def init_cluster(encrypted=True, clean_dir=True, allow_shell=False): """ If not already done, initializes the ResourceAllocationManager and adds a cluster using encrypted or unencrypted communication. Returns the name of the configured cluster. """ authkey = 'PublicKey' if encrypted else 'AuthKey' allocators = ResourceAllocationManager.list_allocators() if len(allocators) == 1: local = ResourceAllocationManager.get_allocator(0) if local.max_load < 10: # First time we've been called. # Ensure we aren't held up by local host load problems. local.max_load = 10 if clean_dir: # Remove any local allocator-created directories. for path in glob.glob('Sim-*'): shutil.rmtree(path, onerror=onerror) node = platform.node() name = '%s_%s' % (node.replace('.', '_'), authkey) for allocator in allocators: if allocator.name == name: return name # Don't add multiple copies. machines = [] python = sys.executable if node.startswith('gxterm'): # User environment assumed OK on this GRC cluster front-end. # Using less than full machine (55 nodes) to allow multiple # cluster testing without hitting limit on open files (sockets). for i in range(20): machines.append({'hostname': 'gx%02d' % i, 'python': python}) elif local_ssh_available(): machines.append({'hostname': node, 'python': python}) if machines: cluster = ClusterAllocator(name, machines, authkey, allow_shell) ResourceAllocationManager.insert_allocator(0, cluster) return name elif not encrypted: # Create a LocalAllocator so we have *something*. name = 'LocalUnencrypted' for allocator in allocators: if allocator.name == name: return name # Don't add multiple copies. local = LocalAllocator(name, authkey=authkey, allow_shell=allow_shell) ResourceAllocationManager.insert_allocator(0, local) return name return None
def rundlcs(): """ run the whole process, including startup and shutdown to do: parse input create load cases create app assembly create dispatcher send cases and app to dispatcher run cases collect and save output """ options, arg = get_options() ctrl = parse_input(options) # ctrl will be just the input, but broken up into separate categories, e.g. # ctrl.cases, ctrl.app, ctrl.dispatch, ... # work in progress; running efficiently at NREL. if (options.cluster_allocator): from PeregrineClusterAllocator import ClusterAllocator cluster = ClusterAllocator() RAM.insert_allocator(0, cluster) ### using "factory" functions to create specific subclasses (e.g. distinguish between FAST and HAWC2) # Then we use these to create the cases... cases = create_run_cases(ctrl.cases, options) # and a turbine---never used this "stub" # turbine = create_turbine(ctrl.turbine) # and the appropriate wind code wrapper... aerocode = create_aerocode_wrapper(ctrl.aerocode, ctrl.output, options) # and the appropriate dispatcher... dispatcher = create_dlc_dispatcher(ctrl.dispatcher) ### After this point everything should be generic, all appropriate subclass object created # # # # # # # # # # # dispatcher.presetup_workflow( aerocode, cases) # just makes sure parts are there when configure() is called dispatcher.configure() # Now tell the dispatcher to (setup and ) run the cases using the aerocode on the turbine. # calling configure() is done inside run(). but now it is done already (above), too. # norun does not write directories, but it does set us up to process them if they already exist if (not options.norun): print "calling run" dispatcher.run() # TODO: more complexity will be needed for difference between "run now" and "run later" cases. dispatcher.collect_output(ctrl.output)
def rundlcs(): """ run the whole process, including startup and shutdown to do: parse input create load cases create app assembly create dispatcher send cases and app to dispatcher run cases collect and save output """ options, arg = get_options() ctrl = parse_input(options.main_input, options) # ctrl will be just the input, but broken up into separate categories, e.g. # ctrl.cases, ctrl.app, ctrl.dispatch, ... if (options.cluster_allocator): cluster = ClusterAllocator() RAM.insert_allocator(0, cluster) ### using "factory" functions to create specific subclasses (e.g. distinguish between FAST and HAWC2) # Then we use these to create the cases... cases = create_load_cases(ctrl.cases, options) # and a turbine turbine = create_turbine(ctrl.turbine) # and the appropriate wind code wrapper... aerocode = create_aerocode_wrapper(ctrl.aerocode, options) # and the appropriate dispatcher... dispatcher = create_dlc_dispatcher(ctrl.dispatcher) ### After this point everything should be generic, all appropriate subclass object created dispatcher.presetup_workflow( aerocode, turbine, cases) # just makes sure parts are there when configure() is called dispatcher.configure() # Now tell the dispatcher to (setup and ) run the cases using the aerocode on the turbine. # calling configure() is done inside run(). if (not options.norun): dispatcher.run() # TODO: more complexity will be needed for difference between "run now" and "run later" cases. dispatcher.collect_output(ctrl.output) sctx = sampler.Context() field_idx = 20 # = RootMyc1Std final_load_calc(sctx, "dlcproto.out", not dispatcher.raw_cases, field_idx)
def rundlcs(): """ run the whole process, including startup and shutdown to do: parse input create load cases create app assembly create dispatcher send cases and app to dispatcher run cases collect and save output """ options, arg = get_options() ctrl = parse_input(options) # ctrl will be just the input, but broken up into separate categories, e.g. # ctrl.cases, ctrl.app, ctrl.dispatch, ... # work in progress; running efficiently at NREL. if (options.cluster_allocator): from PeregrineClusterAllocator import ClusterAllocator cluster=ClusterAllocator() RAM.insert_allocator(0,cluster) ### using "factory" functions to create specific subclasses (e.g. distinguish between FAST and HAWC2) # Then we use these to create the cases... cases = create_run_cases(ctrl.cases, options) # and a turbine---never used this "stub" # turbine = create_turbine(ctrl.turbine) # and the appropriate wind code wrapper... aerocode = create_aerocode_wrapper(ctrl.aerocode, ctrl.output, options) # and the appropriate dispatcher... dispatcher = create_dlc_dispatcher(ctrl.dispatcher) ### After this point everything should be generic, all appropriate subclass object created # # # # # # # # # # # dispatcher.presetup_workflow(aerocode, cases) # just makes sure parts are there when configure() is called dispatcher.configure() # Now tell the dispatcher to (setup and ) run the cases using the aerocode on the turbine. # calling configure() is done inside run(). but now it is done already (above), too. # norun does not write directories, but it does set us up to process them if they already exist if (not options.norun): print "calling run" dispatcher.run() # TODO: more complexity will be needed for difference between "run now" and "run later" cases. dispatcher.collect_output(ctrl.output)
def rundlcs(): """ run the whole process, including startup and shutdown to do: parse input create load cases create app assembly create dispatcher send cases and app to dispatcher run cases collect and save output """ options, arg = get_options() ctrl = parse_input(options.main_input, options) # ctrl will be just the input, but broken up into separate categories, e.g. # ctrl.cases, ctrl.app, ctrl.dispatch, ... if (options.cluster_allocator): cluster=ClusterAllocator() RAM.insert_allocator(0,cluster) ### using "factory" functions to create specific subclasses (e.g. distinguish between FAST and HAWC2) # Then we use these to create the cases... cases = create_load_cases(ctrl.cases, options) # and a turbine turbine = create_turbine(ctrl.turbine) # and the appropriate wind code wrapper... aerocode = create_aerocode_wrapper(ctrl.aerocode, options) # and the appropriate dispatcher... dispatcher = create_dlc_dispatcher(ctrl.dispatcher) ### After this point everything should be generic, all appropriate subclass object created dispatcher.presetup_workflow(aerocode, turbine, cases) # just makes sure parts are there when configure() is called dispatcher.configure() # Now tell the dispatcher to (setup and ) run the cases using the aerocode on the turbine. # calling configure() is done inside run(). if (not options.norun): dispatcher.run() # TODO: more complexity will be needed for difference between "run now" and "run later" cases. dispatcher.collect_output(ctrl.output) sctx = sampler.Context() field_idx = 20 # = RootMyc1Std final_load_calc(sctx, "dlcproto.out", not dispatcher.raw_cases, field_idx)
def InitializeCluster(hostnames, pydir, identity_filename=None): print 'Connecting to cluster...' machines = [] for host in hostnames: machines.append(ClusterHost( hostname=host, python = pydir, tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) _SSH.extend(['-o', 'StrictHostKeyChecking=no']) #somewhat dangerous, this automatically adds the host key to known_hosts cluster = ClusterAllocator('PCCCluster', machines, allow_shell=True) RAM.insert_allocator(0, cluster) print 'Servers connected on cluster:',cluster.max_servers({})[0] global UseCluster UseCluster = True
def main(): """ Configure a cluster and use it. """ enable_console(logging.DEBUG) logging.getLogger().setLevel(0) print 'Client PID', os.getpid() # Configure cluster. cluster_name = 'EC2Cluster' machines = [] if USE_EC2: # The identity file used to access EC2 via ssh. identity_filename = os.path.expanduser('~/.ssh/lovejoykey') identity_filename += '.ppk' if sys.platform == 'win32' else '.pem' machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.17.04.529682' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.17.03.113077' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.17.05.434412' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.20.17.379627' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/Scripts/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.19.49.348885' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/Scripts/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) # machines.append(ClusterHost( # hostname='viper.grc.nasa.gov', # python='OpenMDAO-Framework/devenv/bin/python', # tunnel_incoming=True, tunnel_outgoing=True, # identity_filename=None)) else: # Trivial local 'cluster' for debugging without remote host issues. machines.append( ClusterHost(hostname=socket.getfqdn(), python=sys.executable)) # machines.append(ClusterHost( # hostname='viper.grc.nasa.gov', # python='OpenMDAO-Framework/devenv/bin/python', # tunnel_incoming=True, tunnel_outgoing=True, # identity_filename=None)) # Start it. cluster = ClusterAllocator(cluster_name, machines, allow_shell=True, method='load-average') # method='greedy') # method='round-robin') print 'Cluster initialized' RAM.insert_allocator(0, cluster) n_servers = RAM.max_servers(dict(allocator=cluster_name)) print n_servers, 'Servers:' for name in RAM.get_hostnames( dict(allocator=cluster_name, min_cpus=n_servers)): print ' ', name # Create model. top = GPOptimization() # Configure DOE. top.driver.sequential = False # Run concurrently across cluster. top.driver.reload_model = False # Force use of only cluster hosts by adding this requirement. top.driver.extra_resources = dict(allocator=cluster_name) # This is necessary more often than it should be. top.driver.ignore_egg_requirements = True # Perform the optimization. top.run()
def main(): """ Configure a cluster and use it. """ enable_console(logging.DEBUG) logging.getLogger().setLevel(0) print 'Client PID', os.getpid() # Configure cluster. cluster_name = 'EC2Cluster' machines = [] if USE_EC2: # The identity file used to access EC2 via ssh. identity_filename = os.path.expanduser('~/.ssh/lovejoykey') identity_filename += '.ppk' if sys.platform == 'win32' else '.pem' machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.17.04.529682' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.17.03.113077' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.17.05.434412' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.20.17.379627' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/Scripts/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.19.49.348885' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/Scripts/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) # machines.append(ClusterHost( # hostname='viper.grc.nasa.gov', # python='OpenMDAO-Framework/devenv/bin/python', # tunnel_incoming=True, tunnel_outgoing=True, # identity_filename=None)) else: # Trivial local 'cluster' for debugging without remote host issues. machines.append(ClusterHost(hostname=socket.getfqdn(), python=sys.executable)) # machines.append(ClusterHost( # hostname='viper.grc.nasa.gov', # python='OpenMDAO-Framework/devenv/bin/python', # tunnel_incoming=True, tunnel_outgoing=True, # identity_filename=None)) # Start it. cluster = ClusterAllocator(cluster_name, machines, allow_shell=True, method='load-average') # method='greedy') # method='round-robin') print 'Cluster initialized' RAM.insert_allocator(0, cluster) n_servers = RAM.max_servers(dict(allocator=cluster_name)) print n_servers, 'Servers:' for name in RAM.get_hostnames(dict(allocator=cluster_name, min_cpus=n_servers)): print ' ', name # Create model. top = GPOptimization() # Configure DOE. top.driver.sequential = False # Run concurrently across cluster. top.driver.reload_model = False # Force use of only cluster hosts by adding this requirement. top.driver.extra_resources = dict(allocator=cluster_name) # This is necessary more often than it should be. top.driver.ignore_egg_requirements = True # Perform the optimization. top.run()
def test_configure(self): logging.debug('') logging.debug('test_configure') # Reconfigure. with open('resources.cfg', 'w') as out: out.write(""" [LocalHost] max_load: 100 """) local = RAM.get_allocator('LocalHost') max_load = local.max_load try: self.assertTrue(max_load < 100) RAM.configure('resources.cfg') self.assertEqual(local.max_load, 100) local.max_load = max_load finally: os.remove('resources.cfg') # Add another local. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator authkey: PublicKey allow_shell: False total_cpus: 42 max_load: 200 """) try: RAM.configure('resources.cfg') local2 = RAM.get_allocator('Local2') self.assertEqual(local2.factory._authkey, 'PublicKey') self.assertEqual(local2.factory._allow_shell, False) self.assertEqual(local2.total_cpus, 42) self.assertEqual(local2.max_load, 200) self.assertEqual(local2.host, socket.gethostname()) self.assertTrue(local2.pid > 0) RAM.remove_allocator('Local2') finally: os.remove('resources.cfg') # Bad local total_cpus. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator total_cpus: 0 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), ValueError, 'Local2: total_cpus must be > 0, got 0') finally: os.remove('resources.cfg') # Bad local max_load. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator max_load: 0 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), ValueError, 'Local2: max_load must be > 0, got 0') finally: os.remove('resources.cfg') # Bad module. with open('resources.cfg', 'w') as out: out.write(""" [BadModule] classname: no-such-module.Allocator max_load: 100 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), RuntimeError, "RAM configure BadModule: can't import" " 'no-such-module'") finally: os.remove('resources.cfg') # Bad class. with open('resources.cfg', 'w') as out: out.write(""" [BadClass] classname: openmdao.main.resource.NoSuchAllocator max_load: 100 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), RuntimeError, "RAM configure BadClass: no class" " 'NoSuchAllocator' in openmdao.main.resource") finally: os.remove('resources.cfg') # Add, insert, get, remove. local3 = LocalAllocator('Local3') local4 = LocalAllocator('Local4', total_cpus=4) RAM.add_allocator(local3) try: allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] self.assertEqual(allocator_names, ['LocalHost', 'Local3']) self.assertTrue(RAM.get_allocator('Local3') is local3) self.assertTrue(RAM.get_allocator(1) is local3) RAM.insert_allocator(0, local4) try: allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] self.assertEqual(allocator_names, ['Local4', 'LocalHost', 'Local3']) finally: RAM.remove_allocator('Local4') finally: RAM.remove_allocator(1) assert_raises(self, "RAM.get_allocator('Local3')", globals(), locals(), ValueError, "allocator 'Local3' not found") assert_raises(self, "RAM.remove_allocator('Local3')", globals(), locals(), ValueError, "allocator 'Local3' not found") assert_raises(self, "LocalAllocator('BadLoad', max_load=-2)", globals(), locals(), ValueError, "BadLoad: max_load must be > 0, got -2")
def test_configure(self): logging.debug('') logging.debug('test_configure') # Reconfigure. with open('resources.cfg', 'w') as out: out.write(""" [LocalHost] max_load: 100 """) local = RAM.get_allocator('LocalHost') max_load = local.max_load try: self.assertTrue(max_load < 100) RAM.configure('resources.cfg') self.assertEqual(local.max_load, 100) local.max_load = max_load finally: os.remove('resources.cfg') # Add another local. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator authkey: PublicKey allow_shell: False total_cpus: 42 max_load: 200 """) try: RAM.configure('resources.cfg') local2 = RAM.get_allocator('Local2') self.assertEqual(local2.factory._authkey, 'PublicKey') self.assertEqual(local2.factory._allow_shell, False) self.assertEqual(local2.total_cpus, 42) self.assertEqual(local2.max_load, 200) self.assertEqual(local2.host, socket.gethostname()) self.assertTrue(local2.pid > 0) RAM.remove_allocator('Local2') finally: os.remove('resources.cfg') # Bad local total_cpus. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator total_cpus: 0 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), ValueError, 'Local2: total_cpus must be > 0, got 0') finally: os.remove('resources.cfg') # Bad local max_load. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator max_load: 0 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), ValueError, 'Local2: max_load must be > 0, got 0') finally: os.remove('resources.cfg') # Bad module. with open('resources.cfg', 'w') as out: out.write(""" [BadModule] classname: no-such-module.Allocator max_load: 100 """) try: assert_raises( self, "RAM.configure('resources.cfg')", globals(), locals(), RuntimeError, "RAM configure BadModule: can't import" " 'no-such-module'") finally: os.remove('resources.cfg') # Bad class. with open('resources.cfg', 'w') as out: out.write(""" [BadClass] classname: openmdao.main.resource.NoSuchAllocator max_load: 100 """) try: assert_raises( self, "RAM.configure('resources.cfg')", globals(), locals(), RuntimeError, "RAM configure BadClass: no class" " 'NoSuchAllocator' in openmdao.main.resource") finally: os.remove('resources.cfg') # Add, insert, get, remove. local3 = LocalAllocator('Local3') local4 = LocalAllocator('Local4', total_cpus=4) RAM.add_allocator(local3) try: allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] self.assertEqual(allocator_names, ['LocalHost', 'Local3']) self.assertTrue(RAM.get_allocator('Local3') is local3) self.assertTrue(RAM.get_allocator(1) is local3) RAM.insert_allocator(0, local4) try: allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] self.assertEqual(allocator_names, ['Local4', 'LocalHost', 'Local3']) finally: RAM.remove_allocator('Local4') finally: RAM.remove_allocator(1) assert_raises(self, "RAM.get_allocator('Local3')", globals(), locals(), ValueError, "allocator 'Local3' not found") assert_raises(self, "RAM.remove_allocator('Local3')", globals(), locals(), ValueError, "allocator 'Local3' not found") assert_raises(self, "LocalAllocator('BadLoad', max_load=-2)", globals(), locals(), ValueError, "BadLoad: max_load must be > 0, got -2")