def init_cluster(encrypted=True, clean_dir=True, allow_shell=False): """ If not already done, initializes the ResourceAllocationManager and adds a cluster using encrypted or unencrypted communication. Returns the name of the configured cluster. """ authkey = 'PublicKey' if encrypted else 'AuthKey' allocators = ResourceAllocationManager.list_allocators() if len(allocators) == 1: local = ResourceAllocationManager.get_allocator(0) if local.max_load < 10: # First time we've been called. # Ensure we aren't held up by local host load problems. local.max_load = 10 if clean_dir: # Remove any local allocator-created directories. for path in glob.glob('Sim-*'): shutil.rmtree(path, onerror=onerror) node = platform.node() name = '%s_%s' % (node.replace('.', '_'), authkey) for allocator in allocators: if allocator.name == name: return name # Don't add multiple copies. machines = [] python = sys.executable if node.startswith('gxterm'): # User environment assumed OK on this GRC cluster front-end. # Using less than full machine (55 nodes) to allow multiple # cluster testing without hitting limit on open files (sockets). for i in range(20): machines.append({'hostname': 'gx%02d' % i, 'python': python}) elif local_ssh_available(): machines.append({'hostname': node, 'python': python}) if machines: cluster = ClusterAllocator(name, machines, authkey, allow_shell) ResourceAllocationManager.insert_allocator(0, cluster) return name elif not encrypted: # Create a LocalAllocator so we have *something*. name = 'LocalUnencrypted' for allocator in allocators: if allocator.name == name: return name # Don't add multiple copies. local = LocalAllocator(name, authkey=authkey, allow_shell=allow_shell) ResourceAllocationManager.insert_allocator(0, local) return name return None
def run_serial(self): """ Run serial version of ADPAC. Runs on remote host if there's more than just the local allocator. """ try: allocator = RAM.get_allocator(1) except IndexError: self.resources = {} else: self.resources = {'n_cpus': 1} self.command = [self.serial_adpac] if not self.idissf: self.command.append('-d') if self.irevs: self.command.append('-r') self.stdin = self.input.casename+'.input' self.stdout = self.input.casename+'.output' self.stderr = ExternalCode.STDOUT super(ADPAC, self).execute()
def test_configure(self): logging.debug('') logging.debug('test_configure') # Reconfigure. with open('resources.cfg', 'w') as out: out.write(""" [LocalHost] max_load: 100 """) local = RAM.get_allocator('LocalHost') max_load = local.max_load try: self.assertTrue(max_load < 100) RAM.configure('resources.cfg') self.assertEqual(local.max_load, 100) local.max_load = max_load finally: os.remove('resources.cfg') # Add another local. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator authkey: PublicKey allow_shell: False total_cpus: 42 max_load: 200 """) try: RAM.configure('resources.cfg') local2 = RAM.get_allocator('Local2') self.assertEqual(local2.factory._authkey, 'PublicKey') self.assertEqual(local2.factory._allow_shell, False) self.assertEqual(local2.total_cpus, 42) self.assertEqual(local2.max_load, 200) self.assertEqual(local2.host, socket.gethostname()) self.assertTrue(local2.pid > 0) RAM.remove_allocator('Local2') finally: os.remove('resources.cfg') # Bad local total_cpus. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator total_cpus: 0 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), ValueError, 'Local2: total_cpus must be > 0, got 0') finally: os.remove('resources.cfg') # Bad local max_load. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator max_load: 0 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), ValueError, 'Local2: max_load must be > 0, got 0') finally: os.remove('resources.cfg') # Bad module. with open('resources.cfg', 'w') as out: out.write(""" [BadModule] classname: no-such-module.Allocator max_load: 100 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), RuntimeError, "RAM configure BadModule: can't import" " 'no-such-module'") finally: os.remove('resources.cfg') # Bad class. with open('resources.cfg', 'w') as out: out.write(""" [BadClass] classname: openmdao.main.resource.NoSuchAllocator max_load: 100 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), RuntimeError, "RAM configure BadClass: no class" " 'NoSuchAllocator' in openmdao.main.resource") finally: os.remove('resources.cfg') # Add, insert, get, remove. local3 = LocalAllocator('Local3') local4 = LocalAllocator('Local4', total_cpus=4) RAM.add_allocator(local3) try: allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] self.assertEqual(allocator_names, ['LocalHost', 'Local3']) self.assertTrue(RAM.get_allocator('Local3') is local3) self.assertTrue(RAM.get_allocator(1) is local3) RAM.insert_allocator(0, local4) try: allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] self.assertEqual(allocator_names, ['Local4', 'LocalHost', 'Local3']) finally: RAM.remove_allocator('Local4') finally: RAM.remove_allocator(1) assert_raises(self, "RAM.get_allocator('Local3')", globals(), locals(), ValueError, "allocator 'Local3' not found") assert_raises(self, "RAM.remove_allocator('Local3')", globals(), locals(), ValueError, "allocator 'Local3' not found") assert_raises(self, "LocalAllocator('BadLoad', max_load=-2)", globals(), locals(), ValueError, "BadLoad: max_load must be > 0, got -2")
def test_remote(self): logging.debug('') logging.debug('test_remote') # Start remote server. server_dir = 'Factory' if os.path.exists(server_dir): shutil.rmtree(server_dir, onerror=onerror) os.mkdir(server_dir) os.chdir(server_dir) try: server, server_cfg = start_server() cfg = read_server_config(server_cfg) factory = None try: factory = connect(cfg['address'], cfg['port'], pubkey=cfg['key']) prefix = RAM._make_prefix(factory.host) remote = '%s_LocalHost' % prefix # Show no remotes currently in RAM. allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertFalse(remote in allocator_names) # Add remote server's allocator. RAM.add_remotes(factory) allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertTrue(remote in allocator_names) self.assertFalse(RAM.get_allocator(remote) is RAM.list_allocators()[0]) self.assertTrue(RAM.get_allocator(remote) is RAM.list_allocators()[1]) # Max servers. max_servers = RAM.max_servers(dict(allocator=remote)) self.assertTrue(max_servers >= 0) # Avoid host load issues. remote_alloc = RAM.get_allocator(remote) max_servers, info = \ remote_alloc.max_servers(dict(localhost=True)) self.assertEqual(max_servers, 0) self.assertEqual(info, dict(localhost='requested local host')) max_servers, info = \ remote_alloc.max_servers(dict(allocator='LocalHost')) self.assertEqual(max_servers, 0) self.assertEqual(info, dict(allocator='wrong allocator')) estimate, info = \ remote_alloc.time_estimate(dict(allocator='LocalHost')) self.assertEqual(estimate, -2) self.assertEqual(info, dict(allocator='wrong allocator')) # Allocate, release. remote_server, info = RAM.allocate(dict(allocator=remote)) RAM.release(remote_server) # Remove remote allocators. allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] for name in allocator_names: if name.startswith(prefix): RAM.remove_allocator(name) allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertFalse(remote in allocator_names) finally: if factory is not None: factory.cleanup() server.terminate(timeout=10) finally: os.chdir('..') shutil.rmtree(server_dir, onerror=onerror) # Access local RAM in manner it would be accessed in the server. self.assertEqual(RAM._get_instance().get_total_allocators(), 1) self.assertTrue(RAM._get_instance().get_allocator_proxy(0) is RAM.list_allocators()[0])
def test_configure(self): logging.debug('') logging.debug('test_configure') # Reconfigure. with open('resources.cfg', 'w') as out: out.write(""" [LocalHost] max_load: 100 """) local = RAM.get_allocator('LocalHost') max_load = local.max_load try: self.assertTrue(max_load < 100) RAM.configure('resources.cfg') self.assertEqual(local.max_load, 100) local.max_load = max_load finally: os.remove('resources.cfg') # Add another local. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator authkey: PublicKey allow_shell: False total_cpus: 42 max_load: 200 """) try: RAM.configure('resources.cfg') local2 = RAM.get_allocator('Local2') self.assertEqual(local2.factory._authkey, 'PublicKey') self.assertEqual(local2.factory._allow_shell, False) self.assertEqual(local2.total_cpus, 42) self.assertEqual(local2.max_load, 200) self.assertEqual(local2.host, socket.gethostname()) self.assertTrue(local2.pid > 0) RAM.remove_allocator('Local2') finally: os.remove('resources.cfg') # Bad local total_cpus. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator total_cpus: 0 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), ValueError, 'Local2: total_cpus must be > 0, got 0') finally: os.remove('resources.cfg') # Bad local max_load. with open('resources.cfg', 'w') as out: out.write(""" [Local2] classname: openmdao.main.resource.LocalAllocator max_load: 0 """) try: assert_raises(self, "RAM.configure('resources.cfg')", globals(), locals(), ValueError, 'Local2: max_load must be > 0, got 0') finally: os.remove('resources.cfg') # Bad module. with open('resources.cfg', 'w') as out: out.write(""" [BadModule] classname: no-such-module.Allocator max_load: 100 """) try: assert_raises( self, "RAM.configure('resources.cfg')", globals(), locals(), RuntimeError, "RAM configure BadModule: can't import" " 'no-such-module'") finally: os.remove('resources.cfg') # Bad class. with open('resources.cfg', 'w') as out: out.write(""" [BadClass] classname: openmdao.main.resource.NoSuchAllocator max_load: 100 """) try: assert_raises( self, "RAM.configure('resources.cfg')", globals(), locals(), RuntimeError, "RAM configure BadClass: no class" " 'NoSuchAllocator' in openmdao.main.resource") finally: os.remove('resources.cfg') # Add, insert, get, remove. local3 = LocalAllocator('Local3') local4 = LocalAllocator('Local4', total_cpus=4) RAM.add_allocator(local3) try: allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] self.assertEqual(allocator_names, ['LocalHost', 'Local3']) self.assertTrue(RAM.get_allocator('Local3') is local3) self.assertTrue(RAM.get_allocator(1) is local3) RAM.insert_allocator(0, local4) try: allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] self.assertEqual(allocator_names, ['Local4', 'LocalHost', 'Local3']) finally: RAM.remove_allocator('Local4') finally: RAM.remove_allocator(1) assert_raises(self, "RAM.get_allocator('Local3')", globals(), locals(), ValueError, "allocator 'Local3' not found") assert_raises(self, "RAM.remove_allocator('Local3')", globals(), locals(), ValueError, "allocator 'Local3' not found") assert_raises(self, "LocalAllocator('BadLoad', max_load=-2)", globals(), locals(), ValueError, "BadLoad: max_load must be > 0, got -2")
def test_remote(self): logging.debug('') logging.debug('test_remote') # Start remote server. server_dir = 'Factory' if os.path.exists(server_dir): shutil.rmtree(server_dir, onerror=onerror) os.mkdir(server_dir) os.chdir(server_dir) try: server, server_cfg = start_server() cfg = read_server_config(server_cfg) factory = None try: factory = connect(cfg['address'], cfg['port'], pubkey=cfg['key']) prefix = RAM._make_prefix(factory.host) remote = '%s_LocalHost' % prefix # Show no remotes currently in RAM. allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertFalse(remote in allocator_names) # Add remote server's allocator. RAM.add_remotes(factory) allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertTrue(remote in allocator_names) self.assertFalse( RAM.get_allocator(remote) is RAM.list_allocators()[0]) self.assertTrue( RAM.get_allocator(remote) is RAM.list_allocators()[1]) # Max servers. max_servers = RAM.max_servers(dict(allocator=remote)) self.assertTrue(max_servers >= 0) # Avoid host load issues. remote_alloc = RAM.get_allocator(remote) max_servers, info = \ remote_alloc.max_servers(dict(localhost=True)) self.assertEqual(max_servers, 0) self.assertEqual(info, dict(localhost='requested local host')) max_servers, info = \ remote_alloc.max_servers(dict(allocator='LocalHost')) self.assertEqual(max_servers, 0) self.assertEqual(info, dict(allocator='wrong allocator')) estimate, info = \ remote_alloc.time_estimate(dict(allocator='LocalHost')) self.assertEqual(estimate, -2) self.assertEqual(info, dict(allocator='wrong allocator')) # Allocate, release. remote_server, info = RAM.allocate(dict(allocator=remote)) RAM.release(remote_server) # Remove remote allocators. allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] for name in allocator_names: if name.startswith(prefix): RAM.remove_allocator(name) allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertFalse(remote in allocator_names) finally: if factory is not None: factory.cleanup() server.terminate(timeout=10) finally: os.chdir('..') shutil.rmtree(server_dir, onerror=onerror) # Access local RAM in manner it would be accessed in the server. self.assertEqual(RAM._get_instance().get_total_allocators(), 1) self.assertTrue(RAM._get_instance().get_allocator_proxy(0) is RAM.list_allocators()[0])
def main(): # pragma no cover """ Runs the RJE server. Usage: python rje.py [--allocator=name][--dmz-host=name][--poll-delay=secs][--resources=filename] --allocator: string Allocator to provide remote access to. Default ``PBS``. --dmz-host: string DMZ file server to use. Default ``dmzfs1``. --poll-delay: int Maximum seconds between checks for new client activity. Default 60. --resources: string Filename for resource configuration. If not specified then the default of ``~/.openmdao/resources.cfg`` will be used. """ parser = optparse.OptionParser() parser.add_option('--allocator', action='store', type='str', default='PBS', help='Allocator to provide remote access to') parser.add_option('--dmz-host', action='store', type='str', default='dmzfs1', help='DMZ file server to use') parser.add_option( '--poll-delay', action='store', type='int', default=60, help='Max seconds between checks for new client activity') parser.add_option('--resources', action='store', type='str', default=None, help='Filename for resource configuration') parser.add_option('--ssh', action='store', type='str', default=None, help='ssh command (used during testing)') parser.add_option('--scp', action='store', type='str', default=None, help='scp command (used during testing)') options, arguments = parser.parse_args() if arguments: parser.print_help() sys.exit(1) logger = logging.getLogger() logger.setLevel(logging.DEBUG) # Configure ssh and scp. if options.ssh: configure_ssh(options.ssh.split()) if options.scp: configure_scp(options.scp.split()) # Optionally configure resources. if options.resources is not None: RAM.configure(options.resources) # Get allocator to wrap. try: allocator = RAM.get_allocator(options.allocator) except ValueError: msg = "Can't find allocator %r" % options.allocator print msg logger.error(msg) sys.exit(1) dmz_host = options.dmz_host poll_delay = options.poll_delay # Initialize DMZ protocol. server_init(dmz_host, logger) global _DMZ_HOST _DMZ_HOST = dmz_host msg = 'RJE server ready' print msg logger.info(msg) # Setup for cleanup by this process only. global _RJE_PID _RJE_PID = os.getpid() signal.signal(signal.SIGTERM, _sigterm_handler) # And away we go... wrappers = {} try: delay = 1 # Start with high polling rate. while True: conn_info, removed = server_accept(dmz_host, poll_delay, logger) for client in removed: wrapper = wrappers.pop(client, None) if wrapper is not None: wrapper.shutdown() if conn_info is None: server_heartbeat(dmz_host, poll_delay, logger) delay = min(delay + 1, poll_delay) # Back-off. time.sleep(delay) else: client, connection = conn_info wrapper = AllocatorWrapper(allocator, client, connection) handler = threading.Thread(name='%s_handler' % client, target=wrapper.process_requests) handler.daemon = True handler.start() wrappers[client] = wrapper delay = 1 # Reset. except KeyboardInterrupt: pass finally: _cleanup() sys.exit(0)
def main(): # pragma no cover """ Runs the RJE server. Usage: python rje.py [--allocator=name][--dmz-host=name][--poll-delay=secs][--resources=filename] --allocator: string Allocator to provide remote access to. Default ``PBS``. --dmz-host: string DMZ file server to use. Default ``dmzfs1``. --poll-delay: int Maximum seconds between checks for new client activity. Default 60. --resources: string Filename for resource configuration. If not specified then the default of ``~/.openmdao/resources.cfg`` will be used. """ parser = optparse.OptionParser() parser.add_option('--allocator', action='store', type='str', default='PBS', help='Allocator to provide remote access to') parser.add_option('--dmz-host', action='store', type='str', default='dmzfs1', help='DMZ file server to use') parser.add_option('--poll-delay', action='store', type='int', default=60, help='Max seconds between checks for new client activity') parser.add_option('--resources', action='store', type='str', default=None, help='Filename for resource configuration') parser.add_option('--ssh', action='store', type='str', default=None, help='ssh command (used during testing)') parser.add_option('--scp', action='store', type='str', default=None, help='scp command (used during testing)') options, arguments = parser.parse_args() if arguments: parser.print_help() sys.exit(1) logger = logging.getLogger() logger.setLevel(logging.DEBUG) # Configure ssh and scp. if options.ssh: configure_ssh(options.ssh.split()) if options.scp: configure_scp(options.scp.split()) # Optionally configure resources. if options.resources is not None: RAM.configure(options.resources) # Get allocator to wrap. try: allocator = RAM.get_allocator(options.allocator) except ValueError: msg = "Can't find allocator %r" % options.allocator print msg logger.error(msg) sys.exit(1) dmz_host = options.dmz_host poll_delay = options.poll_delay # Initialize DMZ protocol. server_init(dmz_host, logger) global _DMZ_HOST _DMZ_HOST = dmz_host msg = 'RJE server ready' print msg logger.info(msg) # Setup for cleanup by this process only. global _RJE_PID _RJE_PID = os.getpid() signal.signal(signal.SIGTERM, _sigterm_handler) # And away we go... wrappers = {} try: delay = 1 # Start with high polling rate. while True: conn_info, removed = server_accept(dmz_host, poll_delay, logger) for client in removed: wrapper = wrappers.pop(client, None) if wrapper is not None: wrapper.shutdown() if conn_info is None: server_heartbeat(dmz_host, poll_delay, logger) delay = min(delay + 1, poll_delay) # Back-off. time.sleep(delay) else: client, connection = conn_info wrapper = AllocatorWrapper(allocator, client, connection) handler = threading.Thread(name='%s_handler' % client, target=wrapper.process_requests) handler.daemon = True handler.start() wrappers[client] = wrapper delay = 1 # Reset. except KeyboardInterrupt: pass finally: _cleanup() sys.exit(0)