def test_allocator(self): logging.debug('') logging.debug('test_allocator') # Since we're faking it with a remote LocalHost, we should match. local_servers = RAM.max_servers(dict(allocator='LocalHost')) max_servers = RAM.max_servers(dict(allocator=self.allocator.name)) self.assertEqual(max_servers, local_servers) max_servers = RAM.max_servers(dict(allocator=self.allocator.name, localhost=True)) # Contradictory! self.assertEqual(max_servers, 0) server = self.allocator.deploy('test_server', {}, {}) try: self.assertEqual(server.name, 'NAS_Allocator/test_server') self.assertEqual(server.host, socket.gethostname()) self.assertTrue(server.pid > 0) retval = server.echo(123, 'twisty', 'narrow', 'passages') self.assertEqual(retval, (123, 'twisty', 'narrow', 'passages')) self.assertTrue(server.isdir('.')) self.assertEqual(sorted(server.listdir('.')), ['openmdao_log.txt', 'stderr', 'stdout']) finally: self.allocator.release(server)
def _start(self): """ Start evaluating cases concurrently. """ # Need credentials in case we're using a PublicKey server. credentials = get_credentials() # Determine maximum number of servers available. resources = { 'required_distributions': self._egg_required_distributions, 'orphan_modules': self._egg_orphan_modules, 'python_version': sys.version[:3] } if self.extra_resources: resources.update(self.extra_resources) max_servers = RAM.max_servers(resources) self._logger.debug('max_servers %d', max_servers) if max_servers <= 0: msg = 'No servers supporting required resources %s' % resources self.raise_exception(msg, RuntimeError) # Kick off initial wave of cases. self._server_lock = threading.Lock() self._reply_q = Queue.Queue() self._generation += 1 n_servers = 0 while n_servers < max_servers: if not self._more_to_go(): break # Get next case. Limits servers started if max_servers > cases. try: case = self._iter.next() except StopIteration: if not self._rerun: self._iter = None break
def run_suite(resource_desc=None, name=None): """ Run suite of tests using `resource_desc` and resord under `name`. """ resource_desc = resource_desc or {} name = name or '' print '\n%s' % name initial = 0.01 limit = 20 results = {} max_servers = ResourceAllocationManager.max_servers(resource_desc) print 'max servers', max_servers model = CID() model.driver.reload_model = False model.driver.sequential = False # Save to an egg to avoid analysis overhead during run_test(). print '\nInitializing egg module analysis' template = Case(inputs=[('sleeper.delay', None, 0.01)]) model.driver.iterator = Iterator(template) model.driver.recorders = [Recorder(model.driver.iterator, 1000)] start = time.time() egg_filename, required_distributions, orphan_modules = \ model.save_to_egg('caseperf', '0') et = time.time() - start print ' done in %.2f' % et os.remove(egg_filename) print results = run_test(model, initial, limit, max_servers) record_results(results, name)
def test_allocator(self): logging.debug("") logging.debug("test_allocator") # Since we're faking it with a remote LocalHost, we should match. local_servers = RAM.max_servers(dict(allocator="LocalHost")) max_servers = RAM.max_servers(dict(allocator=self.allocator.name)) self.assertEqual(max_servers, local_servers) max_servers = RAM.max_servers(dict(allocator=self.allocator.name, localhost=True)) # Contradictory! self.assertEqual(max_servers, 0) server = self.allocator.deploy("test_server", {}, {}) try: self.assertEqual(server.name, "NAS_Allocator/test_server") self.assertEqual(server.host, socket.gethostname()) self.assertTrue(server.pid > 0) retval = server.echo(123, "twisty", "narrow", "passages") self.assertEqual(retval, (123, "twisty", "narrow", "passages")) self.assertTrue(server.isdir(".")) self.assertEqual(sorted(server.listdir(".")), ["openmdao_log.txt", "stderr", "stdout"]) finally: self.allocator.release(server)
def main(): """ Configure a cluster and use it. """ enable_console(logging.DEBUG) logging.getLogger().setLevel(0) print 'Client PID', os.getpid() # Configure cluster. cluster_name = 'EC2Cluster' machines = [] if USE_EC2: # The identity file used to access EC2 via ssh. identity_filename = os.path.expanduser('~/.ssh/lovejoykey') identity_filename += '.ppk' if sys.platform == 'win32' else '.pem' machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.17.04.529682' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.17.03.113077' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.17.05.434412' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.20.17.379627' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/Scripts/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.19.49.348885' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/Scripts/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) # machines.append(ClusterHost( # hostname='viper.grc.nasa.gov', # python='OpenMDAO-Framework/devenv/bin/python', # tunnel_incoming=True, tunnel_outgoing=True, # identity_filename=None)) else: # Trivial local 'cluster' for debugging without remote host issues. machines.append( ClusterHost(hostname=socket.getfqdn(), python=sys.executable)) # machines.append(ClusterHost( # hostname='viper.grc.nasa.gov', # python='OpenMDAO-Framework/devenv/bin/python', # tunnel_incoming=True, tunnel_outgoing=True, # identity_filename=None)) # Start it. cluster = ClusterAllocator(cluster_name, machines, allow_shell=True, method='load-average') # method='greedy') # method='round-robin') print 'Cluster initialized' RAM.insert_allocator(0, cluster) n_servers = RAM.max_servers(dict(allocator=cluster_name)) print n_servers, 'Servers:' for name in RAM.get_hostnames( dict(allocator=cluster_name, min_cpus=n_servers)): print ' ', name # Create model. top = GPOptimization() # Configure DOE. top.driver.sequential = False # Run concurrently across cluster. top.driver.reload_model = False # Force use of only cluster hosts by adding this requirement. top.driver.extra_resources = dict(allocator=cluster_name) # This is necessary more often than it should be. top.driver.ignore_egg_requirements = True # Perform the optimization. top.run()
def _start(self): """ Start evaluating cases concurrently. """ # Need credentials in case we're using a PublicKey server. credentials = get_credentials() # Determine maximum number of servers available. resources = { 'required_distributions':self._egg_required_distributions, 'orphan_modules':self._egg_orphan_modules, 'python_version':sys.version[:3]} if self.extra_resources: resources.update(self.extra_resources) max_servers = RAM.max_servers(resources) self._logger.debug('max_servers %d', max_servers) if max_servers <= 0: msg = 'No servers supporting required resources %s' % resources self.raise_exception(msg, RuntimeError) # Kick off initial wave of cases. self._server_lock = threading.Lock() self._reply_q = Queue.Queue() self._generation += 1 n_servers = 0 while n_servers < max_servers: if not self._more_to_go(): break # Get next case. Limits servers started if max_servers > cases. try: case = self._iter.next() except StopIteration: if not self._rerun: self._iter = None self._seqno = 0 break self._seqno += 1 self._todo.append((case, self._seqno)) # Start server worker thread. n_servers += 1 name = '%s_%d_%d' % (self.name, self._generation, n_servers) self._logger.debug('starting worker for %r', name) self._servers[name] = None self._in_use[name] = True self._server_cases[name] = None self._server_states[name] = _EMPTY self._load_failures[name] = 0 server_thread = threading.Thread(target=self._service_loop, args=(name, resources, credentials, self._reply_q)) server_thread.daemon = True try: server_thread.start() except thread.error: self._logger.warning('worker thread startup failed for %r', name) self._in_use[name] = False break if sys.platform != 'win32': # Process any pending events. while self._busy(): try: name, result, exc = self._reply_q.get(True, 0.01) except Queue.Empty: break # Timeout. else: # Difficult to force startup failure. if self._servers[name] is None: #pragma nocover self._logger.debug('server startup failed for %r', name) self._in_use[name] = False else: self._in_use[name] = self._server_ready(name) if sys.platform == 'win32': #pragma no cover # Don't start server processing until all servers are started, # otherwise we have egg removal issues. for name in self._in_use.keys(): name, result, exc = self._reply_q.get() if self._servers[name] is None: self._logger.debug('server startup failed for %r', name) self._in_use[name] = False # Kick-off started servers. for name in self._in_use.keys(): if self._in_use[name]: self._in_use[name] = self._server_ready(name) # Continue until no servers are busy. while self._busy(): if self._more_to_go(): timeout = None else: # Don't wait indefinitely for a server we don't need. # This has happened with a server that got 'lost' # in RAM.allocate() timeout = 60 try: name, result, exc = self._reply_q.get(timeout=timeout) # Hard to force worker to hang, which is handled here. except Queue.Empty: #pragma no cover msgs = [] for name, in_use in self._in_use.items(): if in_use: try: server = self._servers[name] info = self._server_info[name] except KeyError: msgs.append('%r: no startup reply' % name) self._in_use[name] = False else: state = self._server_states[name] if state not in (_LOADING, _EXECUTING): msgs.append('%r: %r %s %s' % (name, self._servers[name], state, self._server_info[name])) self._in_use[name] = False if msgs: self._logger.error('Timeout waiting with nothing left to do:') for msg in msgs: self._logger.error(' %s', msg) else: self._in_use[name] = self._server_ready(name) # Shut-down (started) servers. self._logger.debug('Shut-down (started) servers') for queue in self._queues.values(): queue.put(None) for i in range(len(self._queues)): try: name, status, exc = self._reply_q.get(True, 60) # Hard to force worker to hang, which is handled here. except Queue.Empty: #pragma no cover pass else: if name in self._queues: # 'Stale' worker can reply *late*. del self._queues[name] # Hard to force worker to hang, which is handled here. for name in self._queues.keys(): #pragma no cover self._logger.warning('Timeout waiting for %r to shut-down.', name)
def main(): """ Configure a cluster and use it. """ enable_console(logging.DEBUG) logging.getLogger().setLevel(0) print 'Client PID', os.getpid() # Configure cluster. cluster_name = 'EC2Cluster' machines = [] if USE_EC2: # The identity file used to access EC2 via ssh. identity_filename = os.path.expanduser('~/.ssh/lovejoykey') identity_filename += '.ppk' if sys.platform == 'win32' else '.pem' machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.17.04.529682' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.17.03.113077' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.17.05.434412' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/bin/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.20.17.379627' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/Scripts/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) machines.append(ClusterHost( hostname='*****@*****.**', python='setowns1_2013-05-06_09.19.49.348885' \ '/OpenMDAO-OpenMDAO-Framework-testbranch/devenv/Scripts/python', tunnel_incoming=True, tunnel_outgoing=True, identity_filename=identity_filename)) # machines.append(ClusterHost( # hostname='viper.grc.nasa.gov', # python='OpenMDAO-Framework/devenv/bin/python', # tunnel_incoming=True, tunnel_outgoing=True, # identity_filename=None)) else: # Trivial local 'cluster' for debugging without remote host issues. machines.append(ClusterHost(hostname=socket.getfqdn(), python=sys.executable)) # machines.append(ClusterHost( # hostname='viper.grc.nasa.gov', # python='OpenMDAO-Framework/devenv/bin/python', # tunnel_incoming=True, tunnel_outgoing=True, # identity_filename=None)) # Start it. cluster = ClusterAllocator(cluster_name, machines, allow_shell=True, method='load-average') # method='greedy') # method='round-robin') print 'Cluster initialized' RAM.insert_allocator(0, cluster) n_servers = RAM.max_servers(dict(allocator=cluster_name)) print n_servers, 'Servers:' for name in RAM.get_hostnames(dict(allocator=cluster_name, min_cpus=n_servers)): print ' ', name # Create model. top = GPOptimization() # Configure DOE. top.driver.sequential = False # Run concurrently across cluster. top.driver.reload_model = False # Force use of only cluster hosts by adding this requirement. top.driver.extra_resources = dict(allocator=cluster_name) # This is necessary more often than it should be. top.driver.ignore_egg_requirements = True # Perform the optimization. top.run()
def test_remote(self): logging.debug('') logging.debug('test_remote') # Start remote server. server_dir = 'Factory' if os.path.exists(server_dir): shutil.rmtree(server_dir, onerror=onerror) os.mkdir(server_dir) os.chdir(server_dir) try: server, server_cfg = start_server() cfg = read_server_config(server_cfg) factory = None try: factory = connect(cfg['address'], cfg['port'], pubkey=cfg['key']) prefix = RAM._make_prefix(factory.host) remote = '%s_LocalHost' % prefix # Show no remotes currently in RAM. allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertFalse(remote in allocator_names) # Add remote server's allocator. RAM.add_remotes(factory) allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertTrue(remote in allocator_names) self.assertFalse(RAM.get_allocator(remote) is RAM.list_allocators()[0]) self.assertTrue(RAM.get_allocator(remote) is RAM.list_allocators()[1]) # Max servers. max_servers = RAM.max_servers(dict(allocator=remote)) self.assertTrue(max_servers >= 0) # Avoid host load issues. remote_alloc = RAM.get_allocator(remote) max_servers, info = \ remote_alloc.max_servers(dict(localhost=True)) self.assertEqual(max_servers, 0) self.assertEqual(info, dict(localhost='requested local host')) max_servers, info = \ remote_alloc.max_servers(dict(allocator='LocalHost')) self.assertEqual(max_servers, 0) self.assertEqual(info, dict(allocator='wrong allocator')) estimate, info = \ remote_alloc.time_estimate(dict(allocator='LocalHost')) self.assertEqual(estimate, -2) self.assertEqual(info, dict(allocator='wrong allocator')) # Allocate, release. remote_server, info = RAM.allocate(dict(allocator=remote)) RAM.release(remote_server) # Remove remote allocators. allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] for name in allocator_names: if name.startswith(prefix): RAM.remove_allocator(name) allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertFalse(remote in allocator_names) finally: if factory is not None: factory.cleanup() server.terminate(timeout=10) finally: os.chdir('..') shutil.rmtree(server_dir, onerror=onerror) # Access local RAM in manner it would be accessed in the server. self.assertEqual(RAM._get_instance().get_total_allocators(), 1) self.assertTrue(RAM._get_instance().get_allocator_proxy(0) is RAM.list_allocators()[0])
def test_remote(self): logging.debug('') logging.debug('test_remote') # Start remote server. server_dir = 'Factory' if os.path.exists(server_dir): shutil.rmtree(server_dir, onerror=onerror) os.mkdir(server_dir) os.chdir(server_dir) try: server, server_cfg = start_server() cfg = read_server_config(server_cfg) factory = None try: factory = connect(cfg['address'], cfg['port'], pubkey=cfg['key']) prefix = RAM._make_prefix(factory.host) remote = '%s_LocalHost' % prefix # Show no remotes currently in RAM. allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertFalse(remote in allocator_names) # Add remote server's allocator. RAM.add_remotes(factory) allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertTrue(remote in allocator_names) self.assertFalse( RAM.get_allocator(remote) is RAM.list_allocators()[0]) self.assertTrue( RAM.get_allocator(remote) is RAM.list_allocators()[1]) # Max servers. max_servers = RAM.max_servers(dict(allocator=remote)) self.assertTrue(max_servers >= 0) # Avoid host load issues. remote_alloc = RAM.get_allocator(remote) max_servers, info = \ remote_alloc.max_servers(dict(localhost=True)) self.assertEqual(max_servers, 0) self.assertEqual(info, dict(localhost='requested local host')) max_servers, info = \ remote_alloc.max_servers(dict(allocator='LocalHost')) self.assertEqual(max_servers, 0) self.assertEqual(info, dict(allocator='wrong allocator')) estimate, info = \ remote_alloc.time_estimate(dict(allocator='LocalHost')) self.assertEqual(estimate, -2) self.assertEqual(info, dict(allocator='wrong allocator')) # Allocate, release. remote_server, info = RAM.allocate(dict(allocator=remote)) RAM.release(remote_server) # Remove remote allocators. allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] for name in allocator_names: if name.startswith(prefix): RAM.remove_allocator(name) allocator_names = \ [allocator.name for allocator in RAM.list_allocators()] logging.debug('%s', allocator_names) self.assertFalse(remote in allocator_names) finally: if factory is not None: factory.cleanup() server.terminate(timeout=10) finally: os.chdir('..') shutil.rmtree(server_dir, onerror=onerror) # Access local RAM in manner it would be accessed in the server. self.assertEqual(RAM._get_instance().get_total_allocators(), 1) self.assertTrue(RAM._get_instance().get_allocator_proxy(0) is RAM.list_allocators()[0])