def testLocalCluster(self): endpoint = gen_endpoint('0.0.0.0') with LocalDistributedCluster(endpoint, scheduler_n_process=2, worker_n_process=3, shared_memory='20M') as cluster: pool = cluster.pool self.assertTrue( pool.has_actor(pool.actor_ref( ClusterInfoActor.default_name()))) self.assertTrue( pool.has_actor( pool.actor_ref(SessionManagerActor.default_name()))) self.assertTrue( pool.has_actor(pool.actor_ref(DispatchActor.default_name()))) with new_session(endpoint) as session: api = session._api t = mt.ones((3, 3), chunk_size=2) result = session.run(t) np.testing.assert_array_equal(result, np.ones((3, 3))) self.assertNotIn(session._session_id, api.session_manager.get_sessions())
def setUp(self): endpoint = '127.0.0.1:%d' % get_next_port() self.endpoint = endpoint self.pool = create_actor_pool(n_process=1, backend='gevent', address=endpoint) self.pool.create_actor(SchedulerClusterInfoActor, [endpoint], uid=SchedulerClusterInfoActor.default_name()) self.pool.create_actor(SessionManagerActor, uid=SessionManagerActor.default_name()) self.pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) self.api = MarsAPI(endpoint)
def testFailoverMessage(self): mock_session_id = str(uuid.uuid4()) mock_graph_key = str(uuid.uuid4()) mock_chunk_key = str(uuid.uuid4()) addr = '127.0.0.1:%d' % get_next_port() mock_worker_addr = '127.0.0.1:54132' options.scheduler.worker_blacklist_time = 0.5 with create_actor_pool(n_process=1, backend='gevent', address=addr) as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) session_manager_ref = pool.create_actor( SessionManagerActor, uid=SessionManagerActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) chunk_meta_ref = pool.create_actor( ChunkMetaActor, uid=ChunkMetaActor.default_name()) session_ref = pool.actor_ref(session_manager_ref.create_session(mock_session_id)) chunk_meta_ref.set_chunk_meta(mock_session_id, mock_chunk_key, size=80, shape=(10,), workers=(mock_worker_addr,)) with mock.patch(GraphActor.__module__ + '.' + GraphActor.__name__, new=MockGraphActor): session_ref.submit_tensor_graph(None, mock_graph_key) graph_ref = pool.actor_ref(GraphActor.gen_name(mock_session_id, mock_graph_key)) expire_time = time.time() - options.scheduler.status_timeout - 1 resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=expire_time)) resource_ref.detect_dead_workers(_tell=True) pool.sleep(0.2) _, removes, lost_chunks = graph_ref.get_worker_change_args() self.assertListEqual(removes, [mock_worker_addr]) self.assertListEqual(lost_chunks, [mock_chunk_key]) self.assertNotIn(mock_worker_addr, resource_ref.get_workers_meta()) resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=time.time())) self.assertNotIn(mock_worker_addr, resource_ref.get_workers_meta()) pool.sleep(0.4) resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=time.time())) self.assertIn(mock_worker_addr, resource_ref.get_workers_meta())
def start_processes(self, n_schedulers=2, n_workers=2, etcd=False, modules=None, log_scheduler=True, log_worker=True): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception, ) scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)] self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports] append_args = [] if modules: append_args.extend(['--load-modules', ','.join(modules)]) if etcd: etcd_port = get_next_port() self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port) self.etcd_helper.run() options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port append_args.extend(['--kv-store', options.kv_store]) else: append_args.extend( ['--schedulers', ','.join(self.scheduler_endpoints)]) if 'DUMP_GRAPH_DATA' in os.environ: append_args += ['-Dscheduler.dump_graph_data=true'] self.proc_schedulers = [ subprocess.Popen([ sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '--level', 'debug' if log_scheduler else 'warning', '-p', p, '--format', '%(asctime)-15s %(message)s', '-Dscheduler.retry_delay=5' ] + append_args) for p in scheduler_ports ] self.proc_workers = [ subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', '1', '--level', 'debug' if log_worker else 'warning', '--cache-mem', '16m', '--ignore-avail-mem', '-Dworker.prepare_data_timeout=30' ] + append_args) for _ in range(n_workers) ] actor_client = new_client() self.cluster_info = actor_client.actor_ref( ClusterInfoActor.default_name(), address=self.scheduler_endpoints[0]) check_time = time.time() while True: try: started_schedulers = self.cluster_info.get_schedulers() if len(started_schedulers) < n_schedulers: raise RuntimeError( 'Schedulers does not met requirement: %d < %d.' % (len(started_schedulers), n_schedulers)) actor_address = self.cluster_info.get_scheduler( SessionManagerActor.default_name()) self.session_manager_ref = actor_client.actor_ref( SessionManagerActor.default_name(), address=actor_address) actor_address = self.cluster_info.get_scheduler( ResourceActor.default_name()) resource_ref = actor_client.actor_ref( ResourceActor.default_name(), address=actor_address) if resource_ref.get_worker_count() < n_workers: raise RuntimeError( 'Workers does not met requirement: %d < %d.' % (resource_ref.get_worker_count(), n_workers)) break except: if time.time() - check_time > 20: raise time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors