def testLocalCluster(self, *_): endpoint = gen_endpoint('0.0.0.0') with LocalDistributedCluster(endpoint, scheduler_n_process=2, worker_n_process=3, shared_memory='20M') as cluster: pool = cluster.pool self.assertTrue( pool.has_actor( pool.actor_ref(SchedulerClusterInfoActor.default_uid()))) self.assertTrue( pool.has_actor( pool.actor_ref(SessionManagerActor.default_uid()))) self.assertTrue( pool.has_actor(pool.actor_ref(DispatchActor.default_uid()))) with new_session(endpoint) as session: api = session._api t = mt.ones((3, 3), chunk_size=2) result = session.run(t, timeout=_exec_timeout) np.testing.assert_array_equal(result, np.ones((3, 3))) self.assertNotIn(session._session_id, api.session_manager.get_sessions())
def testFailoverMessage(self): mock_session_id = str(uuid.uuid4()) mock_graph_key = str(uuid.uuid4()) mock_chunk_key = str(uuid.uuid4()) addr = '127.0.0.1:%d' % get_next_port() mock_worker_addr = '127.0.0.1:54132' options.scheduler.worker_blacklist_time = 0.5 with create_actor_pool(n_process=1, backend='gevent', address=addr) as pool: cluster_info_ref = pool.create_actor( SchedulerClusterInfoActor, [pool.cluster_info.address], uid=SchedulerClusterInfoActor.default_uid()) session_manager_ref = pool.create_actor( SessionManagerActor, uid=SessionManagerActor.default_uid()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_uid()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_uid()) session_ref = pool.actor_ref( session_manager_ref.create_session(mock_session_id)) chunk_meta_client = ChunkMetaClient(pool, cluster_info_ref) chunk_meta_client.set_chunk_meta(mock_session_id, mock_chunk_key, size=80, shape=(10, ), workers=(mock_worker_addr, )) with mock.patch(GraphActor.__module__ + '.' + GraphActor.__name__, new=MockGraphActor): session_ref.submit_tileable_graph(None, mock_graph_key) graph_ref = pool.actor_ref( GraphActor.gen_uid(mock_session_id, mock_graph_key)) expire_time = time.time( ) - options.scheduler.status_timeout - 1 resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=expire_time)) resource_ref.detect_dead_workers(_tell=True) pool.sleep(0.2) _, removes, lost_chunks = graph_ref.get_worker_change_args() self.assertListEqual(removes, [mock_worker_addr]) self.assertListEqual(lost_chunks, [mock_chunk_key]) self.assertNotIn(mock_worker_addr, resource_ref.get_workers_meta()) resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=time.time())) self.assertNotIn(mock_worker_addr, resource_ref.get_workers_meta()) pool.sleep(0.4) resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=time.time())) self.assertIn(mock_worker_addr, resource_ref.get_workers_meta())
def setUp(self): endpoint = '127.0.0.1:%d' % get_next_port() self.endpoint = endpoint self.pool = create_actor_pool(n_process=1, backend='gevent', address=endpoint) self.pool.create_actor(SchedulerClusterInfoActor, [endpoint], uid=SchedulerClusterInfoActor.default_uid()) self.pool.create_actor(SessionManagerActor, uid=SessionManagerActor.default_uid()) self.pool.create_actor(ResourceActor, uid=ResourceActor.default_uid()) self.api = MarsAPI(endpoint)
def _start_worker_process(self, cuda=False, cuda_device=None, extra_env=None, modules=None, check_timeout=None): mock_scheduler_addr = f'127.0.0.1:{get_next_port()}' try: with create_actor_pool(n_process=1, backend='gevent', address=mock_scheduler_addr) as pool: pool.create_actor(SchedulerClusterInfoActor, [mock_scheduler_addr], uid=SchedulerClusterInfoActor.default_uid()) pool.create_actor(SessionManagerActor, uid=SessionManagerActor.default_uid()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_uid()) resource_ref = pool.create_actor( ResourceActor, uid=ResourceActor.default_uid()) args = [ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--schedulers', mock_scheduler_addr, '--cpu-procs', '1', '--cache-mem', '10m', '--spill-dir', self._spill_dir, '--log-level', 'debug', '--log-format', '%(asctime)-15s %(message)s', '--ignore-avail-mem' ] if modules: args.extend(['--load-modules', ','.join(modules)]) env = os.environ.copy() env.update(extra_env or dict()) if cuda: env['CUDA_VISIBLE_DEVICES'] = cuda_device proc = subprocess.Popen(args, env=env) worker_endpoint = self._wait_worker_ready( proc, resource_ref, timeout=check_timeout) yield pool, worker_endpoint finally: if proc.poll() is None: proc.send_signal(signal.SIGINT) check_time = time.time() while True: time.sleep(0.1) if proc.poll( ) is not None or time.time() - check_time >= 5: break if proc.poll() is None: proc.kill() if os.path.exists(options.worker.plasma_socket): os.unlink(options.worker.plasma_socket)
def start_processes(self, n_schedulers=2, n_workers=2, etcd=False, cuda=False, modules=None, log_scheduler=True, log_worker=True, env=None): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception, ) scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)] self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports] append_args = [] append_args_scheduler = [] append_args_worker = [] if modules: append_args.extend(['--load-modules', ','.join(modules)]) if etcd: etcd_port = get_next_port() self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port) self.etcd_helper.run() options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port append_args.extend(['--kv-store', options.kv_store]) else: append_args.extend( ['--schedulers', ','.join(self.scheduler_endpoints)]) if 'DUMP_GRAPH_DATA' in os.environ: append_args_scheduler += ['-Dscheduler.dump_graph_data=true'] if not cuda: append_args_worker += ['--no-cuda'] proc_env = os.environ.copy() if env: proc_env.update(env) self.proc_schedulers = [ subprocess.Popen([ sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '-p', p, '--log-level', 'debug' if log_scheduler else 'warning', '--log-format', 'SCH%d %%(asctime)-15s %%(message)s' % idx, '-Dscheduler.retry_delay=5', '-Dscheduler.default_cpu_usage=0', '-Dscheduler.status_timeout=10' ] + append_args + append_args_scheduler, env=proc_env) for idx, p in enumerate(scheduler_ports) ] cuda_count = resource.cuda_count() self.proc_workers = [ subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', '1', '--log-level', 'debug' if log_worker else 'warning', '--log-format', 'WOR%d %%(asctime)-15s %%(message)s' % idx, '--cache-mem', '16m', '--ignore-avail-mem', '--cuda-device', str(idx % cuda_count) if cuda_count else '0', '-Dworker.prepare_data_timeout=30' ] + append_args + append_args_worker, env=proc_env) for idx in range(n_workers) ] actor_client = new_client() self.cluster_info = actor_client.actor_ref( SchedulerClusterInfoActor.default_uid(), address=self.scheduler_endpoints[0]) check_time = time.time() while True: try: started_schedulers = self.cluster_info.get_schedulers() if len(started_schedulers) < n_schedulers: raise ProcessRequirementUnmetError( 'Schedulers does not met requirement: %d < %d.' % (len(started_schedulers), n_schedulers)) actor_address = self.cluster_info.get_scheduler( SessionManagerActor.default_uid()) self.session_manager_ref = actor_client.actor_ref( SessionManagerActor.default_uid(), address=actor_address) actor_address = self.cluster_info.get_scheduler( ResourceActor.default_uid()) resource_ref = actor_client.actor_ref( ResourceActor.default_uid(), address=actor_address) if resource_ref.get_worker_count() < n_workers: raise ProcessRequirementUnmetError( 'Workers does not met requirement: %d < %d.' % (resource_ref.get_worker_count(), n_workers)) break except: if time.time() - check_time > 20: raise time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors
def _start_processes(self, n_schedulers=2, n_workers=2, etcd=False, cuda=False, modules=None, log_scheduler=True, log_worker=True, env=None, scheduler_args=None, worker_args=None, worker_cpu=1): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception,) scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)] self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports] append_args = [] append_args_scheduler = scheduler_args or [] append_args_worker = worker_args or [] if modules: append_args.extend(['--load-modules', ','.join(modules)]) if etcd: etcd_port = get_next_port() self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port) self.etcd_helper.run() options.kv_store = f'etcd://127.0.0.1:{etcd_port}' append_args.extend(['--kv-store', options.kv_store]) else: append_args.extend(['--schedulers', ','.join(self.scheduler_endpoints)]) if 'DUMP_GRAPH_DATA' in os.environ: append_args_scheduler += ['-Dscheduler.dump_graph_data=true'] proc_env = os.environ.copy() if env: proc_env.update(env) self.proc_schedulers = [ subprocess.Popen([sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '-p', p, '--log-level', 'debug' if log_scheduler else 'warning', '--log-format', f'SCH{idx} %(asctime)-15s %(message)s' '-Dscheduler.retry_delay=5', '-Dscheduler.default_cpu_usage=0', '-Dscheduler.status_timeout=10'] + append_args + append_args_scheduler, env=proc_env) for idx, p in enumerate(scheduler_ports)] cuda_count = resource.cuda_count() cuda_devices = [int(d) for d in os.environ['CUDA_VISIBLE_DEVICES'].split(',')] \ if os.environ.get('CUDA_VISIBLE_DEVICES') else list(range(cuda_count)) self.proc_workers = [ subprocess.Popen([sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', str(worker_cpu), '--log-level', 'debug' if log_worker else 'warning', '--log-format', f'WOR{idx} %(asctime)-15s %(message)s', '--cache-mem', '16m', '--ignore-avail-mem', '--cuda-device', str(cuda_devices[idx % cuda_count]) if cuda_count else '', '-Dworker.prepare_data_timeout=30'] + append_args + append_args_worker, env=proc_env) for idx in range(n_workers) ] actor_client = new_client() self.cluster_info = actor_client.actor_ref( SchedulerClusterInfoActor.default_uid(), address=self.scheduler_endpoints[0]) check_time = time.time() while True: try: try: started_schedulers = self.cluster_info.get_schedulers() except Exception as e: raise ProcessRequirementUnmetError(f'Failed to get scheduler numbers, {e}') if len(started_schedulers) < n_schedulers: raise ProcessRequirementUnmetError( f'Schedulers does not met requirement: {len(started_schedulers)} < {n_schedulers}.') actor_address = self.cluster_info.get_scheduler(SessionManagerActor.default_uid()) self.session_manager_ref = actor_client.actor_ref( SessionManagerActor.default_uid(), address=actor_address) actor_address = self.cluster_info.get_scheduler(ResourceActor.default_uid()) resource_ref = actor_client.actor_ref(ResourceActor.default_uid(), address=actor_address) if not actor_client.has_actor(self.session_manager_ref) \ or resource_ref.get_worker_count() < n_workers: raise ProcessRequirementUnmetError( f'Workers does not met requirement: {resource_ref.get_worker_count()} < {n_workers}') break except: # noqa: E722 if time.time() - check_time > 20: raise time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors