def testPrepareQuota(self, *_): pinned = [True] def _mock_pin(graph_key, chunk_keys): from mars.errors import PinChunkFailed if pinned[0]: raise PinChunkFailed return chunk_keys ChunkHolderActor.pin_chunks.side_effect = _mock_pin pool_address = '127.0.0.1:%d' % get_next_port() session_id = str(uuid.uuid4()) mock_data = np.array([1, 2, 3, 4]) with create_actor_pool(n_process=1, backend='gevent', address=pool_address) as pool: self.create_standard_actors(pool, pool_address, with_daemon=False, with_status=False) pool.create_actor(MockSenderActor, mock_data, 'in', uid='w:mock_sender') cluster_info_ref = pool.actor_ref(WorkerClusterInfoActor.default_name()) chunk_meta_client = ChunkMetaClient(pool, cluster_info_ref) import mars.tensor as mt from mars.tensor.expressions.fetch import TensorFetch arr = mt.ones((4,), chunk_size=4) arr_add = mt.array(mock_data) result_tensor = arr + arr_add graph = result_tensor.build_graph(compose=False, tiled=True) modified_chunk = arr_add.chunks[0] arr_add.chunks[0]._op = TensorFetch( dtype=modified_chunk.dtype, _outputs=[weakref.ref(o) for o in modified_chunk.op.outputs], _key=modified_chunk.op.key) chunk_meta_client.set_chunk_meta(session_id, modified_chunk.key, size=mock_data.nbytes, shape=mock_data.shape, workers=('0.0.0.0:1234', pool_address)) with self.run_actor_test(pool) as test_actor: graph_key = str(uuid.uuid4()) execution_ref = test_actor.promise_ref(ExecutionActor.default_name()) start_time = time.time() execution_ref.enqueue_graph( session_id, graph_key, serialize_graph(graph), dict(chunks=[result_tensor.chunks[0].key]), None, _promise=True) \ .then(lambda *_: test_actor.set_result(time.time())) \ .catch(lambda *exc: test_actor.set_result(exc, False)) def _delay_fun(): time.sleep(1) pinned[0] = False threading.Thread(target=_delay_fun).start() finish_time = self.get_result() self.assertGreaterEqual(finish_time, start_time + 1)
def testReExecuteExisting(self): pool_address = '127.0.0.1:%d' % get_next_port() session_id = str(uuid.uuid4()) mock_data = np.array([1, 2, 3, 4]) with create_actor_pool(n_process=1, backend='gevent', address=pool_address, distributor=MarsDistributor(2, 'w:0:')) as pool: self.create_standard_actors(pool, pool_address, with_daemon=False, with_status=False) pool.create_actor(CpuCalcActor, uid='w:1:cpu-calc') pool.create_actor(InProcHolderActor, uid='w:1:inproc-holder') import mars.tensor as mt arr = mt.ones((4, ), chunk_size=4) arr_add = mt.array(mock_data) result_tensor = arr + arr_add graph = result_tensor.build_graph(compose=False, tiled=True) pool.create_actor(MockSenderActor, mock_data + np.ones((4, )), 'out', uid='w:mock_sender') def _validate(_): data = test_actor.shared_store.get(session_id, result_tensor.chunks[0].key) assert_array_equal(data, mock_data + np.ones((4, ))) with self.run_actor_test(pool) as test_actor: graph_key = str(uuid.uuid4()) execution_ref = test_actor.promise_ref( ExecutionActor.default_uid()) execution_ref.execute_graph(session_id, graph_key, serialize_graph(graph), dict(chunks=[result_tensor.chunks[0].key]), None, _promise=True) \ .then(_validate) \ .then(lambda *_: test_actor.set_result(None)) \ .catch(lambda *exc: test_actor.set_result(exc, False)) self.get_result() with self.run_actor_test(pool) as test_actor: execution_ref = test_actor.promise_ref( ExecutionActor.default_uid()) execution_ref.execute_graph(session_id, graph_key, serialize_graph(graph), dict(chunks=[result_tensor.chunks[0].key]), None, _promise=True) \ .then(_validate) \ .then(lambda *_: test_actor.set_result(None)) \ .catch(lambda *exc: test_actor.set_result(exc, False)) self.get_result()
def testAddrReject(self): try: with create_actor_pool(n_process=1) as pool: serve_ref = pool.create_actor(ServeActor, uid='ServeActor') test_ref = pool.create_actor(PromiseTestActor) test_ref.test_addr_reject() gc.collect() wait_test_actor_result(test_ref, 30) self.assertListEqual(serve_ref.get_result(), [0, 'WorkerDead']) finally: self.assertDictEqual(promise._promise_pool, {})
def testDispatch(self, *_): call_records = dict() group_size = 4 mock_scheduler_addr = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=mock_scheduler_addr) as pool: dispatch_ref = pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) # actors of g1 [pool.create_actor(TaskActor, 'g1', call_records) for _ in range(group_size)] [pool.create_actor(TaskActor, 'g2', call_records) for _ in range(group_size)] self.assertEqual(len(dispatch_ref.get_slots('g1')), group_size) self.assertEqual(len(dispatch_ref.get_slots('g2')), group_size) self.assertEqual(len(dispatch_ref.get_slots('g3')), 0) self.assertEqual(dispatch_ref.get_hash_slot('g1', 'hash_str'), dispatch_ref.get_hash_slot('g1', 'hash_str')) # tasks within [0, group_size - 1] will run almost simultaneously, # while the last one will be delayed due to lack of with self.run_actor_test(pool) as test_actor: from mars.promise import Promise p = Promise(done=True) _dispatch_ref = test_actor.promise_ref(DispatchActor.default_name()) def _call_on_dispatched(uid, key=None): if uid is None: call_records[key] = 'NoneUID' else: test_actor.promise_ref(uid).queued_call(key, 2, _tell=True) for idx in range(group_size + 1): p = p.then(lambda *_: _dispatch_ref.get_free_slot('g1', _promise=True)) \ .then(partial(_call_on_dispatched, key='%d_1' % idx)) \ .then(lambda *_: _dispatch_ref.get_free_slot('g2', _promise=True)) \ .then(partial(_call_on_dispatched, key='%d_2' % idx)) p.then(lambda *_: _dispatch_ref.get_free_slot('g3', _promise=True)) \ .then(partial(_call_on_dispatched, key='N_1')) \ .then(lambda *_: test_actor.set_result(None)) self.get_result(20) self.assertEqual(call_records['N_1'], 'NoneUID') self.assertLess(sum(abs(call_records['%d_1' % idx] - call_records['0_1']) for idx in range(group_size)), 1) self.assertGreater(call_records['%d_1' % group_size] - call_records['0_1'], 1) self.assertLess(call_records['%d_1' % group_size] - call_records['0_1'], 3) dispatch_ref.destroy()
def testAssignerActor(self): mock_scheduler_addr = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=mock_scheduler_addr) as pool: cluster_info_ref = pool.create_actor(SchedulerClusterInfoActor, [pool.cluster_info.address], uid=SchedulerClusterInfoActor.default_uid()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_uid()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_uid()) endpoint1 = 'localhost:12345' endpoint2 = 'localhost:23456' res = dict(hardware=dict(cpu=4, memory=4096)) def write_mock_meta(): resource_ref.set_worker_meta(endpoint1, res) resource_ref.set_worker_meta(endpoint2, res) g = gevent.spawn(write_mock_meta) g.join() assigner_ref = pool.create_actor(AssignerActor, uid=AssignerActor.default_uid()) session_id = str(uuid.uuid4()) op_key = str(uuid.uuid4()) chunk_key1 = str(uuid.uuid4()) chunk_key2 = str(uuid.uuid4()) chunk_key3 = str(uuid.uuid4()) op_info = { 'op_name': 'test_op', 'io_meta': dict(input_chunks=[chunk_key1, chunk_key2, chunk_key3]), 'retries': 0, 'optimize': { 'depth': 0, 'demand_depths': (), 'successor_size': 1, 'descendant_size': 0 } } chunk_meta_client = ChunkMetaClient(pool, cluster_info_ref) chunk_meta_client.set_chunk_meta(session_id, chunk_key1, size=512, workers=(endpoint1,)) chunk_meta_client.set_chunk_meta(session_id, chunk_key2, size=512, workers=(endpoint1,)) chunk_meta_client.set_chunk_meta(session_id, chunk_key3, size=512, workers=(endpoint2,)) reply_ref = pool.create_actor(PromiseReplyTestActor) reply_callback = ((reply_ref.uid, reply_ref.address), 'reply') assigner_ref.apply_for_resource(session_id, op_key, op_info, callback=reply_callback) while not reply_ref.get_reply(): gevent.sleep(0.1) _, ret_value = reply_ref.get_reply() self.assertEqual(ret_value[0], endpoint1)
def setUp(self): endpoint = '127.0.0.1:%d' % get_next_port() self.endpoint = endpoint self.pool = create_actor_pool(n_process=1, backend='gevent', address=endpoint) self.pool.create_actor(SchedulerClusterInfoActor, [endpoint], uid=SchedulerClusterInfoActor.default_uid()) self.pool.create_actor(SessionManagerActor, uid=SessionManagerActor.default_uid()) self.pool.create_actor(ResourceActor, uid=ResourceActor.default_uid()) self.api = MarsAPI(endpoint)
def create_bearer_token_actor(): from mars.actors import create_actor_pool, FunctionActor class BearerTokenActor(FunctionActor): def get_bearer_token(self): from cupid import context ctx = context() return ctx.get_bearer_token() pool = create_actor_pool(address=ACTOR_ADDRESS, n_process=1) pool.create_actor(BearerTokenActor, uid=ACTOR_UID) pool.join()
def testNoTimeoutActor(self): try: with create_actor_pool(n_process=1) as pool: serve_ref = pool.create_actor(ServeActor, uid='ServeActor') test_ref = pool.create_actor(PromiseTestActor) test_ref.test_no_timeout() wait_test_actor_result(test_ref, 30) self.assertListEqual(serve_ref.get_result(), [0]) finally: self.assertEqual(promise.get_active_promise_count(), 0)
def testRefReject(self): try: with create_actor_pool(n_process=1) as pool: serve_ref = pool.create_actor(ServeActor, uid='ServeActor') test_ref = pool.create_actor(PromiseTestActor) test_ref.test_ref_reject() wait_test_actor_result(test_ref, 30) self.assertListEqual(serve_ref.get_result(), [0, 'WorkerProcessStopped']) finally: self.assertEqual(promise.get_active_promise_count(), 0)
def testDispatch(self, *_): call_records = dict() group_size = 4 mock_scheduler_addr = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=mock_scheduler_addr) as pool: dispatch_ref = pool.create_actor(DispatchActor, uid='DispatchActor') # actors of g1 [ pool.create_actor(TaskActor, 'g1', call_records) for _ in range(group_size) ] [ pool.create_actor(TaskActor, 'g2', call_records) for _ in range(group_size) ] self.assertEqual(len(dispatch_ref.get_slots('g1')), group_size) self.assertEqual(len(dispatch_ref.get_slots('g2')), group_size) self.assertEqual(len(dispatch_ref.get_slots('g3')), 0) self.assertEqual(dispatch_ref.get_hash_slot('g1', 'hash_str'), dispatch_ref.get_hash_slot('g1', 'hash_str')) # tasks within [0, group_size - 1] will run almost simultaneously, # while the last one will be delayed due to lack of def run_tasks(): test_ref = pool.create_actor(RunTaskTestActor, call_records) test_ref.run_tasks(group_size) while not test_ref.get_finished(): gevent.sleep(1) test_ref.destroy() gl = gevent.spawn(run_tasks) gl.join() self.assertEqual(call_records['N_1'], 'NoneUID') self.assertLess( sum( abs(call_records['%d_1' % idx] - call_records['0_1']) for idx in range(group_size)), 1) self.assertGreater( call_records['%d_1' % group_size] - call_records['0_1'], 1) self.assertLess( call_records['%d_1' % group_size] - call_records['0_1'], 3) dispatch_ref.destroy()
def run_transfer_worker(pool_address, session_id, plasma_socket, chunk_keys, spill_dir, msg_queue): from mars.config import options from mars.utils import PlasmaProcessHelper options.worker.plasma_socket = plasma_socket options.worker.spill_directory = spill_dir plasma_helper = PlasmaProcessHelper(size=1024 * 1024 * 10, socket=options.worker.plasma_socket) try: plasma_helper.run() with create_actor_pool(n_process=2, backend='gevent', distributor=WorkerDistributor(2), address=pool_address) as pool: try: pool.create_actor(ClusterInfoActor, schedulers=[pool_address], uid=ClusterInfoActor.default_name()) pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name()) holder_ref = pool.create_actor(HolderActor, uid='HolderActor') chunk_holder_ref = pool.create_actor(ChunkHolderActor, plasma_helper._size, uid=ChunkHolderActor.default_name()) pool.create_actor(SpillActor) pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4())) register_actor = pool.create_actor(WorkerRegistrationTestActor) register_actor.register(session_id, chunk_keys) check_time = time.time() while not register_actor.get_finished(): gevent.sleep(0.5) if time.time() - check_time > 60: raise SystemError('Wait result timeout') register_actor.destroy() msg_queue.put(1) check_time = time.time() while not holder_ref.obtain(): gevent.sleep(1) if time.time() - check_time > 60: raise SystemError('Wait result timeout') finally: pool.destroy_actor(chunk_holder_ref) finally: plasma_helper.stop()
def testWorkerProcessRestart(self): mock_scheduler_addr = '127.0.0.1:%d' % get_next_port() try: with create_actor_pool(n_process=1, backend='gevent', address=mock_scheduler_addr) as pool: pool.create_actor(SchedulerClusterInfoActor, schedulers=[mock_scheduler_addr], uid=SchedulerClusterInfoActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) resource_ref = pool.create_actor( ResourceActor, uid=ResourceActor.default_name()) proc = subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--schedulers', mock_scheduler_addr, '--cpu-procs', '1', '--cache-mem', '10m', '--spill-dir', self._spill_dir, '--ignore-avail-mem' ]) worker_endpoint = self._wait_worker_ready(proc, resource_ref) daemon_ref = pool.actor_ref(WorkerDaemonActor.default_name(), address=worker_endpoint) dispatch_ref = pool.actor_ref(DispatchActor.default_name(), address=worker_endpoint) cpu_slots = dispatch_ref.get_slots('cpu') calc_ref = pool.actor_ref(cpu_slots[0], address=worker_endpoint) daemon_ref.kill_actor_process(calc_ref) check_start = time.time() while not daemon_ref.is_actor_process_alive(calc_ref): gevent.sleep(0.1) if time.time() - check_start > 10: raise TimeoutError('Check process restart timeout') finally: if proc.poll() is None: proc.send_signal(signal.SIGINT) check_time = time.time() while True: time.sleep(0.1) if proc.poll( ) is not None or time.time() - check_time >= 5: break if proc.poll() is None: proc.kill() if os.path.exists(options.worker.plasma_socket): os.unlink(options.worker.plasma_socket)
def testAllActor(self): try: with create_actor_pool(n_process=1) as pool: serve_ref = pool.create_actor(ServeActor, uid='ServeActor') test_ref = pool.create_actor(PromiseTestActor) test_ref.test_all_promise() wait_test_actor_result(test_ref, 30) self.assertListEqual(serve_ref.get_result(), [-128] + list(range(0, 20, 2)) + list(range(1, 20, 2)) + [127]) finally: self.assertEqual(promise.get_active_promise_count(), 0)
def testPromiseActor(self): try: with create_actor_pool() as pool: serve_ref = pool.create_actor(ServeActor, uid='ServeActor') test_ref = pool.create_actor(PromiseTestActor) def test_proc(): test_ref.test_normal() gevent.sleep(2) self.assertListEqual(serve_ref.get_result(), list(range(11))) gl = gevent.spawn(test_proc) gl.join() finally: self.assertDictEqual(promise._promise_pool, {})
def _run_operand_case(session_id, graph_key, tensor, execution_creator): graph = tensor.build_graph(compose=False) with create_actor_pool(n_process=1, backend='gevent') as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialize_graph(graph), uid=GraphActor.gen_name(session_id, graph_key)) addr_dict = dict() def _build_mock_ref(uid=None, address=None): if address in addr_dict: return addr_dict[address] else: r = addr_dict[address] = execution_creator(pool) return r # handle mock objects OperandActor._get_raw_execution_ref.side_effect = _build_mock_ref mock_resource = dict(hardware=dict(cpu=4, cpu_total=4, memory=512)) resource_ref.set_worker_meta('localhost:12345', mock_resource) resource_ref.set_worker_meta('localhost:23456', mock_resource) graph_ref.prepare_graph() fetched_graph = graph_ref.get_chunk_graph() graph_ref.analyze_graph() final_keys = set() for c in fetched_graph: if fetched_graph.count_successors(c) == 0: final_keys.add(c.op.key) graph_ref.create_operand_actors() graph_meta_ref = pool.actor_ref(GraphMetaActor.gen_name(session_id, graph_key)) start_time = time.time() while True: pool.sleep(0.1) if time.time() - start_time > 30: raise SystemError('Wait for execution finish timeout') if graph_meta_ref.get_state() in (GraphState.SUCCEEDED, GraphState.FAILED, GraphState.CANCELLED): break
def testRefReject(self): try: with create_actor_pool() as pool: serve_ref = pool.create_actor(ServeActor, uid='ServeActor') test_ref = pool.create_actor(PromiseTestActor) def run_proc_test(): test_ref.test_ref_reject() gc.collect() gevent.sleep(3) self.assertListEqual(serve_ref.get_result(), [0, 'WorkerProcessStopped']) gl = gevent.spawn(run_proc_test) gl.join() finally: self.assertDictEqual(promise._promise_pool, {})
def testNoTimeoutActor(self): try: with create_actor_pool() as pool: serve_ref = pool.create_actor(ServeActor, uid='ServeActor') test_ref = pool.create_actor(PromiseTestActor) def run_proc_test(): test_ref.test_no_timeout() gc.collect() gevent.sleep(3) # print(serve_ref.get_result()) self.assertListEqual(serve_ref.get_result(), [0]) gl = gevent.spawn(run_proc_test) gl.join() finally: self.assertDictEqual(promise._promise_pool, {})
def testPromiseActor(self): try: with create_actor_pool(n_process=1) as pool: serve_ref = pool.create_actor(ServeActor, uid='ServeActor') test_ref = pool.create_actor(PromiseTestActor) test_ref.test_normal() wait_test_actor_result(test_ref, 10) self.assertListEqual(serve_ref.get_result(), list(range(11))) serve_ref.clear_result() test_ref.test_error_raise() wait_test_actor_result(test_ref, 10) self.assertListEqual(serve_ref.get_result(), [-1]) finally: self.assertDictEqual(promise._promise_pool, {})
def testResourceActor(self): session_id = str(uuid.uuid4()) with create_actor_pool(n_process=1, backend='gevent') as pool: resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) mock_resource = dict(hardware=dict(cpu=4, memory=512)) ep1 = 'localhost:12345' ep2 = 'localhost:23456' def write_mock_meta(): resource_ref.set_worker_meta(ep1, mock_resource) resource_ref.set_worker_meta(ep2, mock_resource) return resource_ref.get_workers_meta() g = gevent.spawn(write_mock_meta) g.join() self.assertEqual({ep1: mock_resource, ep2: mock_resource}, g.value) key1 = str(uuid.uuid4()) self.assertFalse( resource_ref.allocate_resource(session_id, key1, ep1, dict(cpu=5, memory=256))) key2 = str(uuid.uuid4()) self.assertTrue( resource_ref.allocate_resource(session_id, key2, ep1, dict(cpu=2, memory=256))) key3 = str(uuid.uuid4()) self.assertFalse( resource_ref.allocate_resource(session_id, key3, ep1, dict(cpu=2, memory=260))) key4 = str(uuid.uuid4()) self.assertTrue( resource_ref.allocate_resource(session_id, key4, ep1, dict(cpu=2, memory=256))) key5 = str(uuid.uuid4()) self.assertFalse( resource_ref.allocate_resource(session_id, key5, ep1, dict(cpu=2, memory=256))) resource_ref.deallocate_resource(session_id, key4, ep1) key6 = str(uuid.uuid4()) self.assertTrue( resource_ref.allocate_resource(session_id, key6, ep1, dict(cpu=2, memory=256))) resource_ref.deallocate_resource(session_id, key6, ep1)
def start_transfer_test_pool(**kwargs): address = kwargs.pop('address') plasma_size = kwargs.pop('plasma_size') with create_actor_pool(n_process=1, backend='gevent', address=address, **kwargs) as pool: pool.create_actor(PlasmaKeyMapActor, uid=PlasmaKeyMapActor.default_name()) pool.create_actor(ClusterInfoActor, schedulers=[address], uid=ClusterInfoActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name()) chunk_holder_ref = pool.create_actor(ChunkHolderActor, plasma_size, uid=ChunkHolderActor.default_name()) pool.create_actor(SpillActor) pool.create_actor(StatusActor, address, uid=StatusActor.default_name()) yield pool chunk_holder_ref.destroy()
def testKVStoreActor(self): proc_helper = EtcdProcessHelper(port_range_start=54131) with proc_helper.run(), create_actor_pool(n_process=1, backend='gevent') as pool: store_ref = pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) store_ref.write('/node/v1', 'value1') store_ref.write('/node/v2', 'value2') store_ref.write_batch([ ('/node/v2', 'value2'), ('/node/v3', 'value3'), ]) self.assertEqual(store_ref.read('/node/v1').value, 'value1') self.assertListEqual([ v.value for v in store_ref.read_batch(['/node/v2', '/node/v3']) ], ['value2', 'value3'])
def testAllActor(self): try: with create_actor_pool() as pool: serve_ref = pool.create_actor(ServeActor, uid='ServeActor') test_ref = pool.create_actor(PromiseTestActor) def run_proc_test(): test_ref.test_all_promise() gc.collect() gevent.sleep(3) self.assertListEqual(serve_ref.get_result(), [-128] + list(range(0, 20, 2)) + list(range(1, 20, 2)) + [127]) gl = gevent.spawn(run_proc_test) gl.join() finally: self.assertDictEqual(promise._promise_pool, {})
def testTaskQueueActor(self): with create_actor_pool(n_process=1, backend='gevent') as pool: pool.create_actor(MockExecutionActor, 10, uid=ExecutionActor.default_name()) quota_ref = pool.create_actor(QuotaActor, 30, uid=MemQuotaActor.default_name()) pool.create_actor(TaskQueueActor, 4, uid=TaskQueueActor.default_name()) session_id = str(uuid.uuid4()) chunk_keys = [str(uuid.uuid4()).replace('-', '') for _ in range(6)] with self.run_actor_test(pool) as test_actor: queue_ref = test_actor.promise_ref( TaskQueueActor.default_name()) res_times = dict() def callback_fun(key): res_times[key] = time.time() for idx, k in enumerate(chunk_keys): depth = len(chunk_keys) - idx queue_ref.enqueue_task(session_id, k, dict(depth=depth), _promise=True) \ .then(functools.partial(callback_fun, k)) gevent.sleep(1) self.assertEqual(queue_ref.get_allocated_count(), 4) queue_ref.update_priority(session_id, chunk_keys[-1], dict(depth=len(chunk_keys))) quota_ref.release_quota(chunk_keys[0]) queue_ref.release_task(session_id, chunk_keys[0]) quota_ref.release_quota(chunk_keys[1]) queue_ref.release_task(session_id, chunk_keys[1]) gevent.sleep(0.5) self.assertIn(chunk_keys[-1], res_times) for k in chunk_keys[:3]: self.assertLessEqual(res_times[k], res_times[chunk_keys[-1]] - 0.5) self.assertIn(k, res_times)
def testFailoverMessage(self): mock_session_id = str(uuid.uuid4()) mock_graph_key = str(uuid.uuid4()) mock_chunk_key = str(uuid.uuid4()) addr = '127.0.0.1:%d' % get_next_port() mock_worker_addr = '127.0.0.1:54132' options.scheduler.worker_blacklist_time = 0.5 with create_actor_pool(n_process=1, backend='gevent', address=addr) as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) session_manager_ref = pool.create_actor( SessionManagerActor, uid=SessionManagerActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) chunk_meta_ref = pool.create_actor( ChunkMetaActor, uid=ChunkMetaActor.default_name()) session_ref = pool.actor_ref(session_manager_ref.create_session(mock_session_id)) chunk_meta_ref.set_chunk_meta(mock_session_id, mock_chunk_key, size=80, shape=(10,), workers=(mock_worker_addr,)) with mock.patch(GraphActor.__module__ + '.' + GraphActor.__name__, new=MockGraphActor): session_ref.submit_tensor_graph(None, mock_graph_key) graph_ref = pool.actor_ref(GraphActor.gen_name(mock_session_id, mock_graph_key)) expire_time = time.time() - options.scheduler.status_timeout - 1 resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=expire_time)) resource_ref.detect_dead_workers(_tell=True) pool.sleep(0.2) _, removes, lost_chunks = graph_ref.get_worker_change_args() self.assertListEqual(removes, [mock_worker_addr]) self.assertListEqual(lost_chunks, [mock_chunk_key]) self.assertNotIn(mock_worker_addr, resource_ref.get_workers_meta()) resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=time.time())) self.assertNotIn(mock_worker_addr, resource_ref.get_workers_meta()) pool.sleep(0.4) resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=time.time())) self.assertIn(mock_worker_addr, resource_ref.get_workers_meta())
def testMemQuotaAllocation(self): from mars import resource from mars.utils import AttributeDict mock_mem_stat = AttributeDict( dict(total=300, available=50, used=0, free=50)) local_pool_addr = 'localhost:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=local_pool_addr) as pool, \ patch_method(resource.virtual_memory, new=lambda: mock_mem_stat): pool.create_actor(WorkerClusterInfoActor, schedulers=[local_pool_addr], uid=WorkerClusterInfoActor.default_name()) pool.create_actor(StatusActor, local_pool_addr, uid=StatusActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(ProcessHelperActor, uid=ProcessHelperActor.default_name()) quota_ref = pool.create_actor(MemQuotaActor, 300, refresh_time=0.1, uid=MemQuotaActor.default_name()) time_recs = [] with self.run_actor_test(pool) as test_actor: ref = test_actor.promise_ref(quota_ref) time_recs.append(time.time()) def actual_exec(x): ref.release_quota(x) time_recs.append(time.time()) test_actor.set_result(None) ref.request_quota('req', 100, _promise=True) \ .then(functools.partial(actual_exec, 'req')) pool.sleep(0.5) mock_mem_stat['available'] = 150 mock_mem_stat['free'] = 150 self.get_result(2) self.assertGreater(abs(time_recs[0] - time_recs[1]), 0.4)
def testExecuteWorker(self): mock_scheduler_addr = '127.0.0.1:%d' % get_next_port() try: with create_actor_pool(n_process=1, backend='gevent', address=mock_scheduler_addr) as pool: pool.create_actor(SchedulerClusterInfoActor, schedulers=[mock_scheduler_addr], uid=SchedulerClusterInfoActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) resource_ref = pool.create_actor( ResourceActor, uid=ResourceActor.default_name()) proc = subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--schedulers', mock_scheduler_addr, '--cpu-procs', '1', '--cache-mem', '10m', '--spill-dir', self._spill_dir, '--ignore-avail-mem' ]) worker_endpoint = self._wait_worker_ready(proc, resource_ref) test_ref = pool.create_actor(WorkerProcessTestActor) test_ref.run_test(worker_endpoint, _tell=True) check_time = time.time() while not test_ref.get_reply(): gevent.sleep(0.1) if time.time() - check_time > 20: raise TimeoutError('Check reply timeout') finally: if proc.poll() is None: proc.send_signal(signal.SIGINT) check_time = time.time() while True: time.sleep(0.1) if proc.poll( ) is not None or time.time() - check_time >= 5: break if proc.poll() is None: proc.kill() if os.path.exists(options.worker.plasma_socket): os.unlink(options.worker.plasma_socket)
def testSendTargets(self): pool_address = '127.0.0.1:%d' % get_next_port() session_id = str(uuid.uuid4()) mock_data = np.array([1, 2, 3, 4]) with create_actor_pool(n_process=1, backend='gevent', address=pool_address, distributor=WorkerDistributor(2)) as pool: self.create_standard_actors(pool, pool_address, with_daemon=False, with_status=False) pool.create_actor(CpuCalcActor) import mars.tensor as mt arr = mt.ones((4, ), chunk_size=4) arr_add = mt.array(mock_data) result_tensor = arr + arr_add graph = result_tensor.build_graph(compose=False, tiled=True) result_key = result_tensor.chunks[0].key pool.create_actor(MockSenderActor, mock_data + np.ones((4, )), 'out', uid='w:mock_sender') with self.run_actor_test(pool) as test_actor: def _validate(_): data = test_actor._chunk_store.get( session_id, result_tensor.chunks[0].key) assert_array_equal(data, mock_data + np.ones((4, ))) graph_key = str(uuid.uuid4()) execution_ref = test_actor.promise_ref( ExecutionActor.default_name()) execution_ref.enqueue_graph(session_id, graph_key, serialize_graph(graph), dict(chunks=[result_tensor.chunks[0].key]), None, send_addresses={result_key: (pool_address,)}, _promise=True) \ .then(lambda *_: execution_ref.start_execution(session_id, graph_key, _promise=True)) \ .then(_validate) \ .then(lambda *_: test_actor.set_result(None)) \ .catch(lambda *exc: test_actor.set_result(exc, False)) self.get_result()
def _start_worker_process(self, no_cuda=True, cuda_device=None): mock_scheduler_addr = '127.0.0.1:%d' % get_next_port() try: with create_actor_pool(n_process=1, backend='gevent', address=mock_scheduler_addr) as pool: pool.create_actor(SchedulerClusterInfoActor, [mock_scheduler_addr], uid=SchedulerClusterInfoActor.default_uid()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_uid()) resource_ref = pool.create_actor( ResourceActor, uid=ResourceActor.default_uid()) args = [ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--schedulers', mock_scheduler_addr, '--cpu-procs', '1', '--cache-mem', '10m', '--spill-dir', self._spill_dir, '--ignore-avail-mem' ] env = os.environ.copy() if no_cuda: args.append('--no-cuda') else: env['CUDA_VISIBLE_DEVICES'] = cuda_device proc = subprocess.Popen(args, env=env) worker_endpoint = self._wait_worker_ready(proc, resource_ref) yield pool, worker_endpoint finally: if proc.poll() is None: proc.send_signal(signal.SIGINT) check_time = time.time() while True: time.sleep(0.1) if proc.poll( ) is not None or time.time() - check_time >= 5: break if proc.poll() is None: proc.kill() if os.path.exists(options.worker.plasma_socket): os.unlink(options.worker.plasma_socket)
def testEvents(self, *_): mock_scheduler_addr = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=mock_scheduler_addr) as pool: events_ref = pool.create_actor(EventsActor) event1 = events_ref.add_single_event(EventCategory.RESOURCE, EventLevel.WARNING, ResourceEventType.MEM_HIGH, 'test_owner') self.assertIsNotNone(event1) event2 = events_ref.add_open_event(EventCategory.PROCEDURE, EventLevel.NORMAL, ProcedureEventType.CPU_CALC, 'test_owner2') self.assertIsNotNone(event2) time.sleep(1) proc_events = events_ref.query_by_time(EventCategory.RESOURCE) self.assertEqual(len(proc_events), 0) proc_events = events_ref.query_by_time(EventCategory.PROCEDURE) self.assertEqual(len(proc_events), 1) events_ref.close_event(event2) proc_events = events_ref.query_by_time(EventCategory.PROCEDURE) self.assertGreater(proc_events[0].time_end, proc_events[0].time_start) # repeated closing shall not cause any problems events_ref.close_event(event2) reloaded = pickle.loads(pickle.dumps(proc_events[0])) self.assertEqual(reloaded.event_id, proc_events[0].event_id) with EventContext(events_ref, EventCategory.PROCEDURE, EventLevel.NORMAL, ProcedureEventType.CPU_CALC, 'test_owner3'): proc_events = events_ref.query_by_time(EventCategory.PROCEDURE) self.assertIsNone(proc_events[-1].time_end) self.assertIsNotNone(proc_events[-1].time_end)
def testBatchQuota(self): with create_actor_pool() as pool: quota_ref = pool.create_actor(QuotaActor, 300, uid='QuotaActor') test_refs = [ pool.create_actor(BatchQuotaTestActor, 100) for _ in range(2) ] def test_method(): for ref in test_refs: ref_str = str(id(ref)) ref.mock_step([ref_str + '_0', ref_str + '_1']) gevent.sleep(3) return [ref.get_end_time() for ref in test_refs] gl = gevent.spawn(test_method) gl.join() end_time = gl.value self.assertGreater(abs(end_time[0] - end_time[1]), 0.9) self.assertEqual(quota_ref.get_allocated_size(), 0)