def testInputTileable(self): def f(t, x): return (t * x).sum().to_numpy() rs = np.random.RandomState(0) raw = rs.rand(5, 4) t1 = mt.tensor(raw, chunk_size=3) t2 = t1.sum(axis=0) s = spawn(f, args=(t2, 3)) sess = new_session() sess._sess._executor = ExecutorForTest('numpy', storage=sess._context) result = s.execute(session=sess).fetch(session=sess) expected = (raw.sum(axis=0) * 3).sum() self.assertAlmostEqual(result, expected) df1 = md.DataFrame(raw, chunk_size=3) df1.execute(session=sess) df2 = shuffle(df1) df2.execute(session=sess) def f2(input_df): bonus = input_df.iloc[:, 0].fetch().sum() return input_df.sum().to_pandas() + bonus for df in [df1, df2]: s = spawn(f2, args=(df, )) result = s.execute(session=sess).fetch(session=sess) expected = pd.DataFrame(raw).sum() + raw[:, 0].sum() pd.testing.assert_series_equal(result, expected)
def test_input_tileable(setup): def f(t, x): return (t * x).sum().to_numpy() rs = np.random.RandomState(0) raw = rs.rand(5, 4) t1 = mt.tensor(raw, chunk_size=3) t2 = t1.sum(axis=0) s = spawn(f, args=(t2, 3)) result = s.execute().fetch() expected = (raw.sum(axis=0) * 3).sum() assert pytest.approx(result) == expected df1 = md.DataFrame(raw, chunk_size=3) df1.execute() df2 = shuffle(df1) df2.execute() def f2(input_df): bonus = input_df.iloc[:, 0].fetch().sum() return input_df.sum().to_pandas() + bonus for df in [df1, df2]: s = spawn(f2, args=(df,)) result = s.execute().fetch() expected = pd.DataFrame(raw).sum() + raw[:, 0].sum() pd.testing.assert_series_equal(result, expected)
async def test_task_execution(start_test_service): _sv_pool_address, task_api, storage_api = start_test_service def f1(): return np.arange(5) def f2(): return np.arange(5, 10) def f3(f1r, f2r): return np.concatenate([f1r, f2r]).sum() r1 = mr.spawn(f1) r2 = mr.spawn(f2) r3 = mr.spawn(f3, args=(r1, r2)) graph = TileableGraph([r3.data]) next(TileableGraphBuilder(graph).build()) task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) assert await task_api.get_last_idle_time() is None assert isinstance(task_id, str) await task_api.wait_task(task_id) task_result = await task_api.get_task_result(task_id) assert task_result.status == TaskStatus.terminated assert await task_api.get_last_idle_time() is not None if task_result.error is not None: raise task_result.error.with_traceback(task_result.traceback) result_tileable = (await task_api.get_fetch_tileables(task_id))[0] data_key = result_tileable.chunks[0].key assert await storage_api.get(data_key) == 45
async def test_cancel_subtask(actor_pool): pool, session_id, meta_api, storage_api, manager = actor_pool def sleep(timeout: int): time.sleep(timeout) return timeout a = mr.spawn(sleep, 2) subtask = _gen_subtask(a, session_id) subtask_runner: SubtaskRunnerActor = await manager.get_free_slot() asyncio.create_task(subtask_runner.run_subtask(subtask)) await asyncio.sleep(0.2) with Timer() as timer: # normal cancel by cancel asyncio Task await manager.free_slot(subtask_runner, timeout=5) # do not need to wait 5 sec assert timer.duration < 5 assert await manager.is_slot_free(subtask_runner) is True b = mr.spawn(sleep, 100) subtask2 = _gen_subtask(b, session_id) subtask_runner: SubtaskRunnerActor = await manager.get_free_slot() asyncio.create_task(subtask_runner.run_subtask(subtask2)) await asyncio.sleep(0.2) with Timer() as timer: # normal cancel by cancel asyncio Task aio_task = asyncio.create_task(manager.free_slot(subtask_runner, timeout=1)) assert await manager.is_slot_free(subtask_runner) is False await aio_task # need 1 sec to reach timeout, then killing actor and wait for auto recovering # the time would not be over 5 sec assert timer.duration < 5 assert await manager.is_slot_free(subtask_runner) is True
def testRemoteWithoutEtcd(self): self.start_processes( etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op']) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) rs = np.random.RandomState(0) raw1 = rs.rand(10, 10) raw2 = rs.rand(10, 10) def f_none(_x): return None r_none = spawn(f_none, raw1) graph = r_none.build_graph() targets = [r_none.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, r_none.key) self.assertIsNone(loads(result)) def f1(x): return x + 1 def f2(x, y, z=None): return x * y * (z[0] + z[1]) r1 = spawn(f1, raw1) r2 = spawn(f1, raw2) r3 = spawn(f2, (r1, r2), {'z': [r1, r2]}) graph = r3.build_graph() targets = [r3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, r3.key) expected = (raw1 + 1) * (raw2 + 1) * (raw1 + 1 + raw2 + 1) assert_allclose(loads(result), expected)
def test_none_outputs(setup): def f(*_args): pass r1 = spawn(f, args=(0, )) r2 = spawn(f, args=(r1, 1)) r3 = spawn(f, args=(r1, 2)) r4 = spawn(f, args=(r2, r3)) assert r4.execute().fetch() is None
def testRemoteWithoutEtcd(self): from mars.scheduler.resource import ResourceActor from mars.worker.dispatcher import DispatchActor self.start_processes(etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op']) sess = new_session(self.session_manager_ref.address) resource_ref = sess._api.actor_client.actor_ref( ResourceActor.default_uid(), address=self.cluster_info.get_scheduler(ResourceActor.default_uid()) ) worker_ips = resource_ref.get_worker_endpoints() rs = np.random.RandomState(0) raw1 = rs.rand(10, 10) raw2 = rs.rand(10, 10) def f_none(_x): return None r_none = spawn(f_none, raw1) result = r_none.execute(session=sess, timeout=self.timeout).fetch(session=sess) self.assertIsNone(result) def f1(x): return x + 1 def f2(x, y, z=None): return x * y * (z[0] + z[1]) r1 = spawn(f1, raw1) r2 = spawn(f1, raw2) r3 = spawn(f2, (r1, r2), {'z': [r1, r2]}) result = r3.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = (raw1 + 1) * (raw2 + 1) * (raw1 + 1 + raw2 + 1) np.testing.assert_allclose(result, expected) def f(t, x): mul = (t * x).execute() return mul.sum().to_numpy() rs = np.random.RandomState(0) raw = rs.rand(5, 4) t1 = mt.tensor(raw, chunk_size=3) t2 = t1.sum(axis=0) s = spawn(f, args=(t2, 3)) result = s.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = (raw.sum(axis=0) * 3).sum() self.assertAlmostEqual(result, expected) time.sleep(1) for worker_ip in worker_ips: ref = sess._api.actor_client.actor_ref(DispatchActor.default_uid(), address=worker_ip) self.assertEqual(len(ref.get_slots('cpu')), 1)
async def test_get_tileable_graph(start_test_service): _sv_pool_address, task_api, storage_api = start_test_service def f1(): return np.arange(5) def f2(): return np.arange(5, 10) def f3(f1r, f2r): return np.concatenate([f1r, f2r]).sum() r1 = mr.spawn(f1) r2 = mr.spawn(f2) r3 = mr.spawn(f3, args=(r1, r2)) graph = TileableGraph([r3.data]) next(TileableGraphBuilder(graph).build()) task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) with pytest.raises(TaskNotExist): await task_api.get_tileable_graph_as_json('non_exist') tileable_detail = await task_api.get_tileable_graph_as_json(task_id) num_tileable = len(tileable_detail.get('tileables')) num_dependencies = len(tileable_detail.get('dependencies')) assert num_tileable > 0 assert num_dependencies <= (num_tileable / 2) * (num_tileable / 2) assert (num_tileable == 1 and num_dependencies == 0) or (num_tileable > 1 and num_dependencies > 0) graph_nodes = [] graph_dependencies = [] for node in graph.iter_nodes(): graph_nodes.append(node.key) for node_successor in graph.iter_successors(node): graph_dependencies.append({ 'fromTileableId': node.key, 'toTileableId': node_successor.key, 'linkType': 0, }) for tileable in tileable_detail.get('tileables'): graph_nodes.remove(tileable.get('tileableId')) assert len(graph_nodes) == 0 for i in range(num_dependencies): dependency = tileable_detail.get('dependencies')[i] assert graph_dependencies[i] == dependency
def test_chained_remote(setup): def f(x): return x + 1 def g(x): return x * 2 s = spawn(g, spawn(f, 2)) result = s.execute().fetch() assert result == 6
def testChainedRemote(self): def f(x): return x + 1 def g(x): return x * 2 s = spawn(g, spawn(f, 2)) result = self.executor.execute_tileables([s])[0] self.assertEqual(result, 6)
async def test_cancel_subtask(actor_pool): pool, session_id, meta_api, storage_api, manager = actor_pool subtask_runner: SubtaskRunnerRef = await mo.actor_ref( SubtaskRunnerActor.gen_uid('numa-0', 0), address=pool.external_address) def sleep(timeout: int): time.sleep(timeout) return timeout b = mr.spawn(sleep, 100) subtask = _gen_subtask(b, session_id) asyncio.create_task(subtask_runner.run_subtask(subtask)) await asyncio.sleep(0.2) with Timer() as timer: # normal cancel by cancel asyncio Task aio_task = asyncio.create_task(asyncio.wait_for( subtask_runner.cancel_subtask(), timeout=1)) assert await subtask_runner.is_runner_free() is False with pytest.raises(asyncio.TimeoutError): await aio_task # need 1 sec to reach timeout, then killing actor and wait for auto recovering # the time would not be over 5 sec assert timer.duration < 5 async def wait_slot_restore(): while True: try: assert await subtask_runner.is_runner_free() is True except (mo.ServerClosed, ConnectionRefusedError, mo.ActorNotExist): await asyncio.sleep(0.5) else: break await mo.kill_actor(subtask_runner) await wait_slot_restore() a = mr.spawn(sleep, 2) subtask2 = _gen_subtask(a, session_id) asyncio.create_task(subtask_runner.run_subtask(subtask2)) await asyncio.sleep(0.2) with Timer() as timer: # normal cancel by cancel asyncio Task await asyncio.wait_for(subtask_runner.cancel_subtask(), timeout=6) # do not need to wait 10 sec assert timer.duration < 10 assert await subtask_runner.is_runner_free() is True
def run_mars_job(odps, func, args=(), kwargs=None, retry_when_fail=False, n_output=None, **kw): from mars.remote import spawn from cupid.config import options as cupid_options if 'with_notebook' not in kw: kw['with_notebook'] = False task_name = kw.get('name', None) if task_name is None: kw['name'] = str(uuid.uuid4()) kw['if_exists'] = 'ignore' runtime_endpoint = kw.pop('runtime_endpoint', None) or kw.pop( 'cupid_internal_endpoint', None) client = odps.create_mars_cluster(**kw) try: r = spawn(func, args=args, kwargs=kwargs, retry_when_fail=retry_when_fail, n_output=n_output) r.op.extra_params['project'] = odps.project r.op.extra_params[ 'endpoint'] = runtime_endpoint or cupid_options.cupid.runtime.endpoint r.execute() finally: if task_name is None: client.stop_server()
def recall(doc, query, topk, sample_count, pk_p, distance_p, row_number=None, column_number=None, topk_ids=None, method=None, session=None, run_kwargs=None): if topk_ids is None: topk_ids = [topk] if method is None: method = "BYSCORE" query_sample, idx = sample_data(query=query, sample_count=sample_count) pk_p_sample, distance_p_sample = pk_p[idx, :], distance_p[idx, :] pk_l, distance_l = linear_build_and_search(doc=doc, query=query_sample, topk=topk, row_number=row_number, column_number=column_number) r = mr.spawn(compute_recall, args=(pk_l, distance_l, pk_p_sample, distance_p_sample, topk_ids, method)) return r.execute(session=session, **(run_kwargs or dict())).fetch()
async def test_subtask_op_progress(actor_pool): pool, session_id, meta_api, storage_api, manager = actor_pool subtask_runner: SubtaskRunnerRef = await mo.actor_ref( SubtaskRunnerActor.gen_uid('numa-0', 0), address=pool.external_address) def progress_sleep(interval: float, count: int): for idx in range(count): time.sleep(interval) get_context().set_progress((1 + idx) * 1.0 / count) b = mr.spawn(progress_sleep, args=(0.75, 2)) subtask = _gen_subtask(b, session_id) aio_task = asyncio.create_task(subtask_runner.run_subtask(subtask)) try: await asyncio.sleep(0.5) result = await subtask_runner.get_subtask_result() assert result.progress == 0.0 await asyncio.sleep(0.75) result = await subtask_runner.get_subtask_result() assert result.progress == 0.5 finally: await aio_task result = await subtask_runner.get_subtask_result() assert result.progress == 1.0
async def _run_web_session_test(web_address): session_id = str(uuid.uuid4()) session = await AsyncSession.init(web_address, session_id) session.as_default() raw = np.random.RandomState(0).rand(10, 10) a = mt.tensor(raw, chunk_size=5) b = a + 1 info = await session.execute(b) await info assert info.result() is None assert info.exception() is None assert info.progress() == 1 np.testing.assert_equal(raw + 1, await session.fetch(b)) del a, b r = mr.spawn(_my_func) info = await session.execute(r) await info assert info.result() is None assert info.exception() is None assert info.progress() == 1 assert 'output from function' in str(r.fetch_log(session=session)) assert 'output from function' in str( r.fetch_log(session=session, offsets='0k', sizes=[1000])) assert 'output from function' in str( r.fetch_log(session=session, offsets={r.op.key: '0k'}, sizes=[1000])) AsyncSession.reset_default() await session.destroy()
async def test_task_progress(start_test_service): sv_pool_address, task_api, storage_api = start_test_service session_api = await SessionAPI.create(address=sv_pool_address) ref = await session_api.create_remote_object(task_api._session_id, 'progress_controller', _ProgressController) def f1(count: int): progress_controller = get_context().get_remote_object( 'progress_controller') for idx in range(count): progress_controller.wait() get_context().set_progress((1 + idx) * 1.0 / count) r = mr.spawn(f1, args=(2, )) graph = TileableGraph([r.data]) next(TileableGraphBuilder(graph).build()) await task_api.submit_tileable_graph(graph, fuse_enabled=False) await asyncio.sleep(0.2) results = await task_api.get_task_results(progress=True) assert results[0].progress == 0.0 await ref.set() await asyncio.sleep(1) results = await task_api.get_task_results(progress=True) assert results[0].progress == 0.5 await ref.set() await asyncio.sleep(1) results = await task_api.get_task_results(progress=True) assert results[0].progress == 1.0
async def test_cancel_task(actor_pool): pool, session_id, meta_api, lifecycle_api, storage_api, manager = actor_pool def func(): time.sleep(200) rs = [mr.spawn(func) for _ in range(10)] graph = TileableGraph([r.data for r in rs]) next(TileableGraphBuilder(graph).build()) task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False) assert isinstance(task_id, str) await asyncio.sleep(.5) with Timer() as timer: await manager.cancel_task(task_id) result = await manager.get_task_result(task_id) assert result.status == TaskStatus.terminated assert timer.duration < 20 keys = [r.key for r in rs] del rs gc.collect() await asyncio.sleep(0.5) # test ref counts assert (await lifecycle_api.get_tileable_ref_counts(keys)) == [0] * len(keys)
def run_mars_job(odps, func, args=(), kwargs=None, retry_when_fail=False, n_output=None, **kw): from mars.remote import spawn if 'notebook' not in kw: kw['notebook'] = False cupid_internal_endpoint = kw.pop('cupid_internal_endpoint', None) client = odps.create_mars_cluster(**kw) try: r = spawn(func, args=args, kwargs=kwargs, retry_when_fail=retry_when_fail, n_output=n_output) r.op.extra_params['project'] = odps.project r.op.extra_params[ 'endpoint'] = cupid_internal_endpoint or cupid_options.cupid.runtime.endpoint r.execute() finally: client.stop_server()
def testRemote(self): import mars.remote as mr def add_one(x): return x + 1 def sum_all(xs): return sum(xs) x_list = [] for i in range(10): x_list.append(mr.spawn(add_one, args=(i, ))) client = self.odps.create_mars_cluster(1, 4, 8, name=str(uuid.uuid4())) try: r = mr.spawn(sum_all, args=(x_list, )).execute().fetch() self.assertEqual(r, 55) finally: client.stop_server()
def testRemoteFunctionInLocalCluster(self): with new_cluster(scheduler_n_process=2, worker_n_process=3, shared_memory='20M', modules=[__name__], web=True) as cluster: session = cluster.session def f(x): return x + 1 def g(x, y): return x * y a = mr.spawn(f, 3) b = mr.spawn(f, 4) c = mr.spawn(g, (a, b)) r = session.run(c, timeout=_exec_timeout) self.assertEqual(r, 20) session2 = new_session(cluster.endpoint) expect_session_id = session2.session_id def f2(): session = Session.default assert isinstance(session._sess, ClusterSession) assert session._sess.session_id == expect_session_id t = mt.ones((3, 2)) return t.sum().to_numpy() self.assertEqual( cloudpickle.loads(cloudpickle.dumps( Session.default)).session_id, session.session_id) self.assertIsInstance(serialize_function(f2), bytes) d = mr.spawn(f2, retry_when_fail=False) r = session2.run(d, timeout=_exec_timeout) self.assertEqual(r, 6)
def _cancel_when_execute(session, cancelled): def run(): time.sleep(200) rs = [mr.spawn(run) for _ in range(10)] execute(*rs, cancelled=cancelled) assert all(not r._executed_sessions for r in rs) del rs ref_counts = session._get_ref_counts() assert len(ref_counts) == 0
def test_remote_function(setup): session = setup def f1(x): return x + 1 def f2(x, y, z=None): return x * y * (z[0] + z[1]) rs = np.random.RandomState(0) raw1 = rs.rand(10, 10) raw2 = rs.rand(10, 10) r1 = spawn(f1, raw1) r2 = spawn(f1, raw2) r3 = spawn(f2, (r1, r2), {'z': [r1, r2]}) result = r3.execute().fetch() expected = (raw1 + 1) * (raw2 + 1) * (raw1 + 1 + raw2 + 1) np.testing.assert_almost_equal(result, expected) with pytest.raises(TypeError): spawn(f2, (r1, r2), kwargs=()) session_id = session.session_id def f(): assert get_default_session().session_id == session_id return mt.ones((2, 3)).sum().to_numpy() assert spawn(f).execute().fetch() == 6
def testRemoteFunction(self): def f1(x): return x + 1 def f2(x, y, z=None): return x * y * (z[0] + z[1]) rs = np.random.RandomState(0) raw1 = rs.rand(10, 10) raw2 = rs.rand(10, 10) r1 = spawn(f1, raw1) r2 = spawn(f1, raw2) r3 = spawn(f2, (r1, r2), {'z': [r1, r2]}) result = self.executor.execute_tileables([r3])[0] expected = (raw1 + 1) * (raw2 + 1) * (raw1 + 1 + raw2 + 1) np.testing.assert_almost_equal(result, expected) with self.assertRaises(TypeError): spawn(f2, (r1, r2), kwargs=()) session = new_session() def f(): assert Session.default.session_id == session.session_id return mt.ones((2, 3)).sum().to_numpy() self.assertEqual( spawn(f).execute(session=session).fetch(session=session), 6)
def test_multi_output(setup): sentences = ['word1 word2', 'word2 word3', 'word3 word2 word1'] def mapper(s): word_to_count = defaultdict(lambda: 0) for word in s.split(): word_to_count[word] += 1 downsides = [defaultdict(lambda: 0), defaultdict(lambda: 0)] for word, count in word_to_count.items(): downsides[mmh3_hash(word) % 2][word] += count return downsides def reducer(word_to_count_list): d = defaultdict(lambda: 0) for word_to_count in word_to_count_list: for word, count in word_to_count.items(): d[word] += count return dict(d) outs = [], [] for sentence in sentences: out1, out2 = spawn(mapper, sentence, n_output=2) outs[0].append(out1) outs[1].append(out2) rs = [] for out in outs: r = spawn(reducer, out) rs.append(r) result = dict() for wc in ExecutableTuple(rs).to_object(): result.update(wc) assert result == {'word1': 2, 'word2': 3, 'word3': 2}
def test_params(): def f(x): return x + 1 r = spawn(f, args=(1,)) c = tile(r).chunks[0] assert isinstance(c.params, dict) c.params = c.get_params_from_data(2) assert isinstance(c.params, dict) params = c.params params.pop('index', None) r.params = params r.refresh_params()
def test_unknown_shape_inputs(setup): def f(t, x): assert all(not np.isnan(s) for s in t.shape) return (t * x).sum().to_numpy(extra_config={'check_nsplits': False}) rs = np.random.RandomState(0) raw = rs.rand(5, 4) t1 = mt.tensor(raw, chunk_size=3) t2 = t1[t1 > 0] s = spawn(f, args=(t2, 3)) result = s.execute().fetch() expected = (raw[raw > 0] * 3).sum() assert pytest.approx(result) == expected
async def test_retryable(fault_cluster, fault_config): fault_type, fault_count, expect_raises = fault_config name = await create_fault_injection_manager( session_id=fault_cluster.session.session_id, address=fault_cluster.session.address, fault_count=fault_count, fault_type=fault_type) extra_config = {ExtraConfigKey.FAULT_INJECTION_MANAGER_NAME: name} def f(x): return x + 1 r = spawn(f, args=(1, ), retry_when_fail=False) with expect_raises: r.execute(extra_config=extra_config)
def _cancel_when_execute(session, cancelled): def run(): time.sleep(200) rs = [mr.spawn(run) for _ in range(10)] execute(*rs, cancelled=cancelled) assert all(not r._executed_sessions for r in rs) del rs ref_counts = session._get_ref_counts() assert len(ref_counts) == 0 worker_addr = session._session.client._cluster._worker_pools[ 0].external_address _assert_storage_cleaned(session.session_id, worker_addr, StorageLevel.MEMORY)
def testInputTileable(self): def f(t, x): return (t * x).sum().to_numpy() rs = np.random.RandomState(0) raw = rs.rand(5, 4) t1 = mt.tensor(raw, chunk_size=3) t2 = t1.sum(axis=0) s = spawn(f, args=(t2, 3)) sess = new_session() sess._sess._executor = ExecutorForTest('numpy', storage=sess._context) result = s.execute(session=sess).fetch(session=sess) expected = (raw.sum(axis=0) * 3).sum() self.assertAlmostEqual(result, expected)
async def test_task_error(start_test_service): _sv_pool_address, task_api, storage_api = start_test_service # test job cancel def f1(): raise SystemError rs = [mr.spawn(f1) for _ in range(10)] graph = TileableGraph([r.data for r in rs]) next(TileableGraphBuilder(graph).build()) task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) await task_api.wait_task(task_id, timeout=10) results = await task_api.get_task_results(progress=True) assert type(results[0].error) is SystemError