def testDistributedContext(self): self.start_processes(etcd=False) sess = new_session(self.session_manager_ref.address) rs = np.random.RandomState(0) context = DistributedContext( scheduler_address=self.session_manager_ref.address, session_id=sess.session_id) raw1 = rs.rand(10, 10) a = mt.tensor(raw1, chunk_size=4) a.execute(session=sess, timeout=self.timeout, name='test') tileable_infos = context.get_named_tileable_infos('test') self.assertEqual(a.key, tileable_infos.tileable_key) self.assertEqual(a.shape, tileable_infos.tileable_shape) nsplits = context.get_tileable_metas([a.key], filter_fields=['nsplits'])[0][0] self.assertEqual(((4, 4, 2), (4, 4, 2)), nsplits) r = context.get_tileable_data(a.key) np.testing.assert_array_equal(raw1, r) indexes = [slice(3, 9), slice(0, 7)] r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[tuple(indexes)], r) indexes = [[1, 4, 2, 4, 5], slice(None, None, None)] r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[tuple(indexes)], r) indexes = ([9, 1, 2, 0], [0, 0, 4, 4]) r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[[9, 1, 2, 0], [0, 0, 4, 4]], r)
def testDistributedSampler(self, *_): service_ep = 'http://127.0.0.1:' + self.web_port scheduler_ep = '127.0.0.1:' + self.scheduler_port with new_session(service_ep) as sess: raw1 = np.random.rand(100, 200) data1 = mt.tensor(raw1, chunk_size=40) data1.execute(name='data1', session=sess) raw2 = np.random.rand(100,) data2 = mt.tensor(raw2, chunk_size=60) data2.execute(name='data2', session=sess) with DistributedContext(scheduler_address=scheduler_ep, session_id=sess.session_id): dataset = MarsDataset('data1', 'data2') self.assertEqual(len(dataset), 100) sampler = MarsDistributedSampler(dataset, num_replicas=1, rank=0) indices = sampler.generate_indices() r1 = np.array(dataset._get_data(indices)[0]) r2 = np.array([dataset[ind][0] for ind in sampler]) np.testing.assert_array_equal(r1, r2) r1 = np.array(dataset._get_data(indices)[1]) r2 = np.array([dataset[ind][1] for ind in sampler]) np.testing.assert_array_equal(r1, r2) self.assertEqual(len(sampler), 100) sampler.set_epoch(1) self.assertEqual(sampler.epoch, 1)
def testDistributedContext(self): self.start_processes(etcd=False) session_id = uuid.uuid1() actor_client = new_client() rs = np.random.RandomState(0) context = DistributedContext( scheduler_address=self.scheduler_endpoints[0], session_id=session_id) session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) raw1 = rs.rand(10, 10) a = mt.tensor(raw1, chunk_size=4) graph = a.build_graph() targets = [a.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets, names=['test']) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) tileable_infos = context.get_named_tileable_infos('test') self.assertEqual(a.key, tileable_infos.tileable_key) self.assertEqual(a.shape, tileable_infos.tileable_shape) nsplits = context.get_tileable_metas([a.key], filter_fields=['nsplits'])[0][0] self.assertEqual(((4, 4, 2), (4, 4, 2)), nsplits) r = context.get_tileable_data(a.key) np.testing.assert_array_equal(raw1, r) indexes = [slice(3, 9), slice(0, 7)] r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[tuple(indexes)], r) indexes = [[1, 4, 2, 4, 5], slice(None, None, None)] r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[tuple(indexes)], r) indexes = ([9, 1, 2, 0], [0, 0, 4, 4]) r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[[9, 1, 2, 0], [0, 0, 4, 4]], r)
def testDistributedRunPyTorchScript(self): service_ep = 'http://127.0.0.1:' + self.web_port scheduler_ep = '127.0.0.1:' + self.scheduler_port with new_session(service_ep) as sess: raw = np.random.rand(100, 200) data = mt.tensor(raw, chunk_size=40) data.execute(name='data', session=sess) with DistributedContext(scheduler_address=scheduler_ep, session_id=sess.session_id): dataset = MarsDataset('data') self.assertEqual(len(dataset), 100) sample = [2, 5, 7, 9, 10] r1 = dataset[sample][0] np.testing.assert_array_equal(raw[sample], r1)
def testDistributedDataset(self): service_ep = 'http://127.0.0.1:' + self.web_port scheduler_ep = '127.0.0.1:' + self.scheduler_port with new_session(service_ep) as sess: raw = np.random.rand(100, 200) data = mt.tensor(raw, chunk_size=40) data.execute(name='data', session=sess) with DistributedContext(scheduler_address=scheduler_ep, session_id=sess.session_id): dataset = MarsDataset('data') self.assertEqual(len(dataset), 100) sample = np.random.randint(0, 100, (10,)) r1 = dataset[sample][0] np.testing.assert_array_equal(raw[sample], r1) sample = np.random.randint(0, 100, (10,)) dataset.prefetch(sample) r2 = np.array([dataset[ind][0] for ind in sample]) np.testing.assert_array_equal(raw[sample], r2)
def testFetchLogWithoutEtcd(self): # test fetch log with tempfile.TemporaryDirectory() as temp_dir: self.start_processes( etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op'], scheduler_args=[f'-Dcustom_log_dir={temp_dir}']) sess = new_session(self.session_manager_ref.address) def f(): print('test') r = spawn(f) r.execute(session=sess) custom_log_actor = sess._api.actor_client.actor_ref( CustomLogMetaActor.default_uid(), address=self.cluster_info.get_scheduler( CustomLogMetaActor.default_uid())) chunk_key_to_log_path = custom_log_actor.get_tileable_op_log_paths( sess.session_id, r.op.key) paths = list(chunk_key_to_log_path.values()) self.assertEqual(len(paths), 1) log_path = paths[0][1] with open(log_path) as f: self.assertEqual(f.read().strip(), 'test') context = DistributedContext( scheduler_address=self.session_manager_ref.address, session_id=sess.session_id) log_result = context.fetch_tileable_op_logs(r.op.key) log = next(iter(log_result.values()))['log'] self.assertEqual(log.strip(), 'test') log = r.fetch_log() self.assertEqual(str(log).strip(), 'test') # test multiple functions def f1(size): print('f1' * size) sys.stdout.flush() fs = ExecutableTuple([spawn(f1, 30), spawn(f1, 40)]) fs.execute(session=sess) log = fs.fetch_log(offsets=20, sizes=10) self.assertEqual(str(log[0]).strip(), ('f1' * 30)[20:30]) self.assertEqual(str(log[1]).strip(), ('f1' * 40)[20:30]) self.assertGreater(len(log[0].offsets), 0) self.assertTrue(all(s > 0 for s in log[0].offsets)) self.assertGreater(len(log[1].offsets), 0) self.assertTrue(all(s > 0 for s in log[1].offsets)) self.assertGreater(len(log[0].chunk_op_keys), 0) # test negative offsets log = fs.fetch_log(offsets=-20, sizes=10) self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10]) self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10]) self.assertTrue(all(s > 0 for s in log[0].offsets)) self.assertGreater(len(log[1].offsets), 0) self.assertTrue(all(s > 0 for s in log[1].offsets)) self.assertGreater(len(log[0].chunk_op_keys), 0) # test negative offsets which represented in string log = fs.fetch_log(offsets='-0.02K', sizes='0.01K') self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10]) self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10]) self.assertTrue(all(s > 0 for s in log[0].offsets)) self.assertGreater(len(log[1].offsets), 0) self.assertTrue(all(s > 0 for s in log[1].offsets)) self.assertGreater(len(log[0].chunk_op_keys), 0) def test_nested(): print('level0') fr = spawn(f1, 1) fr.execute() print(fr.fetch_log()) r = spawn(test_nested) with self.assertRaises(ValueError): r.fetch_log() r.execute(session=sess) log = str(r.fetch_log()) self.assertIn('level0', log) self.assertIn('f1', log) df = md.DataFrame(mt.random.rand(10, 3), chunk_size=5) def df_func(c): print('df func') return c df2 = df.map_chunk(df_func) df2.execute(session=sess) log = df2.fetch_log() self.assertIn('Chunk op key:', str(log)) self.assertIn('df func', repr(log)) self.assertEqual(len(str(df.fetch_log(session=sess))), 0)
def testFetchLogWithoutEtcd(self): # test fetch log with tempfile.TemporaryDirectory() as temp_dir: self.start_processes(etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op'], scheduler_args=[f'-Dcustom_log_dir={temp_dir}']) sess = new_session(self.session_manager_ref.address) def f(): print('test') r = spawn(f) r.execute(session=sess) custom_log_actor = sess._api.actor_client.actor_ref( CustomLogMetaActor.default_uid(), address=self.cluster_info.get_scheduler(CustomLogMetaActor.default_uid()) ) chunk_key_to_log_path = custom_log_actor.get_tileable_op_log_paths( sess.session_id, r.op.key) paths = list(chunk_key_to_log_path.values()) self.assertEqual(len(paths), 1) log_path = paths[0][1] with open(log_path) as f: self.assertEqual(f.read().strip(), 'test') context = DistributedContext(scheduler_address=self.session_manager_ref.address, session_id=sess.session_id) log_result = context.fetch_tileable_op_logs(r.op.key) log = next(iter(log_result.values()))['log'] self.assertEqual(log.strip(), 'test') log = r.fetch_log() self.assertEqual(str(log).strip(), 'test') # test multiple functions def f1(size): print('f1' * size) sys.stdout.flush() fs = ExecutableTuple([spawn(f1, 30), spawn(f1, 40)]) fs.execute(session=sess) log = fs.fetch_log(offsets=20, sizes=10) self.assertEqual(str(log[0]).strip(), ('f1' * 30)[20:30]) self.assertEqual(str(log[1]).strip(), ('f1' * 40)[20:30]) self.assertGreater(len(log[0].offsets), 0) self.assertTrue(all(s > 0 for s in log[0].offsets)) self.assertGreater(len(log[1].offsets), 0) self.assertTrue(all(s > 0 for s in log[1].offsets)) self.assertGreater(len(log[0].chunk_op_keys), 0) # test negative offsets log = fs.fetch_log(offsets=-20, sizes=10) self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10]) self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10]) self.assertTrue(all(s > 0 for s in log[0].offsets)) self.assertGreater(len(log[1].offsets), 0) self.assertTrue(all(s > 0 for s in log[1].offsets)) self.assertGreater(len(log[0].chunk_op_keys), 0) # test negative offsets which represented in string log = fs.fetch_log(offsets='-0.02K', sizes='0.01K') self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10]) self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10]) self.assertTrue(all(s > 0 for s in log[0].offsets)) self.assertGreater(len(log[1].offsets), 0) self.assertTrue(all(s > 0 for s in log[1].offsets)) self.assertGreater(len(log[0].chunk_op_keys), 0) def test_nested(): print('level0') fr = spawn(f1, 1) fr.execute() print(fr.fetch_log()) r = spawn(test_nested) with self.assertRaises(ValueError): r.fetch_log() r.execute(session=sess) log = str(r.fetch_log()) self.assertIn('level0', log) self.assertIn('f1', log) df = md.DataFrame(mt.random.rand(10, 3), chunk_size=5) def df_func(c): print('df func') return c df2 = df.map_chunk(df_func) df2.execute(session=sess) log = df2.fetch_log() self.assertIn('Chunk op key:', str(log)) self.assertIn('df func', repr(log)) self.assertEqual(len(str(df.fetch_log(session=sess))), 0) def test_host(rndf): rm = spawn(nested, rndf) rm.execute() print(rm.fetch_log()) def nested(_rndf): print('log_content') ds = [spawn(test_host, n, retry_when_fail=False) for n in np.random.rand(4)] xtp = ExecutableTuple(ds) xtp.execute(session=sess) for log in xtp.fetch_log(session=sess): self.assertEqual(str(log).strip(), 'log_content') def test_threaded(): import threading exc_info = None def print_fun(): nonlocal exc_info try: print('inner') except: # noqa: E722 # nosec # pylint: disable=bare-except exc_info = sys.exc_info() print_thread = threading.Thread(target=print_fun) print_thread.start() print_thread.join() if exc_info is not None: raise exc_info[1].with_traceback(exc_info[-1]) print('after') rm = spawn(test_threaded) rm.execute(session=sess) logs = str(rm.fetch_log(session=sess)).strip() self.assertEqual(logs, 'inner\nafter')