def testKVStoreActor(self): etcd_port = get_next_port() proc_helper = EtcdProcessHelper(port_range_start=etcd_port) options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port with proc_helper.run(), create_actor_pool(n_process=1, backend='gevent') as pool: store_ref = pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) store_ref.write('/node/v1', 'value1') store_ref.write('/node/v2', 'value2') store_ref.write_batch([ ('/node/v2', 'value2'), ('/node/v3', 'value3'), ]) self.assertEqual(store_ref.read('/node/v1').value, 'value1') self.assertListEqual([ v.value for v in store_ref.read_batch(['/node/v2', '/node/v3']) ], ['value2', 'value3']) store_ref.delete('/node', dir=True, recursive=True) with self.assertRaises(KeyError): store_ref.delete('/node', dir=True, recursive=True) store_ref.delete('/node', dir=True, recursive=True, silent=True)
class Test(unittest.TestCase): def tearDown(self): super(Test, self).tearDown() options.kv_store = ':inproc:' @unittest.skipIf(sys.platform == 'win32', 'does not run in windows') @unittest.skipIf('CI' not in os.environ and not EtcdProcessHelper().is_installed(), 'does not run without etcd') def testKVStoreActor(self): etcd_port = get_next_port() proc_helper = EtcdProcessHelper(port_range_start=etcd_port) options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port with proc_helper.run(), create_actor_pool(n_process=1, backend='gevent') as pool: store_ref = pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) store_ref.write('/node/v1', 'value1') store_ref.write('/node/v2', 'value2') store_ref.write_batch([ ('/node/v2', 'value2'), ('/node/v3', 'value3'), ]) self.assertEqual(store_ref.read('/node/v1').value, 'value1') self.assertListEqual([v.value for v in store_ref.read_batch(['/node/v2', '/node/v3'])], ['value2', 'value3']) store_ref.delete('/node', dir=True, recursive=True) with self.assertRaises(KeyError): store_ref.delete('/node', dir=True, recursive=True) store_ref.delete('/node', dir=True, recursive=True, silent=True)
def testEtcdWatch(self): with EtcdProcessHelper(port_range_start=51342).run(): kvstore = get('etcd://localhost:51342') kvstore.write('/node/subnode/v1', 'value1') kvstore.write('/node/v2', 'value2') def watcher(): return kvstore.watch('/node/v2', timeout=10) def writer(): gevent.sleep(1) kvstore.write('/node/v2', 'value2\'') g1 = gevent.spawn(writer) g2 = gevent.spawn(watcher) gevent.joinall([g1, g2]) self.assertEqual(g2.value.value, 'value2\'') kvstore.delete('/node/v2') def watcher(): return kvstore.watch('/node/subnode', timeout=10, recursive=True) def writer(): gevent.sleep(1) kvstore.write('/node/subnode/v1', 'value1\'') g1 = gevent.spawn(writer) g2 = gevent.spawn(watcher) gevent.joinall([g1, g2]) self.assertEqual(g2.value.children[0].value, 'value1\'') kvstore.write('/node/subnode/v3', '-1') def watcher(): results = [] for idx, result in enumerate( kvstore.eternal_watch('/node/subnode/v3')): results.append(int(result.value)) if idx == 4: break return results def writer(): gevent.sleep(0.1) for v in range(5): kvstore.write('/node/subnode/v3', str(v)) gevent.sleep(0.1) g1 = gevent.spawn(writer) g2 = gevent.spawn(watcher) gevent.joinall([g1, g2]) self.assertEqual(g2.value, list(range(5))) kvstore.delete('/node', dir=True, recursive=True)
def testKVStoreActor(self): proc_helper = EtcdProcessHelper(port_range_start=54131) with proc_helper.run(), create_actor_pool(n_process=1, backend='gevent') as pool: store_ref = pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) store_ref.write('/node/v1', 'value1') store_ref.write('/node/v2', 'value2') store_ref.write_batch([ ('/node/v2', 'value2'), ('/node/v3', 'value3'), ]) self.assertEqual(store_ref.read('/node/v1').value, 'value1') self.assertListEqual([ v.value for v in store_ref.read_batch(['/node/v2', '/node/v3']) ], ['value2', 'value3'])
class Test(unittest.TestCase): @classmethod def setUpClass(cls): from mars import kvstore options.worker.spill_directory = os.path.join(tempfile.gettempdir(), 'mars_test_spill') cls._kv_store = kvstore.get(options.kv_store) @classmethod def tearDownClass(cls): import shutil if os.path.exists(options.worker.spill_directory): shutil.rmtree(options.worker.spill_directory) try: delay_state_file = os.environ.get('DELAY_STATE_FILE') if delay_state_file: os.unlink(delay_state_file) except OSError: pass def setUp(self): self.scheduler_endpoints = [] self.proc_schedulers = [] self.proc_workers = [] self.state_files = [] self.etcd_helper = None def tearDown(self): for fn in self.state_files: if os.path.exists(fn): os.unlink(fn) procs = tuple(self.proc_workers) + tuple(self.proc_schedulers) for p in procs: p.send_signal(signal.SIGINT) check_time = time.time() while any(p.poll() is None for p in procs): time.sleep(0.1) if time.time() - check_time > 5: break for p in procs: if p.poll() is None: p.kill() if self.etcd_helper: self.etcd_helper.stop() options.kv_store = ':inproc:' def add_state_file(self, environ): fn = os.environ[environ] = os.path.join( tempfile.gettempdir(), 'test-main-%s-%d-%d' % (environ.lower(), os.getpid(), id(self))) self.state_files.append(fn) return fn def start_processes(self, n_schedulers=2, n_workers=2, etcd=False, modules=None, log_scheduler=True, log_worker=True): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception, ) scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)] self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports] append_args = [] if modules: append_args.extend(['--load-modules', ','.join(modules)]) if etcd: etcd_port = get_next_port() self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port) self.etcd_helper.run() options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port append_args.extend(['--kv-store', options.kv_store]) else: append_args.extend( ['--schedulers', ','.join(self.scheduler_endpoints)]) if 'DUMP_GRAPH_DATA' in os.environ: append_args += ['-Dscheduler.dump_graph_data=true'] self.proc_schedulers = [ subprocess.Popen([ sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '--level', 'debug' if log_scheduler else 'warning', '-p', p, '--format', '%(asctime)-15s %(message)s', '-Dscheduler.retry_delay=5' ] + append_args) for p in scheduler_ports ] self.proc_workers = [ subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', '1', '--level', 'debug' if log_worker else 'warning', '--cache-mem', '16m', '--ignore-avail-mem', '-Dworker.prepare_data_timeout=30' ] + append_args) for _ in range(n_workers) ] actor_client = new_client() self.cluster_info = actor_client.actor_ref( ClusterInfoActor.default_name(), address=self.scheduler_endpoints[0]) check_time = time.time() while True: try: started_schedulers = self.cluster_info.get_schedulers() if len(started_schedulers) < n_schedulers: raise RuntimeError( 'Schedulers does not met requirement: %d < %d.' % (len(started_schedulers), n_schedulers)) actor_address = self.cluster_info.get_scheduler( SessionManagerActor.default_name()) self.session_manager_ref = actor_client.actor_ref( SessionManagerActor.default_name(), address=actor_address) actor_address = self.cluster_info.get_scheduler( ResourceActor.default_name()) resource_ref = actor_client.actor_ref( ResourceActor.default_name(), address=actor_address) if resource_ref.get_worker_count() < n_workers: raise RuntimeError( 'Workers does not met requirement: %d < %d.' % (resource_ref.get_worker_count(), n_workers)) break except: if time.time() - check_time > 20: raise time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors def check_process_statuses(self): for scheduler_proc in self.proc_schedulers: if scheduler_proc.poll() is not None: raise SystemError('Scheduler not started. exit code %s' % self.proc_scheduler.poll()) for worker_proc in self.proc_workers: if worker_proc.poll() is not None: raise SystemError('Worker not started. exit code %s' % worker_proc.poll()) def wait_for_termination(self, actor_client, session_ref, graph_key): check_time = time.time() dump_time = time.time() check_timeout = int(os.environ.get('CHECK_TIMEOUT', 120)) while True: time.sleep(0.1) self.check_process_statuses() if time.time() - check_time > check_timeout: raise SystemError('Check graph status timeout') if time.time() - dump_time > 10: dump_time = time.time() graph_refs = session_ref.get_graph_refs() try: graph_ref = actor_client.actor_ref(graph_refs[graph_key]) graph_ref.dump_unfinished_terminals() except KeyError: pass if session_ref.graph_state( graph_key) in GraphState.TERMINATED_STATES: return session_ref.graph_state(graph_key) def testMainWithoutEtcd(self): self.start_processes() session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1 assert_allclose(loads(result), expected.sum()) a = mt.ones((100, 50), chunk_size=35) * 2 + 1 b = mt.ones((50, 200), chunk_size=35) * 2 + 1 c = a.dot(b) graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) assert_allclose(loads(result), np.ones((100, 200)) * 450) base_arr = np.random.random((100, 100)) a = mt.array(base_arr) sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)]) graph = sumv.build_graph() targets = [sumv.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)]) result = session_ref.fetch_result(graph_key, sumv.key) assert_allclose(loads(result), expected) a = mt.ones((31, 27), chunk_size=10) b = a.reshape(27, 31) b.op.params['_reshape_with_shuffle'] = True graph = b.build_graph() targets = [b.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, b.key) assert_allclose(loads(result), np.ones((27, 31))) def testMainWithEtcd(self): self.start_processes(etcd=True) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1 assert_allclose(loads(result), expected.sum()) def testWorkerFailOver(self): def kill_process_tree(proc): import psutil proc = psutil.Process(proc.pid) plasma_sock_dir = None for p in proc.children(recursive=True): if 'plasma' in p.name(): socks = [ conn.laddr for conn in p.connections('unix') if 'plasma' in conn.laddr ] if socks: plasma_sock_dir = os.path.dirname(socks[0]) p.kill() proc.kill() if plasma_sock_dir: shutil.rmtree(plasma_sock_dir, ignore_errors=True) delay_file = self.add_state_file('DELAY_STATE_FILE') open(delay_file, 'w').close() terminate_file = self.add_state_file('TERMINATE_STATE_FILE') self.start_processes(modules=['mars.scheduler.tests.op_delayer'], log_worker=True) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) np_a = np.random.random((100, 100)) np_b = np.random.random((100, 100)) a = mt.array(np_a, chunk_size=30) * 2 + 1 b = mt.array(np_b, chunk_size=30) * 2 + 1 c = a.dot(b) * 2 + 1 graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) while not os.path.exists(terminate_file): actor_client.sleep(0.05) kill_process_tree(self.proc_workers[0]) logger.warning('Worker %s KILLED!\n\n', self.proc_workers[0].pid) self.proc_workers = self.proc_workers[1:] os.unlink(delay_file) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np_a * 2 + 1).dot(np_b * 2 + 1) * 2 + 1 assert_allclose(loads(result), expected)
def start_processes(self, n_schedulers=2, n_workers=2, etcd=False, modules=None, log_scheduler=True, log_worker=True): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception, ) scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)] self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports] append_args = [] if modules: append_args.extend(['--load-modules', ','.join(modules)]) if etcd: etcd_port = get_next_port() self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port) self.etcd_helper.run() options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port append_args.extend(['--kv-store', options.kv_store]) else: append_args.extend( ['--schedulers', ','.join(self.scheduler_endpoints)]) if 'DUMP_GRAPH_DATA' in os.environ: append_args += ['-Dscheduler.dump_graph_data=true'] self.proc_schedulers = [ subprocess.Popen([ sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '--level', 'debug' if log_scheduler else 'warning', '-p', p, '--format', '%(asctime)-15s %(message)s', '-Dscheduler.retry_delay=5' ] + append_args) for p in scheduler_ports ] self.proc_workers = [ subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', '1', '--level', 'debug' if log_worker else 'warning', '--cache-mem', '16m', '--ignore-avail-mem', '-Dworker.prepare_data_timeout=30' ] + append_args) for _ in range(n_workers) ] actor_client = new_client() self.cluster_info = actor_client.actor_ref( ClusterInfoActor.default_name(), address=self.scheduler_endpoints[0]) check_time = time.time() while True: try: started_schedulers = self.cluster_info.get_schedulers() if len(started_schedulers) < n_schedulers: raise RuntimeError( 'Schedulers does not met requirement: %d < %d.' % (len(started_schedulers), n_schedulers)) actor_address = self.cluster_info.get_scheduler( SessionManagerActor.default_name()) self.session_manager_ref = actor_client.actor_ref( SessionManagerActor.default_name(), address=actor_address) actor_address = self.cluster_info.get_scheduler( ResourceActor.default_name()) resource_ref = actor_client.actor_ref( ResourceActor.default_name(), address=actor_address) if resource_ref.get_worker_count() < n_workers: raise RuntimeError( 'Workers does not met requirement: %d < %d.' % (resource_ref.get_worker_count(), n_workers)) break except: if time.time() - check_time > 20: raise time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors
class SchedulerIntegratedTest(unittest.TestCase): @classmethod def setUpClass(cls): from mars import kvstore options.worker.spill_directory = os.path.join(tempfile.gettempdir(), 'mars_test_spill') cls._kv_store = kvstore.get(options.kv_store) @classmethod def tearDownClass(cls): import shutil if os.path.exists(options.worker.spill_directory): shutil.rmtree(options.worker.spill_directory) def setUp(self): self.scheduler_endpoints = [] self.proc_schedulers = [] self.proc_workers = [] self.state_files = dict() self.etcd_helper = None self.intentional_death_pids = set() def tearDown(self): for env, fn in self.state_files.items(): os.environ.pop(env) if os.path.exists(fn): os.unlink(fn) procs = tuple(self.proc_workers) + tuple(self.proc_schedulers) for p in procs: p.send_signal(signal.SIGINT) check_time = time.time() while any(p.poll() is None for p in procs): time.sleep(0.1) if time.time() - check_time > 5: break for p in procs: if p.poll() is None: self.kill_process_tree(p) if self.etcd_helper: self.etcd_helper.stop() options.kv_store = ':inproc:' def kill_process_tree(self, proc, intentional=True): if intentional: self.intentional_death_pids.add(proc.pid) import psutil proc = psutil.Process(proc.pid) plasma_sock_dir = None for p in proc.children(recursive=True): try: if 'plasma' in p.name(): socks = [ conn.laddr for conn in p.connections('unix') if 'plasma' in conn.laddr ] if socks: plasma_sock_dir = os.path.dirname(socks[0]) p.kill() except psutil.NoSuchProcess: continue proc.kill() if plasma_sock_dir: shutil.rmtree(plasma_sock_dir, ignore_errors=True) def add_state_file(self, environ): fn = os.environ[environ] = os.path.join( tempfile.gettempdir(), 'test-main-%s-%d-%d' % (environ.lower(), os.getpid(), id(self))) self.state_files[environ] = fn return fn def start_processes(self, n_schedulers=2, n_workers=2, etcd=False, cuda=False, modules=None, log_scheduler=True, log_worker=True, env=None): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception, ) scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)] self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports] append_args = [] append_args_scheduler = [] append_args_worker = [] if modules: append_args.extend(['--load-modules', ','.join(modules)]) if etcd: etcd_port = get_next_port() self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port) self.etcd_helper.run() options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port append_args.extend(['--kv-store', options.kv_store]) else: append_args.extend( ['--schedulers', ','.join(self.scheduler_endpoints)]) if 'DUMP_GRAPH_DATA' in os.environ: append_args_scheduler += ['-Dscheduler.dump_graph_data=true'] if not cuda: append_args_worker += ['--no-cuda'] proc_env = os.environ.copy() if env: proc_env.update(env) self.proc_schedulers = [ subprocess.Popen([ sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '-p', p, '--log-level', 'debug' if log_scheduler else 'warning', '--log-format', 'SCH%d %%(asctime)-15s %%(message)s' % idx, '-Dscheduler.retry_delay=5', '-Dscheduler.default_cpu_usage=0', '-Dscheduler.status_timeout=10' ] + append_args + append_args_scheduler, env=proc_env) for idx, p in enumerate(scheduler_ports) ] cuda_count = resource.cuda_count() self.proc_workers = [ subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', '1', '--log-level', 'debug' if log_worker else 'warning', '--log-format', 'WOR%d %%(asctime)-15s %%(message)s' % idx, '--cache-mem', '16m', '--ignore-avail-mem', '--cuda-device', str(idx % cuda_count) if cuda_count else '0', '-Dworker.prepare_data_timeout=30' ] + append_args + append_args_worker, env=proc_env) for idx in range(n_workers) ] actor_client = new_client() self.cluster_info = actor_client.actor_ref( SchedulerClusterInfoActor.default_uid(), address=self.scheduler_endpoints[0]) check_time = time.time() while True: try: started_schedulers = self.cluster_info.get_schedulers() if len(started_schedulers) < n_schedulers: raise ProcessRequirementUnmetError( 'Schedulers does not met requirement: %d < %d.' % (len(started_schedulers), n_schedulers)) actor_address = self.cluster_info.get_scheduler( SessionManagerActor.default_uid()) self.session_manager_ref = actor_client.actor_ref( SessionManagerActor.default_uid(), address=actor_address) actor_address = self.cluster_info.get_scheduler( ResourceActor.default_uid()) resource_ref = actor_client.actor_ref( ResourceActor.default_uid(), address=actor_address) if resource_ref.get_worker_count() < n_workers: raise ProcessRequirementUnmetError( 'Workers does not met requirement: %d < %d.' % (resource_ref.get_worker_count(), n_workers)) break except: if time.time() - check_time > 20: raise time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors def check_process_statuses(self): for scheduler_proc in self.proc_schedulers: if scheduler_proc.poll() is not None: raise ProcessRequirementUnmetError( 'Scheduler not started. exit code %s' % self.proc_scheduler.poll()) for worker_proc in self.proc_workers: if worker_proc.poll( ) is not None and worker_proc.pid not in self.intentional_death_pids: raise ProcessRequirementUnmetError( 'Worker not started. exit code %s' % worker_proc.poll()) def wait_for_termination(self, actor_client, session_ref, graph_key): check_time = time.time() dump_time = time.time() check_timeout = int(os.environ.get('CHECK_TIMEOUT', 120)) while True: time.sleep(0.1) self.check_process_statuses() if time.time() - check_time > check_timeout: raise SystemError('Check graph status timeout') if time.time() - dump_time > 10: dump_time = time.time() graph_refs = session_ref.get_graph_refs() try: graph_ref = actor_client.actor_ref(graph_refs[graph_key]) graph_ref.dump_unfinished_terminals() except KeyError: pass if session_ref.graph_state( graph_key) in GraphState.TERMINATED_STATES: return session_ref.graph_state(graph_key)
def start_processes(self, n_schedulers=2, n_workers=2, etcd=False, cuda=False, modules=None, log_scheduler=True, log_worker=True, env=None): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception, ) scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)] self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports] append_args = [] append_args_scheduler = [] append_args_worker = [] if modules: append_args.extend(['--load-modules', ','.join(modules)]) if etcd: etcd_port = get_next_port() self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port) self.etcd_helper.run() options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port append_args.extend(['--kv-store', options.kv_store]) else: append_args.extend( ['--schedulers', ','.join(self.scheduler_endpoints)]) if 'DUMP_GRAPH_DATA' in os.environ: append_args_scheduler += ['-Dscheduler.dump_graph_data=true'] if not cuda: append_args_worker += ['--no-cuda'] proc_env = os.environ.copy() if env: proc_env.update(env) self.proc_schedulers = [ subprocess.Popen([ sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '-p', p, '--log-level', 'debug' if log_scheduler else 'warning', '--log-format', 'SCH%d %%(asctime)-15s %%(message)s' % idx, '-Dscheduler.retry_delay=5', '-Dscheduler.default_cpu_usage=0', '-Dscheduler.status_timeout=10' ] + append_args + append_args_scheduler, env=proc_env) for idx, p in enumerate(scheduler_ports) ] cuda_count = resource.cuda_count() cuda_devices = [int(d) for d in os.environ['CUDA_VISIBLE_DEVICES'].split(',')] \ if os.environ.get('CUDA_VISIBLE_DEVICES') else list(range(cuda_count)) self.proc_workers = [ subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', '1', '--log-level', 'debug' if log_worker else 'warning', '--log-format', 'WOR%d %%(asctime)-15s %%(message)s' % idx, '--cache-mem', '16m', '--ignore-avail-mem', '--cuda-device', str(cuda_devices[idx % cuda_count]) if cuda_count else '0', '-Dworker.prepare_data_timeout=30' ] + append_args + append_args_worker, env=proc_env) for idx in range(n_workers) ] actor_client = new_client() self.cluster_info = actor_client.actor_ref( SchedulerClusterInfoActor.default_uid(), address=self.scheduler_endpoints[0]) check_time = time.time() while True: try: try: started_schedulers = self.cluster_info.get_schedulers() except Exception as e: raise ProcessRequirementUnmetError( 'Failed to get scheduler numbers, %s' % e) if len(started_schedulers) < n_schedulers: raise ProcessRequirementUnmetError( 'Schedulers does not met requirement: %d < %d.' % (len(started_schedulers), n_schedulers)) actor_address = self.cluster_info.get_scheduler( SessionManagerActor.default_uid()) self.session_manager_ref = actor_client.actor_ref( SessionManagerActor.default_uid(), address=actor_address) actor_address = self.cluster_info.get_scheduler( ResourceActor.default_uid()) resource_ref = actor_client.actor_ref( ResourceActor.default_uid(), address=actor_address) if resource_ref.get_worker_count() < n_workers: raise ProcessRequirementUnmetError( 'Workers does not met requirement: %d < %d.' % (resource_ref.get_worker_count(), n_workers)) break except: if time.time() - check_time > 20: raise time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors
class Test(SchedulerIntegratedTest): def testMainTensorWithoutEtcd(self): self.start_processes() session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1 assert_allclose(loads(result), expected.sum()) a = mt.ones((100, 50), chunk_size=35) * 2 + 1 b = mt.ones((50, 200), chunk_size=35) * 2 + 1 c = a.dot(b) graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) assert_allclose(loads(result), np.ones((100, 200)) * 450) base_arr = np.random.random((100, 100)) a = mt.array(base_arr) sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)]) graph = sumv.build_graph() targets = [sumv.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)]) result = session_ref.fetch_result(graph_key, sumv.key) assert_allclose(loads(result), expected) a = mt.ones((31, 27), chunk_size=10) b = a.reshape(27, 31) b.op.extra_params['_reshape_with_shuffle'] = True r = b.sum(axis=1) graph = r.build_graph() targets = [r.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, r.key) assert_allclose(loads(result), np.ones((27, 31)).sum(axis=1)) raw = np.random.RandomState(0).rand(10, 10) a = mt.tensor(raw, chunk_size=(5, 4)) b = a[a.argmin(axis=1), mt.tensor(np.arange(10))] graph = b.build_graph() targets = [b.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, b.key) np.testing.assert_array_equal(loads(result), raw[raw.argmin(axis=1), np.arange(10)]) @unittest.skipIf('CI' not in os.environ and not EtcdProcessHelper().is_installed(), 'does not run without etcd') def testMainTensorWithEtcd(self): self.start_processes(etcd=True) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1 assert_allclose(loads(result), expected.sum()) @require_cupy @require_cudf def testMainTensorWithCuda(self): self.start_processes(cuda=True) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30, gpu=True) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30, gpu=True) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1 assert_allclose(loads(result), expected.sum()) def testMainDataFrameWithoutEtcd(self): import pandas as pd from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df from mars.dataframe.datasource.series import from_pandas as from_pandas_series from mars.dataframe.arithmetic import add self.start_processes(etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true']) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas_df(data2, chunk_size=6) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas_df(data1, chunk_size=(10, 5)) data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas_df(data2, chunk_size=(10, 6)) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas_df(data2, chunk_size=6) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = from_pandas_series(s1) graph = series1.build_graph() targets = [series1.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, series1.key) pd.testing.assert_series_equal(s1, loads(result)) def testIterativeTilingWithoutEtcd(self): self.start_processes(etcd=False) session_id = uuid.uuid1() actor_client = new_client() rs = np.random.RandomState(0) session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) raw = rs.rand(100) a = mt.tensor(raw, chunk_size=10) a.sort() c = a[:5] graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = np.sort(raw)[:5] assert_allclose(loads(result), expected) with self.assertRaises(KeyError): session_ref.fetch_result(graph_key, a.key, check=False) raw1 = rs.rand(20) raw2 = rs.rand(20) a = mt.tensor(raw1, chunk_size=10) a.sort() b = mt.tensor(raw2, chunk_size=15) + 1 c = mt.concatenate([a[:10], b]) c.sort() d = c[:5] graph = d.build_graph() targets = [d.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, d.key) expected = np.sort(np.concatenate([np.sort(raw1)[:10], raw2 + 1]))[:5] assert_allclose(loads(result), expected) raw = rs.randint(100, size=(100,)) a = mt.tensor(raw, chunk_size=53) a.sort() b = mt.histogram(a, bins='scott') graph = build_tileable_graph(b, set()) targets = [b[0].key, b[1].key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) res = session_ref.fetch_result(graph_key, b[0].key), \ session_ref.fetch_result(graph_key, b[1].key) expected = np.histogram(np.sort(raw), bins='scott') assert_allclose(loads(res[0]), expected[0]) assert_allclose(loads(res[1]), expected[1]) def testDistributedContext(self): self.start_processes(etcd=False) session_id = uuid.uuid1() actor_client = new_client() rs = np.random.RandomState(0) context = DistributedContext(scheduler_address=self.scheduler_endpoints[0], session_id=session_id) session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) raw1 = rs.rand(10, 10) a = mt.tensor(raw1, chunk_size=4) graph = a.build_graph() targets = [a.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets, names=['test']) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) tileable_key = context.get_tileable_key_by_name('test') self.assertEqual(a.key, tileable_key) nsplits = context.get_tileable_metas([a.key], filter_fields=['nsplits'])[0][0] self.assertEqual(((4, 4, 2), (4, 4, 2)), nsplits) r = context.get_tileable_data(a.key) np.testing.assert_array_equal(raw1, r) indexes = [slice(3, 9), slice(0, 7)] r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[tuple(indexes)], r) indexes = [[1, 4, 2, 4, 5], slice(None, None, None)] r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[tuple(indexes)], r) indexes = ([9, 1, 2, 0], [0, 0, 4, 4]) r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[[9, 1, 2, 0], [0, 0, 4, 4]], r) def testOperandsWithoutPrepareInputs(self): self.start_processes(etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op']) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) actor_address = self.cluster_info.get_scheduler(ResourceActor.default_uid()) resource_ref = actor_client.actor_ref(ResourceActor.default_uid(), address=actor_address) worker_endpoints = resource_ref.get_worker_endpoints() t1 = mt.random.rand(10) t1.op._expect_worker = worker_endpoints[0] t2 = mt.random.rand(10) t2.op._expect_worker = worker_endpoints[1] t = NoPrepareOperand().new_tileable([t1, t2]) t.op._prepare_inputs = [False, False] graph = t.build_graph() targets = [t.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED)
def testEtcdPathStore(self): with EtcdProcessHelper(port_range_start=51342).run(): kvstore = get(u'etcd://localhost:51342') kvstore.write(u'/node/subnode/v1', u'value1') kvstore.write(u'/node/v2', u'value2') res = kvstore.read(u'/node', sort=True) expected = PathResult(key=u'/node', dir=True, children=[ PathResult(key=u'/node/subnode', dir=True), PathResult(key=u'/node/v2', value=u'value2'), ]) self.assertEqual(repr(res), repr(expected)) res = kvstore.read(u'/node', recursive=True, sort=True) expected = PathResult(key=u'/node', dir=True, children=[ PathResult(key=u'/node/subnode/v1', value=u'value1'), PathResult(key=u'/node/v2', value=u'value2'), ]) self.assertEqual(repr(res), repr(expected)) kvstore.write(u'/node/v3', u'value3') with self.assertRaises(KeyError): kvstore.write(u'/node/v2/invalid_value', value=u'invalid') res = kvstore.read('/', recursive=False, sort=True) expected = PathResult(key='/', dir=True, children=[ PathResult(key=u'/node', dir=True), ]) self.assertEqual(repr(res), repr(expected)) res = kvstore.read('/', recursive=True, sort=True) expected = PathResult(key='/', dir=True, children=[ PathResult(key=u'/node/subnode/v1', value=u'value1'), PathResult(key=u'/node/v2', value=u'value2'), PathResult(key=u'/node/v3', value=u'value3'), ]) self.assertEqual(repr(res), repr(expected)) kvstore.write(u'/node/subnode2/v4', u'value4') with self.assertRaises(KeyError): kvstore.delete(u'/node/subnode', dir=True) kvstore.delete(u'/node/subnode/v1') res = kvstore.read('/', recursive=True, sort=True) expected = PathResult(key='/', dir=True, children=[ PathResult(key=u'/node/subnode', dir=True), PathResult(key=u'/node/subnode2/v4', value=u'value4'), PathResult(key=u'/node/v2', value=u'value2'), PathResult(key=u'/node/v3', value=u'value3'), ]) self.assertEqual(repr(res), repr(expected)) kvstore.delete(u'/node', recursive=True, dir=True)
class Test(SchedulerIntegratedTest): def testMainTensorWithoutEtcd(self): self.start_processes() sess = new_session(self.session_manager_ref.address) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() result = c.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1 np.testing.assert_allclose(result, expected.sum()) a = mt.ones((100, 50), chunk_size=35) * 2 + 1 b = mt.ones((50, 200), chunk_size=35) * 2 + 1 c = a.dot(b) result = c.execute(session=sess, timeout=self.timeout).fetch(session=sess) np.testing.assert_allclose(result, np.ones((100, 200)) * 450) base_arr = np.random.random((100, 100)) a = mt.array(base_arr) r = reduce(operator.add, [a[:10, :10] for _ in range(10)]) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)]) np.testing.assert_allclose(result, expected) a = mt.ones((31, 27), chunk_size=10) b = a.reshape(27, 31) b.op.extra_params['_reshape_with_shuffle'] = True r = b.sum(axis=1) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) np.testing.assert_allclose(result, np.ones((27, 31)).sum(axis=1)) raw = np.random.RandomState(0).rand(10, 10) a = mt.tensor(raw, chunk_size=(5, 4)) r = a[a.argmin(axis=1), mt.tensor(np.arange(10))] result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) np.testing.assert_array_equal(result, raw[raw.argmin(axis=1), np.arange(10)]) @unittest.skipIf('CI' not in os.environ and not EtcdProcessHelper().is_installed(), 'does not run without etcd') def testMainTensorWithEtcd(self): self.start_processes(etcd=True) sess = new_session(self.session_manager_ref.address) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 r = (a * b * 2 + 1).sum() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1 np.testing.assert_allclose(result, expected.sum()) @require_cupy @require_cudf def testMainTensorWithCuda(self): self.start_processes(cuda=True) sess = new_session(self.session_manager_ref.address) a = mt.ones((100, 100), chunk_size=30, gpu=True) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30, gpu=True) * 2 * 1 + 1 r = (a * b * 2 + 1).sum() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = ((np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1).sum() np.testing.assert_allclose(result, expected) def testMainDataFrameWithoutEtcd(self): self.start_processes( etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true']) sess = new_session(self.session_manager_ref.address) raw1 = pd.DataFrame(np.random.rand(10, 10)) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10)) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=(10, 5)) raw2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=(10, 6)) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10)) raw1[0] = raw1[0].apply(str) df1 = md.DataFrame(raw1, chunk_size=5) r = df1.sort_values(0) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1.sort_values(0)) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = md.Series(s1, chunk_size=6) result = series1.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_series_equal(result, s1) def testIterativeTilingWithoutEtcd(self): self.start_processes(etcd=False) sess = new_session(self.session_manager_ref.address) actor_client = sess._api.actor_client session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(sess.session_id)) rs = np.random.RandomState(0) raw = rs.rand(100) a = mt.tensor(raw, chunk_size=10) a.sort() r = a[:5] result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = np.sort(raw)[:5] np.testing.assert_allclose(result, expected) graph_key = sess._get_tileable_graph_key(r.key) graph_ref = actor_client.actor_ref( session_ref.get_graph_refs()[graph_key]) with self.assertRaises(KeyError): _, keys, _ = graph_ref.get_tileable_metas([a.key])[0] sess._api.fetch_chunk_data(sess.session_id, keys[0]) raw1 = rs.rand(20) raw2 = rs.rand(20) a = mt.tensor(raw1, chunk_size=10) a.sort() b = mt.tensor(raw2, chunk_size=15) + 1 c = mt.concatenate([a[:10], b]) c.sort() r = c[:5] result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = np.sort(np.concatenate([np.sort(raw1)[:10], raw2 + 1]))[:5] np.testing.assert_allclose(result, expected) raw = rs.randint(100, size=(100, )) a = mt.tensor(raw, chunk_size=53) a.sort() r = mt.histogram(a, bins='scott') result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = np.histogram(np.sort(raw), bins='scott') np.testing.assert_allclose(result[0], expected[0]) np.testing.assert_allclose(result[1], expected[1]) def testDistributedContext(self): self.start_processes(etcd=False) sess = new_session(self.session_manager_ref.address) rs = np.random.RandomState(0) context = DistributedContext( scheduler_address=self.session_manager_ref.address, session_id=sess.session_id) raw1 = rs.rand(10, 10) a = mt.tensor(raw1, chunk_size=4) a.execute(session=sess, timeout=self.timeout, name='test') tileable_infos = context.get_named_tileable_infos('test') self.assertEqual(a.key, tileable_infos.tileable_key) self.assertEqual(a.shape, tileable_infos.tileable_shape) nsplits = context.get_tileable_metas([a.key], filter_fields=['nsplits'])[0][0] self.assertEqual(((4, 4, 2), (4, 4, 2)), nsplits) r = context.get_tileable_data(a.key) np.testing.assert_array_equal(raw1, r) indexes = [slice(3, 9), slice(0, 7)] r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[tuple(indexes)], r) indexes = [[1, 4, 2, 4, 5], slice(None, None, None)] r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[tuple(indexes)], r) indexes = ([9, 1, 2, 0], [0, 0, 4, 4]) r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[[9, 1, 2, 0], [0, 0, 4, 4]], r) def testOperandsWithoutPrepareInputs(self): self.start_processes( etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op']) sess = new_session(self.session_manager_ref.address) actor_address = self.cluster_info.get_scheduler( ResourceActor.default_uid()) resource_ref = sess._api.actor_client.actor_ref( ResourceActor.default_uid(), address=actor_address) worker_endpoints = resource_ref.get_worker_endpoints() t1 = mt.random.rand(10) t1.op._expect_worker = worker_endpoints[0] t2 = mt.random.rand(10) t2.op._expect_worker = worker_endpoints[1] t = NoPrepareOperand().new_tileable([t1, t2]) t.op._prepare_inputs = [False, False] t.execute(session=sess, timeout=self.timeout) def testRemoteWithoutEtcd(self): from mars.scheduler.resource import ResourceActor from mars.worker.dispatcher import DispatchActor self.start_processes( etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op']) sess = new_session(self.session_manager_ref.address) resource_ref = sess._api.actor_client.actor_ref( ResourceActor.default_uid(), address=self.cluster_info.get_scheduler( ResourceActor.default_uid())) worker_ips = resource_ref.get_worker_endpoints() rs = np.random.RandomState(0) raw1 = rs.rand(10, 10) raw2 = rs.rand(10, 10) def f_none(_x): return None r_none = spawn(f_none, raw1) result = r_none.execute(session=sess, timeout=self.timeout).fetch(session=sess) self.assertIsNone(result) def f1(x): return x + 1 def f2(x, y, z=None): return x * y * (z[0] + z[1]) r1 = spawn(f1, raw1) r2 = spawn(f1, raw2) r3 = spawn(f2, (r1, r2), {'z': [r1, r2]}) result = r3.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = (raw1 + 1) * (raw2 + 1) * (raw1 + 1 + raw2 + 1) np.testing.assert_allclose(result, expected) def f(t, x): mul = (t * x).execute() return mul.sum().to_numpy() rs = np.random.RandomState(0) raw = rs.rand(5, 4) t1 = mt.tensor(raw, chunk_size=3) t2 = t1.sum(axis=0) s = spawn(f, args=(t2, 3)) result = s.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = (raw.sum(axis=0) * 3).sum() self.assertAlmostEqual(result, expected) time.sleep(1) for worker_ip in worker_ips: ref = sess._api.actor_client.actor_ref(DispatchActor.default_uid(), address=worker_ip) self.assertEqual(len(ref.get_slots('cpu')), 1) def testNoWorkerException(self): self.start_processes(etcd=False, n_workers=0) a = mt.ones((10, 10)) b = mt.ones((10, 10)) c = (a + b) endpoint = self.scheduler_endpoints[0] sess = new_session(endpoint) try: c.execute(session=sess, timeout=self.timeout) except ExecutionFailed as e: self.assertIsInstance(e.__cause__, RuntimeError)
class Test(SchedulerIntegratedTest): def testMainTensorWithoutEtcd(self): self.start_processes() sess = new_session(self.session_manager_ref.address) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() result = c.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1 np.testing.assert_allclose(result, expected.sum()) a = mt.ones((100, 50), chunk_size=35) * 2 + 1 b = mt.ones((50, 200), chunk_size=35) * 2 + 1 c = a.dot(b) result = c.execute(session=sess, timeout=self.timeout).fetch(session=sess) np.testing.assert_allclose(result, np.ones((100, 200)) * 450) base_arr = np.random.random((100, 100)) a = mt.array(base_arr) r = reduce(operator.add, [a[:10, :10] for _ in range(10)]) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)]) np.testing.assert_allclose(result, expected) a = mt.ones((31, 27), chunk_size=10) b = a.reshape(27, 31) b.op.extra_params['_reshape_with_shuffle'] = True r = b.sum(axis=1) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) np.testing.assert_allclose(result, np.ones((27, 31)).sum(axis=1)) raw = np.random.RandomState(0).rand(10, 10) a = mt.tensor(raw, chunk_size=(5, 4)) r = a[a.argmin(axis=1), mt.tensor(np.arange(10))] result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) np.testing.assert_array_equal(result, raw[raw.argmin(axis=1), np.arange(10)]) raw = np.random.RandomState(0).rand(1000) a = mt.tensor(raw, chunk_size=100) r = mt.median(a) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) np.testing.assert_array_equal(result, np.median(raw)) @unittest.skipIf('CI' not in os.environ and not EtcdProcessHelper().is_installed(), 'does not run without etcd') def testMainTensorWithEtcd(self): self.start_processes(etcd=True) sess = new_session(self.session_manager_ref.address) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 r = (a * b * 2 + 1).sum() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1 np.testing.assert_allclose(result, expected.sum()) @require_cupy @require_cudf def testMainTensorWithCuda(self): self.start_processes(cuda=True) sess = new_session(self.session_manager_ref.address) a = mt.ones((100, 100), chunk_size=30, gpu=True) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30, gpu=True) * 2 * 1 + 1 r = (a * b * 2 + 1).sum() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = ((np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1).sum() np.testing.assert_allclose(result, expected) def testMainDataFrameWithoutEtcd(self): self.start_processes( etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true']) sess = new_session(self.session_manager_ref.address) # test binary arithmetics with different indices raw1 = pd.DataFrame(np.random.rand(10, 10)) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10)) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=(10, 5)) raw2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=(10, 6)) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) # test sort_values raw1 = pd.DataFrame(np.random.rand(10, 10)) raw1[0] = raw1[0].apply(str) raw1.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')]) df1 = md.DataFrame(raw1, chunk_size=5) r = df1.sort_values([('A', 'C')]) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1.sort_values([('A', 'C')])) rs = np.random.RandomState(0) raw2 = pd.DataFrame({ 'a': rs.rand(10), 'b': [f's{rs.randint(1000)}' for _ in range(10)] }) raw2['b'] = raw2['b'].astype(md.ArrowStringDtype()) mdf = md.DataFrame(raw2, chunk_size=4) filtered = mdf[mdf['a'] > 0.5] df2 = filtered.sort_values(by='b') result = df2.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = raw2[raw2['a'] > 0.5].sort_values(by='b') pd.testing.assert_frame_equal(result, expected) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = md.Series(s1, chunk_size=6) result = series1.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_series_equal(result, s1) # test reindex data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df3 = md.DataFrame(data, chunk_size=4) r = df3.reindex(index=mt.arange(10, 1, -1, chunk_size=3)) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = data.reindex(index=np.arange(10, 1, -1)) pd.testing.assert_frame_equal(result, expected) # test rebalance df4 = md.DataFrame(data) r = df4.rebalance() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, data) chunk_metas = sess.get_tileable_chunk_metas(r.key) workers = list( set(itertools.chain(*(m.workers for m in chunk_metas.values())))) self.assertEqual(len(workers), 2) # test nunique data = pd.DataFrame(np.random.randint(0, 10, (100, 5)), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df5 = md.DataFrame(data, chunk_size=4) r = df5.nunique() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = data.nunique() pd.testing.assert_series_equal(result, expected) # test re-execute df.groupby().agg().sort_values() rs = np.random.RandomState(0) data = pd.DataFrame({ 'col1': rs.rand(100), 'col2': rs.randint(10, size=100) }) df6 = md.DataFrame(data, chunk_size=40) grouped = df6.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}) \ .execute(session=sess, timeout=self.timeout) r = grouped.sort_values(by='cnt').head().execute(session=sess, timeout=self.timeout) result = r.fetch(session=sess) expected = data.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}) \ .sort_values(by='cnt').head() pd.testing.assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) r2 = df6.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}).sort_values(by='cnt').head() \ .execute(session=sess, timeout=self.timeout) result = r2.fetch(session=sess) pd.testing.assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) # test groupby with sample src_data_list = [] sample_count = 10 for b in range(5): data_count = int(np.random.randint(40, 100)) src_data_list.append( pd.DataFrame({ 'a': np.random.randint(0, 100, size=data_count), 'b': np.array([b] * data_count), 'c': np.random.randint(0, 100, size=data_count), 'd': np.random.randint(0, 100, size=data_count), })) data = pd.concat(src_data_list) shuffle_idx = np.arange(len(data)) np.random.shuffle(shuffle_idx) data = data.iloc[shuffle_idx].reset_index(drop=True) df7 = md.DataFrame(data, chunk_size=40) sampled = df7.groupby('b').sample(10) r = sampled.execute(session=sess, timeout=self.timeout) result = r.fetch(session=sess) self.assertFalse((result.groupby('b').count() - sample_count).any()[0]) def testIterativeTilingWithoutEtcd(self): self.start_processes(etcd=False) sess = new_session(self.session_manager_ref.address) actor_client = sess._api.actor_client session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(sess.session_id)) rs = np.random.RandomState(0) raw = rs.rand(100) a = mt.tensor(raw, chunk_size=10) a.sort() r = a[:5] result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = np.sort(raw)[:5] np.testing.assert_allclose(result, expected) graph_key = sess._get_tileable_graph_key(r.key) graph_ref = actor_client.actor_ref( session_ref.get_graph_refs()[graph_key]) with self.assertRaises(KeyError): _, keys, _ = graph_ref.get_tileable_metas( [a.key], filter_fields=['nsplits', 'chunk_keys', 'chunk_indexes'])[0] sess._api.fetch_chunk_data(sess.session_id, keys[0]) raw1 = rs.rand(20) raw2 = rs.rand(20) a = mt.tensor(raw1, chunk_size=10) a.sort() b = mt.tensor(raw2, chunk_size=15) + 1 c = mt.concatenate([a[:10], b]) c.sort() r = c[:5] result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = np.sort(np.concatenate([np.sort(raw1)[:10], raw2 + 1]))[:5] np.testing.assert_allclose(result, expected) raw = rs.randint(100, size=(100, )) a = mt.tensor(raw, chunk_size=53) a.sort() r = mt.histogram(a, bins='scott') result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = np.histogram(np.sort(raw), bins='scott') np.testing.assert_allclose(result[0], expected[0]) np.testing.assert_allclose(result[1], expected[1]) def testDistributedContext(self): self.start_processes(etcd=False) sess = new_session(self.session_manager_ref.address) rs = np.random.RandomState(0) context = DistributedContext( scheduler_address=self.session_manager_ref.address, session_id=sess.session_id) raw1 = rs.rand(10, 10) a = mt.tensor(raw1, chunk_size=4) a.execute(session=sess, timeout=self.timeout, name='test') tileable_infos = context.get_named_tileable_infos('test') self.assertEqual(a.key, tileable_infos.tileable_key) self.assertEqual(a.shape, tileable_infos.tileable_shape) nsplits = context.get_tileable_metas([a.key], filter_fields=['nsplits'])[0][0] self.assertEqual(((4, 4, 2), (4, 4, 2)), nsplits) r = context.get_tileable_data(a.key) np.testing.assert_array_equal(raw1, r) indexes = [slice(3, 9), slice(0, 7)] r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[tuple(indexes)], r) indexes = [[1, 4, 2, 4, 5], slice(None, None, None)] r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[tuple(indexes)], r) indexes = ([9, 1, 2, 0], [0, 0, 4, 4]) r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[[9, 1, 2, 0], [0, 0, 4, 4]], r) def testOperandsWithPureDepends(self): self.start_processes( etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op']) sess = new_session(self.session_manager_ref.address) actor_address = self.cluster_info.get_scheduler( ResourceActor.default_uid()) resource_ref = sess._api.actor_client.actor_ref( ResourceActor.default_uid(), address=actor_address) worker_endpoints = resource_ref.get_worker_endpoints() t1 = mt.random.rand(10) t1.op._expect_worker = worker_endpoints[0] t2 = mt.random.rand(10) t2.op._expect_worker = worker_endpoints[1] t = PureDependsOperand().new_tileable([t1, t2]) t.op._pure_depends = [True, True] t.execute(session=sess, timeout=self.timeout) def testRemoteWithoutEtcd(self): from mars.scheduler.resource import ResourceActor from mars.worker.dispatcher import DispatchActor self.start_processes( etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op']) sess = new_session(self.session_manager_ref.address) resource_ref = sess._api.actor_client.actor_ref( ResourceActor.default_uid(), address=self.cluster_info.get_scheduler( ResourceActor.default_uid())) worker_ips = resource_ref.get_worker_endpoints() rs = np.random.RandomState(0) raw1 = rs.rand(10, 10) raw2 = rs.rand(10, 10) def f_none(_x): return None r_none = spawn(f_none, raw1) result = r_none.execute(session=sess, timeout=self.timeout).fetch(session=sess) self.assertIsNone(result) def f1(x): return x + 1 def f2(x, y, z=None): return x * y * (z[0] + z[1]) r1 = spawn(f1, raw1) r2 = spawn(f1, raw2) r3 = spawn(f2, (r1, r2), {'z': [r1, r2]}) result = r3.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = (raw1 + 1) * (raw2 + 1) * (raw1 + 1 + raw2 + 1) np.testing.assert_allclose(result, expected) def f(t, x): mul = (t * x).execute() return mul.sum().to_numpy() rs = np.random.RandomState(0) raw = rs.rand(5, 4) t1 = mt.tensor(raw, chunk_size=3) t2 = t1.sum(axis=0) s = spawn(f, args=(t2, 3)) result = s.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = (raw.sum(axis=0) * 3).sum() self.assertAlmostEqual(result, expected) time.sleep(1) for worker_ip in worker_ips: ref = sess._api.actor_client.actor_ref(DispatchActor.default_uid(), address=worker_ip) self.assertEqual(len(ref.get_slots('cpu')), 1) def testFetchLogWithoutEtcd(self): # test fetch log with tempfile.TemporaryDirectory() as temp_dir: self.start_processes( etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op'], scheduler_args=[f'-Dcustom_log_dir={temp_dir}']) sess = new_session(self.session_manager_ref.address) def f(): print('test') r = spawn(f) r.execute(session=sess) custom_log_actor = sess._api.actor_client.actor_ref( CustomLogMetaActor.default_uid(), address=self.cluster_info.get_scheduler( CustomLogMetaActor.default_uid())) chunk_key_to_log_path = custom_log_actor.get_tileable_op_log_paths( sess.session_id, r.op.key) paths = list(chunk_key_to_log_path.values()) self.assertEqual(len(paths), 1) log_path = paths[0][1] with open(log_path) as f: self.assertEqual(f.read().strip(), 'test') context = DistributedContext( scheduler_address=self.session_manager_ref.address, session_id=sess.session_id) log_result = context.fetch_tileable_op_logs(r.op.key) log = next(iter(log_result.values()))['log'] self.assertEqual(log.strip(), 'test') log = r.fetch_log() self.assertEqual(str(log).strip(), 'test') # test multiple functions def f1(size): print('f1' * size) sys.stdout.flush() fs = ExecutableTuple([spawn(f1, 30), spawn(f1, 40)]) fs.execute(session=sess) log = fs.fetch_log(offsets=20, sizes=10) self.assertEqual(str(log[0]).strip(), ('f1' * 30)[20:30]) self.assertEqual(str(log[1]).strip(), ('f1' * 40)[20:30]) self.assertGreater(len(log[0].offsets), 0) self.assertTrue(all(s > 0 for s in log[0].offsets)) self.assertGreater(len(log[1].offsets), 0) self.assertTrue(all(s > 0 for s in log[1].offsets)) self.assertGreater(len(log[0].chunk_op_keys), 0) # test negative offsets log = fs.fetch_log(offsets=-20, sizes=10) self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10]) self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10]) self.assertTrue(all(s > 0 for s in log[0].offsets)) self.assertGreater(len(log[1].offsets), 0) self.assertTrue(all(s > 0 for s in log[1].offsets)) self.assertGreater(len(log[0].chunk_op_keys), 0) # test negative offsets which represented in string log = fs.fetch_log(offsets='-0.02K', sizes='0.01K') self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10]) self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10]) self.assertTrue(all(s > 0 for s in log[0].offsets)) self.assertGreater(len(log[1].offsets), 0) self.assertTrue(all(s > 0 for s in log[1].offsets)) self.assertGreater(len(log[0].chunk_op_keys), 0) def test_nested(): print('level0') fr = spawn(f1, 1) fr.execute() print(fr.fetch_log()) r = spawn(test_nested) with self.assertRaises(ValueError): r.fetch_log() r.execute(session=sess) log = str(r.fetch_log()) self.assertIn('level0', log) self.assertIn('f1', log) df = md.DataFrame(mt.random.rand(10, 3), chunk_size=5) def df_func(c): print('df func') return c df2 = df.map_chunk(df_func) df2.execute(session=sess) log = df2.fetch_log() self.assertIn('Chunk op key:', str(log)) self.assertIn('df func', repr(log)) self.assertEqual(len(str(df.fetch_log(session=sess))), 0) def testNoWorkerException(self): self.start_processes(etcd=False, n_workers=0) a = mt.ones((10, 10)) b = mt.ones((10, 10)) c = (a + b) sess = new_session(self.session_manager_ref.address) try: c.execute(session=sess, timeout=self.timeout) except ExecutionFailed as e: self.assertIsInstance(e.__cause__, RuntimeError)
class Test(SchedulerIntegratedTest): def testMainTensorWithoutEtcd(self): self.start_processes() session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1 assert_allclose(loads(result), expected.sum()) a = mt.ones((100, 50), chunk_size=35) * 2 + 1 b = mt.ones((50, 200), chunk_size=35) * 2 + 1 c = a.dot(b) graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) assert_allclose(loads(result), np.ones((100, 200)) * 450) base_arr = np.random.random((100, 100)) a = mt.array(base_arr) sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)]) graph = sumv.build_graph() targets = [sumv.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)]) result = session_ref.fetch_result(graph_key, sumv.key) assert_allclose(loads(result), expected) a = mt.ones((31, 27), chunk_size=10) b = a.reshape(27, 31) b.op.extra_params['_reshape_with_shuffle'] = True r = b.sum(axis=1) graph = r.build_graph() targets = [r.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, r.key) assert_allclose(loads(result), np.ones((27, 31)).sum(axis=1)) raw = np.random.RandomState(0).rand(10, 10) a = mt.tensor(raw, chunk_size=(5, 4)) b = a[a.argmin(axis=1), mt.tensor(np.arange(10))] graph = b.build_graph() targets = [b.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, b.key) np.testing.assert_array_equal(loads(result), raw[raw.argmin(axis=1), np.arange(10)]) @unittest.skipIf('CI' not in os.environ and not EtcdProcessHelper().is_installed(), 'does not run without etcd') def testMainTensorWithEtcd(self): self.start_processes(etcd=True) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1 assert_allclose(loads(result), expected.sum()) @require_cupy @require_cudf def testMainTensorWithCuda(self): self.start_processes(cuda=True) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30, gpu=True) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30, gpu=True) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1 assert_allclose(loads(result), expected.sum()) def testMainDataFrameWithoutEtcd(self): import pandas as pd from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df from mars.dataframe.datasource.series import from_pandas as from_pandas_series from mars.dataframe.arithmetic import add self.start_processes(etcd=False) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas_df(data2, chunk_size=6) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas_df(data1, chunk_size=(10, 5)) data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas_df(data2, chunk_size=(10, 6)) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas_df(data2, chunk_size=6) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = from_pandas_series(s1) graph = series1.build_graph() targets = [series1.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, series1.key) pd.testing.assert_series_equal(s1, loads(result))
class Test(unittest.TestCase): def testLocalPathStore(self): kvstore = get(':inproc:') kvstore.write('/node/subnode/v1', 'value1') kvstore.write('/node/v2', 'value2') res = kvstore.read('/node', sort=True) expected = PathResult(key='/node', dir=True, children=[ PathResult(key='/node/subnode', dir=True), PathResult(key='/node/v2', value='value2'), ]) self.assertEqual(repr(res), repr(expected)) res = kvstore.read('/node', recursive=True, sort=True) expected = PathResult(key='/node', dir=True, children=[ PathResult(key='/node/subnode/v1', value='value1'), PathResult(key='/node/v2', value='value2'), ]) self.assertEqual(repr(res), repr(expected)) kvstore.write('/node/v3', 'value3') with self.assertRaises(KeyError): kvstore.write('/node/v2/invalid_value', value='invalid') res = kvstore.read('/', recursive=False, sort=True) expected = PathResult(key='/', dir=True, children=[ PathResult(key='/node', dir=True), ]) self.assertEqual(repr(res), repr(expected)) res = kvstore.read('/', recursive=True, sort=True) expected = PathResult(key='/', dir=True, children=[ PathResult(key='/node/subnode/v1', value='value1'), PathResult(key='/node/v2', value='value2'), PathResult(key='/node/v3', value='value3'), ]) self.assertEqual(repr(res), repr(expected)) kvstore.write('/node/subnode2/v4', 'value4') with self.assertRaises(KeyError): kvstore.delete('/node/subnode', dir=True) kvstore.delete('/node/subnode/v1') res = kvstore.read('/', recursive=True, sort=True) expected = PathResult(key='/', dir=True, children=[ PathResult(key='/node/subnode', dir=True), PathResult(key='/node/subnode2/v4', value='value4'), PathResult(key='/node/v2', value='value2'), PathResult(key='/node/v3', value='value3'), ]) self.assertEqual(repr(res), repr(expected)) kvstore.delete('/node/subnode2', dir=True, recursive=True) res = kvstore.read('/', recursive=True, sort=True) expected = PathResult(key='/', dir=True, children=[ PathResult(key='/node/subnode', dir=True), PathResult(key='/node/v2', value='value2'), PathResult(key='/node/v3', value='value3') ]) self.assertEqual(repr(res), repr(expected)) @unittest.skipIf(sys.platform == 'win32', 'does not run in windows') @unittest.skipIf('CI' not in os.environ and not EtcdProcessHelper().is_installed(), 'does not run without etcd') def testEtcdPathStore(self): with EtcdProcessHelper(port_range_start=51342).run(): kvstore = get(u'etcd://*****:*****@unittest.skipIf(sys.platform == 'win32', 'does not run in windows') @unittest.skipIf('CI' not in os.environ and not EtcdProcessHelper().is_installed(), 'does not run without etcd') def testEtcdWatch(self): with EtcdProcessHelper(port_range_start=51342).run(): kvstore = get('etcd://localhost:51342') kvstore.write('/node/subnode/v1', 'value1') kvstore.write('/node/v2', 'value2') def watcher(): return kvstore.watch('/node/v2', timeout=10) def writer(): gevent.sleep(1) kvstore.write('/node/v2', 'value2\'') g1 = gevent.spawn(writer) g2 = gevent.spawn(watcher) gevent.joinall([g1, g2]) self.assertEqual(g2.value.value, 'value2\'') kvstore.delete('/node/v2') def watcher(): return kvstore.watch('/node/subnode', timeout=10, recursive=True) def writer(): gevent.sleep(1) kvstore.write('/node/subnode/v1', 'value1\'') g1 = gevent.spawn(writer) g2 = gevent.spawn(watcher) gevent.joinall([g1, g2]) self.assertEqual(g2.value.children[0].value, 'value1\'') kvstore.write('/node/subnode/v3', '-1') def watcher(): results = [] for idx, result in enumerate( kvstore.eternal_watch('/node/subnode/v3')): results.append(int(result.value)) if idx == 4: break return results def writer(): gevent.sleep(0.1) for v in range(5): kvstore.write('/node/subnode/v3', str(v)) gevent.sleep(0.1) g1 = gevent.spawn(writer) g2 = gevent.spawn(watcher) gevent.joinall([g1, g2]) self.assertEqual(g2.value, list(range(5))) kvstore.delete('/node', dir=True, recursive=True)
class Test(unittest.TestCase): @classmethod def setUpClass(cls): import tempfile from mars import kvstore options.worker.spill_directory = os.path.join(tempfile.gettempdir(), 'mars_test_spill') cls._kv_store = kvstore.get(options.kv_store) @classmethod def tearDownClass(cls): import shutil if os.path.exists(options.worker.spill_directory): shutil.rmtree(options.worker.spill_directory) def setUp(self): self.scheduler_endpoints = [] self.proc_schedulers = [] self.proc_workers = [] self.etcd_helper = None def tearDown(self): procs = tuple(self.proc_workers) + tuple(self.proc_schedulers) for p in procs: p.send_signal(signal.SIGINT) check_time = time.time() while any(p.poll() is None for p in procs): time.sleep(0.1) if time.time() - check_time > 5: break for p in procs: if p.poll() is None: p.kill() if self.etcd_helper: self.etcd_helper.stop() def start_processes(self, n_schedulers=1, n_workers=2, etcd=False, modules=None): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception,) scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)] self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports] append_args = [] if modules: append_args.extend(['--load-modules', ','.join(modules)]) if etcd: etcd_port = get_next_port() self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port) self.etcd_helper.run() options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port append_args.extend(['--kv-store', options.kv_store]) else: append_args.extend(['--schedulers', ','.join(self.scheduler_endpoints)]) self.proc_schedulers = [ subprocess.Popen([sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '--level', 'debug', '-p', p, '--format', '%(asctime)-15s %(message)s'] + append_args) for p in scheduler_ports] self.proc_workers = [ subprocess.Popen([sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', '1', '--level', 'debug', '--cache-mem', '16m', '--ignore-avail-mem'] + append_args) for _ in range(n_workers) ] actor_client = new_client() self.cluster_info = actor_client.actor_ref( ClusterInfoActor.default_name(), address=self.scheduler_endpoints[0]) check_time = time.time() while True: try: started_schedulers = self.cluster_info.get_schedulers() if len(started_schedulers) < n_schedulers: raise RuntimeError('Schedulers does not met requirement: %d < %d.' % ( len(started_schedulers), n_schedulers )) actor_address = self.cluster_info.get_scheduler(SessionManagerActor.default_name()) self.session_manager_ref = actor_client.actor_ref( SessionManagerActor.default_name(), address=actor_address) actor_address = self.cluster_info.get_scheduler(ResourceActor.default_name()) resource_ref = actor_client.actor_ref(ResourceActor.default_name(), address=actor_address) if resource_ref.get_worker_count() < n_workers: raise RuntimeError('Workers does not met requirement: %d < %d.' % ( resource_ref.get_worker_count(), n_workers )) break except: if time.time() - check_time > 20: raise time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors def check_process_statuses(self): for scheduler_proc in self.proc_schedulers: if scheduler_proc.poll() is not None: raise SystemError('Scheduler not started. exit code %s' % self.proc_scheduler.poll()) for worker_proc in self.proc_workers: if worker_proc.poll() is not None: raise SystemError('Worker not started. exit code %s' % worker_proc.poll()) def wait_for_termination(self, session_ref, graph_key): check_time = time.time() while True: time.sleep(0.1) self.check_process_statuses() if time.time() - check_time > 60: raise SystemError('Check graph status timeout') if session_ref.graph_state(graph_key) in GraphState.TERMINATED_STATES: return session_ref.graph_state(graph_key) def testMainWithoutEtcd(self): self.start_processes(n_schedulers=2) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1 assert_array_equal(loads(result), expected.sum()) graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) # todo this behavior may change when eager mode is introduced state = self.wait_for_termination(session_ref, graph_key) self.assertEqual(state, GraphState.FAILED) a = mt.ones((100, 50), chunk_size=35) * 2 + 1 b = mt.ones((50, 200), chunk_size=35) * 2 + 1 c = a.dot(b) graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) assert_array_equal(loads(result), np.ones((100, 200)) * 450) base_arr = np.random.random((100, 100)) a = mt.array(base_arr) sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)]) graph = sumv.build_graph() targets = [sumv.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)]) result = session_ref.fetch_result(graph_key, sumv.key) assert_array_equal(loads(result), expected) def testMainWithEtcd(self): self.start_processes(n_schedulers=2, etcd=True) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1 assert_array_equal(loads(result), expected.sum())
class SchedulerIntegratedTest(unittest.TestCase): @classmethod def setUpClass(cls): from mars import kvstore options.worker.spill_directory = os.path.join(tempfile.gettempdir(), 'mars_test_spill') cls._kv_store = kvstore.get(options.kv_store) cls.timeout = int(os.environ.get('CHECK_TIMEOUT', 120)) @classmethod def tearDownClass(cls): import shutil if os.path.exists(options.worker.spill_directory): shutil.rmtree(options.worker.spill_directory) def setUp(self): self.scheduler_endpoints = [] self.proc_schedulers = [] self.proc_workers = [] self.state_files = dict() self.etcd_helper = None self.intentional_death_pids = set() def tearDown(self): for env, fn in self.state_files.items(): os.environ.pop(env) if os.path.exists(fn): os.unlink(fn) self.terminate_processes() options.kv_store = ':inproc:' def terminate_processes(self): procs = tuple(self.proc_workers) + tuple(self.proc_schedulers) for p in procs: p.send_signal(signal.SIGINT) check_time = time.time() while any(p.poll() is None for p in procs): time.sleep(0.1) if time.time() - check_time > 5: break for p in procs: if p.poll() is None: self.kill_process_tree(p) if self.etcd_helper: self.etcd_helper.stop() def kill_process_tree(self, proc, intentional=True): if intentional: self.intentional_death_pids.add(proc.pid) kill_process_tree(proc.pid) def add_state_file(self, environ): fn = os.environ[environ] = os.path.join( tempfile.gettempdir(), f'test-main-{environ.lower()}-{os.getpid()}-{id(self)}') self.state_files[environ] = fn return fn def start_processes(self, *args, **kwargs): fail_count = 0 while True: try: self._start_processes(*args, **kwargs) break except ProcessRequirementUnmetError: self.terminate_processes() fail_count += 1 if fail_count >= 10: raise time.sleep(5) logger.error('Failed to start service, retrying') def _start_processes(self, n_schedulers=2, n_workers=2, etcd=False, cuda=False, modules=None, log_scheduler=True, log_worker=True, env=None, scheduler_args=None, worker_args=None, worker_cpu=1): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception,) scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)] self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports] append_args = [] append_args_scheduler = scheduler_args or [] append_args_worker = worker_args or [] if modules: append_args.extend(['--load-modules', ','.join(modules)]) if etcd: etcd_port = get_next_port() self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port) self.etcd_helper.run() options.kv_store = f'etcd://127.0.0.1:{etcd_port}' append_args.extend(['--kv-store', options.kv_store]) else: append_args.extend(['--schedulers', ','.join(self.scheduler_endpoints)]) if 'DUMP_GRAPH_DATA' in os.environ: append_args_scheduler += ['-Dscheduler.dump_graph_data=true'] proc_env = os.environ.copy() if env: proc_env.update(env) self.proc_schedulers = [ subprocess.Popen([sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '-p', p, '--log-level', 'debug' if log_scheduler else 'warning', '--log-format', f'SCH{idx} %(asctime)-15s %(message)s' '-Dscheduler.retry_delay=5', '-Dscheduler.default_cpu_usage=0', '-Dscheduler.status_timeout=10'] + append_args + append_args_scheduler, env=proc_env) for idx, p in enumerate(scheduler_ports)] cuda_count = resource.cuda_count() cuda_devices = [int(d) for d in os.environ['CUDA_VISIBLE_DEVICES'].split(',')] \ if os.environ.get('CUDA_VISIBLE_DEVICES') else list(range(cuda_count)) self.proc_workers = [ subprocess.Popen([sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', str(worker_cpu), '--log-level', 'debug' if log_worker else 'warning', '--log-format', f'WOR{idx} %(asctime)-15s %(message)s', '--cache-mem', '16m', '--ignore-avail-mem', '--cuda-device', str(cuda_devices[idx % cuda_count]) if cuda_count else '', '-Dworker.prepare_data_timeout=30'] + append_args + append_args_worker, env=proc_env) for idx in range(n_workers) ] actor_client = new_client() self.cluster_info = actor_client.actor_ref( SchedulerClusterInfoActor.default_uid(), address=self.scheduler_endpoints[0]) check_time = time.time() while True: try: try: started_schedulers = self.cluster_info.get_schedulers() except Exception as e: raise ProcessRequirementUnmetError(f'Failed to get scheduler numbers, {e}') if len(started_schedulers) < n_schedulers: raise ProcessRequirementUnmetError( f'Schedulers does not met requirement: {len(started_schedulers)} < {n_schedulers}.') actor_address = self.cluster_info.get_scheduler(SessionManagerActor.default_uid()) self.session_manager_ref = actor_client.actor_ref( SessionManagerActor.default_uid(), address=actor_address) actor_address = self.cluster_info.get_scheduler(ResourceActor.default_uid()) resource_ref = actor_client.actor_ref(ResourceActor.default_uid(), address=actor_address) if not actor_client.has_actor(self.session_manager_ref) \ or resource_ref.get_worker_count() < n_workers: raise ProcessRequirementUnmetError( f'Workers does not met requirement: {resource_ref.get_worker_count()} < {n_workers}') break except: # noqa: E722 if time.time() - check_time > 20: raise time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors def check_process_statuses(self): for scheduler_proc in self.proc_schedulers: if scheduler_proc.poll() is not None: raise ProcessRequirementUnmetError( f'Scheduler not started. exit code {self.proc_scheduler.poll()}') for worker_proc in self.proc_workers: if worker_proc.poll() is not None and worker_proc.pid not in self.intentional_death_pids: raise ProcessRequirementUnmetError( f'Worker not started. exit code {worker_proc.poll()}') def wait_for_termination(self, actor_client, session_ref, graph_key): check_time = time.time() dump_time = time.time() check_timeout = int(os.environ.get('CHECK_TIMEOUT', 120)) while True: time.sleep(0.1) self.check_process_statuses() if time.time() - check_time > check_timeout: raise SystemError('Check graph status timeout') if time.time() - dump_time > 10: dump_time = time.time() graph_refs = session_ref.get_graph_refs() try: graph_ref = actor_client.actor_ref(graph_refs[graph_key]) graph_ref.dump_unfinished_terminals() except KeyError: pass if session_ref.graph_state(graph_key) in GraphState.TERMINATED_STATES: return session_ref.graph_state(graph_key)