def testKVStoreActor(self): etcd_port = get_next_port() proc_helper = EtcdProcessHelper(port_range_start=etcd_port) options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port with proc_helper.run(), create_actor_pool(n_process=1, backend='gevent') as pool: store_ref = pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) store_ref.write('/node/v1', 'value1') store_ref.write('/node/v2', 'value2') store_ref.write_batch([ ('/node/v2', 'value2'), ('/node/v3', 'value3'), ]) self.assertEqual(store_ref.read('/node/v1').value, 'value1') self.assertListEqual([ v.value for v in store_ref.read_batch(['/node/v2', '/node/v3']) ], ['value2', 'value3']) store_ref.delete('/node', dir=True, recursive=True) with self.assertRaises(KeyError): store_ref.delete('/node', dir=True, recursive=True) store_ref.delete('/node', dir=True, recursive=True, silent=True)
def testKVStoreActor(self): proc_helper = EtcdProcessHelper(port_range_start=54131) with proc_helper.run(), create_actor_pool(n_process=1, backend='gevent') as pool: store_ref = pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) store_ref.write('/node/v1', 'value1') store_ref.write('/node/v2', 'value2') store_ref.write_batch([ ('/node/v2', 'value2'), ('/node/v3', 'value3'), ]) self.assertEqual(store_ref.read('/node/v1').value, 'value1') self.assertListEqual([ v.value for v in store_ref.read_batch(['/node/v2', '/node/v3']) ], ['value2', 'value3'])
class Test(unittest.TestCase): @classmethod def setUpClass(cls): from mars import kvstore options.worker.spill_directory = os.path.join(tempfile.gettempdir(), 'mars_test_spill') cls._kv_store = kvstore.get(options.kv_store) @classmethod def tearDownClass(cls): import shutil if os.path.exists(options.worker.spill_directory): shutil.rmtree(options.worker.spill_directory) try: delay_state_file = os.environ.get('DELAY_STATE_FILE') if delay_state_file: os.unlink(delay_state_file) except OSError: pass def setUp(self): self.scheduler_endpoints = [] self.proc_schedulers = [] self.proc_workers = [] self.state_files = [] self.etcd_helper = None def tearDown(self): for fn in self.state_files: if os.path.exists(fn): os.unlink(fn) procs = tuple(self.proc_workers) + tuple(self.proc_schedulers) for p in procs: p.send_signal(signal.SIGINT) check_time = time.time() while any(p.poll() is None for p in procs): time.sleep(0.1) if time.time() - check_time > 5: break for p in procs: if p.poll() is None: p.kill() if self.etcd_helper: self.etcd_helper.stop() options.kv_store = ':inproc:' def add_state_file(self, environ): fn = os.environ[environ] = os.path.join( tempfile.gettempdir(), 'test-main-%s-%d-%d' % (environ.lower(), os.getpid(), id(self))) self.state_files.append(fn) return fn def start_processes(self, n_schedulers=2, n_workers=2, etcd=False, modules=None, log_scheduler=True, log_worker=True): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception, ) scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)] self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports] append_args = [] if modules: append_args.extend(['--load-modules', ','.join(modules)]) if etcd: etcd_port = get_next_port() self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port) self.etcd_helper.run() options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port append_args.extend(['--kv-store', options.kv_store]) else: append_args.extend( ['--schedulers', ','.join(self.scheduler_endpoints)]) if 'DUMP_GRAPH_DATA' in os.environ: append_args += ['-Dscheduler.dump_graph_data=true'] self.proc_schedulers = [ subprocess.Popen([ sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '--level', 'debug' if log_scheduler else 'warning', '-p', p, '--format', '%(asctime)-15s %(message)s', '-Dscheduler.retry_delay=5' ] + append_args) for p in scheduler_ports ] self.proc_workers = [ subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', '1', '--level', 'debug' if log_worker else 'warning', '--cache-mem', '16m', '--ignore-avail-mem', '-Dworker.prepare_data_timeout=30' ] + append_args) for _ in range(n_workers) ] actor_client = new_client() self.cluster_info = actor_client.actor_ref( ClusterInfoActor.default_name(), address=self.scheduler_endpoints[0]) check_time = time.time() while True: try: started_schedulers = self.cluster_info.get_schedulers() if len(started_schedulers) < n_schedulers: raise RuntimeError( 'Schedulers does not met requirement: %d < %d.' % (len(started_schedulers), n_schedulers)) actor_address = self.cluster_info.get_scheduler( SessionManagerActor.default_name()) self.session_manager_ref = actor_client.actor_ref( SessionManagerActor.default_name(), address=actor_address) actor_address = self.cluster_info.get_scheduler( ResourceActor.default_name()) resource_ref = actor_client.actor_ref( ResourceActor.default_name(), address=actor_address) if resource_ref.get_worker_count() < n_workers: raise RuntimeError( 'Workers does not met requirement: %d < %d.' % (resource_ref.get_worker_count(), n_workers)) break except: if time.time() - check_time > 20: raise time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors def check_process_statuses(self): for scheduler_proc in self.proc_schedulers: if scheduler_proc.poll() is not None: raise SystemError('Scheduler not started. exit code %s' % self.proc_scheduler.poll()) for worker_proc in self.proc_workers: if worker_proc.poll() is not None: raise SystemError('Worker not started. exit code %s' % worker_proc.poll()) def wait_for_termination(self, actor_client, session_ref, graph_key): check_time = time.time() dump_time = time.time() check_timeout = int(os.environ.get('CHECK_TIMEOUT', 120)) while True: time.sleep(0.1) self.check_process_statuses() if time.time() - check_time > check_timeout: raise SystemError('Check graph status timeout') if time.time() - dump_time > 10: dump_time = time.time() graph_refs = session_ref.get_graph_refs() try: graph_ref = actor_client.actor_ref(graph_refs[graph_key]) graph_ref.dump_unfinished_terminals() except KeyError: pass if session_ref.graph_state( graph_key) in GraphState.TERMINATED_STATES: return session_ref.graph_state(graph_key) def testMainWithoutEtcd(self): self.start_processes() session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1 assert_allclose(loads(result), expected.sum()) a = mt.ones((100, 50), chunk_size=35) * 2 + 1 b = mt.ones((50, 200), chunk_size=35) * 2 + 1 c = a.dot(b) graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) assert_allclose(loads(result), np.ones((100, 200)) * 450) base_arr = np.random.random((100, 100)) a = mt.array(base_arr) sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)]) graph = sumv.build_graph() targets = [sumv.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)]) result = session_ref.fetch_result(graph_key, sumv.key) assert_allclose(loads(result), expected) a = mt.ones((31, 27), chunk_size=10) b = a.reshape(27, 31) b.op.params['_reshape_with_shuffle'] = True graph = b.build_graph() targets = [b.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, b.key) assert_allclose(loads(result), np.ones((27, 31))) def testMainWithEtcd(self): self.start_processes(etcd=True) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1 assert_allclose(loads(result), expected.sum()) def testWorkerFailOver(self): def kill_process_tree(proc): import psutil proc = psutil.Process(proc.pid) plasma_sock_dir = None for p in proc.children(recursive=True): if 'plasma' in p.name(): socks = [ conn.laddr for conn in p.connections('unix') if 'plasma' in conn.laddr ] if socks: plasma_sock_dir = os.path.dirname(socks[0]) p.kill() proc.kill() if plasma_sock_dir: shutil.rmtree(plasma_sock_dir, ignore_errors=True) delay_file = self.add_state_file('DELAY_STATE_FILE') open(delay_file, 'w').close() terminate_file = self.add_state_file('TERMINATE_STATE_FILE') self.start_processes(modules=['mars.scheduler.tests.op_delayer'], log_worker=True) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) np_a = np.random.random((100, 100)) np_b = np.random.random((100, 100)) a = mt.array(np_a, chunk_size=30) * 2 + 1 b = mt.array(np_b, chunk_size=30) * 2 + 1 c = a.dot(b) * 2 + 1 graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) while not os.path.exists(terminate_file): actor_client.sleep(0.05) kill_process_tree(self.proc_workers[0]) logger.warning('Worker %s KILLED!\n\n', self.proc_workers[0].pid) self.proc_workers = self.proc_workers[1:] os.unlink(delay_file) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np_a * 2 + 1).dot(np_b * 2 + 1) * 2 + 1 assert_allclose(loads(result), expected)
class SchedulerIntegratedTest(unittest.TestCase): @classmethod def setUpClass(cls): from mars import kvstore options.worker.spill_directory = os.path.join(tempfile.gettempdir(), 'mars_test_spill') cls._kv_store = kvstore.get(options.kv_store) @classmethod def tearDownClass(cls): import shutil if os.path.exists(options.worker.spill_directory): shutil.rmtree(options.worker.spill_directory) def setUp(self): self.scheduler_endpoints = [] self.proc_schedulers = [] self.proc_workers = [] self.state_files = dict() self.etcd_helper = None self.intentional_death_pids = set() def tearDown(self): for env, fn in self.state_files.items(): os.environ.pop(env) if os.path.exists(fn): os.unlink(fn) procs = tuple(self.proc_workers) + tuple(self.proc_schedulers) for p in procs: p.send_signal(signal.SIGINT) check_time = time.time() while any(p.poll() is None for p in procs): time.sleep(0.1) if time.time() - check_time > 5: break for p in procs: if p.poll() is None: self.kill_process_tree(p) if self.etcd_helper: self.etcd_helper.stop() options.kv_store = ':inproc:' def kill_process_tree(self, proc, intentional=True): if intentional: self.intentional_death_pids.add(proc.pid) import psutil proc = psutil.Process(proc.pid) plasma_sock_dir = None for p in proc.children(recursive=True): try: if 'plasma' in p.name(): socks = [ conn.laddr for conn in p.connections('unix') if 'plasma' in conn.laddr ] if socks: plasma_sock_dir = os.path.dirname(socks[0]) p.kill() except psutil.NoSuchProcess: continue proc.kill() if plasma_sock_dir: shutil.rmtree(plasma_sock_dir, ignore_errors=True) def add_state_file(self, environ): fn = os.environ[environ] = os.path.join( tempfile.gettempdir(), 'test-main-%s-%d-%d' % (environ.lower(), os.getpid(), id(self))) self.state_files[environ] = fn return fn def start_processes(self, n_schedulers=2, n_workers=2, etcd=False, cuda=False, modules=None, log_scheduler=True, log_worker=True, env=None): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception, ) scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)] self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports] append_args = [] append_args_scheduler = [] append_args_worker = [] if modules: append_args.extend(['--load-modules', ','.join(modules)]) if etcd: etcd_port = get_next_port() self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port) self.etcd_helper.run() options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port append_args.extend(['--kv-store', options.kv_store]) else: append_args.extend( ['--schedulers', ','.join(self.scheduler_endpoints)]) if 'DUMP_GRAPH_DATA' in os.environ: append_args_scheduler += ['-Dscheduler.dump_graph_data=true'] if not cuda: append_args_worker += ['--no-cuda'] proc_env = os.environ.copy() if env: proc_env.update(env) self.proc_schedulers = [ subprocess.Popen([ sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '-p', p, '--log-level', 'debug' if log_scheduler else 'warning', '--log-format', 'SCH%d %%(asctime)-15s %%(message)s' % idx, '-Dscheduler.retry_delay=5', '-Dscheduler.default_cpu_usage=0', '-Dscheduler.status_timeout=10' ] + append_args + append_args_scheduler, env=proc_env) for idx, p in enumerate(scheduler_ports) ] cuda_count = resource.cuda_count() self.proc_workers = [ subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', '1', '--log-level', 'debug' if log_worker else 'warning', '--log-format', 'WOR%d %%(asctime)-15s %%(message)s' % idx, '--cache-mem', '16m', '--ignore-avail-mem', '--cuda-device', str(idx % cuda_count) if cuda_count else '0', '-Dworker.prepare_data_timeout=30' ] + append_args + append_args_worker, env=proc_env) for idx in range(n_workers) ] actor_client = new_client() self.cluster_info = actor_client.actor_ref( SchedulerClusterInfoActor.default_uid(), address=self.scheduler_endpoints[0]) check_time = time.time() while True: try: started_schedulers = self.cluster_info.get_schedulers() if len(started_schedulers) < n_schedulers: raise ProcessRequirementUnmetError( 'Schedulers does not met requirement: %d < %d.' % (len(started_schedulers), n_schedulers)) actor_address = self.cluster_info.get_scheduler( SessionManagerActor.default_uid()) self.session_manager_ref = actor_client.actor_ref( SessionManagerActor.default_uid(), address=actor_address) actor_address = self.cluster_info.get_scheduler( ResourceActor.default_uid()) resource_ref = actor_client.actor_ref( ResourceActor.default_uid(), address=actor_address) if resource_ref.get_worker_count() < n_workers: raise ProcessRequirementUnmetError( 'Workers does not met requirement: %d < %d.' % (resource_ref.get_worker_count(), n_workers)) break except: if time.time() - check_time > 20: raise time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors def check_process_statuses(self): for scheduler_proc in self.proc_schedulers: if scheduler_proc.poll() is not None: raise ProcessRequirementUnmetError( 'Scheduler not started. exit code %s' % self.proc_scheduler.poll()) for worker_proc in self.proc_workers: if worker_proc.poll( ) is not None and worker_proc.pid not in self.intentional_death_pids: raise ProcessRequirementUnmetError( 'Worker not started. exit code %s' % worker_proc.poll()) def wait_for_termination(self, actor_client, session_ref, graph_key): check_time = time.time() dump_time = time.time() check_timeout = int(os.environ.get('CHECK_TIMEOUT', 120)) while True: time.sleep(0.1) self.check_process_statuses() if time.time() - check_time > check_timeout: raise SystemError('Check graph status timeout') if time.time() - dump_time > 10: dump_time = time.time() graph_refs = session_ref.get_graph_refs() try: graph_ref = actor_client.actor_ref(graph_refs[graph_key]) graph_ref.dump_unfinished_terminals() except KeyError: pass if session_ref.graph_state( graph_key) in GraphState.TERMINATED_STATES: return session_ref.graph_state(graph_key)
class Test(unittest.TestCase): @classmethod def setUpClass(cls): import tempfile from mars import kvstore options.worker.spill_directory = os.path.join(tempfile.gettempdir(), 'mars_test_spill') cls._kv_store = kvstore.get(options.kv_store) @classmethod def tearDownClass(cls): import shutil if os.path.exists(options.worker.spill_directory): shutil.rmtree(options.worker.spill_directory) def setUp(self): self.scheduler_endpoints = [] self.proc_schedulers = [] self.proc_workers = [] self.etcd_helper = None def tearDown(self): procs = tuple(self.proc_workers) + tuple(self.proc_schedulers) for p in procs: p.send_signal(signal.SIGINT) check_time = time.time() while any(p.poll() is None for p in procs): time.sleep(0.1) if time.time() - check_time > 5: break for p in procs: if p.poll() is None: p.kill() if self.etcd_helper: self.etcd_helper.stop() def start_processes(self, n_schedulers=1, n_workers=2, etcd=False, modules=None): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception,) scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)] self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports] append_args = [] if modules: append_args.extend(['--load-modules', ','.join(modules)]) if etcd: etcd_port = get_next_port() self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port) self.etcd_helper.run() options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port append_args.extend(['--kv-store', options.kv_store]) else: append_args.extend(['--schedulers', ','.join(self.scheduler_endpoints)]) self.proc_schedulers = [ subprocess.Popen([sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '--level', 'debug', '-p', p, '--format', '%(asctime)-15s %(message)s'] + append_args) for p in scheduler_ports] self.proc_workers = [ subprocess.Popen([sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', '1', '--level', 'debug', '--cache-mem', '16m', '--ignore-avail-mem'] + append_args) for _ in range(n_workers) ] actor_client = new_client() self.cluster_info = actor_client.actor_ref( ClusterInfoActor.default_name(), address=self.scheduler_endpoints[0]) check_time = time.time() while True: try: started_schedulers = self.cluster_info.get_schedulers() if len(started_schedulers) < n_schedulers: raise RuntimeError('Schedulers does not met requirement: %d < %d.' % ( len(started_schedulers), n_schedulers )) actor_address = self.cluster_info.get_scheduler(SessionManagerActor.default_name()) self.session_manager_ref = actor_client.actor_ref( SessionManagerActor.default_name(), address=actor_address) actor_address = self.cluster_info.get_scheduler(ResourceActor.default_name()) resource_ref = actor_client.actor_ref(ResourceActor.default_name(), address=actor_address) if resource_ref.get_worker_count() < n_workers: raise RuntimeError('Workers does not met requirement: %d < %d.' % ( resource_ref.get_worker_count(), n_workers )) break except: if time.time() - check_time > 20: raise time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors def check_process_statuses(self): for scheduler_proc in self.proc_schedulers: if scheduler_proc.poll() is not None: raise SystemError('Scheduler not started. exit code %s' % self.proc_scheduler.poll()) for worker_proc in self.proc_workers: if worker_proc.poll() is not None: raise SystemError('Worker not started. exit code %s' % worker_proc.poll()) def wait_for_termination(self, session_ref, graph_key): check_time = time.time() while True: time.sleep(0.1) self.check_process_statuses() if time.time() - check_time > 60: raise SystemError('Check graph status timeout') if session_ref.graph_state(graph_key) in GraphState.TERMINATED_STATES: return session_ref.graph_state(graph_key) def testMainWithoutEtcd(self): self.start_processes(n_schedulers=2) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1 assert_array_equal(loads(result), expected.sum()) graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) # todo this behavior may change when eager mode is introduced state = self.wait_for_termination(session_ref, graph_key) self.assertEqual(state, GraphState.FAILED) a = mt.ones((100, 50), chunk_size=35) * 2 + 1 b = mt.ones((50, 200), chunk_size=35) * 2 + 1 c = a.dot(b) graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) assert_array_equal(loads(result), np.ones((100, 200)) * 450) base_arr = np.random.random((100, 100)) a = mt.array(base_arr) sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)]) graph = sumv.build_graph() targets = [sumv.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)]) result = session_ref.fetch_result(graph_key, sumv.key) assert_array_equal(loads(result), expected) def testMainWithEtcd(self): self.start_processes(n_schedulers=2, etcd=True) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1 assert_array_equal(loads(result), expected.sum())
class SchedulerIntegratedTest(unittest.TestCase): @classmethod def setUpClass(cls): from mars import kvstore options.worker.spill_directory = os.path.join(tempfile.gettempdir(), 'mars_test_spill') cls._kv_store = kvstore.get(options.kv_store) cls.timeout = int(os.environ.get('CHECK_TIMEOUT', 120)) @classmethod def tearDownClass(cls): import shutil if os.path.exists(options.worker.spill_directory): shutil.rmtree(options.worker.spill_directory) def setUp(self): self.scheduler_endpoints = [] self.proc_schedulers = [] self.proc_workers = [] self.state_files = dict() self.etcd_helper = None self.intentional_death_pids = set() def tearDown(self): for env, fn in self.state_files.items(): os.environ.pop(env) if os.path.exists(fn): os.unlink(fn) self.terminate_processes() options.kv_store = ':inproc:' def terminate_processes(self): procs = tuple(self.proc_workers) + tuple(self.proc_schedulers) for p in procs: p.send_signal(signal.SIGINT) check_time = time.time() while any(p.poll() is None for p in procs): time.sleep(0.1) if time.time() - check_time > 5: break for p in procs: if p.poll() is None: self.kill_process_tree(p) if self.etcd_helper: self.etcd_helper.stop() def kill_process_tree(self, proc, intentional=True): if intentional: self.intentional_death_pids.add(proc.pid) kill_process_tree(proc.pid) def add_state_file(self, environ): fn = os.environ[environ] = os.path.join( tempfile.gettempdir(), f'test-main-{environ.lower()}-{os.getpid()}-{id(self)}') self.state_files[environ] = fn return fn def start_processes(self, *args, **kwargs): fail_count = 0 while True: try: self._start_processes(*args, **kwargs) break except ProcessRequirementUnmetError: self.terminate_processes() fail_count += 1 if fail_count >= 10: raise time.sleep(5) logger.error('Failed to start service, retrying') def _start_processes(self, n_schedulers=2, n_workers=2, etcd=False, cuda=False, modules=None, log_scheduler=True, log_worker=True, env=None, scheduler_args=None, worker_args=None, worker_cpu=1): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception,) scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)] self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports] append_args = [] append_args_scheduler = scheduler_args or [] append_args_worker = worker_args or [] if modules: append_args.extend(['--load-modules', ','.join(modules)]) if etcd: etcd_port = get_next_port() self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port) self.etcd_helper.run() options.kv_store = f'etcd://127.0.0.1:{etcd_port}' append_args.extend(['--kv-store', options.kv_store]) else: append_args.extend(['--schedulers', ','.join(self.scheduler_endpoints)]) if 'DUMP_GRAPH_DATA' in os.environ: append_args_scheduler += ['-Dscheduler.dump_graph_data=true'] proc_env = os.environ.copy() if env: proc_env.update(env) self.proc_schedulers = [ subprocess.Popen([sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '-p', p, '--log-level', 'debug' if log_scheduler else 'warning', '--log-format', f'SCH{idx} %(asctime)-15s %(message)s' '-Dscheduler.retry_delay=5', '-Dscheduler.default_cpu_usage=0', '-Dscheduler.status_timeout=10'] + append_args + append_args_scheduler, env=proc_env) for idx, p in enumerate(scheduler_ports)] cuda_count = resource.cuda_count() cuda_devices = [int(d) for d in os.environ['CUDA_VISIBLE_DEVICES'].split(',')] \ if os.environ.get('CUDA_VISIBLE_DEVICES') else list(range(cuda_count)) self.proc_workers = [ subprocess.Popen([sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', str(worker_cpu), '--log-level', 'debug' if log_worker else 'warning', '--log-format', f'WOR{idx} %(asctime)-15s %(message)s', '--cache-mem', '16m', '--ignore-avail-mem', '--cuda-device', str(cuda_devices[idx % cuda_count]) if cuda_count else '', '-Dworker.prepare_data_timeout=30'] + append_args + append_args_worker, env=proc_env) for idx in range(n_workers) ] actor_client = new_client() self.cluster_info = actor_client.actor_ref( SchedulerClusterInfoActor.default_uid(), address=self.scheduler_endpoints[0]) check_time = time.time() while True: try: try: started_schedulers = self.cluster_info.get_schedulers() except Exception as e: raise ProcessRequirementUnmetError(f'Failed to get scheduler numbers, {e}') if len(started_schedulers) < n_schedulers: raise ProcessRequirementUnmetError( f'Schedulers does not met requirement: {len(started_schedulers)} < {n_schedulers}.') actor_address = self.cluster_info.get_scheduler(SessionManagerActor.default_uid()) self.session_manager_ref = actor_client.actor_ref( SessionManagerActor.default_uid(), address=actor_address) actor_address = self.cluster_info.get_scheduler(ResourceActor.default_uid()) resource_ref = actor_client.actor_ref(ResourceActor.default_uid(), address=actor_address) if not actor_client.has_actor(self.session_manager_ref) \ or resource_ref.get_worker_count() < n_workers: raise ProcessRequirementUnmetError( f'Workers does not met requirement: {resource_ref.get_worker_count()} < {n_workers}') break except: # noqa: E722 if time.time() - check_time > 20: raise time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors def check_process_statuses(self): for scheduler_proc in self.proc_schedulers: if scheduler_proc.poll() is not None: raise ProcessRequirementUnmetError( f'Scheduler not started. exit code {self.proc_scheduler.poll()}') for worker_proc in self.proc_workers: if worker_proc.poll() is not None and worker_proc.pid not in self.intentional_death_pids: raise ProcessRequirementUnmetError( f'Worker not started. exit code {worker_proc.poll()}') def wait_for_termination(self, actor_client, session_ref, graph_key): check_time = time.time() dump_time = time.time() check_timeout = int(os.environ.get('CHECK_TIMEOUT', 120)) while True: time.sleep(0.1) self.check_process_statuses() if time.time() - check_time > check_timeout: raise SystemError('Check graph status timeout') if time.time() - dump_time > 10: dump_time = time.time() graph_refs = session_ref.get_graph_refs() try: graph_ref = actor_client.actor_ref(graph_refs[graph_key]) graph_ref.dump_unfinished_terminals() except KeyError: pass if session_ref.graph_state(graph_key) in GraphState.TERMINATED_STATES: return session_ref.graph_state(graph_key)