def testOperandsWithoutPrepareInputs(self): self.start_processes(etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op']) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) actor_address = self.cluster_info.get_scheduler(ResourceActor.default_uid()) resource_ref = actor_client.actor_ref(ResourceActor.default_uid(), address=actor_address) worker_endpoints = resource_ref.get_worker_endpoints() t1 = mt.random.rand(10) t1.op._expect_worker = worker_endpoints[0] t2 = mt.random.rand(10) t2.op._expect_worker = worker_endpoints[1] t = NoPrepareOperand().new_tileable([t1, t2]) t.op._prepare_inputs = [False, False] graph = t.build_graph() targets = [t.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED)
def testMainWithEtcd(self): self.start_processes(etcd=True) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1 assert_allclose(loads(result), expected.sum())
def testShuffleFailoverBeforeSuccStart(self): pred_finish_file = self.add_state_file('SHUFFLE_ALL_PRED_FINISHED_FILE') succ_start_file = self.add_state_file('SHUFFLE_START_SUCC_FILE') self.start_processes(modules=['mars.scheduler.tests.integrated.op_delayer'], log_worker=True) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) a = mt.ones((31, 27), chunk_size=10) b = a.reshape(27, 31) b.op.extra_params['_reshape_with_shuffle'] = True graph = b.build_graph() targets = [b.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) actor_client.sleep(1) while not os.path.exists(pred_finish_file): actor_client.sleep(0.01) self.kill_process_tree(self.proc_workers[0]) logger.warning('Worker %s KILLED!\n\n', self.proc_workers[0].pid) self.proc_workers = self.proc_workers[1:] open(succ_start_file, 'w').close() state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, b.key) assert_allclose(loads(result), np.ones((27, 31)))
def testWorkerFailOver(self): def kill_process_tree(proc): import psutil proc = psutil.Process(proc.pid) plasma_sock_dir = None for p in proc.children(recursive=True): if 'plasma' in p.name(): socks = [ conn.laddr for conn in p.connections('unix') if 'plasma' in conn.laddr ] if socks: plasma_sock_dir = os.path.dirname(socks[0]) p.kill() proc.kill() if plasma_sock_dir: shutil.rmtree(plasma_sock_dir, ignore_errors=True) delay_file = self.add_state_file('DELAY_STATE_FILE') open(delay_file, 'w').close() terminate_file = self.add_state_file('TERMINATE_STATE_FILE') self.start_processes(modules=['mars.scheduler.tests.op_delayer'], log_worker=True) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) np_a = np.random.random((100, 100)) np_b = np.random.random((100, 100)) a = mt.array(np_a, chunk_size=30) * 2 + 1 b = mt.array(np_b, chunk_size=30) * 2 + 1 c = a.dot(b) * 2 + 1 graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) while not os.path.exists(terminate_file): actor_client.sleep(0.05) kill_process_tree(self.proc_workers[0]) logger.warning('Worker %s KILLED!\n\n', self.proc_workers[0].pid) self.proc_workers = self.proc_workers[1:] os.unlink(delay_file) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np_a * 2 + 1).dot(np_b * 2 + 1) * 2 + 1 assert_allclose(loads(result), expected)
def testRemoteWithoutEtcd(self): self.start_processes( etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op']) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) rs = np.random.RandomState(0) raw1 = rs.rand(10, 10) raw2 = rs.rand(10, 10) def f_none(_x): return None r_none = spawn(f_none, raw1) graph = r_none.build_graph() targets = [r_none.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, r_none.key) self.assertIsNone(loads(result)) def f1(x): return x + 1 def f2(x, y, z=None): return x * y * (z[0] + z[1]) r1 = spawn(f1, raw1) r2 = spawn(f1, raw2) r3 = spawn(f2, (r1, r2), {'z': [r1, r2]}) graph = r3.build_graph() targets = [r3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, r3.key) expected = (raw1 + 1) * (raw2 + 1) * (raw1 + 1 + raw2 + 1) assert_allclose(loads(result), expected)
def testWorkerFailOver(self): def kill_process_tree(p): import psutil proc = psutil.Process(p.pid) for p in proc.children(recursive=True): p.kill() proc.kill() import tempfile delay_file = os.environ['DELAY_STATE_FILE'] = os.path.join( tempfile.gettempdir(), 'test-main-delay-%d-%d' % (os.getpid(), id(self))) open(delay_file, 'w').close() terminate_file = os.environ['TERMINATE_STATE_FILE'] = os.path.join( tempfile.gettempdir(), 'test-main-terminate-%d-%d' % (os.getpid(), id(self))) self.start_processes(modules=['mars.scheduler.tests.op_delayer'], log_worker=True) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) np_a = np.random.random((100, 100)) np_b = np.random.random((100, 100)) a = mt.array(np_a, chunk_size=30) * 2 + 1 b = mt.array(np_b, chunk_size=30) * 2 + 1 c = a.dot(b) * 2 + 1 graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) while not os.path.exists(terminate_file): actor_client.sleep(0.05) os.unlink(terminate_file) # actor_client.sleep(1.2) kill_process_tree(self.proc_workers[0]) logger.warning('Worker %s KILLED!\n\n', self.proc_workers[0].pid) self.proc_workers = self.proc_workers[1:] os.unlink(delay_file) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np_a * 2 + 1).dot(np_b * 2 + 1) * 2 + 1 assert_allclose(loads(result), expected)
def testIterativeTilingWithoutEtcd(self): self.start_processes(etcd=False) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) rs = np.random.RandomState(0) raw = rs.rand(100) a = mt.tensor(raw, chunk_size=10) a.sort() c = a[:5] graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = np.sort(raw)[:5] assert_allclose(loads(result), expected) with self.assertRaises(TypeError): session_ref.fetch_result(graph_key, a.key, check=False) raw1 = rs.rand(20) raw2 = rs.rand(20) a = mt.tensor(raw1, chunk_size=10) a.sort() b = mt.tensor(raw2, chunk_size=15) + 1 c = mt.concatenate([a[:10], b]) c.sort() d = c[:5] graph = d.build_graph() targets = [d.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, d.key) expected = np.sort(np.concatenate([np.sort(raw1)[:10], raw2 + 1]))[:5] assert_allclose(loads(result), expected)
def testMain(self): session_id = uuid.uuid1() scheduler_address = '127.0.0.1:' + self.scheduler_port actor_client = new_client() session_ref = actor_client.create_actor( SessionActor, uid=SessionActor.gen_name(session_id), address=scheduler_address, session_id=session_id) a = ones((100, 100), chunks=30) * 2 * 1 + 1 b = ones((100, 100), chunks=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) check_time = time.time() while True: time.sleep(1) self.check_process_statuses() if time.time() - check_time > 60: raise SystemError('Check graph status timeout') if session_ref.graph_state(graph_key) == GraphState.SUCCEEDED: result = session_ref.fetch_result(graph_key, c.key) break expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1 assert_array_equal(loads(result), expected.sum()) a = ones((100, 50), chunks=30) * 2 + 1 b = ones((50, 200), chunks=30) * 2 + 1 c = a.dot(b) graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) check_time = time.time() while True: time.sleep(1) self.check_process_statuses() if time.time() - check_time > 60: raise SystemError('Check graph status timeout') if session_ref.graph_state(graph_key) == GraphState.SUCCEEDED: result = session_ref.fetch_result(graph_key, c.key) break assert_array_equal(loads(result), np.ones((100, 200)) * 450)
def testMainWithoutEtcd(self): self.start_processes() session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1 assert_allclose(loads(result), expected.sum()) a = mt.ones((100, 50), chunk_size=35) * 2 + 1 b = mt.ones((50, 200), chunk_size=35) * 2 + 1 c = a.dot(b) graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) assert_allclose(loads(result), np.ones((100, 200)) * 450) base_arr = np.random.random((100, 100)) a = mt.array(base_arr) sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)]) graph = sumv.build_graph() targets = [sumv.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)]) result = session_ref.fetch_result(graph_key, sumv.key) assert_allclose(loads(result), expected)
def setUp(self): scheduler_port = str(get_next_port()) proc_worker1 = subprocess.Popen([sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', '2', '--level', 'debug', '--cache-mem', '16m', '--schedulers', '127.0.0.1:' + scheduler_port, '--ignore-avail-mem']) proc_worker2 = subprocess.Popen([sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', '2', '--level', 'debug', '--cache-mem', '16m', '--schedulers', '127.0.0.1:' + scheduler_port, '--ignore-avail-mem']) proc_scheduler = subprocess.Popen([sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '--level', 'debug', '-p', scheduler_port, '--format', '%(asctime)-15s %(message)s']) self.scheduler_port = scheduler_port self.proc_workers = [proc_worker1, proc_worker2] self.proc_scheduler = proc_scheduler time.sleep(2) actor_client = new_client() check_time = time.time() while True: try: resource_ref = actor_client.actor_ref(ResourceActor.default_name(), address='127.0.0.1:' + scheduler_port) if actor_client.has_actor(resource_ref): break else: raise SystemError('Check meta_timestamp timeout') except: if time.time() - check_time > 10: raise time.sleep(1) check_time = time.time() while True: if resource_ref.get_worker_count() < 2: time.sleep(0.5) self.check_process_statuses() if time.time() - check_time > 20: raise SystemError('Check meta_timestamp timeout') else: break self.exceptions = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception,)
def setUp(self): self.worker_plasma_sock = '/tmp/plasma_%d_%d.sock' % (os.getpid(), id(Test)) scheduler_port = str(get_next_port()) proc_worker = subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', '2', '--level', 'debug', '--cache-mem', '16m', '--schedulers', '127.0.0.1:' + scheduler_port, '--plasma-socket', self.worker_plasma_sock, '--ignore-avail-mem' ]) proc_scheduler = subprocess.Popen([ sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '--level', 'debug', '-p', scheduler_port, '--format', '%(asctime)-15s %(message)s' ]) self.scheduler_port = scheduler_port self.proc_worker = proc_worker self.proc_scheduler = proc_scheduler time.sleep(2) actor_client = new_client() check_time = time.time() while True: try: kv_ref = actor_client.actor_ref(KVStoreActor.default_name(), address='127.0.0.1:' + scheduler_port) if actor_client.has_actor(kv_ref): break else: raise SystemError('Check meta_timestamp timeout') except: if time.time() - check_time > 10: raise time.sleep(1) check_time = time.time() while True: content = kv_ref.read('/workers/meta_timestamp', silent=True) if not content: time.sleep(0.5) self.check_process_statuses() if time.time() - check_time > 20: raise SystemError('Check meta_timestamp timeout') else: break self.exceptions = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception, )
def testDistributedContext(self): self.start_processes(etcd=False) session_id = uuid.uuid1() actor_client = new_client() rs = np.random.RandomState(0) context = DistributedContext( scheduler_address=self.scheduler_endpoints[0], session_id=session_id) session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) raw1 = rs.rand(10, 10) a = mt.tensor(raw1, chunk_size=4) graph = a.build_graph() targets = [a.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets, names=['test']) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) tileable_infos = context.get_named_tileable_infos('test') self.assertEqual(a.key, tileable_infos.tileable_key) self.assertEqual(a.shape, tileable_infos.tileable_shape) nsplits = context.get_tileable_metas([a.key], filter_fields=['nsplits'])[0][0] self.assertEqual(((4, 4, 2), (4, 4, 2)), nsplits) r = context.get_tileable_data(a.key) np.testing.assert_array_equal(raw1, r) indexes = [slice(3, 9), slice(0, 7)] r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[tuple(indexes)], r) indexes = [[1, 4, 2, 4, 5], slice(None, None, None)] r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[tuple(indexes)], r) indexes = ([9, 1, 2, 0], [0, 0, 4, 4]) r = context.get_tileable_data(a.key, indexes) np.testing.assert_array_equal(raw1[[9, 1, 2, 0], [0, 0, 4, 4]], r)
def testMain(self): session_id = uuid.uuid1() scheduler_address = '127.0.0.1:' + self.scheduler_port actor_client = new_client() session_ref = actor_client.create_actor( SessionActor, uid=SessionActor.gen_name(session_id), address=scheduler_address, session_id=session_id) a = ones((100, 100), chunks=30) * 2 * 1 + 1 b = ones((100, 100), chunks=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1 assert_array_equal(loads(result), expected.sum()) graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) # todo this behavior may change when eager mode is introduced state = self.wait_for_termination(session_ref, graph_key) self.assertEqual(state, GraphState.FAILED) a = ones((100, 50), chunks=30) * 2 + 1 b = ones((50, 200), chunks=30) * 2 + 1 c = a.dot(b) graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) assert_array_equal(loads(result), np.ones((100, 200)) * 450)
def testCommonOperandFailover(self): delay_file = self.add_state_file('OP_DELAY_STATE_FILE') open(delay_file, 'w').close() terminate_file = self.add_state_file('OP_TERMINATE_STATE_FILE') self.start_processes( modules=['mars.scheduler.tests.integrated.op_delayer'], log_worker=True) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) np_a = np.random.random((100, 100)) np_b = np.random.random((100, 100)) a = mt.array(np_a, chunk_size=30) * 2 + 1 b = mt.array(np_b, chunk_size=30) * 2 + 1 c = a.dot(b) * 2 + 1 graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) while not os.path.exists(terminate_file): actor_client.sleep(0.01) self.kill_process_tree(self.proc_workers[0]) logger.warning('Worker %s KILLED!\n\n', self.proc_workers[0].pid) self.proc_workers = self.proc_workers[1:] os.unlink(delay_file) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np_a * 2 + 1).dot(np_b * 2 + 1) * 2 + 1 assert_allclose(loads(result), expected)
def start_processes(self, n_schedulers=2, n_workers=2, etcd=False, modules=None, log_scheduler=True, log_worker=True): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception, ) scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)] self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports] append_args = [] if modules: append_args.extend(['--load-modules', ','.join(modules)]) if etcd: etcd_port = get_next_port() self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port) self.etcd_helper.run() options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port append_args.extend(['--kv-store', options.kv_store]) else: append_args.extend( ['--schedulers', ','.join(self.scheduler_endpoints)]) if 'DUMP_GRAPH_DATA' in os.environ: append_args += ['-Dscheduler.dump_graph_data=true'] self.proc_schedulers = [ subprocess.Popen([ sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '--level', 'debug' if log_scheduler else 'warning', '-p', p, '--format', '%(asctime)-15s %(message)s', '-Dscheduler.retry_delay=5' ] + append_args) for p in scheduler_ports ] self.proc_workers = [ subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', '1', '--level', 'debug' if log_worker else 'warning', '--cache-mem', '16m', '--ignore-avail-mem', '-Dworker.prepare_data_timeout=30' ] + append_args) for _ in range(n_workers) ] actor_client = new_client() self.cluster_info = actor_client.actor_ref( ClusterInfoActor.default_name(), address=self.scheduler_endpoints[0]) check_time = time.time() while True: try: started_schedulers = self.cluster_info.get_schedulers() if len(started_schedulers) < n_schedulers: raise RuntimeError( 'Schedulers does not met requirement: %d < %d.' % (len(started_schedulers), n_schedulers)) actor_address = self.cluster_info.get_scheduler( SessionManagerActor.default_name()) self.session_manager_ref = actor_client.actor_ref( SessionManagerActor.default_name(), address=actor_address) actor_address = self.cluster_info.get_scheduler( ResourceActor.default_name()) resource_ref = actor_client.actor_ref( ResourceActor.default_name(), address=actor_address) if resource_ref.get_worker_count() < n_workers: raise RuntimeError( 'Workers does not met requirement: %d < %d.' % (resource_ref.get_worker_count(), n_workers)) break except: if time.time() - check_time > 20: raise time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors
def start_processes(self, n_schedulers=2, n_workers=2, etcd=False, cuda=False, modules=None, log_scheduler=True, log_worker=True, env=None): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception, ) scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)] self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports] append_args = [] append_args_scheduler = [] append_args_worker = [] if modules: append_args.extend(['--load-modules', ','.join(modules)]) if etcd: etcd_port = get_next_port() self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port) self.etcd_helper.run() options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port append_args.extend(['--kv-store', options.kv_store]) else: append_args.extend( ['--schedulers', ','.join(self.scheduler_endpoints)]) if 'DUMP_GRAPH_DATA' in os.environ: append_args_scheduler += ['-Dscheduler.dump_graph_data=true'] if not cuda: append_args_worker += ['--no-cuda'] proc_env = os.environ.copy() if env: proc_env.update(env) self.proc_schedulers = [ subprocess.Popen([ sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '-p', p, '--log-level', 'debug' if log_scheduler else 'warning', '--log-format', 'SCH%d %%(asctime)-15s %%(message)s' % idx, '-Dscheduler.retry_delay=5', '-Dscheduler.default_cpu_usage=0', '-Dscheduler.status_timeout=10' ] + append_args + append_args_scheduler, env=proc_env) for idx, p in enumerate(scheduler_ports) ] cuda_count = resource.cuda_count() cuda_devices = [int(d) for d in os.environ['CUDA_VISIBLE_DEVICES'].split(',')] \ if os.environ.get('CUDA_VISIBLE_DEVICES') else list(range(cuda_count)) self.proc_workers = [ subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', '1', '--log-level', 'debug' if log_worker else 'warning', '--log-format', 'WOR%d %%(asctime)-15s %%(message)s' % idx, '--cache-mem', '16m', '--ignore-avail-mem', '--cuda-device', str(cuda_devices[idx % cuda_count]) if cuda_count else '0', '-Dworker.prepare_data_timeout=30' ] + append_args + append_args_worker, env=proc_env) for idx in range(n_workers) ] actor_client = new_client() self.cluster_info = actor_client.actor_ref( SchedulerClusterInfoActor.default_uid(), address=self.scheduler_endpoints[0]) check_time = time.time() while True: try: try: started_schedulers = self.cluster_info.get_schedulers() except Exception as e: raise ProcessRequirementUnmetError( 'Failed to get scheduler numbers, %s' % e) if len(started_schedulers) < n_schedulers: raise ProcessRequirementUnmetError( 'Schedulers does not met requirement: %d < %d.' % (len(started_schedulers), n_schedulers)) actor_address = self.cluster_info.get_scheduler( SessionManagerActor.default_uid()) self.session_manager_ref = actor_client.actor_ref( SessionManagerActor.default_uid(), address=actor_address) actor_address = self.cluster_info.get_scheduler( ResourceActor.default_uid()) resource_ref = actor_client.actor_ref( ResourceActor.default_uid(), address=actor_address) if resource_ref.get_worker_count() < n_workers: raise ProcessRequirementUnmetError( 'Workers does not met requirement: %d < %d.' % (resource_ref.get_worker_count(), n_workers)) break except: if time.time() - check_time > 20: raise time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors
def testMainTensorWithoutEtcd(self): self.start_processes() session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1) ** 2 * 2 + 1 assert_allclose(loads(result), expected.sum()) a = mt.ones((100, 50), chunk_size=35) * 2 + 1 b = mt.ones((50, 200), chunk_size=35) * 2 + 1 c = a.dot(b) graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) assert_allclose(loads(result), np.ones((100, 200)) * 450) base_arr = np.random.random((100, 100)) a = mt.array(base_arr) sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)]) graph = sumv.build_graph() targets = [sumv.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)]) result = session_ref.fetch_result(graph_key, sumv.key) assert_allclose(loads(result), expected) a = mt.ones((31, 27), chunk_size=10) b = a.reshape(27, 31) b.op.extra_params['_reshape_with_shuffle'] = True r = b.sum(axis=1) graph = r.build_graph() targets = [r.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, r.key) assert_allclose(loads(result), np.ones((27, 31)).sum(axis=1)) raw = np.random.RandomState(0).rand(10, 10) a = mt.tensor(raw, chunk_size=(5, 4)) b = a[a.argmin(axis=1), mt.tensor(np.arange(10))] graph = b.build_graph() targets = [b.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, b.key) np.testing.assert_array_equal(loads(result), raw[raw.argmin(axis=1), np.arange(10)])
def testMainDataFrameWithoutEtcd(self): import pandas as pd from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df from mars.dataframe.datasource.series import from_pandas as from_pandas_series from mars.dataframe.arithmetic import add self.start_processes(etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true']) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas_df(data2, chunk_size=6) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas_df(data1, chunk_size=(10, 5)) data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas_df(data2, chunk_size=(10, 6)) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas_df(data2, chunk_size=6) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = from_pandas_series(s1) graph = series1.build_graph() targets = [series1.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, series1.key) pd.testing.assert_series_equal(s1, loads(result))
def setUp(self): self.worker_plasma_sock = '/tmp/plasma_%d_%d.sock' % (os.getpid(), id(Test)) scheduler_port = str(get_next_port()) proc_worker = subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--level', 'debug', '--cpu-procs', '2', '--cache-mem', '10m', '--schedulers', '127.0.0.1:' + scheduler_port, '--plasma-socket', self.worker_plasma_sock, '--ignore-avail-mem' ]) proc_scheduler = subprocess.Popen([ sys.executable, '-m', 'mars.scheduler', '--nproc', '1', '--level', 'debug', '-H', '127.0.0.1', '-p', scheduler_port, '--format', '%(asctime)-15s %(message)s' ]) self.scheduler_port = scheduler_port self.proc_worker = proc_worker self.proc_scheduler = proc_scheduler actor_client = new_client() time.sleep(2) check_time = time.time() while True: try: kv_ref = actor_client.actor_ref(KVStoreActor.default_name(), address='127.0.0.1:' + scheduler_port) if actor_client.has_actor(kv_ref): break else: raise SystemError('Check meta_timestamp timeout') except: if time.time() - check_time > 10: raise time.sleep(1) check_time = time.time() while True: content = kv_ref.read('/workers/meta_timestamp', silent=True) if self.proc_scheduler.poll() is not None: raise SystemError('Scheduler not started. exit code %s' % self.proc_scheduler.poll()) if self.proc_worker.poll() is not None: raise SystemError('Worker not started. exit code %s' % self.proc_worker.poll()) if time.time() - check_time > 20: raise SystemError('Check meta_timestamp timeout') if not content: time.sleep(0.5) else: break web_port = str(get_next_port()) self.web_port = web_port proc_web = subprocess.Popen([ sys.executable, '-m', 'mars.web', '-H', '127.0.0.1', '--level', 'debug', '--ui-port', web_port, '-s', '127.0.0.1:' + self.scheduler_port ]) self.proc_web = proc_web service_ep = 'http://127.0.0.1:' + self.web_port check_time = time.time() while True: if time.time() - check_time > 30: raise SystemError('Wait for service start timeout') try: resp = requests.get(service_ep + '/api', timeout=1) except (requests.ConnectionError, requests.Timeout): time.sleep(1) continue if resp.status_code >= 400: time.sleep(1) continue break self.exceptions = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception, )