def testOperandPrepush(self): session_id = str(uuid.uuid4()) graph_key = str(uuid.uuid4()) mock_workers = ['localhost:12345'] with self._prepare_test_graph(session_id, graph_key, mock_workers) as (pool, graph_ref): input_op_keys, mid_op_key, output_op_keys = self._filter_graph_level_op_keys( graph_ref) fake_exec_ref = pool.create_actor(FakeExecutionActor, 0.5) input_refs = [ pool.actor_ref(OperandActor.gen_uid(session_id, k)) for k in input_op_keys ] mid_ref = pool.actor_ref( OperandActor.gen_uid(session_id, mid_op_key)) def _fake_raw_execution_ref(*_, **__): return fake_exec_ref with patch_method(OperandActor._get_raw_execution_ref, new=_fake_raw_execution_ref),\ patch_method(AssignerActor.get_worker_assignments, new=lambda *_: mock_workers): input_refs[0].start_operand(OperandState.READY) input_refs[1].start_operand(OperandState.READY) start_time = time.time() # submission without pre-push will fail while mid_ref.get_state() != OperandState.FINISHED: pool.sleep(0.1) if time.time() - start_time > 30: raise TimeoutError( 'Check middle chunk state timed out.')
def testExecuteWorker(self): with self._start_worker_process() as (pool, worker_endpoint): test_ref = pool.create_actor(WorkerProcessTestActor) test_ref.run_test(worker_endpoint, _tell=True) check_time = time.time() while not test_ref.get_reply(): gevent.sleep(0.1) if time.time() - check_time > 20: raise TimeoutError('Check reply timeout')
def testExecuteCudaWorker(self): dev_id = os.environ.get('CUDA_VISIBLE_DEVICES', '0').split(',', 1)[0] with self._start_worker_process( no_cuda=False, cuda_device=dev_id) as (pool, worker_endpoint): test_ref = pool.create_actor(WorkerProcessTestActor) test_ref.run_test(worker_endpoint, calc_device='cuda', _tell=True) check_time = time.time() while not test_ref.get_reply(): gevent.sleep(0.1) if time.time() - check_time > 2000: raise TimeoutError('Check reply timeout')
def testWorkerProcessRestart(self): with self._start_worker_process() as (pool, worker_endpoint): daemon_ref = pool.actor_ref(WorkerDaemonActor.default_uid(), address=worker_endpoint) dispatch_ref = pool.actor_ref(DispatchActor.default_uid(), address=worker_endpoint) cpu_slots = dispatch_ref.get_slots('cpu') calc_ref = pool.actor_ref(cpu_slots[0], address=worker_endpoint) daemon_ref.kill_actor_process(calc_ref) check_start = time.time() while not daemon_ref.is_actor_process_alive(calc_ref): gevent.sleep(0.1) if time.time() - check_start > 10: raise TimeoutError('Check process restart timeout')
def testWorkerProcessRestart(self): mock_scheduler_addr = '127.0.0.1:%d' % get_next_port() try: with create_actor_pool(n_process=1, backend='gevent', address=mock_scheduler_addr) as pool: pool.create_actor(SchedulerClusterInfoActor, schedulers=[mock_scheduler_addr], uid=SchedulerClusterInfoActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) resource_ref = pool.create_actor( ResourceActor, uid=ResourceActor.default_name()) proc = subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--schedulers', mock_scheduler_addr, '--cpu-procs', '1', '--cache-mem', '10m', '--spill-dir', self._spill_dir, '--ignore-avail-mem' ]) worker_endpoint = self._wait_worker_ready(proc, resource_ref) daemon_ref = pool.actor_ref(WorkerDaemonActor.default_name(), address=worker_endpoint) dispatch_ref = pool.actor_ref(DispatchActor.default_name(), address=worker_endpoint) cpu_slots = dispatch_ref.get_slots('cpu') calc_ref = pool.actor_ref(cpu_slots[0], address=worker_endpoint) daemon_ref.kill_actor_process(calc_ref) check_start = time.time() while not daemon_ref.is_actor_process_alive(calc_ref): gevent.sleep(0.1) if time.time() - check_start > 10: raise TimeoutError('Check process restart timeout') finally: if proc.poll() is None: proc.send_signal(signal.SIGINT) check_time = time.time() while True: time.sleep(0.1) if proc.poll( ) is not None or time.time() - check_time >= 5: break if proc.poll() is None: proc.kill() if os.path.exists(options.worker.plasma_socket): os.unlink(options.worker.plasma_socket)
def waiter(): check_time = time.time() while True: if not resource_ref.get_workers_meta(): gevent.sleep(0.1) if proc.poll() is not None: raise SystemError('Worker dead. exit code %s' % proc.poll()) if time.time() - check_time > 20: raise TimeoutError('Check meta_timestamp timeout') continue else: break val = resource_ref.get_workers_meta() worker_ips.extend(val.keys())
def testExecuteWorker(self): mock_scheduler_addr = '127.0.0.1:%d' % get_next_port() try: with create_actor_pool(n_process=1, backend='gevent', address=mock_scheduler_addr) as pool: pool.create_actor(SchedulerClusterInfoActor, schedulers=[mock_scheduler_addr], uid=SchedulerClusterInfoActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) resource_ref = pool.create_actor( ResourceActor, uid=ResourceActor.default_name()) proc = subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--schedulers', mock_scheduler_addr, '--cpu-procs', '1', '--cache-mem', '10m', '--spill-dir', self._spill_dir, '--ignore-avail-mem' ]) worker_endpoint = self._wait_worker_ready(proc, resource_ref) test_ref = pool.create_actor(WorkerProcessTestActor) test_ref.run_test(worker_endpoint, _tell=True) check_time = time.time() while not test_ref.get_reply(): gevent.sleep(0.1) if time.time() - check_time > 20: raise TimeoutError('Check reply timeout') finally: if proc.poll() is None: proc.send_signal(signal.SIGINT) check_time = time.time() while True: time.sleep(0.1) if proc.poll( ) is not None or time.time() - check_time >= 5: break if proc.poll() is None: proc.kill() if os.path.exists(options.worker.plasma_socket): os.unlink(options.worker.plasma_socket)