async def test_cancel_subtask(actor_pool): pool, session_id, meta_api, storage_api, manager = actor_pool def sleep(timeout: int): time.sleep(timeout) return timeout a = mr.spawn(sleep, 2) subtask = _gen_subtask(a, session_id) subtask_runner: SubtaskRunnerActor = await manager.get_free_slot() asyncio.create_task(subtask_runner.run_subtask(subtask)) await asyncio.sleep(0.2) with Timer() as timer: # normal cancel by cancel asyncio Task await manager.free_slot(subtask_runner, timeout=5) # do not need to wait 5 sec assert timer.duration < 5 assert await manager.is_slot_free(subtask_runner) is True b = mr.spawn(sleep, 100) subtask2 = _gen_subtask(b, session_id) subtask_runner: SubtaskRunnerActor = await manager.get_free_slot() asyncio.create_task(subtask_runner.run_subtask(subtask2)) await asyncio.sleep(0.2) with Timer() as timer: # normal cancel by cancel asyncio Task aio_task = asyncio.create_task(manager.free_slot(subtask_runner, timeout=1)) assert await manager.is_slot_free(subtask_runner) is False await aio_task # need 1 sec to reach timeout, then killing actor and wait for auto recovering # the time would not be over 5 sec assert timer.duration < 5 assert await manager.is_slot_free(subtask_runner) is True
async def test_cancel_subtask(actor_pool): pool, session_id, meta_api, storage_api, manager = actor_pool subtask_runner: SubtaskRunnerRef = await mo.actor_ref( SubtaskRunnerActor.gen_uid('numa-0', 0), address=pool.external_address) def sleep(timeout: int): time.sleep(timeout) return timeout b = mr.spawn(sleep, 100) subtask = _gen_subtask(b, session_id) asyncio.create_task(subtask_runner.run_subtask(subtask)) await asyncio.sleep(0.2) with Timer() as timer: # normal cancel by cancel asyncio Task aio_task = asyncio.create_task(asyncio.wait_for( subtask_runner.cancel_subtask(), timeout=1)) assert await subtask_runner.is_runner_free() is False with pytest.raises(asyncio.TimeoutError): await aio_task # need 1 sec to reach timeout, then killing actor and wait for auto recovering # the time would not be over 5 sec assert timer.duration < 5 async def wait_slot_restore(): while True: try: assert await subtask_runner.is_runner_free() is True except (mo.ServerClosed, ConnectionRefusedError, mo.ActorNotExist): await asyncio.sleep(0.5) else: break await mo.kill_actor(subtask_runner) await wait_slot_restore() a = mr.spawn(sleep, 2) subtask2 = _gen_subtask(a, session_id) asyncio.create_task(subtask_runner.run_subtask(subtask2)) await asyncio.sleep(0.2) with Timer() as timer: # normal cancel by cancel asyncio Task await asyncio.wait_for(subtask_runner.cancel_subtask(), timeout=6) # do not need to wait 10 sec assert timer.duration < 10 assert await subtask_runner.is_runner_free() is True
async def test_cancel_task(actor_pool): pool, session_id, meta_api, lifecycle_api, storage_api, manager = actor_pool def func(): time.sleep(200) rs = [mr.spawn(func) for _ in range(10)] graph = TileableGraph([r.data for r in rs]) next(TileableGraphBuilder(graph).build()) task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False) assert isinstance(task_id, str) await asyncio.sleep(.5) with Timer() as timer: await manager.cancel_task(task_id) result = await manager.get_task_result(task_id) assert result.status == TaskStatus.terminated assert timer.duration < 20 keys = [r.key for r in rs] del rs gc.collect() await asyncio.sleep(0.5) # test ref counts assert (await lifecycle_api.get_tileable_ref_counts(keys)) == [0] * len(keys)
async def test_supervisor_peer_locator(actor_pool, temp_address_file): addresses = [ '1.2.3.4:1234', '1.2.3.4:1235', '1.2.3.4:1236', '1.2.3.4:1237' ] with open(temp_address_file, 'w') as file_obj: file_obj.write('\n'.join(addresses)) locator_ref = await mo.create_actor( SupervisorPeerLocatorActor, 'test', temp_address_file, uid=SupervisorPeerLocatorActor.default_uid(), address=actor_pool.external_address) # test starting nodes filled info_ref = await mo.actor_ref(uid=NodeInfoCollectorActor.default_uid(), address=actor_pool.external_address) assert set(await info_ref.get_nodes_info()) == set(addresses) # test watch nodes changes version, result = await locator_ref.watch_supervisors_by_keys( ['mock_name']) assert result[0] in addresses with open(temp_address_file, 'w') as file_obj: file_obj.write('\n'.join(addresses[2:])) version, result = await locator_ref.watch_supervisors_by_keys( ['mock_name'], version=version) assert result[0] in addresses[2:] # test wait all supervisors ready with open(temp_address_file, 'w') as file_obj: file_obj.write('\n'.join(f'{a},{idx % 2}' for idx, a in enumerate(addresses))) async def delay_read_fun(): await asyncio.sleep(0.2) with open(temp_address_file, 'w') as file_obj: file_obj.write('\n'.join(f'{a},{(idx + 1) % 2}' for idx, a in enumerate(addresses))) await asyncio.sleep(0.3) with open(temp_address_file, 'w') as file_obj: file_obj.write('\n'.join(addresses)) asyncio.create_task(delay_read_fun()) with Timer() as timer: await locator_ref.wait_all_supervisors_ready() assert timer.duration > 0.4 await mo.destroy_actor(locator_ref)
async def test_changing_locator(actor_pool): addresses = ['1.2.3.4:1234', '1.2.3.4:1235', '1.2.3.4:1236', '1.2.3.4:1237'] locator_ref = await mo.create_actor( SupervisorLocatorActor, 'fixed', ','.join(addresses), address=actor_pool.external_address) assert (await locator_ref.watch_supervisors_by_keys(['mock_name']))[0] in addresses assert (await locator_ref.watch_supervisors_by_keys(['mock_name']))[0] in addresses assert all(addr in addresses for addr in await locator_ref.watch_supervisors()) with Timer() as timer: await locator_ref.wait_all_supervisors_ready() assert timer.duration > 0.1 await mo.destroy_actor(locator_ref)
async def test_fixed_locator(actor_pool): addresses = ['1.2.3.4:1234', '1.2.3.4:1235', '1.2.3.4:1236', '1.2.3.4:1237'] locator_ref = await mo.create_actor( SupervisorLocatorActor, 'fixed', ','.join(addresses), address=actor_pool.external_address) assert await locator_ref.get_supervisor('mock_name') in addresses dbl_addrs = await locator_ref.get_supervisor('mock_name', 2) assert len(dbl_addrs) == 2 assert all(addr in addresses for addr in dbl_addrs) with Timer() as timer: await locator_ref.wait_all_supervisors_ready() assert timer.duration < 0.1 await mo.destroy_actor(locator_ref)
def test_merge_index_value(): with Timer() as timer: index_values = {i: parse_index(pd.RangeIndex(1e7)) for i in range(20)} index_value = merge_index_value(index_values) pd.testing.assert_index_equal(index_value.to_pandas(), pd.Index([], dtype=np.int64)) assert index_value.min_val == 0 assert index_value.max_val == 1e7 - 1 # range indexes that are continuous index_values = { i: parse_index(pd.RangeIndex(i * 1e7, (i + 1) * 1e7)) for i in range(20) } index_value = merge_index_value(index_values) pd.testing.assert_index_equal(index_value.to_pandas(), pd.RangeIndex(1e7 * 20)) assert index_value.min_val == 0 assert index_value.max_val == 1e7 * 20 - 1 assert timer.duration < 1
async def test_cancel_task(actor_pool): pool, session_id, meta_api, storage_api, manager = actor_pool def func(): time.sleep(20) rs = [mr.spawn(func) for _ in range(10)] graph = TileableGraph([r.data for r in rs]) next(TileableGraphBuilder(graph).build()) task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False) assert isinstance(task_id, str) await asyncio.sleep(.5) with Timer() as timer: await manager.cancel_task(task_id) result = await manager.get_task_result(task_id) assert result.status == TaskStatus.terminated assert timer.duration < 15
async def test_task_cancel(start_test_service): _sv_pool_address, task_api, storage_api = start_test_service # test job cancel def f1(): time.sleep(100) rs = [mr.spawn(f1) for _ in range(10)] graph = TileableGraph([r.data for r in rs]) next(TileableGraphBuilder(graph).build()) task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) await asyncio.sleep(.5) with Timer() as timer: await task_api.cancel_task(task_id) result = await task_api.get_task_result(task_id) assert result.status == TaskStatus.terminated assert timer.duration < 20 await asyncio.sleep(.1) assert await task_api.get_last_idle_time() is not None results = await task_api.get_task_results(progress=True) assert all(result.status == TaskStatus.terminated for result in results)
async def test_subtask_service(actor_pools): sv_pool, worker_pool = actor_pools config = { "services": [ "cluster", "session", "meta", "lifecycle", "scheduling", "subtask", "task" ], "cluster": { "backend": "fixed", "lookup_address": sv_pool.external_address, "resource": { "numa-0": 2 } }, "meta": { "store": "dict" }, "scheduling": {}, "subtask": {}, } await start_services(NodeRole.SUPERVISOR, config, address=sv_pool.external_address) await start_services(NodeRole.WORKER, config, address=worker_pool.external_address) session_id = 'test_session' session_api = await SessionAPI.create(sv_pool.external_address) await session_api.create_session(session_id) ref = await mo.actor_ref(FakeTaskManager.gen_uid(session_id), address=sv_pool.external_address) await mo.destroy_actor(ref) await mo.create_actor(FakeTaskManager, session_id, uid=FakeTaskManager.gen_uid(session_id), address=sv_pool.external_address) subtask_api = await SubtaskAPI.create(worker_pool.external_address) # create mock meta and storage APIs meta_api = await MetaAPI.create(session_id, sv_pool.external_address) storage_api = await MockStorageAPI.create(session_id, worker_pool.external_address) a = mt.ones((10, 10), chunk_size=10) b = a + 1 subtask = _gen_subtask(b, session_id) await subtask_api.run_subtask_in_slot('numa-0', 0, subtask) # check storage expected = np.ones((10, 10)) + 1 result_key = subtask.chunk_graph.results[0].key result = await storage_api.get(result_key) np.testing.assert_array_equal(expected, result) # check meta chunk_meta = await meta_api.get_chunk_meta(result_key) assert chunk_meta is not None assert chunk_meta['bands'][0] == (worker_pool.external_address, 'numa-0') def sleep(timeout: int): time.sleep(timeout) return timeout b = mr.spawn(sleep, 1) subtask2 = _gen_subtask(b, session_id) asyncio.create_task(subtask_api.run_subtask_in_slot('numa-0', 0, subtask2)) await asyncio.sleep(0.2) with Timer() as timer: # normal cancel by cancel asyncio Task await asyncio.wait_for(subtask_api.cancel_subtask_in_slot('numa-0', 0), timeout=2) # need 1 sec to reach timeout, then killing actor and wait for auto recovering # the time would not be over 5 sec assert timer.duration < 2 await MockStorageAPI.cleanup(worker_pool.external_address)
async def test_execute_with_cancel(actor_pool, cancel_phase): pool, session_id, meta_api, storage_api, execution_ref = actor_pool # config for different phases ref_to_delay = None if cancel_phase == 'prepare': ref_to_delay = await mo.actor_ref(StorageManagerActor.default_uid(), address=pool.external_address) elif cancel_phase == 'quota': ref_to_delay = await mo.actor_ref(QuotaActor.gen_uid('numa-0'), address=pool.external_address) elif cancel_phase == 'slot': ref_to_delay = await mo.actor_ref( BandSlotManagerActor.gen_uid('numa-0'), address=pool.external_address) if ref_to_delay: await ref_to_delay.set_delay_fetch_time(100) def delay_fun(delay, _inp1): time.sleep(delay) return delay input1 = TensorFetch(key='input1', source_key='input1', dtype=np.dtype(int)).new_chunk([]) remote_result = RemoteFunction(function=delay_fun, function_args=[100, input1], function_kwargs={}, n_output=1) \ .new_chunk([input1]) data1 = np.random.rand(10, 10) await meta_api.set_chunk_meta(input1, memory_size=data1.nbytes, store_size=data1.nbytes, bands=[(pool.external_address, 'numa-0')]) await storage_api.put(input1.key, data1) chunk_graph = ChunkGraph([remote_result]) chunk_graph.add_node(input1) chunk_graph.add_node(remote_result) chunk_graph.add_edge(input1, remote_result) subtask = Subtask(f'test_task_{uuid.uuid4()}', session_id=session_id, chunk_graph=chunk_graph) aiotask = asyncio.create_task( execution_ref.run_subtask(subtask, 'numa-0', pool.external_address)) await asyncio.sleep(1) with Timer() as timer: await execution_ref.cancel_subtask(subtask.subtask_id, kill_timeout=1) with pytest.raises(asyncio.CancelledError): await asyncio.wait_for(aiotask, timeout=30) assert timer.duration < 6 # check for different phases if ref_to_delay is not None: assert await ref_to_delay.get_is_cancelled() await ref_to_delay.set_delay_fetch_time(0) # test if slot is restored remote_tileable = mr.spawn(delay_fun, args=(0.5, None)) graph = TileableGraph([remote_tileable.data]) next(TileableGraphBuilder(graph).build()) chunk_graph = next(ChunkGraphBuilder(graph, fuse_enabled=False).build()) subtask = Subtask(f'test_task2_{uuid.uuid4()}', session_id=session_id, chunk_graph=chunk_graph) await asyncio.wait_for(execution_ref.run_subtask(subtask, 'numa-0', pool.external_address), timeout=30)
async def test_task_service(actor_pools, use_web_api): sv_pool, worker_pool = actor_pools config = { "services": [ "cluster", "session", "lifecycle", "meta", "lifecycle", "scheduling", "task", "subtask" ], "cluster": { "backend": "fixed", "lookup_address": sv_pool.external_address, "resource": { "numa-0": 2 } }, "meta": { "store": "dict" }, "scheduling": {}, "task": {}, } if use_web_api: config['services'].append('web') await start_services(NodeRole.SUPERVISOR, config, address=sv_pool.external_address) await start_services(NodeRole.WORKER, config, address=worker_pool.external_address) session_id = 'test_session' session_api = await SessionAPI.create(sv_pool.external_address) await session_api.create_session(session_id) if not use_web_api: task_api = await TaskAPI.create(session_id, sv_pool.external_address) else: web_actor = await mo.actor_ref(WebActor.default_uid(), address=sv_pool.external_address) web_address = await web_actor.get_web_address() task_api = WebTaskAPI(session_id, web_address) # create mock meta and storage APIs _ = await MetaAPI.create(session_id, sv_pool.external_address) storage_api = await MockStorageAPI.create(session_id, worker_pool.external_address) def f1(): return np.arange(5) def f2(): return np.arange(5, 10) def f3(f1r, f2r): return np.concatenate([f1r, f2r]).sum() r1 = mr.spawn(f1) r2 = mr.spawn(f2) r3 = mr.spawn(f3, args=(r1, r2)) graph = TileableGraph([r3.data]) next(TileableGraphBuilder(graph).build()) task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) assert await task_api.get_last_idle_time() is None assert isinstance(task_id, str) await task_api.wait_task(task_id) task_result = await task_api.get_task_result(task_id) assert task_result.status == TaskStatus.terminated assert await task_api.get_last_idle_time() is not None if task_result.error is not None: raise task_result.error.with_traceback(task_result.traceback) result_tileable = (await task_api.get_fetch_tileables(task_id))[0] data_key = result_tileable.chunks[0].key assert await storage_api.get(data_key) == 45 # test job cancel def f4(): time.sleep(100) rs = [mr.spawn(f4) for _ in range(10)] graph = TileableGraph([r.data for r in rs]) next(TileableGraphBuilder(graph).build()) task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) await asyncio.sleep(.5) with Timer() as timer: await task_api.cancel_task(task_id) result = await task_api.get_task_result(task_id) assert result.status == TaskStatus.terminated assert timer.duration < 20 await asyncio.sleep(.1) assert await task_api.get_last_idle_time() is not None await MockStorageAPI.cleanup(worker_pool.external_address)