示例#1
0
async def test_subtask_manager(actor_pool):
    pool, session_id, execution_ref, manager_ref, queue_ref, task_manager_ref = actor_pool

    subtask1 = Subtask('subtask1', session_id)
    subtask2 = Subtask('subtask2', session_id)

    await manager_ref.add_subtasks([subtask1, subtask2], [(1, ), (2, )])
    await manager_ref.submit_subtask_to_band(subtask1.subtask_id,
                                             (pool.external_address, 'gpu-0'))
    await manager_ref.submit_subtask_to_band(subtask1.subtask_id,
                                             (pool.external_address, 'gpu-1'))

    await manager_ref.cancel_subtasks(
        [subtask1.subtask_id, subtask2.subtask_id])
    await asyncio.wait_for(asyncio.gather(
        execution_ref.wait_subtask(subtask1.subtask_id, 'gpu-0'),
        execution_ref.wait_subtask(subtask1.subtask_id, 'gpu-1'),
    ),
                           timeout=10)
    assert (await task_manager_ref.get_result(subtask1.subtask_id)).status \
           == SubtaskStatus.cancelled
    assert (await task_manager_ref.get_result(subtask2.subtask_id)).status \
           == SubtaskStatus.cancelled

    subtask3 = Subtask('subtask3', session_id)

    await queue_ref.set_error(ValueError())
    await manager_ref.add_subtasks.tell([subtask3], [(3, )])
    await asyncio.sleep(0.1)
    subtask3_result = await task_manager_ref.get_result(subtask3.subtask_id)
    assert subtask3_result.status == SubtaskStatus.errored
    assert isinstance(subtask3_result.error, ValueError)
示例#2
0
async def test_subtask_queueing(actor_pool):
    _pool, session_id, queueing_ref, slots_ref, manager_ref = actor_pool
    await slots_ref.set_capacity(2)

    subtasks = [Subtask(str(i)) for i in range(5)]
    priorities = [(i, ) for i in range(5)]

    await queueing_ref.add_subtasks(subtasks, priorities)
    # queue: [4 3 2 1 0]

    await queueing_ref.submit_subtasks()
    # queue: [2 1 0]
    commited_subtask_ids, _commited_bands = await manager_ref.dump_data()
    assert commited_subtask_ids == ['4', '3']

    await queueing_ref.remove_queued_subtasks(['1'])
    # queue: [2 0]
    await queueing_ref.update_subtask_priority.batch(
        queueing_ref.update_subtask_priority.delay('0', (3, )),
        queueing_ref.update_subtask_priority.delay('4', (5, )),
    )
    # queue: [0(3) 2]
    await queueing_ref.submit_subtasks()
    # queue: []
    commited_subtasks, _commited_bands = await manager_ref.dump_data()
    assert commited_subtasks == ['4', '3', '0', '2']
示例#3
0
async def test_assign_gpu_tasks(actor_pool):
    pool, session_id, assigner_ref, cluster_api, meta_api = actor_pool

    input1 = TensorFetch(key='a', source_key='a',
                         dtype=np.dtype(int)).new_chunk([])
    input2 = TensorFetch(key='b', source_key='b',
                         dtype=np.dtype(int)).new_chunk([])
    result_chunk = TensorTreeAdd(args=[input1, input2], gpu=True) \
        .new_chunk([input1, input2])

    chunk_graph = ChunkGraph([result_chunk])
    chunk_graph.add_node(input1)
    chunk_graph.add_node(input2)
    chunk_graph.add_node(result_chunk)
    chunk_graph.add_edge(input1, result_chunk)
    chunk_graph.add_edge(input2, result_chunk)

    await meta_api.set_chunk_meta(input1,
                                  memory_size=200,
                                  store_size=200,
                                  bands=[('address0', 'numa-0')])
    await meta_api.set_chunk_meta(input2,
                                  memory_size=200,
                                  store_size=200,
                                  bands=[('address0', 'numa-0')])

    subtask = Subtask('test_task', session_id, chunk_graph=chunk_graph)
    [result] = await assigner_ref.assign_subtasks([subtask])
    assert result[1].startswith('gpu')
示例#4
0
def _gen_subtask(t, session_id):
    graph = TileableGraph([t.data])
    next(TileableGraphBuilder(graph).build())

    chunk_graph = next(ChunkGraphBuilder(graph, fuse_enabled=False).build())
    subtask = Subtask(new_task_id(), session_id, new_task_id(), chunk_graph)

    return subtask
示例#5
0
async def _queue_subtasks(num_subtasks, expect_bands, queueing_ref):
    if not num_subtasks:
        return
    subtasks = [
        Subtask(expect_bands[0] + '-' + str(i)) for i in range(num_subtasks)
    ]
    for subtask in subtasks:
        subtask.expect_bands = [expect_bands]
    priorities = [(i, ) for i in range(num_subtasks)]

    await queueing_ref.add_subtasks(subtasks, priorities)
示例#6
0
async def test_execute_tensor(actor_pool):
    pool, session_id, meta_api, storage_api, execution_ref = actor_pool

    data1 = np.random.rand(10, 10)
    data2 = np.random.rand(10, 10)

    input1 = TensorFetch(key='input1',
                         source_key='input2',
                         dtype=np.dtype(int)).new_chunk([])
    input2 = TensorFetch(key='input2',
                         source_key='input2',
                         dtype=np.dtype(int)).new_chunk([])
    result_chunk = TensorTreeAdd(args=[input1, input2]) \
        .new_chunk([input1, input2], shape=data1.shape, dtype=data1.dtype)

    await meta_api.set_chunk_meta(input1,
                                  memory_size=data1.nbytes,
                                  store_size=data1.nbytes,
                                  bands=[(pool.external_address, 'numa-0')])
    await meta_api.set_chunk_meta(input2,
                                  memory_size=data1.nbytes,
                                  store_size=data2.nbytes,
                                  bands=[(pool.external_address, 'numa-0')])
    # todo use different storage level when storage ready
    await storage_api.put(input1.key, data1)
    await storage_api.put(input2.key, data2)

    chunk_graph = ChunkGraph([result_chunk])
    chunk_graph.add_node(input1)
    chunk_graph.add_node(input2)
    chunk_graph.add_node(result_chunk)
    chunk_graph.add_edge(input1, result_chunk)
    chunk_graph.add_edge(input2, result_chunk)

    subtask = Subtask('test_task',
                      session_id=session_id,
                      chunk_graph=chunk_graph)
    await execution_ref.run_subtask(subtask, 'numa-0', pool.external_address)

    # check if results are correct
    result = await storage_api.get(result_chunk.key)
    np.testing.assert_array_equal(data1 + data2, result)

    # check if quota computations are correct
    quota_ref = await mo.actor_ref(QuotaActor.gen_uid('numa-0'),
                                   address=pool.external_address)
    [quota] = await quota_ref.get_batch_quota_reqs()
    assert quota[(subtask.subtask_id, subtask.subtask_id)] == data1.nbytes

    # check if metas are correct
    result_meta = await meta_api.get_chunk_meta(result_chunk.key)
    assert result_meta['object_id'] == result_chunk.key
    assert result_meta['shape'] == result.shape
示例#7
0
async def test_cancel_without_kill(actor_pool):
    pool, session_id, meta_api, storage_api, execution_ref = actor_pool

    def delay_fun(delay):
        import mars
        time.sleep(delay)
        mars._slot_marker = 1
        return delay

    def check_fun():
        import mars
        return getattr(mars, '_slot_marker', False)

    remote_result = RemoteFunction(function=delay_fun, function_args=[2],
                                   function_kwargs={}).new_chunk([])
    chunk_graph = ChunkGraph([remote_result])
    chunk_graph.add_node(remote_result)

    subtask = Subtask(f'test_task_{uuid.uuid4()}', session_id=session_id,
                      chunk_graph=chunk_graph)
    aiotask = asyncio.create_task(execution_ref.run_subtask(
        subtask, 'numa-0', pool.external_address))
    await asyncio.sleep(0.5)

    await execution_ref.cancel_subtask(subtask.subtask_id, kill_timeout=1)
    with pytest.raises(asyncio.CancelledError):
        await asyncio.wait_for(aiotask, timeout=30)

    remote_result = RemoteFunction(function=check_fun, function_args=[],
                                   function_kwargs={}).new_chunk([])
    chunk_graph = ChunkGraph([remote_result])
    chunk_graph.add_node(remote_result)

    subtask = Subtask(f'test_task_{uuid.uuid4()}', session_id=session_id,
                      chunk_graph=chunk_graph)
    await execution_ref.run_subtask(
        subtask, 'numa-0', pool.external_address)

    # check if results are correct
    assert await storage_api.get(remote_result.key)
示例#8
0
async def test_assigner(actor_pool):
    pool, session_id, assigner_ref, meta_api = actor_pool

    input1 = TensorFetch(key='a', source_key='a',
                         dtype=np.dtype(int)).new_chunk([])
    input2 = TensorFetch(key='b', source_key='b',
                         dtype=np.dtype(int)).new_chunk([])
    input3 = TensorFetch(key='c', source_key='c',
                         dtype=np.dtype(int)).new_chunk([])
    result_chunk = TensorTreeAdd(args=[input1, input2, input3]) \
        .new_chunk([input1, input2, input3])

    chunk_graph = ChunkGraph([result_chunk])
    chunk_graph.add_node(input1)
    chunk_graph.add_node(input2)
    chunk_graph.add_node(input3)
    chunk_graph.add_node(result_chunk)
    chunk_graph.add_edge(input1, result_chunk)
    chunk_graph.add_edge(input2, result_chunk)
    chunk_graph.add_edge(input3, result_chunk)

    await meta_api.set_chunk_meta(input1,
                                  memory_size=200,
                                  store_size=200,
                                  bands=[('address0', 'numa-0')])
    await meta_api.set_chunk_meta(input2,
                                  memory_size=400,
                                  store_size=400,
                                  bands=[('address1', 'numa-0')])
    await meta_api.set_chunk_meta(input3,
                                  memory_size=400,
                                  store_size=400,
                                  bands=[('address2', 'numa-0')])

    subtask = Subtask('test_task', session_id, chunk_graph=chunk_graph)
    [result] = await assigner_ref.assign_subtasks([subtask])
    assert result in (('address1', 'numa-0'), ('address2', 'numa-0'))
示例#9
0
async def test_execute_with_cancel(actor_pool, cancel_phase):
    pool, session_id, meta_api, storage_api, execution_ref = actor_pool

    # config for different phases
    ref_to_delay = None
    if cancel_phase == 'prepare':
        ref_to_delay = await mo.actor_ref(StorageManagerActor.default_uid(),
                                          address=pool.external_address)
    elif cancel_phase == 'quota':
        ref_to_delay = await mo.actor_ref(QuotaActor.gen_uid('numa-0'),
                                          address=pool.external_address)
    elif cancel_phase == 'slot':
        ref_to_delay = await mo.actor_ref(
            BandSlotManagerActor.gen_uid('numa-0'),
            address=pool.external_address)
    if ref_to_delay:
        await ref_to_delay.set_delay_fetch_time(100)

    def delay_fun(delay, _inp1):
        time.sleep(delay)
        return delay

    input1 = TensorFetch(key='input1',
                         source_key='input1',
                         dtype=np.dtype(int)).new_chunk([])
    remote_result = RemoteFunction(function=delay_fun, function_args=[100, input1],
                                   function_kwargs={}, n_output=1) \
        .new_chunk([input1])

    data1 = np.random.rand(10, 10)
    await meta_api.set_chunk_meta(input1,
                                  memory_size=data1.nbytes,
                                  store_size=data1.nbytes,
                                  bands=[(pool.external_address, 'numa-0')])
    await storage_api.put(input1.key, data1)

    chunk_graph = ChunkGraph([remote_result])
    chunk_graph.add_node(input1)
    chunk_graph.add_node(remote_result)
    chunk_graph.add_edge(input1, remote_result)

    subtask = Subtask(f'test_task_{uuid.uuid4()}',
                      session_id=session_id,
                      chunk_graph=chunk_graph)
    aiotask = asyncio.create_task(
        execution_ref.run_subtask(subtask, 'numa-0', pool.external_address))
    await asyncio.sleep(1)

    with Timer() as timer:
        await execution_ref.cancel_subtask(subtask.subtask_id, kill_timeout=1)
        with pytest.raises(asyncio.CancelledError):
            await asyncio.wait_for(aiotask, timeout=30)
    assert timer.duration < 6

    # check for different phases
    if ref_to_delay is not None:
        assert await ref_to_delay.get_is_cancelled()
        await ref_to_delay.set_delay_fetch_time(0)

    # test if slot is restored
    remote_tileable = mr.spawn(delay_fun, args=(0.5, None))
    graph = TileableGraph([remote_tileable.data])
    next(TileableGraphBuilder(graph).build())

    chunk_graph = next(ChunkGraphBuilder(graph, fuse_enabled=False).build())

    subtask = Subtask(f'test_task2_{uuid.uuid4()}',
                      session_id=session_id,
                      chunk_graph=chunk_graph)
    await asyncio.wait_for(execution_ref.run_subtask(subtask, 'numa-0',
                                                     pool.external_address),
                           timeout=30)
示例#10
0
async def test_assign_cpu_tasks(actor_pool):
    pool, session_id, assigner_ref, cluster_api, meta_api = actor_pool

    input1 = TensorFetch(key='a', source_key='a',
                         dtype=np.dtype(int)).new_chunk([])
    input2 = TensorFetch(key='b', source_key='b',
                         dtype=np.dtype(int)).new_chunk([])
    input3 = TensorFetch(key='c', source_key='c',
                         dtype=np.dtype(int)).new_chunk([])
    result_chunk = TensorTreeAdd(args=[input1, input2, input3]) \
        .new_chunk([input1, input2, input3])

    chunk_graph = ChunkGraph([result_chunk])
    chunk_graph.add_node(input1)
    chunk_graph.add_node(input2)
    chunk_graph.add_node(input3)
    chunk_graph.add_node(result_chunk)
    chunk_graph.add_edge(input1, result_chunk)
    chunk_graph.add_edge(input2, result_chunk)
    chunk_graph.add_edge(input3, result_chunk)

    await meta_api.set_chunk_meta(input1,
                                  memory_size=200,
                                  store_size=200,
                                  bands=[('address0', 'numa-0')])
    await meta_api.set_chunk_meta(input2,
                                  memory_size=400,
                                  store_size=400,
                                  bands=[('address1', 'numa-0')])
    await meta_api.set_chunk_meta(input3,
                                  memory_size=400,
                                  store_size=400,
                                  bands=[('address2', 'numa-0')])

    await cluster_api.set_node_status(node='address1',
                                      role=NodeRole.WORKER,
                                      status=NodeStatus.STOPPING)
    await cluster_api.set_node_status(node='address3',
                                      role=NodeRole.WORKER,
                                      status=NodeStatus.STOPPING)

    subtask = Subtask('test_task', session_id, chunk_graph=chunk_graph)
    [result] = await assigner_ref.assign_subtasks([subtask])
    assert result in (('address0', 'numa-0'), ('address2', 'numa-0'))

    subtask.expect_bands = [('address0', 'numa-0')]
    [result] = await assigner_ref.assign_subtasks([subtask])
    assert result == ('address0', 'numa-0')

    subtask.expect_bands = [('address0', 'numa-0'), ('address1', 'numa-0')]
    [result] = await assigner_ref.assign_subtasks([subtask])
    assert result == ('address0', 'numa-0')

    subtask.expect_bands = [('address1', 'numa-0')]
    [result] = await assigner_ref.assign_subtasks([subtask])
    assert result in (('address0', 'numa-0'), ('address2', 'numa-0'))

    result_chunk.op.gpu = True
    subtask = Subtask('test_task', session_id, chunk_graph=chunk_graph)
    with pytest.raises(NoMatchingSlots) as err:
        await assigner_ref.assign_subtasks([subtask])
    assert 'gpu' in str(err.value)