Пример #1
0
def test_groupby_prune_read_parquet(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.parquet')
    pdf.to_parquet(file_path)

    df1 = md.read_parquet(file_path)
    df2 = df1.groupby('c').agg({'a': 'sum'})
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert opt_df1.op.columns == ['a', 'c']
    # original tileable should not be modified
    assert df2.inputs[0] is df1.data

    df3 = df1.groupby('c', as_index=False).c.agg({'cnt': 'count'})
    graph = TileableGraph([df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert opt_df1.op.columns == ['c']
Пример #2
0
def test_sort_head(prepare_data):
    _, pdf = prepare_data

    df1 = md.DataFrame(pdf, chunk_size=20)
    df1 = df1.sort_values(by='b')
    df2 = df1.head(10)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 10
    assert len(graph) == 2
    assert opt_df2 in graph.results

    pdf2 = pdf.copy()
    pdf2.set_index('b', inplace=True)
    df1 = md.DataFrame(pdf2, chunk_size=20)
    df1 = df1.sort_index()
    df2 = df1.head(10)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 10
    assert len(graph) == 2
    assert opt_df2 in graph.results
Пример #3
0
def test_getitem_prune_read_parquet(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.parquet')
    pdf.to_parquet(file_path)

    df1 = md.read_parquet(file_path)
    df2 = df1.c
    df3 = df1[['a']]
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)

    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert opt_df1 in graph.predecessors(opt_df2)
    assert opt_df1 in opt_df2.inputs
    assert opt_df1 in graph.predecessors(opt_df3)
    assert opt_df1 in opt_df3.inputs
    assert opt_df1.op.columns == ['a', 'c']
    assert opt_df1 in graph.predecessors(opt_df3)
    assert opt_df1 in opt_df3.inputs
    # original tileable should not be modified
    assert df2.inputs[0] is df1.data
    assert df3.inputs[0] is df1.data
Пример #4
0
async def test_get_tileable_graph(start_test_service):
    _sv_pool_address, task_api, storage_api = start_test_service

    def f1():
        return np.arange(5)

    def f2():
        return np.arange(5, 10)

    def f3(f1r, f2r):
        return np.concatenate([f1r, f2r]).sum()

    r1 = mr.spawn(f1)
    r2 = mr.spawn(f2)
    r3 = mr.spawn(f3, args=(r1, r2))

    graph = TileableGraph([r3.data])
    next(TileableGraphBuilder(graph).build())

    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False)

    with pytest.raises(TaskNotExist):
        await task_api.get_tileable_graph_as_json('non_exist')

    tileable_detail = await task_api.get_tileable_graph_as_json(task_id)

    num_tileable = len(tileable_detail.get('tileables'))
    num_dependencies = len(tileable_detail.get('dependencies'))
    assert num_tileable > 0
    assert num_dependencies <= (num_tileable / 2) * (num_tileable / 2)

    assert (num_tileable == 1
            and num_dependencies == 0) or (num_tileable > 1
                                           and num_dependencies > 0)

    graph_nodes = []
    graph_dependencies = []
    for node in graph.iter_nodes():
        graph_nodes.append(node.key)

        for node_successor in graph.iter_successors(node):
            graph_dependencies.append({
                'fromTileableId': node.key,
                'toTileableId': node_successor.key,
                'linkType': 0,
            })

    for tileable in tileable_detail.get('tileables'):
        graph_nodes.remove(tileable.get('tileableId'))

    assert len(graph_nodes) == 0

    for i in range(num_dependencies):
        dependency = tileable_detail.get('dependencies')[i]
        assert graph_dependencies[i] == dependency
Пример #5
0
async def test_iterative_tiling(actor_pool):
    pool, session_id, meta_api, storage_api, manager = actor_pool

    rs = np.random.RandomState(0)
    raw_a = rs.rand(10, 10)
    raw_b = rs.rand(10, 10)
    a = mt.tensor(raw_a, chunk_size=5)
    b = mt.tensor(raw_b, chunk_size=5)

    d = a[a[:, 0] < 3] + b[b[:, 0] < 3]
    graph = TileableGraph([d.data])
    next(TileableGraphBuilder(graph).build())

    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False)
    assert isinstance(task_id, str)

    await manager.wait_task(task_id)
    task_result: TaskResult = await manager.get_task_result(task_id)

    assert task_result.status == TaskStatus.terminated
    assert task_result.error is None
    assert await manager.get_task_progress(task_id) == 1.0

    expect = raw_a[raw_a[:, 0] < 3] + raw_b[raw_b[:, 0] < 3]
    result_tileables = (await manager.get_task_result_tileables(task_id))[0]
    result = await _merge_data(result_tileables, storage_api)
    np.testing.assert_array_equal(result, expect)
Пример #6
0
async def test_run_tasks_with_same_name(actor_pool):
    pool, session_id, meta_api, lifecycle_api, storage_api, manager = actor_pool

    raw = np.random.RandomState(0).rand(10, 10)
    a = mt.tensor(raw, chunk_size=5)
    b = a + 1
    c = a * 2

    for t, e in zip([b, c], [raw + 1, raw * 2]):
        graph = TileableGraph([t.data])
        next(TileableGraphBuilder(graph).build())

        task_id = await manager.submit_tileable_graph(graph, task_name='my_task',
                                                      fuse_enabled=False)
        assert isinstance(task_id, str)

        await manager.wait_task(task_id)
        task_result: TaskResult = await manager.get_task_result(task_id)

        assert task_result.status == TaskStatus.terminated
        if task_result.error is not None:
            raise task_result.error.with_traceback(task_result.traceback)
        assert await manager.get_task_progress(task_id) == 1.0

        result_tileable = (await manager.get_task_result_tileables(task_id))[0]
        result = await _merge_data(result_tileable, storage_api)
        np.testing.assert_array_equal(result, e)
Пример #7
0
async def test_cancel_task(actor_pool):
    pool, session_id, meta_api, lifecycle_api, storage_api, manager = actor_pool

    def func():
        time.sleep(200)

    rs = [mr.spawn(func) for _ in range(10)]

    graph = TileableGraph([r.data for r in rs])
    next(TileableGraphBuilder(graph).build())

    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False)
    assert isinstance(task_id, str)

    await asyncio.sleep(.5)

    with Timer() as timer:
        await manager.cancel_task(task_id)
        result = await manager.get_task_result(task_id)
        assert result.status == TaskStatus.terminated

    assert timer.duration < 20

    keys = [r.key for r in rs]
    del rs
    gc.collect()
    await asyncio.sleep(0.5)

    # test ref counts
    assert (await lifecycle_api.get_tileable_ref_counts(keys)) == [0] * len(keys)
Пример #8
0
async def test_run_task(actor_pool):
    pool, session_id, meta_api, lifecycle_api, storage_api, manager = actor_pool

    raw = np.random.RandomState(0).rand(10, 10)
    a = mt.tensor(raw, chunk_size=5)
    b = a + 1

    graph = TileableGraph([b.data])
    next(TileableGraphBuilder(graph).build())

    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False)
    assert isinstance(task_id, str)

    await manager.wait_task(task_id)
    task_result: TaskResult = await manager.get_task_result(task_id)

    assert task_result.status == TaskStatus.terminated
    if task_result.error is not None:
        raise task_result.error.with_traceback(task_result.traceback)
    assert await manager.get_task_progress(task_id) == 1.0

    result_tileable = (await manager.get_task_result_tileables(task_id))[0]
    result = await _merge_data(result_tileable, storage_api)
    np.testing.assert_array_equal(result, raw + 1)

    # test ref counts
    assert (await lifecycle_api.get_tileable_ref_counts([b.key]))[0] == 1
    assert (await lifecycle_api.get_chunk_ref_counts(
        [c.key for c in result_tileable.chunks])) == [1] * len(result_tileable.chunks)
Пример #9
0
async def test_task_execution(start_test_service):
    _sv_pool_address, task_api, storage_api = start_test_service

    def f1():
        return np.arange(5)

    def f2():
        return np.arange(5, 10)

    def f3(f1r, f2r):
        return np.concatenate([f1r, f2r]).sum()

    r1 = mr.spawn(f1)
    r2 = mr.spawn(f2)
    r3 = mr.spawn(f3, args=(r1, r2))

    graph = TileableGraph([r3.data])
    next(TileableGraphBuilder(graph).build())

    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False)
    assert await task_api.get_last_idle_time() is None
    assert isinstance(task_id, str)

    await task_api.wait_task(task_id)
    task_result = await task_api.get_task_result(task_id)

    assert task_result.status == TaskStatus.terminated
    assert await task_api.get_last_idle_time() is not None
    if task_result.error is not None:
        raise task_result.error.with_traceback(task_result.traceback)

    result_tileable = (await task_api.get_fetch_tileables(task_id))[0]
    data_key = result_tileable.chunks[0].key
    assert await storage_api.get(data_key) == 45
Пример #10
0
async def test_shuffle(actor_pool):
    pool, session_id, meta_api, storage_api, manager = actor_pool

    raw = np.random.rand(10, 10)
    raw2 = np.random.randint(10, size=(10, ))
    a = mt.tensor(raw, chunk_size=5)
    b = mt.tensor(raw2, chunk_size=5)
    c = a[b]

    graph = TileableGraph([c.data])
    next(TileableGraphBuilder(graph).build())

    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False)
    assert isinstance(task_id, str)

    await manager.wait_task(task_id)
    task_result: TaskResult = await manager.get_task_result(task_id)

    assert task_result.status == TaskStatus.terminated
    assert task_result.error is None
    assert await manager.get_task_progress(task_id) == 1.0

    expect = raw[raw2]
    result_tileables = (await manager.get_task_result_tileables(task_id))[0]
    result = await _merge_data(result_tileables, storage_api)
    np.testing.assert_array_equal(result, expect)
Пример #11
0
async def test_task_progress(start_test_service):
    sv_pool_address, task_api, storage_api = start_test_service

    session_api = await SessionAPI.create(address=sv_pool_address)
    ref = await session_api.create_remote_object(task_api._session_id,
                                                 'progress_controller',
                                                 _ProgressController)

    def f1(count: int):
        progress_controller = get_context().get_remote_object(
            'progress_controller')
        for idx in range(count):
            progress_controller.wait()
            get_context().set_progress((1 + idx) * 1.0 / count)

    r = mr.spawn(f1, args=(2, ))

    graph = TileableGraph([r.data])
    next(TileableGraphBuilder(graph).build())

    await task_api.submit_tileable_graph(graph, fuse_enabled=False)

    await asyncio.sleep(0.2)
    results = await task_api.get_task_results(progress=True)
    assert results[0].progress == 0.0

    await ref.set()
    await asyncio.sleep(1)
    results = await task_api.get_task_results(progress=True)
    assert results[0].progress == 0.5

    await ref.set()
    await asyncio.sleep(1)
    results = await task_api.get_task_results(progress=True)
    assert results[0].progress == 1.0
Пример #12
0
def test_cannot_prune(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path)

    df1 = md.read_csv(file_path)
    df2 = df1.groupby('c').agg({'a': 'sum'})
    # does not support prune
    df3 = df1 + 1
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is None
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is None

    df1 = md.read_csv(file_path)
    df2 = df1.groupby('c').agg({'a': 'sum'})
    # does not support prune, another rule
    df3 = df1.head(3)
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is None
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is None

    df1 = md.read_csv(file_path)
    df2 = df1[df1.dtypes.index.tolist()]
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    # all columns selected
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is None
Пример #13
0
async def test_optimization(actor_pool):
    pool, session_id, meta_api, lifecycle_api, storage_api, manager = actor_pool

    with tempfile.TemporaryDirectory() as tempdir:
        file_path = os.path.join(tempdir, 'test.csv')

        pdf = pd.DataFrame({
            'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
            'c': list('aabaaddce'),
            'd': list('abaaaddce')
        })
        pdf.to_csv(file_path, index=False)

        df = md.read_csv(file_path)
        df2 = df.groupby('c').agg({'a': 'sum'})
        df3 = df[['b', 'a']]

        graph = TileableGraph([df2.data, df3.data])
        next(TileableGraphBuilder(graph).build())

        task_id = await manager.submit_tileable_graph(graph)
        assert isinstance(task_id, str)

        await manager.wait_task(task_id)
        task_result: TaskResult = await manager.get_task_result(task_id)

        assert task_result.status == TaskStatus.terminated
        if task_result.error is not None:
            raise task_result.error.with_traceback(task_result.traceback)
        assert await manager.get_task_progress(task_id) == 1.0

        expect = pdf.groupby('c').agg({'a': 'sum'})
        result_tileables = (await manager.get_task_result_tileables(task_id))
        result1 = result_tileables[0]
        result = await _merge_data(result1, storage_api)
        np.testing.assert_array_equal(result, expect)

        expect = pdf[['b', 'a']]
        result2 = result_tileables[1]
        result = await _merge_data(result2, storage_api)
        np.testing.assert_array_equal(result, expect)

        # test ref counts
        assert (await lifecycle_api.get_tileable_ref_counts([df3.key]))[0] == 1
        assert (await lifecycle_api.get_chunk_ref_counts([
            c.key for c in result_tileables[1].chunks
        ])) == [1] * len(result_tileables[1].chunks)

        # test ref counts
        assert (await lifecycle_api.get_tileable_ref_counts([df3.key]))[0] == 1
        assert (await lifecycle_api.get_chunk_ref_counts([
            c.key for c in result_tileables[1].chunks
        ])) == [1] * len(result_tileables[1].chunks)
Пример #14
0
def test_read_csv_head(prepare_data, setup):
    tempdir, pdf = prepare_data
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path, index=False)

    size = os.stat(file_path).st_size / 2
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.head(5)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 1
    assert opt_df2 in graph.results

    result = df2.execute(extra_config={
        'operand_executors': _iloc_operand_executors
    }).fetch()
    expected = pdf.head(5)
    pd.testing.assert_frame_equal(result, expected)

    # test multiple head
    df3 = df1.head(10)
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    assert opt_df1.op.nrows == 10
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert graph.predecessors(opt_df2)[0] is opt_df1
    assert opt_df2.inputs[0] is opt_df1
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert graph.predecessors(opt_df3)[0] is opt_df1
    assert opt_df3.inputs[0] is opt_df1

    # test head with successor
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.head(5)
    df3 = df2 + 1
    graph = TileableGraph([df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 2
Пример #15
0
def test_k_means_init_large_n_clusters():
    chunk_bytes_limit = options.chunk_store_limit * 2
    n_cluster = 2000
    x = mt.random.rand(1000_000, 64, chunk_size=250_000)

    centers = _init_centroids(x, n_cluster, init='k-means||')
    t_graph = next(TileableGraphBuilder(TileableGraph([centers])).build())
    graph = next(ChunkGraphBuilder(t_graph).build())
    for c in graph:
        nbytes = c.nbytes
        if not np.isnan(nbytes):
            assert nbytes <= chunk_bytes_limit
Пример #16
0
def test_sort_head(prepare_data, setup):
    _, pdf = prepare_data

    df1 = md.DataFrame(pdf, chunk_size=20)
    df1 = df1.sort_values(by='b')
    df2 = df1.head(10)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 10
    assert len(graph) == 2
    assert opt_df2 in graph.results

    result = df2.execute(extra_config={
        'operand_executors': _iloc_operand_executors
    }).fetch()
    expected = pdf.sort_values(by='b').head(10)
    pd.testing.assert_frame_equal(result, expected)

    pdf2 = pdf.copy()
    pdf2.set_index('b', inplace=True)
    df1 = md.DataFrame(pdf2, chunk_size=20)
    df1 = df1.sort_index()
    df2 = df1.head(10)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 10
    assert len(graph) == 2
    assert opt_df2 in graph.results

    result = df2.execute(extra_config={
        'operand_executors': _iloc_operand_executors
    }).fetch()
    expected = pdf2.sort_index().head(10)
    pd.testing.assert_frame_equal(result, expected)
Пример #17
0
def test_value_counts_head(prepare_data, chunk_size):
    _, pdf = prepare_data
    df = md.DataFrame(pdf, chunk_size=chunk_size)

    df1 = df['a'].value_counts()
    df2 = df1.head(3)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 3
    assert len(graph) == 3
    assert opt_df2 in graph.results
Пример #18
0
def test_groupby_and_getitem(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path)

    df1 = md.read_csv(file_path)
    df2 = df1.groupby('c').agg({'a': 'sum'})
    df3 = df1[['b', 'a']]
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert opt_df1 in graph.predecessors(opt_df2)
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert opt_df1 in graph.predecessors(opt_df3)
    assert opt_df1.op.usecols == ['a', 'b', 'c']
    # original tileable should not be modified
    assert df2.inputs[0] is df1.data
    assert df3.inputs[0] is df1.data
Пример #19
0
def test_no_head(prepare_data):
    tempdir, pdf = prepare_data
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path, index=False)

    size = os.stat(file_path).st_size / 2
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.iloc[1:10]

    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    assert records.get_optimization_result(df2.data) is None

    df2 = df1.head(3)
    df3 = df1 + 1

    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    assert records.get_optimization_result(df2.data) is None
    assert records.get_optimization_result(df3.data) is None
Пример #20
0
def test_read_csv_head(prepare_data):
    tempdir, pdf = prepare_data
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path, index=False)

    size = os.stat(file_path).st_size / 2
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.head(5)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 1
    assert opt_df2 in graph.results

    # test multiple head
    df3 = df1.head(10)
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    assert opt_df1.op.nrows == 10
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert graph.predecessors(opt_df2)[0] is opt_df1
    assert opt_df2.inputs[0] is opt_df1
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert graph.predecessors(opt_df3)[0] is opt_df1
    assert opt_df3.inputs[0] is opt_df1

    # test head with successor
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.head(5)
    df3 = df2 + 1
    graph = TileableGraph([df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 2
Пример #21
0
def test_cupy():
    t1 = mt.ones((100, 50), chunk_size=50, gpu=True)
    t2 = mt.ones(50, chunk_size=50, gpu=True)
    t = (t1 - t2) / mt.sqrt(t2 * (1 - t2) * len(t2))

    graph = TileableGraph([t.data])
    next(TileableGraphBuilder(graph).build())
    context = dict()
    chunk_graph_builder = ChunkGraphBuilder(graph,
                                            fuse_enabled=False,
                                            tile_context=context)
    chunk_graph = next(chunk_graph_builder.build())

    CupyRuntimeOptimizer(chunk_graph).optimize()
    assert any(n.op.__class__.__name__ == 'TensorCpFuseChunk'
               for n in chunk_graph)
Пример #22
0
async def test_task_error(start_test_service):
    _sv_pool_address, task_api, storage_api = start_test_service

    # test job cancel
    def f1():
        raise SystemError

    rs = [mr.spawn(f1) for _ in range(10)]

    graph = TileableGraph([r.data for r in rs])
    next(TileableGraphBuilder(graph).build())

    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False)

    await task_api.wait_task(task_id, timeout=10)
    results = await task_api.get_task_results(progress=True)
    assert type(results[0].error) is SystemError
Пример #23
0
def test_read_parquet_head(prepare_data):
    tempdir, pdf = prepare_data
    dirname = os.path.join(tempdir, 'test_parquet')
    os.makedirs(dirname)
    for i in range(3):
        file_path = os.path.join(dirname , f'test{i}.parquet')
        pdf[i * 40: (i + 1) * 40].to_parquet(file_path, index=False)

    df1 = md.read_parquet(dirname)
    df2 = df1.head(5)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 1
    assert opt_df2 in graph.results
Пример #24
0
async def test_error_task(actor_pool):
    pool, session_id, meta_api, storage_api, manager = actor_pool

    with mt.errstate(divide='raise'):
        a = mt.ones((10, 10), chunk_size=10)
        c = a / 0

    graph = TileableGraph([c.data])
    next(TileableGraphBuilder(graph).build())

    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False)
    assert isinstance(task_id, str)

    await manager.wait_task(task_id)
    task_result: TaskResult = await manager.get_task_result(task_id)

    assert task_result.status == TaskStatus.terminated
    assert task_result.error is not None
    assert isinstance(task_result.error, FloatingPointError)
Пример #25
0
def test_value_counts_head(prepare_data, setup, chunk_size):
    _, pdf = prepare_data
    df = md.DataFrame(pdf, chunk_size=chunk_size)

    df1 = df['a'].value_counts(method='tree')
    df2 = df1.head(3)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 3
    assert len(graph) == 3
    assert opt_df2 in graph.results

    result = df2.execute(extra_config={
        'operand_executors': _iloc_operand_executors}).fetch()
    expected = pdf['a'].value_counts().head(3)
    pd.testing.assert_series_equal(result, expected)
Пример #26
0
def test_groupby_prune_read_sql(gen_data2):
    pdf, tempdir = gen_data2
    uri = 'sqlite:///' + os.path.join(tempdir, 'test.db')
    table_name = 'test'
    pdf.to_sql(table_name, uri, index=False)

    # test read df with columns
    df1 = md.read_sql_table('test', uri, chunk_size=4)
    df2 = df1.groupby('a', as_index=False).a.agg({'cnt': 'count'})
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert opt_df1.op.columns == ['a']
    # original tileable should not be modified
    assert df2.inputs[0] is df1.data
Пример #27
0
async def test_shuffle(actor_pool):
    pool, session_id, meta_api, lifecycle_api, storage_api, manager = actor_pool

    rs = np.random.RandomState(0)
    raw = rs.rand(10, 10)
    raw2 = rs.randint(10, size=(10, ))
    a = mt.tensor(raw, chunk_size=5)
    b = mt.tensor(raw2, chunk_size=5)
    c = a[b]

    graph = TileableGraph([c.data])
    next(TileableGraphBuilder(graph).build())

    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False)
    assert isinstance(task_id, str)

    await manager.wait_task(task_id)
    task_result: TaskResult = await manager.get_task_result(task_id)

    assert task_result.status == TaskStatus.terminated
    if task_result.error is not None:
        raise task_result.error.with_traceback(task_result.traceback)
    assert await manager.get_task_progress(task_id) == 1.0

    expect = raw[raw2]
    result_tileable = (await manager.get_task_result_tileables(task_id))[0]
    result = await _merge_data(result_tileable, storage_api)
    np.testing.assert_array_equal(result, expect)

    # test ref counts
    assert (await lifecycle_api.get_tileable_ref_counts([c.key]))[0] == 1
    assert (await lifecycle_api.get_chunk_ref_counts([
        c.key for c in result_tileable.chunks
    ])) == [1] * len(result_tileable.chunks)
    await lifecycle_api.decref_tileables([c.key])
    ref_counts = await lifecycle_api.get_all_chunk_ref_counts()
    assert len(ref_counts) == 0

    # test if exists in storage
    from mars.storage import StorageLevel
    assert len(await storage_api.list(level=StorageLevel.MEMORY)) == 0
Пример #28
0
async def test_cancel_task(actor_pool):
    pool, session_id, meta_api, storage_api, manager = actor_pool

    def func():
        time.sleep(20)

    rs = [mr.spawn(func) for _ in range(10)]

    graph = TileableGraph([r.data for r in rs])
    next(TileableGraphBuilder(graph).build())

    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False)
    assert isinstance(task_id, str)

    await asyncio.sleep(.5)

    with Timer() as timer:
        await manager.cancel_task(task_id)
        result = await manager.get_task_result(task_id)
        assert result.status == TaskStatus.terminated

    assert timer.duration < 15
Пример #29
0
def test_read_csv_head(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path)

    df1 = md.read_csv(file_path)
    df2 = df1.head(5)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    context = dict()
    chunk_graph_builder = ChunkGraphBuilder(graph,
                                            fuse_enabled=False,
                                            tile_context=context)
    chunk_graph = next(chunk_graph_builder.build())
    chunk1 = context[df1.data].chunks[0].data
    chunk2 = context[df2.data].chunks[0].data
    records = optimize(chunk_graph)
    assert records.get_optimization_result(chunk1) is None
    opt_chunk2 = records.get_optimization_result(chunk2)
    assert opt_chunk2.op.nrows == 5
    assert len(chunk_graph) == 1
    assert opt_chunk2 in chunk_graph.results
Пример #30
0
def test_read_parquet_head(prepare_data, setup):
    tempdir, pdf = prepare_data
    dirname = os.path.join(tempdir, 'test_parquet')
    os.makedirs(dirname)
    for i in range(3):
        file_path = os.path.join(dirname , f'test{i}.parquet')
        pdf[i * 40: (i + 1) * 40].to_parquet(file_path, index=False)

    df1 = md.read_parquet(dirname)
    df2 = df1.head(5)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 1
    assert opt_df2 in graph.results

    result = df2.execute(extra_config={
        'operand_executors': _iloc_operand_executors}).fetch()
    expected = pdf.head(5)
    pd.testing.assert_frame_equal(result, expected)