예제 #1
0
def test__futures_to_collection(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    remote_dfs = e.map(identity, dfs)
    ddf = yield _futures_to_collection(remote_dfs, divisions=True)
    ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    assert isinstance(ddf, dd.DataFrame)

    assert ddf.dask == ddf2.dask

    remote_arrays = e.map(np.arange, range(3, 5))
    x = yield _futures_to_collection(remote_arrays)
    y = yield _futures_to_dask_array(remote_arrays)

    assert type(x) == type(y)
    assert x.dask == y.dask

    remote_lists = yield e._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = yield _futures_to_collection(remote_lists)
    c = yield _futures_to_dask_bag(remote_lists)

    assert type(b) == type(c)
    assert b.dask == b.dask

    yield e._shutdown()
예제 #2
0
def _read_csv(fn, executor=None, hdfs=None, lazy=False, lineterminator='\n',
        header=True, names=None, **kwargs):
    from hdfs3 import HDFileSystem
    from dask import do
    import pandas as pd
    hdfs = hdfs or HDFileSystem()
    executor = default_executor(executor)
    kwargs['lineterminator'] = lineterminator
    filenames = hdfs.glob(fn)
    blockss = [read_binary(fn, executor, hdfs, lazy=True, delimiter=lineterminator)
               for fn in filenames]
    if names is None and header:
        with hdfs.open(filenames[0]) as f:
            head = pd.read_csv(f, nrows=5, **kwargs)
            names = head.columns

    dfs1 = [[do(buffer_to_csv)(blocks[0], names=names, skiprows=1, **kwargs)] +
            [do(buffer_to_csv)(b, names=names, **kwargs) for b in blocks[1:]]
            for blocks in blockss]
    dfs2 = sum(dfs1, [])
    if lazy:
        from dask.dataframe import from_imperative
        raise gen.Return(from_imperative(dfs2, columns=names))
    else:
        futures = executor.compute(*dfs2)
        from distributed.collections import _futures_to_dask_dataframe
        df = yield _futures_to_dask_dataframe(futures)
        raise gen.Return(df)
예제 #3
0
def test_dataframes(e, s, a, b):
    dfs = [pd.DataFrame({'x': np.random.random(100),
                         'y': np.random.random(100)},
                        index=list(range(i, i + 100)))
           for i in range(0, 100*10, 100)]

    remote_dfs = e.map(lambda x: x, dfs)
    rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    name = 'foo'
    ldf = dd.DataFrame({(name, i): df for i, df in enumerate(dfs)},
                       name, dfs[0].columns,
                       list(range(0, 1000, 100)) + [999])

    assert rdf.divisions == ldf.divisions

    remote = e.compute(rdf)
    result = yield remote._result()

    tm.assert_frame_equal(result,
                          ldf.compute(get=dask.get))

    exprs = [lambda df: df.x.mean(),
             lambda df: df.y.std(),
             lambda df: df.assign(z=df.x + df.y).drop_duplicates(),
             lambda df: df.index,
             lambda df: df.x,
             lambda df: df.x.cumsum(),
             lambda df: df.groupby(['x', 'y']).count(),
             lambda df: df.loc[50:75]]
    for f in exprs:
        local = f(ldf).compute(get=dask.get)
        remote = e.compute(f(rdf))
        remote = yield gen.with_timeout(timedelta(seconds=5), remote._result())
        assert_equal(local, remote)
예제 #4
0
def test__futures_to_collection(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    remote_dfs = e.map(identity, dfs)
    ddf = yield _futures_to_collection(remote_dfs, divisions=True)
    ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    assert isinstance(ddf, dd.DataFrame)

    assert ddf.dask == ddf2.dask

    remote_arrays = e.map(np.arange, range(3, 5))
    x = yield _futures_to_collection(remote_arrays)
    y = yield _futures_to_dask_array(remote_arrays)

    assert type(x) == type(y)
    assert x.dask == y.dask

    remote_lists = yield e._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = yield _futures_to_collection(remote_lists)
    c = yield _futures_to_dask_bag(remote_lists)

    assert type(b) == type(c)
    assert b.dask == b.dask

    yield e._shutdown()
예제 #5
0
def test_dataframes(e, s, a, b):
    dfs = [
        pd.DataFrame({
            'x': np.random.random(100),
            'y': np.random.random(100)
        },
                     index=list(range(i, i + 100)))
        for i in range(0, 100 * 10, 100)
    ]

    remote_dfs = e.map(lambda x: x, dfs)
    rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    name = 'foo'
    ldf = dd.DataFrame({(name, i): df
                        for i, df in enumerate(dfs)}, name, dfs[0].columns,
                       list(range(0, 1000, 100)) + [999])

    assert rdf.divisions == ldf.divisions

    remote = e.compute(rdf)
    result = yield remote._result()

    tm.assert_frame_equal(result, ldf.compute(get=dask.get))

    exprs = [
        lambda df: df.x.mean(), lambda df: df.y.std(),
        lambda df: df.assign(z=df.x + df.y).drop_duplicates(),
        lambda df: df.index, lambda df: df.x, lambda df: df.x.cumsum(),
        lambda df: df.groupby(['x', 'y']).count(), lambda df: df.loc[50:75]
    ]
    for f in exprs:
        local = f(ldf).compute(get=dask.get)
        remote = e.compute(f(rdf))
        remote = yield gen.with_timeout(timedelta(seconds=5), remote._result())
        assert_equal(local, remote)
예제 #6
0
def test_no_divisions(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()
    dfs = e.map(tm.makeTimeDataFrame, range(5, 10))

    df = yield _futures_to_dask_dataframe(dfs)
    assert not df.known_divisions
    assert list(df.columns) == list(tm.makeTimeDataFrame(5).columns)
예제 #7
0
def test_no_divisions(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()
    dfs = e.map(tm.makeTimeDataFrame, range(5, 10))

    df = yield _futures_to_dask_dataframe(dfs)
    assert not df.known_divisions
    assert list(df.columns) == list(tm.makeTimeDataFrame(5).columns)
예제 #8
0
def test__futures_to_dask_dataframe(c, s, a, b):
    remote_dfs = c.map(identity, dfs)
    ddf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True, client=c)

    assert isinstance(ddf, dd.DataFrame)
    assert ddf.divisions == (0, 30, 60, 80)
    expr = ddf.x.sum()
    result = yield c._get(expr.dask, expr._keys())
    assert result == [sum([df.x.sum() for df in dfs])]
def test__futures_to_dask_dataframe(c, s, a, b):
    remote_dfs = c.map(identity, dfs)
    ddf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True,
            client=c)

    assert isinstance(ddf, dd.DataFrame)
    assert ddf.divisions == (0, 30, 60, 80)
    expr = ddf.x.sum()
    result = yield c._get(expr.dask, expr._keys())
    assert result == [sum([df.x.sum() for df in dfs])]
예제 #10
0
def test__futures_to_dask_dataframe(e, s, a, b):
    remote_dfs = e.map(identity, dfs)
    ddf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True,
            executor=e)

    assert isinstance(ddf, dd.DataFrame)
    assert ddf.divisions == (0, 30, 60, 80)
    assert ddf._known_dtype
    expr = ddf.x.sum()
    result = yield e._get(expr.dask, expr._keys())
    assert result == [sum([df.x.sum() for df in dfs])]
예제 #11
0
def test__futures_to_dask_dataframe(e, s, a, b):
    remote_dfs = e.map(identity, dfs)
    ddf = yield _futures_to_dask_dataframe(remote_dfs,
                                           divisions=True,
                                           executor=e)

    assert isinstance(ddf, dd.DataFrame)
    assert ddf.divisions == (0, 30, 60, 80)
    assert ddf._known_dtype
    expr = ddf.x.sum()
    result = yield e._get(expr.dask, expr._keys())
    assert result == [sum([df.x.sum() for df in dfs])]
예제 #12
0
    def f(c, a, b):
        e = Executor((c.ip, c.port), start=False)
        IOLoop.current().spawn_callback(e._go)

        remote_dfs = e.map(lambda x: x, dfs)
        ddf = yield _futures_to_dask_dataframe(e, remote_dfs, divisions=True)

        assert isinstance(ddf, dd.DataFrame)
        assert ddf.divisions == (0, 30, 60, 80)
        expr = ddf.x.sum()
        result = yield e._get(expr.dask, expr._keys())
        assert result == [sum([df.x.sum() for df in dfs])]

        yield e._shutdown()
예제 #13
0
def test__futures_to_dask_dataframe(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    remote_dfs = e.map(identity, dfs)
    ddf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True,
            executor=e)

    assert isinstance(ddf, dd.DataFrame)
    assert ddf.divisions == (0, 30, 60, 80)
    expr = ddf.x.sum()
    result = yield e._get(expr.dask, expr._keys())
    assert result == [sum([df.x.sum() for df in dfs])]

    yield e._shutdown()
예제 #14
0
def test__futures_to_dask_dataframe(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    remote_dfs = e.map(identity, dfs)
    ddf = yield _futures_to_dask_dataframe(remote_dfs,
                                           divisions=True,
                                           executor=e)

    assert isinstance(ddf, dd.DataFrame)
    assert ddf.divisions == (0, 30, 60, 80)
    expr = ddf.x.sum()
    result = yield e._get(expr.dask, expr._keys())
    assert result == [sum([df.x.sum() for df in dfs])]

    yield e._shutdown()
예제 #15
0
def test__futures_to_collection(c, s, a, b):
    remote_dfs = c.map(identity, dfs)
    ddf = yield _futures_to_collection(remote_dfs, divisions=True)
    ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    assert isinstance(ddf, dd.DataFrame)

    assert ddf.dask == ddf2.dask

    remote_arrays = c.map(np.arange, range(3, 5))
    x = yield _futures_to_collection(remote_arrays)
    y = yield _futures_to_dask_array(remote_arrays)

    assert type(x) == type(y)
    assert x.dask == y.dask

    remote_lists = yield c._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = yield _futures_to_collection(remote_lists)
    c = yield _futures_to_dask_bag(remote_lists)

    assert type(b) == type(c)
    assert b.dask == b.dask
예제 #16
0
def test__futures_to_collection(c, s, a, b):
    remote_dfs = c.map(identity, dfs)
    ddf = yield _futures_to_collection(remote_dfs, divisions=True)
    ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    assert isinstance(ddf, dd.DataFrame)

    assert ddf.dask == ddf2.dask

    remote_arrays = c.map(np.arange, range(3, 5))
    x = yield _futures_to_collection(remote_arrays)
    y = yield _futures_to_dask_array(remote_arrays)

    assert type(x) == type(y)
    assert x.dask == y.dask

    remote_lists = yield c._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = yield _futures_to_collection(remote_lists)
    c = yield _futures_to_dask_bag(remote_lists)

    assert type(b) == type(c)
    assert b.dask == b.dask
예제 #17
0
def test_dataframes(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    dfs = [
        pd.DataFrame({
            'x': np.random.random(100),
            'y': np.random.random(100)
        },
                     index=list(range(i, i + 100)))
        for i in range(0, 100 * 10, 100)
    ]

    remote_dfs = e.map(lambda x: x, dfs)
    rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    name = 'foo'
    ldf = dd.DataFrame({(name, i): df
                        for i, df in enumerate(dfs)}, name, dfs[0].columns,
                       list(range(0, 1000, 100)) + [999])

    assert rdf.divisions == ldf.divisions

    remote, = e.compute(rdf)
    result = yield remote._result()

    tm.assert_frame_equal(result, ldf.compute(get=dask.get))

    exprs = [
        lambda df: df.x.mean(), lambda df: df.y.std(),
        lambda df: df.assign(z=df.x + df.y).drop_duplicates(),
        lambda df: df.index, lambda df: df.x, lambda df: df.x.cumsum(),
        lambda df: df.loc[50:75]
    ]
    for f in exprs:
        local = f(ldf).compute(get=dask.get)
        remote, = e.compute(f(rdf))
        remote = yield remote._result()
        assert_equal(local, remote)
예제 #18
0
def test_dataframes(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    dfs = [pd.DataFrame({'x': np.random.random(100),
                         'y': np.random.random(100)},
                        index=list(range(i, i + 100)))
           for i in range(0, 100*10, 100)]

    remote_dfs = e.map(lambda x: x, dfs)
    rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    name = 'foo'
    ldf = dd.DataFrame({(name, i): df for i, df in enumerate(dfs)},
                       name, dfs[0].columns,
                       list(range(0, 1000, 100)) + [999])

    assert rdf.divisions == ldf.divisions

    remote, = e.compute(rdf)
    result = yield remote._result()

    tm.assert_frame_equal(result,
                          ldf.compute(get=dask.get))

    exprs = [lambda df: df.x.mean(),
             lambda df: df.y.std(),
             lambda df: df.assign(z=df.x + df.y).drop_duplicates(),
             lambda df: df.index,
             lambda df: df.x,
             lambda df: df.x.cumsum(),
             lambda df: df.loc[50:75]]
    for f in exprs:
        local = f(ldf).compute(get=dask.get)
        remote, = e.compute(f(rdf))
        remote = yield remote._result()
        assert_equal(local, remote)
예제 #19
0
def test_no_divisions(c, s, a, b):
    dfs = c.map(tm.makeTimeDataFrame, range(5, 10))

    df = yield _futures_to_dask_dataframe(dfs)
    assert not df.known_divisions
    assert list(df.columns) == list(tm.makeTimeDataFrame(5).columns)
예제 #20
0
def test_no_divisions(e, s, a, b):
    dfs = e.map(tm.makeTimeDataFrame, range(5, 10))

    df = yield _futures_to_dask_dataframe(dfs)
    assert not df.known_divisions
    assert list(df.columns) == list(tm.makeTimeDataFrame(5).columns)