Exemplo n.º 1
0
def test__futures_to_dask_bag(s, a, b):
    import dask.bag as db
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    futures = yield e._scatter(L)

    rb = yield _futures_to_dask_bag(futures)
    assert isinstance(rb, db.Bag)
    assert rb.npartitions == len(L)

    lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3)

    exprs = [lambda x: x.map(lambda x: x + 1).sum(),
             lambda x: x.filter(lambda x: x % 2)]

    for expr in exprs:
        local = expr(lb).compute(get=dask.get)
        remote = e.compute(expr(rb))
        remote = yield remote._result()

        assert local == remote

    yield e._shutdown()
Exemplo n.º 2
0
def test__futures_to_collection(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    remote_dfs = e.map(identity, dfs)
    ddf = yield _futures_to_collection(remote_dfs, divisions=True)
    ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    assert isinstance(ddf, dd.DataFrame)

    assert ddf.dask == ddf2.dask

    remote_arrays = e.map(np.arange, range(3, 5))
    x = yield _futures_to_collection(remote_arrays)
    y = yield _futures_to_dask_array(remote_arrays)

    assert type(x) == type(y)
    assert x.dask == y.dask

    remote_lists = yield e._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = yield _futures_to_collection(remote_lists)
    c = yield _futures_to_dask_bag(remote_lists)

    assert type(b) == type(c)
    assert b.dask == b.dask

    yield e._shutdown()
Exemplo n.º 3
0
def test__futures_to_collection(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    remote_dfs = e.map(identity, dfs)
    ddf = yield _futures_to_collection(remote_dfs, divisions=True)
    ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    assert isinstance(ddf, dd.DataFrame)

    assert ddf.dask == ddf2.dask

    remote_arrays = e.map(np.arange, range(3, 5))
    x = yield _futures_to_collection(remote_arrays)
    y = yield _futures_to_dask_array(remote_arrays)

    assert type(x) == type(y)
    assert x.dask == y.dask

    remote_lists = yield e._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = yield _futures_to_collection(remote_lists)
    c = yield _futures_to_dask_bag(remote_lists)

    assert type(b) == type(c)
    assert b.dask == b.dask

    yield e._shutdown()
Exemplo n.º 4
0
def test__futures_to_dask_bag(s, a, b):
    import dask.bag as db
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    futures = yield e._scatter(L)

    rb = yield _futures_to_dask_bag(futures)
    assert isinstance(rb, db.Bag)
    assert rb.npartitions == len(L)

    lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3)

    exprs = [
        lambda x: x.map(lambda x: x + 1).sum(),
        lambda x: x.filter(lambda x: x % 2)
    ]

    for expr in exprs:
        local = expr(lb).compute(get=dask.get)
        remote = e.compute(expr(rb))
        remote = yield remote._result()

        assert local == remote

    yield e._shutdown()
Exemplo n.º 5
0
def _read_text(fn, encoding='utf-8', errors='strict', lineterminator='\n',
               executor=None, hdfs=None, lazy=True, collection=True):
    from hdfs3 import HDFileSystem
    from dask import do
    import pandas as pd
    hdfs = hdfs or HDFileSystem()
    executor = default_executor(executor)

    filenames = sorted(hdfs.glob(fn))
    blocks = [block for fn in filenames
                    for block in read_bytes(fn, executor, hdfs, lazy=True,
                                            delimiter=lineterminator.encode())]
    strings = [do(bytes.decode)(b, encoding, errors) for b in blocks]
    lines = [do(unicode.split)(s, lineterminator) for s in strings]

    if lazy:
        from dask.bag import from_imperative
        if collection:
            ensure_default_get(executor)
            raise gen.Return(from_imperative(lines))
        else:
            raise gen.Return(lines)

    else:
        futures = executor.compute(lines)
        from distributed.collections import _futures_to_dask_bag
        if collection:
            ensure_default_get(executor)
            b = yield _futures_to_dask_bag(futures)
            raise gen.Return(b)
        else:
            raise gen.Return(futures)
Exemplo n.º 6
0
def test__futures_to_collection(c, s, a, b):
    remote_dfs = c.map(identity, dfs)
    ddf = yield _futures_to_collection(remote_dfs, divisions=True)
    ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    assert isinstance(ddf, dd.DataFrame)

    assert ddf.dask == ddf2.dask

    remote_arrays = c.map(np.arange, range(3, 5))
    x = yield _futures_to_collection(remote_arrays)
    y = yield _futures_to_dask_array(remote_arrays)

    assert type(x) == type(y)
    assert x.dask == y.dask

    remote_lists = yield c._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = yield _futures_to_collection(remote_lists)
    c = yield _futures_to_dask_bag(remote_lists)

    assert type(b) == type(c)
    assert b.dask == b.dask
Exemplo n.º 7
0
def test__futures_to_collection(c, s, a, b):
    remote_dfs = c.map(identity, dfs)
    ddf = yield _futures_to_collection(remote_dfs, divisions=True)
    ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    assert isinstance(ddf, dd.DataFrame)

    assert ddf.dask == ddf2.dask

    remote_arrays = c.map(np.arange, range(3, 5))
    x = yield _futures_to_collection(remote_arrays)
    y = yield _futures_to_dask_array(remote_arrays)

    assert type(x) == type(y)
    assert x.dask == y.dask

    remote_lists = yield c._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = yield _futures_to_collection(remote_lists)
    c = yield _futures_to_dask_bag(remote_lists)

    assert type(b) == type(c)
    assert b.dask == b.dask
Exemplo n.º 8
0
def test__futures_to_dask_bag(c, s, a, b):
    import dask.bag as db

    L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    futures = yield c._scatter(L)

    rb = yield _futures_to_dask_bag(futures)
    assert isinstance(rb, db.Bag)
    assert rb.npartitions == len(L)

    lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3)

    exprs = [lambda x: x.map(lambda x: x + 1).sum(),
             lambda x: x.filter(lambda x: x % 2)]

    for expr in exprs:
        local = expr(lb).compute(get=dask.get)
        remote = c.compute(expr(rb))
        remote = yield remote

        assert local == remote
Exemplo n.º 9
0
def _read_text(fn,
               encoding='utf-8',
               errors='strict',
               lineterminator='\n',
               executor=None,
               hdfs=None,
               lazy=True,
               collection=True):
    from hdfs3 import HDFileSystem
    from dask import do
    import pandas as pd
    hdfs = hdfs or HDFileSystem()
    executor = default_executor(executor)

    filenames = sorted(hdfs.glob(fn))
    blocks = [
        block for fn in filenames for block in read_bytes(
            fn, executor, hdfs, lazy=True, delimiter=lineterminator.encode())
    ]
    strings = [do(bytes.decode)(b, encoding, errors) for b in blocks]
    lines = [do(unicode.split)(s, lineterminator) for s in strings]

    if lazy:
        from dask.bag import from_imperative
        if collection:
            ensure_default_get(executor)
            raise gen.Return(from_imperative(lines))
        else:
            raise gen.Return(lines)

    else:
        futures = executor.compute(lines)
        from distributed.collections import _futures_to_dask_bag
        if collection:
            ensure_default_get(executor)
            b = yield _futures_to_dask_bag(futures)
            raise gen.Return(b)
        else:
            raise gen.Return(futures)