예제 #1
0
def test__dask_array_collections(s, a, b):
    import dask.array as da
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    x_dsk = {('x', i, j): np.random.random((3, 3)) for i in range(3)
                                                   for j in range(2)}
    y_dsk = {('y', i, j): np.random.random((3, 3)) for i in range(2)
                                                   for j in range(3)}
    x_futures = yield e._scatter(x_dsk)
    y_futures = yield e._scatter(y_dsk)

    dt = np.random.random(0).dtype
    x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt)
    y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt)

    x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt)
    y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt)

    exprs = [lambda x, y: x.T + y,
             lambda x, y: x.mean() + y.mean(),
             lambda x, y: x.dot(y).std(axis=0),
             lambda x, y: x - x.mean(axis=1)[:, None]]

    for expr in exprs:
        local = expr(x_local, y_local).compute(get=dask.get)

        remote = e.compute(expr(x_remote, y_remote))
        remote = yield remote._result()

        assert np.all(local == remote)

    yield e._shutdown()
예제 #2
0
def test_read_text(s, a, b):
    pytest.importorskip('dask.bag')
    import dask.bag as db
    from dask.imperative import Value
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    b = read_text(test_bucket_name, 'test/accounts', lazy=True,
                  collection=True, anon=True)
    assert isinstance(b, db.Bag)
    yield gen.sleep(0.2)
    assert not s.tasks

    future = e.compute(b.filter(None).map(json.loads).pluck('amount').sum())
    result = yield future._result()

    assert result == (1 + 2 + 3 + 4 + 5 + 6 + 7 + 8) * 100

    text = read_text(test_bucket_name, 'test/accounts', lazy=True,
                     collection=False, anon=True)
    assert all(isinstance(v, Value) for v in text)

    text = read_text(test_bucket_name, 'test/accounts', lazy=False,
                     collection=False, anon=True)
    assert all(isinstance(v, Future) for v in text)

    yield e._shutdown()
예제 #3
0
def test__futures_to_dask_bag(s, a, b):
    import dask.bag as db
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    futures = yield e._scatter(L)

    rb = yield _futures_to_dask_bag(futures)
    assert isinstance(rb, db.Bag)
    assert rb.npartitions == len(L)

    lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3)

    exprs = [lambda x: x.map(lambda x: x + 1).sum(),
             lambda x: x.filter(lambda x: x % 2)]

    for expr in exprs:
        local = expr(lb).compute(get=dask.get)
        remote = e.compute(expr(rb))
        remote = yield remote._result()

        assert local == remote

    yield e._shutdown()
예제 #4
0
def test_lazy_values(s, a, b):
    with make_hdfs() as hdfs:
        data = b'a'

        for i in range(3):
            hdfs.mkdir('/tmp/test/data-%d' % i)
            for j in range(2):
                fn = '/tmp/test/data-%d/file-%d.csv' % (i, j)
                with hdfs.open(fn, 'w', repl=1) as f:
                    f.write(data)

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        values = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True)
        assert all(isinstance(v, Value) for v in values)

        while not s.restrictions:
            yield gen.sleep(0.01)
        assert not s.dask

        results = e.compute(*values, sync=False)
        results = yield e._gather(results)
        assert len(results) == 6
        assert all(x == b'a' for x in results)
예제 #5
0
def test_lazy_values(s, a, b):
    with make_hdfs() as hdfs:
        data = b'a'

        for i in range(3):
            hdfs.mkdir('/tmp/test/data-%d' % i)
            for j in range(2):
                fn = '/tmp/test/data-%d/file-%d.csv' % (i, j)
                with hdfs.open(fn, 'w', repl=1) as f:
                    f.write(data)

        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        values = read_binary('/tmp/test/', hdfs=hdfs, lazy=True)
        assert all(isinstance(v, Value) for v in values)

        while not s.restrictions:
            yield gen.sleep(0.01)
        assert not s.dask

        results = e.compute(*values, sync=False)
        results = yield e._gather(results)
        assert len(results) == 6
        assert all(x == b'a' for x in results)
예제 #6
0
def test__futures_to_dask_bag(s, a, b):
    import dask.bag as db
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    futures = yield e._scatter(L)

    rb = yield _futures_to_dask_bag(futures)
    assert isinstance(rb, db.Bag)
    assert rb.npartitions == len(L)

    lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3)

    exprs = [
        lambda x: x.map(lambda x: x + 1).sum(),
        lambda x: x.filter(lambda x: x % 2)
    ]

    for expr in exprs:
        local = expr(lb).compute(get=dask.get)
        remote = e.compute(expr(rb))
        remote = yield remote._result()

        assert local == remote

    yield e._shutdown()
예제 #7
0
def test__dask_array_collections(s, a, b):
    import dask.array as da
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    x_dsk = {('x', i, j): np.random.random((3, 3))
             for i in range(3) for j in range(2)}
    y_dsk = {('y', i, j): np.random.random((3, 3))
             for i in range(2) for j in range(3)}
    x_futures = yield e._scatter(x_dsk)
    y_futures = yield e._scatter(y_dsk)

    dt = np.random.random(0).dtype
    x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt)
    y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt)

    x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt)
    y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt)

    exprs = [
        lambda x, y: x.T + y, lambda x, y: x.mean() + y.mean(),
        lambda x, y: x.dot(y).std(axis=0),
        lambda x, y: x - x.mean(axis=1)[:, None]
    ]

    for expr in exprs:
        local = expr(x_local, y_local).compute(get=dask.get)

        remote, = e.compute(expr(x_remote, y_remote))
        remote = yield remote._result()

        assert np.all(local == remote)

    yield e._shutdown()
예제 #8
0
def test__read_text(s, a, b):
    with make_hdfs() as hdfs:
        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        with hdfs.open('/tmp/test/text.1.txt', 'wb') as f:
            f.write('Alice 100\nBob 200\nCharlie 300'.encode())

        with hdfs.open('/tmp/test/text.2.txt', 'wb') as f:
            f.write('Dan 400\nEdith 500\nFrank 600'.encode())

        with hdfs.open('/tmp/test/other.txt', 'wb') as f:
            f.write('a b\nc d'.encode())

        b = yield _read_text('/tmp/test/text.*.txt',
                             collection=True,
                             lazy=True)
        yield gen.sleep(0.5)
        assert not s.tasks

        future = e.compute(b.str.strip().str.split().map(len))
        result = yield future._result()
        assert result == [2, 2, 2, 2, 2, 2]

        b = yield _read_text('/tmp/test/other.txt',
                             collection=True,
                             lazy=False)
        future = e.compute(b.str.split().concat())
        result = yield future._result()
        assert result == ['a', 'b', 'c', 'd']

        L = yield _read_text('/tmp/test/text.*.txt',
                             collection=False,
                             lazy=False)
        assert all(isinstance(x, Future) for x in L)

        L = yield _read_text('/tmp/test/text.*.txt',
                             collection=False,
                             lazy=True)
        assert all(isinstance(x, Value) for x in L)

        yield e._shutdown()
예제 #9
0
def test_dataframes(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    dfs = [
        pd.DataFrame({
            'x': np.random.random(100),
            'y': np.random.random(100)
        },
                     index=list(range(i, i + 100)))
        for i in range(0, 100 * 10, 100)
    ]

    remote_dfs = e.map(lambda x: x, dfs)
    rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    name = 'foo'
    ldf = dd.DataFrame({(name, i): df
                        for i, df in enumerate(dfs)}, name, dfs[0].columns,
                       list(range(0, 1000, 100)) + [999])

    assert rdf.divisions == ldf.divisions

    remote = e.compute(rdf)
    result = yield remote._result()

    tm.assert_frame_equal(result, ldf.compute(get=dask.get))

    exprs = [
        lambda df: df.x.mean(), lambda df: df.y.std(),
        lambda df: df.assign(z=df.x + df.y).drop_duplicates(),
        lambda df: df.index, lambda df: df.x, lambda df: df.x.cumsum(),
        lambda df: df.loc[50:75]
    ]
    for f in exprs:
        local = f(ldf).compute(get=dask.get)
        remote = e.compute(f(rdf))
        remote = yield gen.with_timeout(timedelta(seconds=5), remote._result())
        assert_equal(local, remote)

    yield e._shutdown()
예제 #10
0
def test_read_bytes_lazy(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    values = read_bytes(test_bucket_name, 'test/', lazy=True, anon=True)
    assert all(isinstance(v, Value) for v in values)

    results = e.compute(values, sync=False)
    results = yield e._gather(results)

    assert set(results).issuperset(set(files.values()))

    yield e._shutdown()
예제 #11
0
def test_read_bytes_lazy(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    values = read_bytes(test_bucket_name, 'test/', lazy=True, anon=True)
    assert all(isinstance(v, Value) for v in values)

    results = e.compute(values, sync=False)
    results = yield e._gather(results)

    assert set(results).issuperset(set(files.values()))

    yield e._shutdown()
예제 #12
0
def test__read_text(s, a, b):
    with make_hdfs() as hdfs:
        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        with hdfs.open('/tmp/test/text.1.txt', 'wb') as f:
            f.write('Alice 100\nBob 200\nCharlie 300'.encode())

        with hdfs.open('/tmp/test/text.2.txt', 'wb') as f:
            f.write('Dan 400\nEdith 500\nFrank 600'.encode())

        with hdfs.open('/tmp/test/other.txt', 'wb') as f:
            f.write('a b\nc d'.encode())

        b = yield _read_text('/tmp/test/text.*.txt',
                             collection=True, lazy=True)
        yield gen.sleep(0.5)
        assert not s.tasks

        future = e.compute(b.str.strip().str.split().map(len))
        result = yield future._result()
        assert result == [2, 2, 2, 2, 2, 2]

        b = yield _read_text('/tmp/test/other.txt',
                             collection=True, lazy=False)
        future = e.compute(b.str.split().concat())
        result = yield future._result()
        assert result == ['a', 'b', 'c', 'd']

        L = yield _read_text('/tmp/test/text.*.txt',
                             collection=False, lazy=False)
        assert all(isinstance(x, Future) for x in L)

        L = yield _read_text('/tmp/test/text.*.txt',
                             collection=False, lazy=True)
        assert all(isinstance(x, Value) for x in L)

        yield e._shutdown()
예제 #13
0
def test_dataframes(s, a, b):
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    dfs = [pd.DataFrame({'x': np.random.random(100),
                         'y': np.random.random(100)},
                        index=list(range(i, i + 100)))
           for i in range(0, 100*10, 100)]

    remote_dfs = e.map(lambda x: x, dfs)
    rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True)
    name = 'foo'
    ldf = dd.DataFrame({(name, i): df for i, df in enumerate(dfs)},
                       name, dfs[0].columns,
                       list(range(0, 1000, 100)) + [999])

    assert rdf.divisions == ldf.divisions

    remote = e.compute(rdf)
    result = yield remote._result()

    tm.assert_frame_equal(result,
                          ldf.compute(get=dask.get))

    exprs = [lambda df: df.x.mean(),
             lambda df: df.y.std(),
             lambda df: df.assign(z=df.x + df.y).drop_duplicates(),
             lambda df: df.index,
             lambda df: df.x,
             lambda df: df.x.cumsum(),
             lambda df: df.loc[50:75]]
    for f in exprs:
        local = f(ldf).compute(get=dask.get)
        remote = e.compute(f(rdf))
        remote = yield gen.with_timeout(timedelta(seconds=5), remote._result())
        assert_equal(local, remote)

    yield e._shutdown()
예제 #14
0
def test_read_csv(s, a, b):
    with make_hdfs() as hdfs:
        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        with hdfs.open('/tmp/test/1.csv', 'w') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('/tmp/test/2.csv', 'w') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        df = yield _read_csv('/tmp/test/*.csv', header=True, lineterminator='\n')
        result, = e.compute(df.id.sum(), sync=False)
        result = yield result._result()
        assert result == 1 + 2 + 3 + 4
예제 #15
0
def test_read_csv(s, a, b):
    with make_hdfs() as hdfs:
        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        with hdfs.open('/tmp/test/1.csv', 'w') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('/tmp/test/2.csv', 'w') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        df = yield _read_csv('/tmp/test/*.csv',
                             header=True,
                             lineterminator='\n')
        result, = e.compute(df.id.sum(), sync=False)
        result = yield result._result()
        assert result == 1 + 2 + 3 + 4
예제 #16
0
def test__stack(s, a, b):
    import dask.array as da
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    arrays = e.map(np.ones, [(5, 5)] * 6)
    y = yield _stack(arrays, axis=0)
    assert y.shape == (6, 5, 5)
    assert y.chunks == ((1, 1, 1, 1, 1, 1), (5, ), (5, ))

    y_result, = e.compute(y)
    yy = yield y_result._result()

    assert isinstance(yy, np.ndarray)
    assert yy.shape == y.shape
    assert (yy == 1).all()

    yield e._shutdown()
예제 #17
0
def test__stack(s, a, b):
    import dask.array as da
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    arrays = e.map(np.ones, [(5, 5)] * 6)
    y = yield _stack(arrays, axis=0)
    assert y.shape == (6, 5, 5)
    assert y.chunks == ((1, 1, 1, 1, 1, 1), (5,), (5,))

    y_result = e.compute(y)
    yy = yield y_result._result()

    assert isinstance(yy, np.ndarray)
    assert yy.shape == y.shape
    assert (yy == 1).all()

    yield e._shutdown()
예제 #18
0
def test_read_csv_lazy(s, a, b):
    with make_hdfs() as hdfs:
        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        with hdfs.open('/tmp/test/1.csv', 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('/tmp/test/2.csv', 'wb') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        df = yield _read_csv('/tmp/test/*.csv', lazy=True,
                             lineterminator='\n')
        assert df._known_dtype
        yield gen.sleep(0.5)
        assert not s.tasks

        result = yield e.compute(df.id.sum(), sync=False)._result()
        assert result == 1 + 2 + 3 + 4

        yield e._shutdown()
예제 #19
0
def test_read_csv_lazy(s, a, b):
    with make_hdfs() as hdfs:
        e = Executor((s.ip, s.port), start=False)
        yield e._start()

        with hdfs.open('/tmp/test/1.csv', 'wb') as f:
            f.write(b'name,amount,id\nAlice,100,1\nBob,200,2')

        with hdfs.open('/tmp/test/2.csv', 'wb') as f:
            f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4')

        df = yield _read_csv('/tmp/test/*.csv',
                             header=True,
                             lazy=True,
                             lineterminator='\n')
        assert df._known_dtype
        yield gen.sleep(0.5)
        assert not s.tasks

        result = yield e.compute(df.id.sum(), sync=False)._result()
        assert result == 1 + 2 + 3 + 4

        yield e._shutdown()
예제 #20
0
def test_read_text(s, a, b):
    pytest.importorskip('dask.bag')
    import dask.bag as db
    from dask.imperative import Value
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    b = read_text(test_bucket_name,
                  'test/accounts',
                  lazy=True,
                  collection=True,
                  anon=True)
    assert isinstance(b, db.Bag)
    yield gen.sleep(0.2)
    assert not s.tasks

    future = e.compute(b.filter(None).map(json.loads).pluck('amount').sum())
    result = yield future._result()

    assert result == (1 + 2 + 3 + 4 + 5 + 6 + 7 + 8) * 100

    text = read_text(test_bucket_name,
                     'test/accounts',
                     lazy=True,
                     collection=False,
                     anon=True)
    assert all(isinstance(v, Value) for v in text)

    text = read_text(test_bucket_name,
                     'test/accounts',
                     lazy=False,
                     collection=False,
                     anon=True)
    assert all(isinstance(v, Future) for v in text)

    yield e._shutdown()
예제 #21
0
import referenceAssembler
import sys
from dask import delayed, multiprocessing, threaded
from distributed import Executor

@delayed
def ingest(src):
    print(src)

@delayed
def run(src,k=21):
    referenceAssembler.runAssembler(k,src)
    return 1


if __name__ == '__main__':
    e = Executor('127.0.0.1:8786')
    print(sys.argv[1])
    #ingest(sys.argv[1])
    e.compute(run(sys.argv[1]))