def test__dask_array_collections(s, a, b): import dask.array as da e = Executor((s.ip, s.port), start=False) yield e._start() x_dsk = {('x', i, j): np.random.random((3, 3)) for i in range(3) for j in range(2)} y_dsk = {('y', i, j): np.random.random((3, 3)) for i in range(2) for j in range(3)} x_futures = yield e._scatter(x_dsk) y_futures = yield e._scatter(y_dsk) dt = np.random.random(0).dtype x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt) y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt) x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt) y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt) exprs = [lambda x, y: x.T + y, lambda x, y: x.mean() + y.mean(), lambda x, y: x.dot(y).std(axis=0), lambda x, y: x - x.mean(axis=1)[:, None]] for expr in exprs: local = expr(x_local, y_local).compute(get=dask.get) remote = e.compute(expr(x_remote, y_remote)) remote = yield remote._result() assert np.all(local == remote) yield e._shutdown()
def test_read_text(s, a, b): pytest.importorskip('dask.bag') import dask.bag as db from dask.imperative import Value e = Executor((s.ip, s.port), start=False) yield e._start() b = read_text(test_bucket_name, 'test/accounts', lazy=True, collection=True, anon=True) assert isinstance(b, db.Bag) yield gen.sleep(0.2) assert not s.tasks future = e.compute(b.filter(None).map(json.loads).pluck('amount').sum()) result = yield future._result() assert result == (1 + 2 + 3 + 4 + 5 + 6 + 7 + 8) * 100 text = read_text(test_bucket_name, 'test/accounts', lazy=True, collection=False, anon=True) assert all(isinstance(v, Value) for v in text) text = read_text(test_bucket_name, 'test/accounts', lazy=False, collection=False, anon=True) assert all(isinstance(v, Future) for v in text) yield e._shutdown()
def test__futures_to_dask_bag(s, a, b): import dask.bag as db e = Executor((s.ip, s.port), start=False) yield e._start() L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] futures = yield e._scatter(L) rb = yield _futures_to_dask_bag(futures) assert isinstance(rb, db.Bag) assert rb.npartitions == len(L) lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3) exprs = [lambda x: x.map(lambda x: x + 1).sum(), lambda x: x.filter(lambda x: x % 2)] for expr in exprs: local = expr(lb).compute(get=dask.get) remote = e.compute(expr(rb)) remote = yield remote._result() assert local == remote yield e._shutdown()
def test_lazy_values(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() values = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True) assert all(isinstance(v, Value) for v in values) while not s.restrictions: yield gen.sleep(0.01) assert not s.dask results = e.compute(*values, sync=False) results = yield e._gather(results) assert len(results) == 6 assert all(x == b'a' for x in results)
def test_lazy_values(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() values = read_binary('/tmp/test/', hdfs=hdfs, lazy=True) assert all(isinstance(v, Value) for v in values) while not s.restrictions: yield gen.sleep(0.01) assert not s.dask results = e.compute(*values, sync=False) results = yield e._gather(results) assert len(results) == 6 assert all(x == b'a' for x in results)
def test__futures_to_dask_bag(s, a, b): import dask.bag as db e = Executor((s.ip, s.port), start=False) yield e._start() L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] futures = yield e._scatter(L) rb = yield _futures_to_dask_bag(futures) assert isinstance(rb, db.Bag) assert rb.npartitions == len(L) lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3) exprs = [ lambda x: x.map(lambda x: x + 1).sum(), lambda x: x.filter(lambda x: x % 2) ] for expr in exprs: local = expr(lb).compute(get=dask.get) remote = e.compute(expr(rb)) remote = yield remote._result() assert local == remote yield e._shutdown()
def test__dask_array_collections(s, a, b): import dask.array as da e = Executor((s.ip, s.port), start=False) yield e._start() x_dsk = {('x', i, j): np.random.random((3, 3)) for i in range(3) for j in range(2)} y_dsk = {('y', i, j): np.random.random((3, 3)) for i in range(2) for j in range(3)} x_futures = yield e._scatter(x_dsk) y_futures = yield e._scatter(y_dsk) dt = np.random.random(0).dtype x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt) y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt) x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt) y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt) exprs = [ lambda x, y: x.T + y, lambda x, y: x.mean() + y.mean(), lambda x, y: x.dot(y).std(axis=0), lambda x, y: x - x.mean(axis=1)[:, None] ] for expr in exprs: local = expr(x_local, y_local).compute(get=dask.get) remote, = e.compute(expr(x_remote, y_remote)) remote = yield remote._result() assert np.all(local == remote) yield e._shutdown()
def test__read_text(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() with hdfs.open('/tmp/test/text.1.txt', 'wb') as f: f.write('Alice 100\nBob 200\nCharlie 300'.encode()) with hdfs.open('/tmp/test/text.2.txt', 'wb') as f: f.write('Dan 400\nEdith 500\nFrank 600'.encode()) with hdfs.open('/tmp/test/other.txt', 'wb') as f: f.write('a b\nc d'.encode()) b = yield _read_text('/tmp/test/text.*.txt', collection=True, lazy=True) yield gen.sleep(0.5) assert not s.tasks future = e.compute(b.str.strip().str.split().map(len)) result = yield future._result() assert result == [2, 2, 2, 2, 2, 2] b = yield _read_text('/tmp/test/other.txt', collection=True, lazy=False) future = e.compute(b.str.split().concat()) result = yield future._result() assert result == ['a', 'b', 'c', 'd'] L = yield _read_text('/tmp/test/text.*.txt', collection=False, lazy=False) assert all(isinstance(x, Future) for x in L) L = yield _read_text('/tmp/test/text.*.txt', collection=False, lazy=True) assert all(isinstance(x, Value) for x in L) yield e._shutdown()
def test_dataframes(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() dfs = [ pd.DataFrame({ 'x': np.random.random(100), 'y': np.random.random(100) }, index=list(range(i, i + 100))) for i in range(0, 100 * 10, 100) ] remote_dfs = e.map(lambda x: x, dfs) rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) name = 'foo' ldf = dd.DataFrame({(name, i): df for i, df in enumerate(dfs)}, name, dfs[0].columns, list(range(0, 1000, 100)) + [999]) assert rdf.divisions == ldf.divisions remote = e.compute(rdf) result = yield remote._result() tm.assert_frame_equal(result, ldf.compute(get=dask.get)) exprs = [ lambda df: df.x.mean(), lambda df: df.y.std(), lambda df: df.assign(z=df.x + df.y).drop_duplicates(), lambda df: df.index, lambda df: df.x, lambda df: df.x.cumsum(), lambda df: df.loc[50:75] ] for f in exprs: local = f(ldf).compute(get=dask.get) remote = e.compute(f(rdf)) remote = yield gen.with_timeout(timedelta(seconds=5), remote._result()) assert_equal(local, remote) yield e._shutdown()
def test_read_bytes_lazy(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() values = read_bytes(test_bucket_name, 'test/', lazy=True, anon=True) assert all(isinstance(v, Value) for v in values) results = e.compute(values, sync=False) results = yield e._gather(results) assert set(results).issuperset(set(files.values())) yield e._shutdown()
def test_dataframes(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() dfs = [pd.DataFrame({'x': np.random.random(100), 'y': np.random.random(100)}, index=list(range(i, i + 100))) for i in range(0, 100*10, 100)] remote_dfs = e.map(lambda x: x, dfs) rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) name = 'foo' ldf = dd.DataFrame({(name, i): df for i, df in enumerate(dfs)}, name, dfs[0].columns, list(range(0, 1000, 100)) + [999]) assert rdf.divisions == ldf.divisions remote = e.compute(rdf) result = yield remote._result() tm.assert_frame_equal(result, ldf.compute(get=dask.get)) exprs = [lambda df: df.x.mean(), lambda df: df.y.std(), lambda df: df.assign(z=df.x + df.y).drop_duplicates(), lambda df: df.index, lambda df: df.x, lambda df: df.x.cumsum(), lambda df: df.loc[50:75]] for f in exprs: local = f(ldf).compute(get=dask.get) remote = e.compute(f(rdf)) remote = yield gen.with_timeout(timedelta(seconds=5), remote._result()) assert_equal(local, remote) yield e._shutdown()
def test_read_csv(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() with hdfs.open('/tmp/test/1.csv', 'w') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'w') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') df = yield _read_csv('/tmp/test/*.csv', header=True, lineterminator='\n') result, = e.compute(df.id.sum(), sync=False) result = yield result._result() assert result == 1 + 2 + 3 + 4
def test__stack(s, a, b): import dask.array as da e = Executor((s.ip, s.port), start=False) yield e._start() arrays = e.map(np.ones, [(5, 5)] * 6) y = yield _stack(arrays, axis=0) assert y.shape == (6, 5, 5) assert y.chunks == ((1, 1, 1, 1, 1, 1), (5, ), (5, )) y_result, = e.compute(y) yy = yield y_result._result() assert isinstance(yy, np.ndarray) assert yy.shape == y.shape assert (yy == 1).all() yield e._shutdown()
def test__stack(s, a, b): import dask.array as da e = Executor((s.ip, s.port), start=False) yield e._start() arrays = e.map(np.ones, [(5, 5)] * 6) y = yield _stack(arrays, axis=0) assert y.shape == (6, 5, 5) assert y.chunks == ((1, 1, 1, 1, 1, 1), (5,), (5,)) y_result = e.compute(y) yy = yield y_result._result() assert isinstance(yy, np.ndarray) assert yy.shape == y.shape assert (yy == 1).all() yield e._shutdown()
def test_read_csv_lazy(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') df = yield _read_csv('/tmp/test/*.csv', lazy=True, lineterminator='\n') assert df._known_dtype yield gen.sleep(0.5) assert not s.tasks result = yield e.compute(df.id.sum(), sync=False)._result() assert result == 1 + 2 + 3 + 4 yield e._shutdown()
def test_read_csv_lazy(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') df = yield _read_csv('/tmp/test/*.csv', header=True, lazy=True, lineterminator='\n') assert df._known_dtype yield gen.sleep(0.5) assert not s.tasks result = yield e.compute(df.id.sum(), sync=False)._result() assert result == 1 + 2 + 3 + 4 yield e._shutdown()
import referenceAssembler import sys from dask import delayed, multiprocessing, threaded from distributed import Executor @delayed def ingest(src): print(src) @delayed def run(src,k=21): referenceAssembler.runAssembler(k,src) return 1 if __name__ == '__main__': e = Executor('127.0.0.1:8786') print(sys.argv[1]) #ingest(sys.argv[1]) e.compute(run(sys.argv[1]))