def test__futures_to_collection(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() remote_dfs = e.map(identity, dfs) ddf = yield _futures_to_collection(remote_dfs, divisions=True) ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) assert isinstance(ddf, dd.DataFrame) assert ddf.dask == ddf2.dask remote_arrays = e.map(np.arange, range(3, 5)) x = yield _futures_to_collection(remote_arrays) y = yield _futures_to_dask_array(remote_arrays) assert type(x) == type(y) assert x.dask == y.dask remote_lists = yield e._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) b = yield _futures_to_collection(remote_lists) c = yield _futures_to_dask_bag(remote_lists) assert type(b) == type(c) assert b.dask == b.dask yield e._shutdown()
def test_avro(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() avro_files = { '/tmp/test/1.avro': avro_bytes, '/tmp/test/2.avro': avro_bytes } with make_hdfs() as hdfs: for k, v in avro_files.items(): with hdfs.open(k, 'wb') as f: f.write(v) assert hdfs.info(k)['size'] > 0 L = yield _read_avro('/tmp/test/*.avro', lazy=False) assert isinstance(L, list) assert all(isinstance(x, Future) for x in L) results = yield e._gather(L) assert all(isinstance(r, list) for r in results) assert results[0][:5] == data[:5] assert results[-1][-5:] == data[-5:] L = yield _read_avro('/tmp/test/*.avro', lazy=True) assert isinstance(L, list) assert all(isinstance(x, Value) for x in L) yield e._shutdown()
def dont_test_dataframes(s, a): # slow pytest.importorskip('pandas') n = 3000000 fn = '/tmp/test/file.csv' with make_hdfs() as hdfs: data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n) with hdfs.open(fn, 'w') as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_binary(fn, hdfs=hdfs, delimiter=b'\r\n') assert len(futures) > 1 def load(b, **kwargs): assert b from io import BytesIO import pandas as pd bio = BytesIO(b) return pd.read_csv(bio, **kwargs) dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1) dfs2 = yield e._gather(dfs) assert sum(map(len, dfs2)) == n * 2 - 1
def test_read_text(s, a, b): pytest.importorskip('dask.bag') import dask.bag as db from dask.imperative import Value e = Executor((s.ip, s.port), start=False) yield e._start() b = read_text(test_bucket_name, 'test/accounts', lazy=True, collection=True, anon=True) assert isinstance(b, db.Bag) yield gen.sleep(0.2) assert not s.tasks future = e.compute(b.filter(None).map(json.loads).pluck('amount').sum()) result = yield future._result() assert result == (1 + 2 + 3 + 4 + 5 + 6 + 7 + 8) * 100 text = read_text(test_bucket_name, 'test/accounts', lazy=True, collection=False, anon=True) assert all(isinstance(v, Value) for v in text) text = read_text(test_bucket_name, 'test/accounts', lazy=False, collection=False, anon=True) assert all(isinstance(v, Future) for v in text) yield e._shutdown()
def test_lazy_values(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() values = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True) assert all(isinstance(v, Value) for v in values) while not s.restrictions: yield gen.sleep(0.01) assert not s.dask results = e.compute(*values, sync=False) results = yield e._gather(results) assert len(results) == 6 assert all(x == b'a' for x in results)
def test_with_data(s, a, b): ss = HTTPScheduler(s) ss.listen(0) e = Executor((s.ip, s.port), start=False) yield e._start() L = e.map(inc, [1, 2, 3]) L2 = yield e._scatter(['Hello', 'world!']) yield _wait(L) client = AsyncHTTPClient() response = yield client.fetch('http://localhost:%s/memory-load.json' % ss.port) out = json.loads(response.body.decode()) assert all(isinstance(v, int) for v in out.values()) assert set(out) == {a.address_string, b.address_string} assert sum(out.values()) == sum( map(sys.getsizeof, [1, 2, 3, 'Hello', 'world!'])) response = yield client.fetch( 'http://localhost:%s/memory-load-by-key.json' % ss.port) out = json.loads(response.body.decode()) assert set(out) == {a.address_string, b.address_string} assert all(isinstance(v, dict) for v in out.values()) assert all(k in {'inc', 'data'} for d in out.values() for k in d) assert all(isinstance(v, int) for d in out.values() for v in d.values()) assert sum(v for d in out.values() for v in d.values()) == \ sum(map(sys.getsizeof, [1, 2, 3, 'Hello', 'world!'])) ss.stop() yield e._shutdown()
def test_lazy_values(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() values = read_binary('/tmp/test/', hdfs=hdfs, lazy=True) assert all(isinstance(v, Value) for v in values) while not s.restrictions: yield gen.sleep(0.01) assert not s.dask results = e.compute(*values, sync=False) results = yield e._gather(results) assert len(results) == 6 assert all(x == b'a' for x in results)
def dont_test_dataframes(s, a): # slow pytest.importorskip('pandas') n = 3000000 fn = '/tmp/test/file.csv' with make_hdfs() as hdfs: data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n) with hdfs.open(fn, 'w') as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(fn, hdfs=hdfs, delimiter=b'\r\n') assert len(futures) > 1 def load(b, **kwargs): assert b from io import BytesIO import pandas as pd bio = BytesIO(b) return pd.read_csv(bio, **kwargs) dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1) dfs2 = yield e._gather(dfs) assert sum(map(len, dfs2)) == n * 2 - 1
def test__dask_array_collections(s, a, b): import dask.array as da e = Executor((s.ip, s.port), start=False) yield e._start() x_dsk = {('x', i, j): np.random.random((3, 3)) for i in range(3) for j in range(2)} y_dsk = {('y', i, j): np.random.random((3, 3)) for i in range(2) for j in range(3)} x_futures = yield e._scatter(x_dsk) y_futures = yield e._scatter(y_dsk) dt = np.random.random(0).dtype x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt) y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt) x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt) y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt) exprs = [ lambda x, y: x.T + y, lambda x, y: x.mean() + y.mean(), lambda x, y: x.dot(y).std(axis=0), lambda x, y: x - x.mean(axis=1)[:, None] ] for expr in exprs: local = expr(x_local, y_local).compute(get=dask.get) remote, = e.compute(expr(x_remote, y_remote)) remote = yield remote._result() assert np.all(local == remote) yield e._shutdown()
def test__futures_to_dask_bag(s, a, b): import dask.bag as db e = Executor((s.ip, s.port), start=False) yield e._start() L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] futures = yield e._scatter(L) rb = yield _futures_to_dask_bag(futures) assert isinstance(rb, db.Bag) assert rb.npartitions == len(L) lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3) exprs = [lambda x: x.map(lambda x: x + 1).sum(), lambda x: x.filter(lambda x: x % 2)] for expr in exprs: local = expr(lb).compute(get=dask.get) remote = e.compute(expr(rb)) remote = yield remote._result() assert local == remote yield e._shutdown()
def test_avro(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() avro_files = {'/tmp/test/1.avro': avro_bytes, '/tmp/test/2.avro': avro_bytes} with make_hdfs() as hdfs: for k, v in avro_files.items(): with hdfs.open(k, 'w') as f: f.write(v) assert hdfs.info(k)['size'] > 0 L = yield _read_avro('/tmp/test/*.avro', lazy=False) assert isinstance(L, list) assert all(isinstance(x, Future) for x in L) results = yield e._gather(L) assert all(isinstance(r, list) for r in results) assert results[0][:5] == data[:5] assert results[-1][-5:] == data[-5:] L = yield _read_avro('/tmp/test/*.avro', lazy=True) assert isinstance(L, list) assert all(isinstance(x, Value) for x in L)
def test__futures_to_dask_bag(s, a, b): import dask.bag as db e = Executor((s.ip, s.port), start=False) yield e._start() L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] futures = yield e._scatter(L) rb = yield _futures_to_dask_bag(futures) assert isinstance(rb, db.Bag) assert rb.npartitions == len(L) lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3) exprs = [ lambda x: x.map(lambda x: x + 1).sum(), lambda x: x.filter(lambda x: x % 2) ] for expr in exprs: local = expr(lb).compute(get=dask.get) remote = e.compute(expr(rb)) remote = yield remote._result() assert local == remote yield e._shutdown()
def test_with_data(s, a, b): ss = HTTPScheduler(s) ss.listen(0) e = Executor((s.ip, s.port), start=False) yield e._start() L = e.map(inc, [1, 2, 3]) L2 = yield e._scatter(['Hello', 'world!']) yield _wait(L) client = AsyncHTTPClient() response = yield client.fetch('http://localhost:%s/memory-load.json' % ss.port) out = json.loads(response.body.decode()) assert all(isinstance(v, int) for v in out.values()) assert set(out) == {a.address_string, b.address_string} assert sum(out.values()) == sum(map(sys.getsizeof, [1, 2, 3, 'Hello', 'world!'])) response = yield client.fetch('http://localhost:%s/memory-load-by-key.json' % ss.port) out = json.loads(response.body.decode()) assert set(out) == {a.address_string, b.address_string} assert all(isinstance(v, dict) for v in out.values()) assert all(k in {'inc', 'data'} for d in out.values() for k in d) assert all(isinstance(v, int) for d in out.values() for v in d.values()) assert sum(v for d in out.values() for v in d.values()) == \ sum(map(sys.getsizeof, [1, 2, 3, 'Hello', 'world!'])) ss.stop() yield e._shutdown()
def test__dask_array_collections(s, a, b): import dask.array as da e = Executor((s.ip, s.port), start=False) yield e._start() x_dsk = {('x', i, j): np.random.random((3, 3)) for i in range(3) for j in range(2)} y_dsk = {('y', i, j): np.random.random((3, 3)) for i in range(2) for j in range(3)} x_futures = yield e._scatter(x_dsk) y_futures = yield e._scatter(y_dsk) dt = np.random.random(0).dtype x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt) y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt) x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt) y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt) exprs = [lambda x, y: x.T + y, lambda x, y: x.mean() + y.mean(), lambda x, y: x.dot(y).std(axis=0), lambda x, y: x - x.mean(axis=1)[:, None]] for expr in exprs: local = expr(x_local, y_local).compute(get=dask.get) remote = e.compute(expr(x_remote, y_remote)) remote = yield remote._result() assert np.all(local == remote) yield e._shutdown()
def f(c, a, b): e = Executor((c.ip, c.port), start=False, loop=loop) yield e._start() x_dsk = {('x', i, j): np.random.random((3, 3)) for i in range(3) for j in range(2)} y_dsk = {('y', i, j): np.random.random((3, 3)) for i in range(2) for j in range(3)} x_futures = yield e._scatter(x_dsk) y_futures = yield e._scatter(y_dsk) dt = np.random.random(0).dtype x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt) y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt) x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt) y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt) exprs = [lambda x, y: x.T + y, lambda x, y: x.mean() + y.mean(), lambda x, y: x.dot(y).std(axis=0), lambda x, y: x - x.mean(axis=1)[:, None]] for expr in exprs: local = expr(x_local, y_local) local_results = dask.get(local.dask, local._keys()) local_result = da.Array._finalize(local, local_results) remote = expr(x_remote, y_remote) remote_results = yield e._get(remote.dask, remote._keys()) remote_result = da.Array._finalize(remote, remote_results) assert np.all(local_result == remote_result) yield e._shutdown()
def test_no_divisions(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() dfs = e.map(tm.makeTimeDataFrame, range(5, 10)) df = yield _futures_to_dask_dataframe(dfs) assert not df.known_divisions assert list(df.columns) == list(tm.makeTimeDataFrame(5).columns)
def test_read_bytes(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(test_bucket_name, prefix='test/', anon=True) assert len(futures) >= len(files) results = yield e._gather(futures) assert set(results).issuperset(set(files.values())) yield e._shutdown()
def test_multiple_executors_restart(s, a, b): e1 = Executor((s.ip, s.port), start=False) yield e1._start() e2 = Executor((s.ip, s.port), start=False) yield e2._start() x = e1.submit(inc, 1) y = e2.submit(inc, 2) xx = yield x._result() yy = yield y._result() assert xx == 2 assert yy == 3 yield e1._restart() assert x.cancelled() assert y.cancelled() yield e1._shutdown(fast=True) yield e2._shutdown(fast=True)
def _get_executor(self): loop = tornado.ioloop.IOLoop.current() IP = '127.0.0.1' PORT = 63000 PORT_SCHEDULER = 63500 from distributed import Executor executor = Executor('{}:{}'.format(IP, PORT_SCHEDULER), loop=loop, start=False) yield executor._start() return executor
def test_read_csv_with_names(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') df = yield _read_csv('/tmp/test/*.csv', names=['amount', 'name'], lineterminator='\n', lazy=False) assert list(df.columns) == ['amount', 'name'] yield e._shutdown()
def test_read_bytes_lazy(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() values = read_bytes(test_bucket_name, 'test/', lazy=True, anon=True) assert all(isinstance(v, Value) for v in values) results = e.compute(values, sync=False) results = yield e._gather(results) assert set(results).issuperset(set(files.values())) yield e._shutdown()
def test_read_bytes(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(test_bucket_name, prefix='test/', anon=True, lazy=False) assert len(futures) >= len(files) results = yield e._gather(futures) assert set(results).issuperset(set(files.values())) yield e._shutdown()
def test__futures_to_dask_dataframe(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() remote_dfs = e.map(identity, dfs) ddf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True, executor=e) assert isinstance(ddf, dd.DataFrame) assert ddf.divisions == (0, 30, 60, 80) expr = ddf.x.sum() result = yield e._get(expr.dask, expr._keys()) assert result == [sum([df.x.sum() for df in dfs])] yield e._shutdown()
def test_read_csv(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() with hdfs.open('/tmp/test/1.csv', 'w') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'w') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') df = yield _read_csv('/tmp/test/*.csv', header=True, lineterminator='\n') result, = e.compute(df.id.sum(), sync=False) result = yield result._result() assert result == 1 + 2 + 3 + 4
def f(c, a, b): e = Executor((c.ip, c.port), start=False, loop=loop) yield e._start() remote_dfs = e.map(lambda x: x, dfs) ddf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True, executor=e) assert isinstance(ddf, dd.DataFrame) assert ddf.divisions == (0, 30, 60, 80) expr = ddf.x.sum() result = yield e._get(expr.dask, expr._keys()) assert result == [sum([df.x.sum() for df in dfs])] yield e._shutdown()
def f(c, a, b): e = Executor((c.ip, c.port), start=False, loop=loop) yield e._start() arrays = e.map(np.ones, [(5, 5)] * 6) y = yield _stack(arrays, axis=0) assert y.shape == (6, 5, 5) assert y.chunks == ((1, 1, 1, 1, 1, 1), (5,), (5,)) y_results = yield e._get(y.dask, y._keys()) yy = da.Array._finalize(y, y_results) assert isinstance(yy, np.ndarray) assert yy.shape == y.shape assert (yy == 1).all() yield e._shutdown()
def test__read_text_unicode(s, a, b): fn = '/tmp/test/data.txt' data = b'abcd\xc3\xa9' with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() with hdfs.open(fn, 'wb') as f: f.write(b'\n'.join([data, data])) f = yield _read_text(fn, collection=False, lazy=False) result = yield f[0]._result() assert len(result) == 2 assert list(map(unicode.strip, result)) == [data.decode('utf-8')] * 2 assert len(result[0]) == 5 yield e._shutdown()
def test__stack(s, a, b): import dask.array as da e = Executor((s.ip, s.port), start=False) yield e._start() arrays = e.map(np.ones, [(5, 5)] * 6) y = yield _stack(arrays, axis=0) assert y.shape == (6, 5, 5) assert y.chunks == ((1, 1, 1, 1, 1, 1), (5, ), (5, )) y_result, = e.compute(y) yy = yield y_result._result() assert isinstance(yy, np.ndarray) assert yy.shape == y.shape assert (yy == 1).all() yield e._shutdown()
def test__stack(s, a, b): import dask.array as da e = Executor((s.ip, s.port), start=False) yield e._start() arrays = e.map(np.ones, [(5, 5)] * 6) y = yield _stack(arrays, axis=0) assert y.shape == (6, 5, 5) assert y.chunks == ((1, 1, 1, 1, 1, 1), (5,), (5,)) y_result = e.compute(y) yy = yield y_result._result() assert isinstance(yy, np.ndarray) assert yy.shape == y.shape assert (yy == 1).all() yield e._shutdown()
def f(c, a, b): e = Executor((c.ip, c.port), start=False, loop=loop) yield e._start() remote_arrays = [[[e.submit(np.full, (2, 3, 4), i + j + k) for i in range(2)] for j in range(2)] for k in range(4)] x = yield _futures_to_dask_array(remote_arrays, executor=e) assert x.chunks == ((2, 2, 2, 2), (3, 3), (4, 4)) assert x.dtype == np.full((), 0).dtype assert isinstance(x, da.Array) expr = x.sum() result = yield e._get(expr.dask, expr._keys()) assert isinstance(result[0], np.number) yield e._shutdown()
def test__futures_to_dask_array(s, a, b): import dask.array as da e = Executor((s.ip, s.port), start=False) yield e._start() remote_arrays = [[[ e.submit(np.full, (2, 3, 4), i + j + k) for i in range(2) ] for j in range(2)] for k in range(4)] x = yield _futures_to_dask_array(remote_arrays, executor=e) assert x.chunks == ((2, 2, 2, 2), (3, 3), (4, 4)) assert x.dtype == np.full((), 0).dtype assert isinstance(x, da.Array) expr = x.sum() result = yield e._get(expr.dask, expr._keys()) assert isinstance(result[0], np.number) yield e._shutdown()
def test__read_text(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() with hdfs.open('/tmp/test/text.1.txt', 'wb') as f: f.write('Alice 100\nBob 200\nCharlie 300'.encode()) with hdfs.open('/tmp/test/text.2.txt', 'wb') as f: f.write('Dan 400\nEdith 500\nFrank 600'.encode()) with hdfs.open('/tmp/test/other.txt', 'wb') as f: f.write('a b\nc d'.encode()) b = yield _read_text('/tmp/test/text.*.txt', collection=True, lazy=True) yield gen.sleep(0.5) assert not s.tasks future = e.compute(b.str.strip().str.split().map(len)) result = yield future._result() assert result == [2, 2, 2, 2, 2, 2] b = yield _read_text('/tmp/test/other.txt', collection=True, lazy=False) future = e.compute(b.str.split().concat()) result = yield future._result() assert result == ['a', 'b', 'c', 'd'] L = yield _read_text('/tmp/test/text.*.txt', collection=False, lazy=False) assert all(isinstance(x, Future) for x in L) L = yield _read_text('/tmp/test/text.*.txt', collection=False, lazy=True) assert all(isinstance(x, Value) for x in L) yield e._shutdown()
def test_write_bytes(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() data = [b'123', b'456', b'789'] remote_data = yield e._scatter(data) futures = write_bytes('/tmp/test/data/file.*.dat', remote_data, hdfs=hdfs) yield _wait(futures) assert len(hdfs.ls('/tmp/test/data/')) == 3 with hdfs.open('/tmp/test/data/file.1.dat') as f: assert f.read() == b'456' futures = write_bytes('/tmp/test/data2/', remote_data, hdfs=hdfs) yield _wait(futures) assert len(hdfs.ls('/tmp/test/data2/')) == 3
def test_read_bytes(s, a, b): with make_hdfs() as hdfs: data = b'a' * int(1e8) fn = '/tmp/test/file' with hdfs.open(fn, 'w', repl=1) as f: f.write(data) blocks = hdfs.get_block_locations(fn) assert len(blocks) > 1 e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(fn, hdfs=hdfs) assert len(futures) == len(blocks) assert futures[0].executor is e results = yield e._gather(futures) assert b''.join(results) == data assert s.restrictions assert {f.key for f in futures}.issubset(s.loose_restrictions)
def test_get_block_locations_nested(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) L = get_block_locations(hdfs, '/tmp/test/') assert len(L) == 6 e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_binary('/tmp/test/', hdfs=hdfs) results = yield e._gather(futures) assert len(results) == 6 assert all(x == b'a' for x in results)
def test_get_block_locations_nested(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) L = get_block_locations(hdfs, '/tmp/test/') assert len(L) == 6 e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes('/tmp/test/', hdfs=hdfs) results = yield e._gather(futures) assert len(results) == 6 assert all(x == b'a' for x in results)
def test_read_csv_lazy(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') df = yield _read_csv('/tmp/test/*.csv', lazy=True, lineterminator='\n') assert df._known_dtype yield gen.sleep(0.5) assert not s.tasks result = yield e.compute(df.id.sum(), sync=False)._result() assert result == 1 + 2 + 3 + 4 yield e._shutdown()
def test_write_binary(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() data = [b'123', b'456', b'789'] remote_data = yield e._scatter(data) futures = write_binary('/tmp/test/data/file.*.dat', remote_data, hdfs=hdfs) yield _wait(futures) assert len(hdfs.ls('/tmp/test/data/')) == 3 with hdfs.open('/tmp/test/data/file.1.dat') as f: assert f.read() == b'456' futures = write_binary('/tmp/test/data2/', remote_data, hdfs=hdfs) yield _wait(futures) assert len(hdfs.ls('/tmp/test/data2/')) == 3
def test_dataframes(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() dfs = [ pd.DataFrame({ 'x': np.random.random(100), 'y': np.random.random(100) }, index=list(range(i, i + 100))) for i in range(0, 100 * 10, 100) ] remote_dfs = e.map(lambda x: x, dfs) rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) name = 'foo' ldf = dd.DataFrame({(name, i): df for i, df in enumerate(dfs)}, name, dfs[0].columns, list(range(0, 1000, 100)) + [999]) assert rdf.divisions == ldf.divisions remote = e.compute(rdf) result = yield remote._result() tm.assert_frame_equal(result, ldf.compute(get=dask.get)) exprs = [ lambda df: df.x.mean(), lambda df: df.y.std(), lambda df: df.assign(z=df.x + df.y).drop_duplicates(), lambda df: df.index, lambda df: df.x, lambda df: df.x.cumsum(), lambda df: df.loc[50:75] ] for f in exprs: local = f(ldf).compute(get=dask.get) remote = e.compute(f(rdf)) remote = yield gen.with_timeout(timedelta(seconds=5), remote._result()) assert_equal(local, remote) yield e._shutdown()
def test_read_binary(s, a, b): with make_hdfs() as hdfs: assert hdfs._handle > 0 data = b'a' * int(1e8) fn = '/tmp/test/file' with hdfs.open(fn, 'w', repl=1) as f: f.write(data) blocks = hdfs.get_block_locations(fn) assert len(blocks) > 1 e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_binary(fn, hdfs=hdfs) assert len(futures) == len(blocks) assert futures[0].executor is e results = yield e._gather(futures) assert b''.join(results) == data assert s.restrictions assert {f.key for f in futures}.issubset(s.loose_restrictions)
def test_dataframes(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() dfs = [pd.DataFrame({'x': np.random.random(100), 'y': np.random.random(100)}, index=list(range(i, i + 100))) for i in range(0, 100*10, 100)] remote_dfs = e.map(lambda x: x, dfs) rdf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) name = 'foo' ldf = dd.DataFrame({(name, i): df for i, df in enumerate(dfs)}, name, dfs[0].columns, list(range(0, 1000, 100)) + [999]) assert rdf.divisions == ldf.divisions remote = e.compute(rdf) result = yield remote._result() tm.assert_frame_equal(result, ldf.compute(get=dask.get)) exprs = [lambda df: df.x.mean(), lambda df: df.y.std(), lambda df: df.assign(z=df.x + df.y).drop_duplicates(), lambda df: df.index, lambda df: df.x, lambda df: df.x.cumsum(), lambda df: df.loc[50:75]] for f in exprs: local = f(ldf).compute(get=dask.get) remote = e.compute(f(rdf)) remote = yield gen.with_timeout(timedelta(seconds=5), remote._result()) assert_equal(local, remote) yield e._shutdown()
def test_read_csv_lazy(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') df = yield _read_csv('/tmp/test/*.csv', header=True, lazy=True, lineterminator='\n') assert df._known_dtype yield gen.sleep(0.5) assert not s.tasks result = yield e.compute(df.id.sum(), sync=False)._result() assert result == 1 + 2 + 3 + 4 yield e._shutdown()