def test_deterministic_key_names(e, s, a, b): with make_hdfs() as hdfs: data = b'abc\n' * int(1e3) fn = '/tmp/test/file' with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) x = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'\n') y = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'\n') z = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True, delimiter=b'c') assert [f.key for f in x] == [f.key for f in y] assert [f.key for f in x] != [f.key for f in z]
def test_lazy_values(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() values = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True) assert all(isinstance(v, Value) for v in values) while not s.restrictions: yield gen.sleep(0.01) assert not s.dask results = e.compute(*values, sync=False) results = yield e._gather(results) assert len(results) == 6 assert all(x == b'a' for x in results)
def test_read_bytes(e, s, a, b): with make_hdfs() as hdfs: data = b'a' * int(1e8) fn = '/tmp/test/file' with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) blocks = hdfs.get_block_locations(fn) assert len(blocks) > 1 sample, values = read_bytes(fn, hdfs=hdfs) assert sample[:5] == b'aaaaa' assert len(values) == len(blocks) while not s.restrictions: yield gen.sleep(0.01) assert not s.tasks assert {v.key for v in values} == set(s.restrictions) assert {v.key for v in values} == set(s.loose_restrictions) futures = e.compute(values) results = yield e._gather(futures) assert b''.join(results) == data assert s.restrictions
def dont_test_dataframes(s, a): # slow pytest.importorskip('pandas') n = 3000000 fn = '/tmp/test/file.csv' with make_hdfs() as hdfs: data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n) with hdfs.open(fn, 'wb') as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(fn, hdfs=hdfs, delimiter=b'\r\n', lazy=False) assert len(futures) > 1 def load(b, **kwargs): assert b from io import BytesIO import pandas as pd bio = BytesIO(b) return pd.read_csv(bio, **kwargs) dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1) dfs2 = yield e._gather(dfs) assert sum(map(len, dfs2)) == n * 2 - 1 yield e._shutdown()
def dont_test_dataframes(s, a): # slow pytest.importorskip('pandas') n = 3000000 fn = '/tmp/test/file.csv' with make_hdfs() as hdfs: data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n) with hdfs.open(fn, 'w') as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(fn, hdfs=hdfs, delimiter=b'\r\n') assert len(futures) > 1 def load(b, **kwargs): assert b from io import BytesIO import pandas as pd bio = BytesIO(b) return pd.read_csv(bio, **kwargs) dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1) dfs2 = yield e._gather(dfs) assert sum(map(len, dfs2)) == n * 2 - 1
def test_lazy_values(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() values = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True) assert all(isinstance(v, Value) for v in values) while not s.restrictions: yield gen.sleep(0.01) assert not s.tasks results = e.compute(values, sync=False) results = yield e._gather(results) assert len(results) == 6 assert all(x == b'a' for x in results) yield e._shutdown()
def test_read_bytes_sync(loop): with cluster(nworkers=3) as (s, [a, b, c]): with make_hdfs() as hdfs: data = b'a' * int(1e3) for fn in ['/tmp/test/file.%d' % i for i in range(100)]: with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) with Executor(('127.0.0.1', s['port']), loop=loop) as e: futures = read_bytes('/tmp/test/file.*', lazy=False) results = e.gather(futures) assert b''.join(results) == 100 * data
def test_read_bytes_sync(loop): with make_hdfs() as hdfs: data = b'a' * int(1e3) for fn in ['/tmp/test/file.%d' % i for i in range(100)]: with hdfs.open(fn, 'w', repl=1) as f: f.write(data) with cluster(nworkers=3) as (s, [a, b, c]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: futures = read_bytes('/tmp/test/file.*') results = e.gather(futures) assert b''.join(results) == 100 * data
def test_write_bytes(e, s, a, b): from dask.bytes.core import write_bytes, read_bytes with make_hdfs() as hdfs: path = 'hdfs:///tmp/test/' data = [b'test data %i' % i for i in range(5)] values = [delayed(d) for d in data] out = write_bytes(values, path, hdfs=hdfs) futures = e.compute(out) results = yield e._gather(futures) assert len(hdfs.ls('/tmp/test/')) == 5 sample, vals = read_bytes('hdfs:///tmp/test/*.part', hdfs=hdfs, lazy=True) futures = e.compute(vals) results = yield e._gather(futures) assert data == results
def test_read_bytes(e, s, a, b): with make_hdfs() as hdfs: data = b'a' * int(1e8) fn = '/tmp/test/file' with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) blocks = hdfs.get_block_locations(fn) assert len(blocks) > 1 futures = read_bytes(fn, hdfs=hdfs, lazy=False) assert len(futures) == len(blocks) assert futures[0].executor is e results = yield e._gather(futures) assert b''.join(results) == data assert s.restrictions assert {f.key for f in futures}.issubset(s.loose_restrictions)
def test_get_block_locations_nested(e, s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) L = get_block_locations(hdfs, '/tmp/test/') assert len(L) == 6 futures = read_bytes('/tmp/test/', hdfs=hdfs, lazy=False) results = yield e._gather(futures) assert len(results) == 6 assert all(x == b'a' for x in results)
def test_get_block_locations_nested(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) L = get_block_locations(hdfs, '/tmp/test/') assert len(L) == 6 e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes('/tmp/test/', hdfs=hdfs) results = yield e._gather(futures) assert len(results) == 6 assert all(x == b'a' for x in results)
def test_lazy_values(e, s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) sample, values = read_bytes('/tmp/test/', hdfs=hdfs) assert all(isinstance(v, Delayed) for v in values) while not s.restrictions: yield gen.sleep(0.01) assert not s.tasks results = e.compute(values, sync=False) results = yield e._gather(results) assert len(results) == 6 assert all(x == b'a' for x in results)