def test_lazy_values(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() values = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True) assert all(isinstance(v, Value) for v in values) while not s.restrictions: yield gen.sleep(0.01) assert not s.dask results = e.compute(*values, sync=False) results = yield e._gather(results) assert len(results) == 6 assert all(x == b'a' for x in results)
def dont_test_dataframes(s, a): # slow pytest.importorskip('pandas') n = 3000000 fn = '/tmp/test/file.csv' with make_hdfs() as hdfs: data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n) with hdfs.open(fn, 'w') as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(fn, hdfs=hdfs, delimiter=b'\r\n') assert len(futures) > 1 def load(b, **kwargs): assert b from io import BytesIO import pandas as pd bio = BytesIO(b) return pd.read_csv(bio, **kwargs) dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1) dfs2 = yield e._gather(dfs) assert sum(map(len, dfs2)) == n * 2 - 1
def test_avro(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() avro_files = {'/tmp/test/1.avro': avro_bytes, '/tmp/test/2.avro': avro_bytes} with make_hdfs() as hdfs: for k, v in avro_files.items(): with hdfs.open(k, 'w') as f: f.write(v) assert hdfs.info(k)['size'] > 0 L = yield _read_avro('/tmp/test/*.avro', lazy=False) assert isinstance(L, list) assert all(isinstance(x, Future) for x in L) results = yield e._gather(L) assert all(isinstance(r, list) for r in results) assert results[0][:5] == data[:5] assert results[-1][-5:] == data[-5:] L = yield _read_avro('/tmp/test/*.avro', lazy=True) assert isinstance(L, list) assert all(isinstance(x, Value) for x in L)
def test_avro(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() avro_files = { '/tmp/test/1.avro': avro_bytes, '/tmp/test/2.avro': avro_bytes } with make_hdfs() as hdfs: for k, v in avro_files.items(): with hdfs.open(k, 'wb') as f: f.write(v) assert hdfs.info(k)['size'] > 0 L = yield _read_avro('/tmp/test/*.avro', lazy=False) assert isinstance(L, list) assert all(isinstance(x, Future) for x in L) results = yield e._gather(L) assert all(isinstance(r, list) for r in results) assert results[0][:5] == data[:5] assert results[-1][-5:] == data[-5:] L = yield _read_avro('/tmp/test/*.avro', lazy=True) assert isinstance(L, list) assert all(isinstance(x, Value) for x in L) yield e._shutdown()
def dont_test_dataframes(s, a): # slow pytest.importorskip('pandas') n = 3000000 fn = '/tmp/test/file.csv' with make_hdfs() as hdfs: data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n) with hdfs.open(fn, 'w') as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_binary(fn, hdfs=hdfs, delimiter=b'\r\n') assert len(futures) > 1 def load(b, **kwargs): assert b from io import BytesIO import pandas as pd bio = BytesIO(b) return pd.read_csv(bio, **kwargs) dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1) dfs2 = yield e._gather(dfs) assert sum(map(len, dfs2)) == n * 2 - 1
def test_lazy_values(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() values = read_binary('/tmp/test/', hdfs=hdfs, lazy=True) assert all(isinstance(v, Value) for v in values) while not s.restrictions: yield gen.sleep(0.01) assert not s.dask results = e.compute(*values, sync=False) results = yield e._gather(results) assert len(results) == 6 assert all(x == b'a' for x in results)
def test_read_bytes(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(test_bucket_name, prefix='test/', anon=True) assert len(futures) >= len(files) results = yield e._gather(futures) assert set(results).issuperset(set(files.values())) yield e._shutdown()
def test_read_bytes_lazy(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() values = read_bytes(test_bucket_name, 'test/', lazy=True, anon=True) assert all(isinstance(v, Value) for v in values) results = e.compute(values, sync=False) results = yield e._gather(results) assert set(results).issuperset(set(files.values())) yield e._shutdown()
def test_read_bytes(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(test_bucket_name, prefix='test/', anon=True, lazy=False) assert len(futures) >= len(files) results = yield e._gather(futures) assert set(results).issuperset(set(files.values())) yield e._shutdown()
def test_read_bytes(s, a, b): with make_hdfs() as hdfs: data = b'a' * int(1e8) fn = '/tmp/test/file' with hdfs.open(fn, 'w', repl=1) as f: f.write(data) blocks = hdfs.get_block_locations(fn) assert len(blocks) > 1 e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(fn, hdfs=hdfs) assert len(futures) == len(blocks) assert futures[0].executor is e results = yield e._gather(futures) assert b''.join(results) == data assert s.restrictions assert {f.key for f in futures}.issubset(s.loose_restrictions)
def test_get_block_locations_nested(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) L = get_block_locations(hdfs, '/tmp/test/') assert len(L) == 6 e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes('/tmp/test/', hdfs=hdfs) results = yield e._gather(futures) assert len(results) == 6 assert all(x == b'a' for x in results)
def test_get_block_locations_nested(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) L = get_block_locations(hdfs, '/tmp/test/') assert len(L) == 6 e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_binary('/tmp/test/', hdfs=hdfs) results = yield e._gather(futures) assert len(results) == 6 assert all(x == b'a' for x in results)
def test_read_binary(s, a, b): with make_hdfs() as hdfs: assert hdfs._handle > 0 data = b'a' * int(1e8) fn = '/tmp/test/file' with hdfs.open(fn, 'w', repl=1) as f: f.write(data) blocks = hdfs.get_block_locations(fn) assert len(blocks) > 1 e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_binary(fn, hdfs=hdfs) assert len(futures) == len(blocks) assert futures[0].executor is e results = yield e._gather(futures) assert b''.join(results) == data assert s.restrictions assert {f.key for f in futures}.issubset(s.loose_restrictions)