def test_read_text(s, a, b): pytest.importorskip('dask.bag') import dask.bag as db from dask.imperative import Value e = Executor((s.ip, s.port), start=False) yield e._start() b = read_text(test_bucket_name, 'test/accounts', lazy=True, collection=True, anon=True) assert isinstance(b, db.Bag) yield gen.sleep(0.2) assert not s.tasks future = e.compute(b.filter(None).map(json.loads).pluck('amount').sum()) result = yield future._result() assert result == (1 + 2 + 3 + 4 + 5 + 6 + 7 + 8) * 100 text = read_text(test_bucket_name, 'test/accounts', lazy=True, collection=False, anon=True) assert all(isinstance(v, Value) for v in text) text = read_text(test_bucket_name, 'test/accounts', lazy=False, collection=False, anon=True) assert all(isinstance(v, Future) for v in text) yield e._shutdown()
def test_read_text(e, s, a, b): import dask.bag as db b = read_text(test_bucket_name + '/test/accounts*', lazy=True, collection=True, anon=True) assert isinstance(b, db.Bag) yield gen.sleep(0.2) assert not s.tasks future = e.compute(b.map(json.loads).pluck('amount').sum()) result = yield future._result() assert result == (1 + 2 + 3 + 4 + 5 + 6 + 7 + 8) * 100 text = read_text(test_bucket_name + '/test/accounts*', lazy=True, collection=False, anon=True) assert all(isinstance(v, Delayed) for v in text) text = read_text(test_bucket_name + '/test/accounts*', lazy=False, collection=False, anon=True) assert all(isinstance(v, Future) for v in text)
def test_read_text_bucket_key_inputs(loop): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: a = read_text(test_bucket_name, '/text/accounts', lazy=True) b = read_text(test_bucket_name, 'text/accounts', lazy=True) c = read_text(test_bucket_name + '/text/accounts', lazy=True) assert a._keys() == b._keys() == c._keys()
def test_read_text_blocksize(e, s, a, b): for bs in [20, 27, 12]: b = read_text(test_bucket_name + '/test/accounts*', lazy=True, blocksize=bs, collection=True) assert b.npartitions == sum(ceil(len(b) / bs) for b in files.values())
def test_read_text_compression(e, s, a, b): b = read_text('distributed-test/csv/gzip/*', compression='gzip', blocksize=None, anon=True) result = yield e.compute(b)._result() assert result == [line + '\n' for k in sorted(csv_files) for line in csv_files[k].decode().split('\n') if line]
def test_read_text_compression(e, s, a, b): b = read_text('distributed-test/csv/gzip/*', compression='gzip', blocksize=None) result = yield e.compute(b)._result() assert result == [ line + '\n' for k in sorted(csv_files) for line in csv_files[k].decode().split('\n') if line ]
def test_read_text_sync(loop): import dask.bag as db with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: b = read_text(test_bucket_name+'/test/accounts*', lazy=True, collection=True) assert isinstance(b, db.Bag) c = b.map(json.loads).pluck('amount').sum() result = c.compute(get=e.get) assert result == (1 + 2 + 3 + 4 + 5 + 6 + 7 + 8) * 100
def test_read_text_sync(loop): import dask.bag as db with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: b = read_text(test_bucket_name+'/test/accounts*', lazy=True, collection=True) assert isinstance(b, db.Bag) c = b.filter(None).map(json.loads).pluck('amount').sum() result = c.compute(get=e.get) assert result == (1 + 2 + 3 + 4 + 5 + 6 + 7 + 8) * 100
def test_read_text(e, s, a, b): import dask.bag as db b = read_text(test_bucket_name+'/test/accounts*', lazy=True, collection=True, anon=True) assert isinstance(b, db.Bag) yield gen.sleep(0.2) assert not s.tasks future = e.compute(b.map(json.loads).pluck('amount').sum()) result = yield future._result() assert result == (1 + 2 + 3 + 4 + 5 + 6 + 7 + 8) * 100 text = read_text(test_bucket_name+'/test/accounts*', lazy=True, collection=False, anon=True) assert all(isinstance(v, Delayed) for v in text) text = read_text(test_bucket_name+'/test/accounts*', lazy=False, collection=False, anon=True) assert all(isinstance(v, Future) for v in text)
def test_read_text_blocksize(e, s, a, b): for bs in [20, 27, 12]: b = read_text(test_bucket_name+'/test/accounts*', lazy=True, blocksize=bs, collection=True) assert b.npartitions == sum(ceil(len(b) / bs) for b in files.values())
def test_read_text_compression(e, s, a, b): b = read_text('distributed-test/csv/gzip/', compression='gzip') result = yield e.compute(b)._result() assert result == [line for k in sorted(csv_files) for line in csv_files[k].decode().split('\n')]