def f(c, a, b): e = Executor((c.ip, c.port), start=False, loop=loop) yield e._start() x_dsk = {('x', i, j): np.random.random((3, 3)) for i in range(3) for j in range(2)} y_dsk = {('y', i, j): np.random.random((3, 3)) for i in range(2) for j in range(3)} x_futures = yield e._scatter(x_dsk) y_futures = yield e._scatter(y_dsk) dt = np.random.random(0).dtype x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt) y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt) x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt) y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt) exprs = [lambda x, y: x.T + y, lambda x, y: x.mean() + y.mean(), lambda x, y: x.dot(y).std(axis=0), lambda x, y: x - x.mean(axis=1)[:, None]] for expr in exprs: local = expr(x_local, y_local) local_results = dask.get(local.dask, local._keys()) local_result = da.Array._finalize(local, local_results) remote = expr(x_remote, y_remote) remote_results = yield e._get(remote.dask, remote._keys()) remote_result = da.Array._finalize(remote, remote_results) assert np.all(local_result == remote_result) yield e._shutdown()
def test_read_text(s, a, b): pytest.importorskip('dask.bag') import dask.bag as db from dask.imperative import Value e = Executor((s.ip, s.port), start=False) yield e._start() b = read_text(test_bucket_name, 'test/accounts', lazy=True, collection=True, anon=True) assert isinstance(b, db.Bag) yield gen.sleep(0.2) assert not s.tasks future = e.compute(b.filter(None).map(json.loads).pluck('amount').sum()) result = yield future._result() assert result == (1 + 2 + 3 + 4 + 5 + 6 + 7 + 8) * 100 text = read_text(test_bucket_name, 'test/accounts', lazy=True, collection=False, anon=True) assert all(isinstance(v, Value) for v in text) text = read_text(test_bucket_name, 'test/accounts', lazy=False, collection=False, anon=True) assert all(isinstance(v, Future) for v in text) yield e._shutdown()
def dont_test_dataframes(s, a): # slow pytest.importorskip('pandas') n = 3000000 fn = '/tmp/test/file.csv' with make_hdfs() as hdfs: data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n) with hdfs.open(fn, 'w') as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(fn, hdfs=hdfs, delimiter=b'\r\n') assert len(futures) > 1 def load(b, **kwargs): assert b from io import BytesIO import pandas as pd bio = BytesIO(b) return pd.read_csv(bio, **kwargs) dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1) dfs2 = yield e._gather(dfs) assert sum(map(len, dfs2)) == n * 2 - 1
def test__dask_array_collections(s, a, b): import dask.array as da e = Executor((s.ip, s.port), start=False) yield e._start() x_dsk = {('x', i, j): np.random.random((3, 3)) for i in range(3) for j in range(2)} y_dsk = {('y', i, j): np.random.random((3, 3)) for i in range(2) for j in range(3)} x_futures = yield e._scatter(x_dsk) y_futures = yield e._scatter(y_dsk) dt = np.random.random(0).dtype x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt) y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt) x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt) y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt) exprs = [lambda x, y: x.T + y, lambda x, y: x.mean() + y.mean(), lambda x, y: x.dot(y).std(axis=0), lambda x, y: x - x.mean(axis=1)[:, None]] for expr in exprs: local = expr(x_local, y_local).compute(get=dask.get) remote = e.compute(expr(x_remote, y_remote)) remote = yield remote._result() assert np.all(local == remote) yield e._shutdown()
def test_with_data(s, a, b): ss = HTTPScheduler(s) ss.listen(0) e = Executor((s.ip, s.port), start=False) yield e._start() L = e.map(inc, [1, 2, 3]) L2 = yield e._scatter(['Hello', 'world!']) yield _wait(L) client = AsyncHTTPClient() response = yield client.fetch('http://localhost:%s/memory-load.json' % ss.port) out = json.loads(response.body.decode()) assert all(isinstance(v, int) for v in out.values()) assert set(out) == {a.address_string, b.address_string} assert sum(out.values()) == sum(map(sys.getsizeof, [1, 2, 3, 'Hello', 'world!'])) response = yield client.fetch('http://localhost:%s/memory-load-by-key.json' % ss.port) out = json.loads(response.body.decode()) assert set(out) == {a.address_string, b.address_string} assert all(isinstance(v, dict) for v in out.values()) assert all(k in {'inc', 'data'} for d in out.values() for k in d) assert all(isinstance(v, int) for d in out.values() for v in d.values()) assert sum(v for d in out.values() for v in d.values()) == \ sum(map(sys.getsizeof, [1, 2, 3, 'Hello', 'world!'])) ss.stop() yield e._shutdown()
def test_bokeh(): pytest.importorskip('bokeh') try: proc = Popen(['dscheduler'], stdout=PIPE, stderr=PIPE) e = Executor('127.0.0.1:%d' % Scheduler.default_port) while True: line = proc.stderr.readline() if b'Bokeh UI' in line: break start = time() while True: try: for name in [socket.gethostname(), 'localhost', '127.0.0.1', get_ip()]: response = requests.get('http://%s:8787/status/' % name) assert response.ok break except: sleep(0.1) assert time() < start + 5 finally: with ignoring(Exception): e.shutdown() with ignoring(Exception): os.kill(proc.pid, signal.SIGINT)
def test__futures_to_dask_bag(s, a, b): import dask.bag as db e = Executor((s.ip, s.port), start=False) yield e._start() L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] futures = yield e._scatter(L) rb = yield _futures_to_dask_bag(futures) assert isinstance(rb, db.Bag) assert rb.npartitions == len(L) lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3) exprs = [lambda x: x.map(lambda x: x + 1).sum(), lambda x: x.filter(lambda x: x % 2)] for expr in exprs: local = expr(lb).compute(get=dask.get) remote = e.compute(expr(rb)) remote = yield remote._result() assert local == remote yield e._shutdown()
def test_bokeh_non_standard_ports(): pytest.importorskip('bokeh') try: proc = Popen(['dscheduler', '--port', '3448', '--http-port', '4824', '--bokeh-port', '4832'], stdout=PIPE, stderr=PIPE) e = Executor('127.0.0.1:3448') while True: line = proc.stderr.readline() if b'Bokeh UI' in line: break start = time() while True: try: response = requests.get('http://localhost:4832/status/') assert response.ok break except: sleep(0.1) assert time() < start + 5 finally: with ignoring(Exception): e.shutdown() with ignoring(Exception): os.kill(proc.pid, signal.SIGINT)
def test_avro(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() avro_files = { '/tmp/test/1.avro': avro_bytes, '/tmp/test/2.avro': avro_bytes } with make_hdfs() as hdfs: for k, v in avro_files.items(): with hdfs.open(k, 'wb') as f: f.write(v) assert hdfs.info(k)['size'] > 0 L = yield _read_avro('/tmp/test/*.avro', lazy=False) assert isinstance(L, list) assert all(isinstance(x, Future) for x in L) results = yield e._gather(L) assert all(isinstance(r, list) for r in results) assert results[0][:5] == data[:5] assert results[-1][-5:] == data[-5:] L = yield _read_avro('/tmp/test/*.avro', lazy=True) assert isinstance(L, list) assert all(isinstance(x, Value) for x in L) yield e._shutdown()
def test_procs(loop): with LocalCluster(2, scheduler_port=0, nanny=False, threads_per_worker=3, diagnostic_port=None, silence_logs=False) as c: assert len(c.workers) == 2 assert all(isinstance(w, Worker) for w in c.workers) with Executor((c.scheduler.ip, c.scheduler.port), loop=loop) as e: assert all(w.ncores == 3 for w in c.workers) repr(c) with LocalCluster(2, scheduler_port=0, nanny=True, threads_per_worker=3, diagnostic_port=None, silence_logs=False) as c: assert len(c.workers) == 2 assert all(isinstance(w, Nanny) for w in c.workers) with Executor((c.scheduler.ip, c.scheduler.port), loop=loop) as e: assert all(v == 3 for v in e.ncores().values()) c.start_worker(nanny=False) assert isinstance(c.workers[-1], Worker) repr(c)
def test_avro(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() avro_files = {'/tmp/test/1.avro': avro_bytes, '/tmp/test/2.avro': avro_bytes} with make_hdfs() as hdfs: for k, v in avro_files.items(): with hdfs.open(k, 'w') as f: f.write(v) assert hdfs.info(k)['size'] > 0 L = yield _read_avro('/tmp/test/*.avro', lazy=False) assert isinstance(L, list) assert all(isinstance(x, Future) for x in L) results = yield e._gather(L) assert all(isinstance(r, list) for r in results) assert results[0][:5] == data[:5] assert results[-1][-5:] == data[-5:] L = yield _read_avro('/tmp/test/*.avro', lazy=True) assert isinstance(L, list) assert all(isinstance(x, Value) for x in L)
def dont_test_dataframes(s, a): # slow pytest.importorskip('pandas') n = 3000000 fn = '/tmp/test/file.csv' with make_hdfs() as hdfs: data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n) with hdfs.open(fn, 'w') as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_binary(fn, hdfs=hdfs, delimiter=b'\r\n') assert len(futures) > 1 def load(b, **kwargs): assert b from io import BytesIO import pandas as pd bio = BytesIO(b) return pd.read_csv(bio, **kwargs) dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1) dfs2 = yield e._gather(dfs) assert sum(map(len, dfs2)) == n * 2 - 1
def test_lazy_values(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() values = read_bytes('/tmp/test/', hdfs=hdfs, lazy=True) assert all(isinstance(v, Value) for v in values) while not s.restrictions: yield gen.sleep(0.01) assert not s.dask results = e.compute(*values, sync=False) results = yield e._gather(results) assert len(results) == 6 assert all(x == b'a' for x in results)
def test_lazy_values(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() values = read_binary('/tmp/test/', hdfs=hdfs, lazy=True) assert all(isinstance(v, Value) for v in values) while not s.restrictions: yield gen.sleep(0.01) assert not s.dask results = e.compute(*values, sync=False) results = yield e._gather(results) assert len(results) == 6 assert all(x == b'a' for x in results)
def test_bokeh(): pytest.importorskip('bokeh') try: proc = Popen(['dscheduler'], stdout=PIPE, stderr=PIPE) e = Executor('127.0.0.1:%d' % Scheduler.default_port) while True: line = proc.stderr.readline() if b'Start Bokeh UI' in line: break start = time() while True: try: for name in [ socket.gethostname(), 'localhost', '127.0.0.1', get_ip() ]: response = requests.get('http://%s:8787/status/' % name) assert response.ok break except: sleep(0.1) assert time() < start + 5 finally: with ignoring(Exception): e.shutdown() with ignoring(Exception): os.kill(proc.pid, signal.SIGINT)
def test_no_divisions(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() dfs = e.map(tm.makeTimeDataFrame, range(5, 10)) df = yield _futures_to_dask_dataframe(dfs) assert not df.known_divisions assert list(df.columns) == list(tm.makeTimeDataFrame(5).columns)
def test_hostport(): try: proc = Popen(['dask-scheduler', '--no-bokeh', '--host', '127.0.0.1:8978'], stdout=PIPE, stderr=PIPE) e = Executor('127.0.0.1:8978') finally: e.shutdown() os.kill(proc.pid, signal.SIGINT)
def test_read_bytes(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(test_bucket_name, prefix='test/', anon=True) assert len(futures) >= len(files) results = yield e._gather(futures) assert set(results).issuperset(set(files.values())) yield e._shutdown()
def test_defaults(): try: proc = Popen(['dscheduler', '--no-bokeh'], stdout=PIPE, stderr=PIPE) e = Executor('127.0.0.1:%d' % Scheduler.default_port) response = requests.get('http://127.0.0.1:9786/info.json') assert response.ok assert response.json()['status'] == 'running' finally: e.shutdown() os.kill(proc.pid, signal.SIGINT)
def test_read_csv_with_names(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') df = yield _read_csv('/tmp/test/*.csv', names=['amount', 'name'], lineterminator='\n', lazy=False) assert list(df.columns) == ['amount', 'name'] yield e._shutdown()
def test_read_bytes(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(test_bucket_name, prefix='test/', anon=True, lazy=False) assert len(futures) >= len(files) results = yield e._gather(futures) assert set(results).issuperset(set(files.values())) yield e._shutdown()
def test_read_bytes_lazy(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() values = read_bytes(test_bucket_name, 'test/', lazy=True, anon=True) assert all(isinstance(v, Value) for v in values) results = e.compute(values, sync=False) results = yield e._gather(results) assert set(results).issuperset(set(files.values())) yield e._shutdown()
def test_no_bokeh(): pytest.importorskip('bokeh') try: proc = Popen(['dscheduler', '--no-bokeh'], stdout=PIPE, stderr=PIPE) e = Executor('127.0.0.1:%d' % Scheduler.default_port) for i in range(3): assert b'bokeh' not in next(proc.stderr) finally: with ignoring(Exception): e.shutdown() with ignoring(Exception): os.kill(proc.pid, signal.SIGINT)
def _get_executor(self): loop = tornado.ioloop.IOLoop.current() IP = '127.0.0.1' PORT = 63000 PORT_SCHEDULER = 63500 from distributed import Executor executor = Executor('{}:{}'.format(IP, PORT_SCHEDULER), loop=loop, start=False) yield executor._start() return executor
def f(c, a, b): e = Executor((c.ip, c.port), start=False) IOLoop.current().spawn_callback(e._go) remote_dfs = e.map(lambda x: x, dfs) ddf = yield _futures_to_dask_dataframe(e, remote_dfs, divisions=True) assert isinstance(ddf, dd.DataFrame) assert ddf.divisions == (0, 30, 60, 80) expr = ddf.x.sum() result = yield e._get(expr.dask, expr._keys()) assert result == [sum([df.x.sum() for df in dfs])] yield e._shutdown()
def test_read_csv(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() with hdfs.open('/tmp/test/1.csv', 'w') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'w') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') df = yield _read_csv('/tmp/test/*.csv', header=True, lineterminator='\n') result, = e.compute(df.id.sum(), sync=False) result = yield result._result() assert result == 1 + 2 + 3 + 4
def test__futures_to_dask_dataframe(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() remote_dfs = e.map(identity, dfs) ddf = yield _futures_to_dask_dataframe(remote_dfs, divisions=True, executor=e) assert isinstance(ddf, dd.DataFrame) assert ddf.divisions == (0, 30, 60, 80) expr = ddf.x.sum() result = yield e._get(expr.dask, expr._keys()) assert result == [sum([df.x.sum() for df in dfs])] yield e._shutdown()
def test_read_csv_sync(loop): import dask.dataframe as dd import pandas as pd with cluster(nworkers=3) as (s, [a, b, c]): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') with Executor(('127.0.0.1', s['port']), loop=loop) as e: futures = read_csv('/tmp/test/*.csv', lineterminator='\n', collection=False, lazy=False, header=0) assert all(isinstance(f, Future) for f in futures) L = e.gather(futures) assert isinstance(L[0], pd.DataFrame) assert list(L[0].columns) == ['name', 'amount', 'id'] df = read_csv('/tmp/test/*.csv', lineterminator='\n', collection=True, lazy=False, header=0) assert isinstance(df, dd.DataFrame) assert list(df.head().iloc[0]) == ['Alice', 100, 1]
def dsubmit(*a, args=(), kwargs=None, rtn='', **kw): """Returns a distributed submission context manager, DSubmitter(), with a new executor instance. Parameters ---------- args : Sequence of str, optional A tuple of argument names for DSubmitter. kwargs : Mapping of str to values or list of item tuples, optional Keyword argument names and values for DSubmitter. rtn : str, optional Name of object to return for DSubmitter. a, kw : Sequence and Mapping All other arguments and keyword arguments are used to construct the executor instance. Returns ------- dsub : DSubmitter An instance of the DSubmitter context manager. """ from distributed import Executor e = Executor(*a, **kw) dsub = DSubmitter(e, args=args, kwargs=kwargs, rtn=rtn) return dsub
def test_nanny_worker_ports(loop): try: worker = Popen([ 'dworker', '127.0.0.1:8989', '--host', '127.0.0.1', '--worker-port', '8788', '--nanny-port', '8789' ], stdout=PIPE, stderr=PIPE) sched = Popen(['dscheduler', '--port', '8989'], stdout=PIPE, stderr=PIPE) with Executor('127.0.0.1:8989', loop=loop) as e: start = time() while True: d = sync(e.loop, e.scheduler.identity) if d['workers']: break else: assert time() - start < 5 sleep(0.1) assert d['workers']['127.0.0.1:8788']['services']['nanny'] == 8789 finally: with ignoring(Exception): w = rpc('127.0.0.1:8789') sync(loop, w.terminate) with ignoring(Exception): os.kill(sched.pid, signal.SIGINT) with ignoring(Exception): worker.kill()
def client_context(dask_client=None, dask_scheduler=None): '''client_context creates a dask distributed or threadpool client or None Parameters: dask_client: str from choices ("DISTRIBUTED", 'THREAD_POOL', 'SERIAL') or None to take DASK_CLIENT from environment dask_scheduler: Distributed scheduler url or None to take DASK_SCHEDULER from environment ''' env = parse_env_vars() dask_client = dask_client or env.get('DASK_CLIENT', 'SERIAL') dask_scheduler = dask_scheduler or env.get('DASK_SCHEDULER') if dask_client == 'DISTRIBUTED': if Executor is None: raise ValueError('distributed is not installed - "conda install distributed"') client = Executor(dask_scheduler) elif dask_client == 'THREAD_POOL': client = ThreadPool(env.get('DASK_THREADS')) elif dask_client == 'SERIAL': client = None else: raise ValueError('Did not expect DASK_CLIENT to be {}'.format(dask_client)) get_func = _find_get_func_for_client(client) with da.set_options(pool=dask_client): yield client
def test_dataframe_groupby_tasks(loop): df = pd.util.testing.makeTimeDataFrame() df['A'] = df.A // 0.1 df['B'] = df.B // 0.1 ddf = dd.from_pandas(df, npartitions=10) with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port']), loop=loop) as e: with dask.set_options(get=e.get): for ind in [lambda x: 'A', lambda x: x.A]: a = df.groupby(ind(df)).apply(len) b = ddf.groupby(ind(ddf)).apply(len) assert_equal(a, b.compute(get=dask.get).sort_index()) assert not any('partd' in k[0] for k in b.dask) a = df.groupby(ind(df)).B.apply(len) b = ddf.groupby(ind(ddf)).B.apply(len) assert_equal(a, b.compute(get=dask.get).sort_index()) assert not any('partd' in k[0] for k in b.dask) with pytest.raises(NotImplementedError): ddf.groupby(ddf[['A', 'B']]).apply(len) a = df.groupby(['A', 'B']).apply(len) b = ddf.groupby(['A', 'B']).apply(len) assert_equal(a, b.compute(get=dask.get).sort_index())
def test_gather_after_failed_worker(loop): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: L = e.map(inc, range(10)) wait(L) a['proc'].terminate() result = e.gather(L) assert result == list(map(inc, range(10)))
def test_submit_after_failed_worker(loop): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: L = e.map(inc, range(10)) wait(L) a['proc'].terminate() total = e.submit(sum, L) assert total.result() == sum(map(inc, range(10)))
def f(c, a, b): e = Executor((c.ip, c.port), start=False, loop=loop) yield e._start() arrays = e.map(np.ones, [(5, 5)] * 6) y = yield _stack(arrays, axis=0) assert y.shape == (6, 5, 5) assert y.chunks == ((1, 1, 1, 1, 1, 1), (5,), (5,)) y_results = yield e._get(y.dask, y._keys()) yy = da.Array._finalize(y, y_results) assert isinstance(yy, np.ndarray) assert yy.shape == y.shape assert (yy == 1).all() yield e._shutdown()
def test_read_text_bucket_key_inputs(loop): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: a = read_text(test_bucket_name, '/text/accounts', lazy=True) b = read_text(test_bucket_name, 'text/accounts', lazy=True) c = read_text(test_bucket_name + '/text/accounts', lazy=True) assert a._keys() == b._keys() == c._keys()
def test_stress_gc(loop, func, n): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: x = e.submit(func, 1) for i in range(n): x = e.submit(func, x) assert x.result() == n + 2
def test_framework_runs(self): with MesosCluster() as cluster: time.sleep(2) driver = DistributedDriver().create_driver(DistributedScheduler) driver.start() time.sleep(5) expect(cluster).to(have_activated_slaves(1)) expect(cluster).to(have_framework_name('distributed-framework')) # distributed test - this probably doesnt belong here executor = Executor('127.0.0.1:8787') A = executor.map(lambda x: x**2, range(10)) B = executor.map(lambda x: -x, A) total = executor.submit(sum, B) expect(total.result()).to(equal(-285)) driver.stop()
def test_progress_function(loop, capsys): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: f = e.submit(lambda: 1) g = e.submit(lambda: 2) progress([[f], [[g]]], notebook=False) check_bar_completed(capsys)
def test__read_text_unicode(s, a, b): fn = '/tmp/test/data.txt' data = b'abcd\xc3\xa9' with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() with hdfs.open(fn, 'wb') as f: f.write(b'\n'.join([data, data])) f = yield _read_text(fn, collection=False, lazy=False) result = yield f[0]._result() assert len(result) == 2 assert list(map(unicode.strip, result)) == [data.decode('utf-8')] * 2 assert len(result[0]) == 5 yield e._shutdown()
def test__futures_to_collection(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() remote_dfs = e.map(identity, dfs) ddf = yield _futures_to_collection(remote_dfs, divisions=True) ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) assert isinstance(ddf, dd.DataFrame) assert ddf.dask == ddf2.dask remote_arrays = e.map(np.arange, range(3, 5)) x = yield _futures_to_collection(remote_arrays) y = yield _futures_to_dask_array(remote_arrays) assert type(x) == type(y) assert x.dask == y.dask remote_lists = yield e._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) b = yield _futures_to_collection(remote_lists) c = yield _futures_to_dask_bag(remote_lists) assert type(b) == type(c) assert b.dask == b.dask yield e._shutdown()
def test_Executor_with_local(loop): with LocalCluster(1, scheduler_port=0, silence_logs=False, diagnostic_port=None, loop=loop) as c: with Executor(c, loop=loop) as e: assert len(e.ncores()) == len(c.workers) assert c.scheduler_address in repr(e)
def test_futures_to_dask_dataframe(loop): with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port']), loop=loop) as e: remote_dfs = e.map(lambda x: x, dfs) ddf = futures_to_dask_dataframe(remote_dfs, divisions=True) assert isinstance(ddf, dd.DataFrame) assert ddf.x.sum().compute(get=e.get) == sum( [df.x.sum() for df in dfs])
def test_read_text_sync(loop): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/data.txt', 'wb') as f: f.write(b'hello\nworld') with cluster(nworkers=3) as (s, [a, b, c]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: b = read_text('/tmp/test/*.txt', lazy=False) assert list(b.str.upper()) == ['HELLO', 'WORLD']
def test__stack(s, a, b): import dask.array as da e = Executor((s.ip, s.port), start=False) yield e._start() arrays = e.map(np.ones, [(5, 5)] * 6) y = yield _stack(arrays, axis=0) assert y.shape == (6, 5, 5) assert y.chunks == ((1, 1, 1, 1, 1, 1), (5,), (5,)) y_result = e.compute(y) yy = yield y_result._result() assert isinstance(yy, np.ndarray) assert yy.shape == y.shape assert (yy == 1).all() yield e._shutdown()
def test_restart_sync_no_center(loop): with cluster(nanny=True) as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: x = e.submit(inc, 1) e.restart() assert x.cancelled() y = e.submit(inc, 2) assert y.result() == 3 assert len(e.ncores()) == 2
def f(c, a, b): e = Executor((c.ip, c.port), start=False, loop=loop) yield e._start() remote_arrays = [[[e.submit(np.full, (2, 3, 4), i + j + k) for i in range(2)] for j in range(2)] for k in range(4)] x = yield _futures_to_dask_array(remote_arrays, executor=e) assert x.chunks == ((2, 2, 2, 2), (3, 3), (4, 4)) assert x.dtype == np.full((), 0).dtype assert isinstance(x, da.Array) expr = x.sum() result = yield e._get(expr.dask, expr._keys()) assert isinstance(result[0], np.number) yield e._shutdown()
def test_futures_to_dask_arrays(loop): with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port']), loop=loop) as e: futures = e.map(np.ones, [(5, i) for i in range(1, 6)]) x = future_to_dask_array(futures[0]) assert x.shape == (5, 1) assert (x.compute(get=e.get) == 1).all() xs = futures_to_dask_arrays(futures) assert [x.shape for x in xs] == [(5, i) for i in range(1, 6)]
def test_write_bytes(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() data = [b'123', b'456', b'789'] remote_data = yield e._scatter(data) futures = write_bytes('/tmp/test/data/file.*.dat', remote_data, hdfs=hdfs) yield _wait(futures) assert len(hdfs.ls('/tmp/test/data/')) == 3 with hdfs.open('/tmp/test/data/file.1.dat') as f: assert f.read() == b'456' futures = write_bytes('/tmp/test/data2/', remote_data, hdfs=hdfs) yield _wait(futures) assert len(hdfs.ls('/tmp/test/data2/')) == 3
def test_read_bytes(s, a, b): with make_hdfs() as hdfs: data = b'a' * int(1e8) fn = '/tmp/test/file' with hdfs.open(fn, 'w', repl=1) as f: f.write(data) blocks = hdfs.get_block_locations(fn) assert len(blocks) > 1 e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes(fn, hdfs=hdfs) assert len(futures) == len(blocks) assert futures[0].executor is e results = yield e._gather(futures) assert b''.join(results) == data assert s.restrictions assert {f.key for f in futures}.issubset(s.loose_restrictions)
def test_get_block_locations_nested(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) L = get_block_locations(hdfs, '/tmp/test/') assert len(L) == 6 e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_bytes('/tmp/test/', hdfs=hdfs) results = yield e._gather(futures) assert len(results) == 6 assert all(x == b'a' for x in results)
def test_read_csv_lazy(s, a, b): with make_hdfs() as hdfs: e = Executor((s.ip, s.port), start=False) yield e._start() with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') df = yield _read_csv('/tmp/test/*.csv', lazy=True, lineterminator='\n') assert df._known_dtype yield gen.sleep(0.5) assert not s.tasks result = yield e.compute(df.id.sum(), sync=False)._result() assert result == 1 + 2 + 3 + 4 yield e._shutdown()