def test_procs(loop): with LocalCluster(2, scheduler_port=0, nanny=False, threads_per_worker=3, diagnostic_port=None, silence_logs=False) as c: assert len(c.workers) == 2 assert all(isinstance(w, Worker) for w in c.workers) with Executor((c.scheduler.ip, c.scheduler.port), loop=loop) as e: assert all(w.ncores == 3 for w in c.workers) repr(c) with LocalCluster(2, scheduler_port=0, nanny=True, threads_per_worker=3, diagnostic_port=None, silence_logs=False) as c: assert len(c.workers) == 2 assert all(isinstance(w, Nanny) for w in c.workers) with Executor((c.scheduler.ip, c.scheduler.port), loop=loop) as e: assert all(v == 3 for v in e.ncores().values()) c.start_worker(nanny=False) assert isinstance(c.workers[-1], Worker) repr(c)
def dont_test_dataframes(s, a): # slow pytest.importorskip('pandas') n = 3000000 fn = '/tmp/test/file.csv' with make_hdfs() as hdfs: data = (b'name,amount,id\r\n' + b'Alice,100,1\r\nBob,200,2\r\n' * n) with hdfs.open(fn, 'w') as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() futures = read_binary(fn, hdfs=hdfs, delimiter=b'\r\n') assert len(futures) > 1 def load(b, **kwargs): assert b from io import BytesIO import pandas as pd bio = BytesIO(b) return pd.read_csv(bio, **kwargs) dfs = e.map(load, futures, names=['name', 'amount', 'id'], skiprows=1) dfs2 = yield e._gather(dfs) assert sum(map(len, dfs2)) == n * 2 - 1
def test_bokeh(): pytest.importorskip('bokeh') try: proc = Popen(['dscheduler'], stdout=PIPE, stderr=PIPE) e = Executor('127.0.0.1:%d' % Scheduler.default_port) while True: line = proc.stderr.readline() if b'Start Bokeh UI' in line: break start = time() while True: try: for name in [ socket.gethostname(), 'localhost', '127.0.0.1', get_ip() ]: response = requests.get('http://%s:8787/status/' % name) assert response.ok break except: sleep(0.1) assert time() < start + 5 finally: with ignoring(Exception): e.shutdown() with ignoring(Exception): os.kill(proc.pid, signal.SIGINT)
def dsubmit(*a, args=(), kwargs=None, rtn='', **kw): """Returns a distributed submission context manager, DSubmitter(), with a new executor instance. Parameters ---------- args : Sequence of str, optional A tuple of argument names for DSubmitter. kwargs : Mapping of str to values or list of item tuples, optional Keyword argument names and values for DSubmitter. rtn : str, optional Name of object to return for DSubmitter. a, kw : Sequence and Mapping All other arguments and keyword arguments are used to construct the executor instance. Returns ------- dsub : DSubmitter An instance of the DSubmitter context manager. """ from distributed import Executor e = Executor(*a, **kw) dsub = DSubmitter(e, args=args, kwargs=kwargs, rtn=rtn) return dsub
def test_lazy_values(s, a, b): with make_hdfs() as hdfs: data = b'a' for i in range(3): hdfs.mkdir('/tmp/test/data-%d' % i) for j in range(2): fn = '/tmp/test/data-%d/file-%d.csv' % (i, j) with hdfs.open(fn, 'w', repl=1) as f: f.write(data) e = Executor((s.ip, s.port), start=False) yield e._start() values = read_binary('/tmp/test/', hdfs=hdfs, lazy=True) assert all(isinstance(v, Value) for v in values) while not s.restrictions: yield gen.sleep(0.01) assert not s.dask results = e.compute(*values, sync=False) results = yield e._gather(results) assert len(results) == 6 assert all(x == b'a' for x in results)
def test_with_data(s, a, b): ss = HTTPScheduler(s) ss.listen(0) e = Executor((s.ip, s.port), start=False) yield e._start() L = e.map(inc, [1, 2, 3]) L2 = yield e._scatter(['Hello', 'world!']) yield _wait(L) client = AsyncHTTPClient() response = yield client.fetch('http://localhost:%s/memory-load.json' % ss.port) out = json.loads(response.body.decode()) assert all(isinstance(v, int) for v in out.values()) assert set(out) == {a.address_string, b.address_string} assert sum(out.values()) == sum( map(sys.getsizeof, [1, 2, 3, 'Hello', 'world!'])) response = yield client.fetch( 'http://localhost:%s/memory-load-by-key.json' % ss.port) out = json.loads(response.body.decode()) assert set(out) == {a.address_string, b.address_string} assert all(isinstance(v, dict) for v in out.values()) assert all(k in {'inc', 'data'} for d in out.values() for k in d) assert all(isinstance(v, int) for d in out.values() for v in d.values()) assert sum(v for d in out.values() for v in d.values()) == \ sum(map(sys.getsizeof, [1, 2, 3, 'Hello', 'world!'])) ss.stop() yield e._shutdown()
def test_bokeh_non_standard_ports(): pytest.importorskip('bokeh') try: proc = Popen(['dscheduler', '--port', '3448', '--http-port', '4824', '--bokeh-port', '4832'], stdout=PIPE, stderr=PIPE) e = Executor('127.0.0.1:3448') while True: line = proc.stderr.readline() if b'Bokeh UI' in line: break start = time() while True: try: response = requests.get('http://localhost:4832/status/') assert response.ok break except: sleep(0.1) assert time() < start + 5 finally: with ignoring(Exception): e.shutdown() with ignoring(Exception): os.kill(proc.pid, signal.SIGINT)
def test_read_csv_sync(loop): import dask.dataframe as dd import pandas as pd with cluster(nworkers=3) as (s, [a, b, c]): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') with Executor(('127.0.0.1', s['port']), loop=loop) as e: futures = read_csv('/tmp/test/*.csv', lineterminator='\n', collection=False, lazy=False, header=0) assert all(isinstance(f, Future) for f in futures) L = e.gather(futures) assert isinstance(L[0], pd.DataFrame) assert list(L[0].columns) == ['name', 'amount', 'id'] df = read_csv('/tmp/test/*.csv', lineterminator='\n', collection=True, lazy=False, header=0) assert isinstance(df, dd.DataFrame) assert list(df.head().iloc[0]) == ['Alice', 100, 1]
def test_dataframe_groupby_tasks(loop): df = pd.util.testing.makeTimeDataFrame() df['A'] = df.A // 0.1 df['B'] = df.B // 0.1 ddf = dd.from_pandas(df, npartitions=10) with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port']), loop=loop) as e: with dask.set_options(get=e.get): for ind in [lambda x: 'A', lambda x: x.A]: a = df.groupby(ind(df)).apply(len) b = ddf.groupby(ind(ddf)).apply(len) assert_equal(a, b.compute(get=dask.get).sort_index()) assert not any('partd' in k[0] for k in b.dask) a = df.groupby(ind(df)).B.apply(len) b = ddf.groupby(ind(ddf)).B.apply(len) assert_equal(a, b.compute(get=dask.get).sort_index()) assert not any('partd' in k[0] for k in b.dask) with pytest.raises(NotImplementedError): ddf.groupby(ddf[['A', 'B']]).apply(len) a = df.groupby(['A', 'B']).apply(len) b = ddf.groupby(['A', 'B']).apply(len) assert_equal(a, b.compute(get=dask.get).sort_index())
def test__dask_array_collections(s, a, b): import dask.array as da e = Executor((s.ip, s.port), start=False) yield e._start() x_dsk = {('x', i, j): np.random.random((3, 3)) for i in range(3) for j in range(2)} y_dsk = {('y', i, j): np.random.random((3, 3)) for i in range(2) for j in range(3)} x_futures = yield e._scatter(x_dsk) y_futures = yield e._scatter(y_dsk) dt = np.random.random(0).dtype x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt) y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt) x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt) y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt) exprs = [ lambda x, y: x.T + y, lambda x, y: x.mean() + y.mean(), lambda x, y: x.dot(y).std(axis=0), lambda x, y: x - x.mean(axis=1)[:, None] ] for expr in exprs: local = expr(x_local, y_local).compute(get=dask.get) remote, = e.compute(expr(x_remote, y_remote)) remote = yield remote._result() assert np.all(local == remote) yield e._shutdown()
def test__futures_to_collection(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() remote_dfs = e.map(identity, dfs) ddf = yield _futures_to_collection(remote_dfs, divisions=True) ddf2 = yield _futures_to_dask_dataframe(remote_dfs, divisions=True) assert isinstance(ddf, dd.DataFrame) assert ddf.dask == ddf2.dask remote_arrays = e.map(np.arange, range(3, 5)) x = yield _futures_to_collection(remote_arrays) y = yield _futures_to_dask_array(remote_arrays) assert type(x) == type(y) assert x.dask == y.dask remote_lists = yield e._scatter([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) b = yield _futures_to_collection(remote_lists) c = yield _futures_to_dask_bag(remote_lists) assert type(b) == type(c) assert b.dask == b.dask yield e._shutdown()
def test__futures_to_dask_bag(s, a, b): import dask.bag as db e = Executor((s.ip, s.port), start=False) yield e._start() L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] futures = yield e._scatter(L) rb = yield _futures_to_dask_bag(futures) assert isinstance(rb, db.Bag) assert rb.npartitions == len(L) lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3) exprs = [ lambda x: x.map(lambda x: x + 1).sum(), lambda x: x.filter(lambda x: x % 2) ] for expr in exprs: local = expr(lb).compute(get=dask.get) remote = e.compute(expr(rb)) remote = yield remote._result() assert local == remote yield e._shutdown()
def test_nanny_worker_ports(loop): try: worker = Popen([ 'dworker', '127.0.0.1:8989', '--host', '127.0.0.1', '--worker-port', '8788', '--nanny-port', '8789' ], stdout=PIPE, stderr=PIPE) sched = Popen(['dscheduler', '--port', '8989'], stdout=PIPE, stderr=PIPE) with Executor('127.0.0.1:8989', loop=loop) as e: start = time() while True: d = sync(e.loop, e.scheduler.identity) if d['workers']: break else: assert time() - start < 5 sleep(0.1) assert d['workers']['127.0.0.1:8788']['services']['nanny'] == 8789 finally: with ignoring(Exception): w = rpc('127.0.0.1:8789') sync(loop, w.terminate) with ignoring(Exception): os.kill(sched.pid, signal.SIGINT) with ignoring(Exception): worker.kill()
def test_avro(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() avro_files = { '/tmp/test/1.avro': avro_bytes, '/tmp/test/2.avro': avro_bytes } with make_hdfs() as hdfs: for k, v in avro_files.items(): with hdfs.open(k, 'wb') as f: f.write(v) assert hdfs.info(k)['size'] > 0 L = yield _read_avro('/tmp/test/*.avro', lazy=False) assert isinstance(L, list) assert all(isinstance(x, Future) for x in L) results = yield e._gather(L) assert all(isinstance(r, list) for r in results) assert results[0][:5] == data[:5] assert results[-1][-5:] == data[-5:] L = yield _read_avro('/tmp/test/*.avro', lazy=True) assert isinstance(L, list) assert all(isinstance(x, Value) for x in L) yield e._shutdown()
def client_context(dask_client=None, dask_scheduler=None): '''client_context creates a dask distributed or threadpool client or None Parameters: dask_client: str from choices ("DISTRIBUTED", 'THREAD_POOL', 'SERIAL') or None to take DASK_CLIENT from environment dask_scheduler: Distributed scheduler url or None to take DASK_SCHEDULER from environment ''' env = parse_env_vars() dask_client = dask_client or env.get('DASK_CLIENT', 'SERIAL') dask_scheduler = dask_scheduler or env.get('DASK_SCHEDULER') if dask_client == 'DISTRIBUTED': if Executor is None: raise ValueError('distributed is not installed - "conda install distributed"') client = Executor(dask_scheduler) elif dask_client == 'THREAD_POOL': client = ThreadPool(env.get('DASK_THREADS')) elif dask_client == 'SERIAL': client = None else: raise ValueError('Did not expect DASK_CLIENT to be {}'.format(dask_client)) get_func = _find_get_func_for_client(client) with da.set_options(pool=dask_client): yield client
def test_stress_gc(loop, func, n): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: x = e.submit(func, 1) for i in range(n): x = e.submit(func, x) assert x.result() == n + 2
def test_read_text_bucket_key_inputs(loop): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: a = read_text(test_bucket_name, '/text/accounts', lazy=True) b = read_text(test_bucket_name, 'text/accounts', lazy=True) c = read_text(test_bucket_name + '/text/accounts', lazy=True) assert a._keys() == b._keys() == c._keys()
def test_submit_after_failed_worker(loop): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: L = e.map(inc, range(10)) wait(L) a['proc'].terminate() total = e.submit(sum, L) assert total.result() == sum(map(inc, range(10)))
def test_no_divisions(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() dfs = e.map(tm.makeTimeDataFrame, range(5, 10)) df = yield _futures_to_dask_dataframe(dfs) assert not df.known_divisions assert list(df.columns) == list(tm.makeTimeDataFrame(5).columns)
def test_progress_function(loop, capsys): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: f = e.submit(lambda: 1) g = e.submit(lambda: 2) progress([[f], [[g]]], notebook=False) check_bar_completed(capsys)
def test_gather_after_failed_worker(loop): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: L = e.map(inc, range(10)) wait(L) a['proc'].terminate() result = e.gather(L) assert result == list(map(inc, range(10)))
def test_Executor_with_local(loop): with LocalCluster(1, scheduler_port=0, silence_logs=False, diagnostic_port=None, loop=loop) as c: with Executor(c, loop=loop) as e: assert len(e.ncores()) == len(c.workers) assert c.scheduler_address in repr(e)
def test_futures_to_dask_dataframe(loop): with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port']), loop=loop) as e: remote_dfs = e.map(lambda x: x, dfs) ddf = futures_to_dask_dataframe(remote_dfs, divisions=True) assert isinstance(ddf, dd.DataFrame) assert ddf.x.sum().compute(get=e.get) == sum( [df.x.sum() for df in dfs])
def test_read_text_sync(loop): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/data.txt', 'wb') as f: f.write(b'hello\nworld') with cluster(nworkers=3) as (s, [a, b, c]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: b = read_text('/tmp/test/*.txt', lazy=False) assert list(b.str.upper()) == ['HELLO', 'WORLD']
def test_restart_sync_no_center(loop): with cluster(nanny=True) as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: x = e.submit(inc, 1) e.restart() assert x.cancelled() y = e.submit(inc, 2) assert y.result() == 3 assert len(e.ncores()) == 2
def test_futures_to_dask_arrays(loop): with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port']), loop=loop) as e: futures = e.map(np.ones, [(5, i) for i in range(1, 6)]) x = future_to_dask_array(futures[0]) assert x.shape == (5, 1) assert (x.compute(get=e.get) == 1).all() xs = futures_to_dask_arrays(futures) assert [x.shape for x in xs] == [(5, i) for i in range(1, 6)]
def test_read_text_sync(loop): import dask.bag as db with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: b = read_text(test_bucket_name+'/test/accounts*', lazy=True, collection=True) assert isinstance(b, db.Bag) c = b.filter(None).map(json.loads).pluck('amount').sum() result = c.compute(get=e.get) assert result == (1 + 2 + 3 + 4 + 5 + 6 + 7 + 8) * 100
def test_defaults(): try: proc = Popen(['dscheduler', '--no-bokeh'], stdout=PIPE, stderr=PIPE) e = Executor('127.0.0.1:%d' % Scheduler.default_port) response = requests.get('http://127.0.0.1:9786/info.json') assert response.ok assert response.json()['status'] == 'running' finally: e.shutdown() os.kill(proc.pid, signal.SIGINT)
def test_futures_to_dask_bag(loop): import dask.bag as db with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port']), loop=loop) as e: data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] futures = e.scatter(data) b = futures_to_dask_bag(futures) assert isinstance(b, db.Bag) assert b.map(lambda x: x + 1).sum().compute(get=e.get) == sum( range(2, 11))
def test_multiple_executors_restart(s, a, b): e1 = Executor((s.ip, s.port), start=False) yield e1._start() e2 = Executor((s.ip, s.port), start=False) yield e2._start() x = e1.submit(inc, 1) y = e2.submit(inc, 2) xx = yield x._result() yy = yield y._result() assert xx == 2 assert yy == 3 yield e1._restart() assert x.cancelled() assert y.cancelled() yield e1._shutdown(fast=True) yield e2._shutdown(fast=True)