def test_auto_scatter(loop): # noqa: F811 np = pytest.importorskip('numpy') data = np.ones(int(1e7), dtype=np.uint8) def count_events(event_name, client): worker_events = client.run(lambda dask_worker: dask_worker.log) event_counts = {} for w, events in worker_events.items(): event_counts[w] = len([event for event in list(events) if event[1] == event_name]) return event_counts with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as client: with parallel_backend('dask') as (ba, _): # Passing the same data as arg and kwarg triggers a single # scatter operation whose result is reused. Parallel()(delayed(noop)(data, data, i, opt=data) for i in range(5)) # By default large array are automatically scattered with # broadcast=1 which means that one worker must directly receive # the data from the scatter operation once. counts = count_events('receive-from-scatter', client) assert counts[a['address']] + counts[b['address']] == 1 with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as client: with parallel_backend('dask') as (ba, _): Parallel()(delayed(noop)(data[:3], i) for i in range(5)) # Small arrays are passed within the task definition without going # through a scatter operation. counts = count_events('receive-from-scatter', client) assert counts[a['address']] == 0 assert counts[b['address']] == 0
def test_local_client(loop): def produce(n): with local_client() as c: x = c.channel('x') for i in range(n): future = c.submit(slowinc, i, delay=0.01, key='f-%d' % i) x.append(future) x.flush() def consume(): with local_client() as c: x = c.channel('x') y = c.channel('y') last = 0 for i, future in enumerate(x): last = c.submit(add, future, last, key='add-' + future.key) y.append(last) with cluster() as (s, [a, b]): with Client(('127.0.0.1', s['port']), loop=loop) as c: x = c.channel('x') y = c.channel('y') producers = (c.submit(produce, 5), c.submit(produce, 10)) consumer = c.submit(consume) results = [] for i, future in enumerate(take(15, y)): result = future.result() results.append(result) assert len(results) == 15 assert all(0 < r < 100 for r in results)
def test_stop(loop): def produce(n): with local_client() as c: x = c.channel('x') for i in range(n): future = c.submit(slowinc, i, delay=0.01, key='f-%d' % i) x.append(future) x.stop() x.flush() with cluster() as (s, [a, b]): with Client(('127.0.0.1', s['port']), loop=loop) as c: x = c.channel('x') producer = c.submit(produce, 5) futures = list(x) assert len(futures) == 5 with pytest.raises(StopIteration): x.append(c.submit(inc, 1)) with Client(('127.0.0.1', s['port']), loop=loop) as c2: xx = c2.channel('x') futures = list(xx) assert len(futures) == 5
def test_nested_backend_context_manager(loop): # noqa: F811 def get_nested_pids(): pids = set(Parallel(n_jobs=2)(delayed(os.getpid)() for _ in range(2))) pids |= set(Parallel(n_jobs=2)(delayed(os.getpid)() for _ in range(2))) return pids with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as client: with parallel_backend('dask') as (ba, _): pid_groups = Parallel(n_jobs=2)( delayed(get_nested_pids)() for _ in range(10) ) for pid_group in pid_groups: assert len(set(pid_group)) <= 2 # No deadlocks with Client(s['address'], loop=loop) as client: # noqa: F841 with parallel_backend('dask') as (ba, _): pid_groups = Parallel(n_jobs=2)( delayed(get_nested_pids)() for _ in range(10) ) for pid_group in pid_groups: assert len(set(pid_group)) <= 2
def test_dataframe_groupby_tasks(loop): df = pd.util.testing.makeTimeDataFrame() df["A"] = df.A // 0.1 df["B"] = df.B // 0.1 ddf = dd.from_pandas(df, npartitions=10) with cluster() as (c, [a, b]): with Client(("127.0.0.1", c["port"]), loop=loop) as c: with dask.set_options(get=c.get): for ind in [lambda x: "A", lambda x: x.A]: a = df.groupby(ind(df)).apply(len) b = ddf.groupby(ind(ddf)).apply(len) assert_equal(a, b.compute(get=dask.get).sort_index()) assert not any("partd" in k[0] for k in b.dask) a = df.groupby(ind(df)).B.apply(len) b = ddf.groupby(ind(ddf)).B.apply(len) assert_equal(a, b.compute(get=dask.get).sort_index()) assert not any("partd" in k[0] for k in b.dask) with pytest.raises(NotImplementedError): ddf.groupby(ddf[["A", "B"]]).apply(len) a = df.groupby(["A", "B"]).apply(len) b = ddf.groupby(["A", "B"]).apply(len) assert_equal(a, b.compute(get=dask.get).sort_index())
def test_dont_assume_function_purity(loop): with cluster() as (s, [a, b]): with parallel_backend('distributed', loop=loop, scheduler_host=('127.0.0.1', s['port'])): x, y = Parallel()(delayed(random2)() for i in range(2)) assert x != y
def test_dataframe_groupby_tasks(loop): df = pd.util.testing.makeTimeDataFrame() df['A'] = df.A // 0.1 df['B'] = df.B // 0.1 ddf = dd.from_pandas(df, npartitions=10) with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port']), loop=loop) as e: with dask.set_options(get=e.get): for ind in [lambda x: 'A', lambda x: x.A]: a = df.groupby(ind(df)).apply(len) b = ddf.groupby(ind(ddf)).apply(len) assert_equal(a, b.compute(get=dask.get).sort_index()) assert not any('partd' in k[0] for k in b.dask) a = df.groupby(ind(df)).B.apply(len) b = ddf.groupby(ind(ddf)).B.apply(len) assert_equal(a, b.compute(get=dask.get).sort_index()) assert not any('partd' in k[0] for k in b.dask) with pytest.raises(NotImplementedError): ddf.groupby(ddf[['A', 'B']]).apply(len) a = df.groupby(['A', 'B']).apply(len) b = ddf.groupby(['A', 'B']).apply(len) assert_equal(a, b.compute(get=dask.get).sort_index())
def test_dataframes(loop): dfs = [pd.DataFrame({'x': np.random.random(100), 'y': np.random.random(100)}, index=list(range(i, i + 100))) for i in range(0, 100*10, 100)] with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port']), loop=loop) as e: remote_dfs = e.map(lambda x: x, dfs) rdf = futures_to_dask_dataframe(remote_dfs, divisions=True) name = 'foo' ldf = dd.DataFrame({(name, i): df for i, df in enumerate(dfs)}, name, dfs[0].columns, list(range(0, 1000, 100)) + [999]) assert rdf.divisions == ldf.divisions tm.assert_frame_equal(rdf.compute(get=e.get), ldf.compute(get=dask.get)) exprs = [lambda df: df.x.mean(), lambda df: df.y.std(), lambda df: df.assign(z=df.x + df.y).drop_duplicates(), lambda df: df.index, lambda df: df.x, lambda df: df.x.cumsum(), lambda df: df.loc[50:75]] for f in exprs: local = f(ldf).compute(get=dask.get) remote = f(rdf).compute(get=e.get) assert_equal(local, remote)
def test_dask_persisted_entityset(entityset, capsys): times = list([datetime(2011, 4, 9, 10, 30, i * 6) for i in range(5)] + [datetime(2011, 4, 9, 10, 31, i * 9) for i in range(4)] + [datetime(2011, 4, 9, 10, 40, 0)] + [datetime(2011, 4, 10, 10, 40, i) for i in range(2)] + [datetime(2011, 4, 10, 10, 41, i * 3) for i in range(3)] + [datetime(2011, 4, 10, 11, 10, i * 3) for i in range(2)]) labels = [False] * 3 + [True] * 2 + [False] * 9 + [True] + [False] * 2 cutoff_time = pd.DataFrame({'time': times, 'instance_id': range(17)}) property_feature = IdentityFeature(entityset['log']['value']) > 10 with cluster() as (scheduler, [a, b]): dkwargs = {'cluster': scheduler['address']} feature_matrix = calculate_feature_matrix([property_feature], entityset=entityset, cutoff_time=cutoff_time, verbose=True, chunk_size=.13, dask_kwargs=dkwargs, approximate='1 hour') assert (feature_matrix == labels).values.all() feature_matrix = calculate_feature_matrix([property_feature], entityset=entityset, cutoff_time=cutoff_time, verbose=True, chunk_size=.13, dask_kwargs=dkwargs, approximate='1 hour') captured = capsys.readouterr() assert "Using EntitySet persisted on the cluster as dataset " in captured[0] assert (feature_matrix == labels).values.all()
def test_scheduler_param_distributed(loop): X, y = make_classification(n_samples=100, n_features=10, random_state=0) with cluster() as (s, [a, b]): with Client(s['address'], loop=loop, set_as_default=False) as client: gs = dcv.GridSearchCV(MockClassifier(), {'foo_param': [0, 1, 2]}, cv=3, scheduler=client) gs.fit(X, y)
def test_directed_scatter_sync(loop): with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port']), loop=loop) as e: e.scatter([1, 2, 3], workers=[('127.0.0.1', b['port'])]) has_what = sync(e.loop, e.center.has_what) assert len(has_what[('127.0.0.1', b['port'])]) == 3 assert len(has_what[('127.0.0.1', a['port'])]) == 0
def test_directed_scatter_sync(loop): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: futures = e.scatter([1, 2, 3], workers=[('127.0.0.1', b['port'])]) has_what = sync(loop, e.scheduler.has_what) assert len(has_what[('127.0.0.1', b['port'])]) == 3 assert len(has_what[('127.0.0.1', a['port'])]) == 0
def test_dask_distributed_netcdf_roundtrip( loop, tmp_netcdf_filename, engine, nc_format): if engine not in ENGINES: pytest.skip('engine not available') chunks = {'dim1': 4, 'dim2': 3, 'dim3': 6} with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: original = create_test_data().chunk(chunks) if engine == 'scipy': with pytest.raises(NotImplementedError): original.to_netcdf(tmp_netcdf_filename, engine=engine, format=nc_format) return original.to_netcdf(tmp_netcdf_filename, engine=engine, format=nc_format) with xr.open_dataset(tmp_netcdf_filename, chunks=chunks, engine=engine) as restored: assert isinstance(restored.var1.data, da.Array) computed = restored.compute() assert_allclose(original, computed)
def test_manual_scatter(loop): # noqa: F811 x = CountSerialized(1) y = CountSerialized(2) z = CountSerialized(3) with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as client: # noqa: F841 with parallel_backend('dask', scatter=[x, y]) as (ba, _): f = delayed(add5) tasks = [f(x, y, z, d=4, e=5), f(x, z, y, d=5, e=4), f(y, x, z, d=x, e=5), f(z, z, x, d=z, e=y)] expected = [func(*args, **kwargs) for func, args, kwargs in tasks] results = Parallel()(tasks) # Scatter must take a list/tuple with pytest.raises(TypeError): with parallel_backend('dask', loop=loop, scatter=1): pass assert results == expected # Scattered variables only serialized once assert x.count == 1 assert y.count == 1 assert z.count == 4
def test_read_csv_sync(loop): import dask.dataframe as dd import pandas as pd with cluster(nworkers=3) as (s, [a, b, c]): with make_hdfs() as (hdfs, basedir): with hdfs.open('%s/1.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('%s/2.csv' % basedir, 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') with Client(('127.0.0.1', s['port']), loop=loop) as e: values = dd.read_csv('hdfs://%s/*.csv' % basedir, lineterminator='\n', collection=False, header=0) futures = e.compute(values) assert all(isinstance(f, Future) for f in futures) L = e.gather(futures) assert isinstance(L[0], pd.DataFrame) assert list(L[0].columns) == ['name', 'amount', 'id'] df = dd.read_csv('hdfs://%s/*.csv' % basedir, lineterminator='\n', collection=True, header=0) assert isinstance(df, dd.DataFrame) assert list(df.head().iloc[0]) == ['Alice', 100, 1]
def test_dask_distributed_read_netcdf_integration_test(loop, engine, autoclose, nc_format): if engine == 'h5netcdf' and autoclose: pytest.skip('h5netcdf does not support autoclose') if nc_format not in NC_FORMATS[engine]: pytest.skip('invalid format for engine') chunks = {'dim1': 4, 'dim2': 3, 'dim3': 6} with create_tmp_file(allow_cleanup_failure=ON_WINDOWS) as filename: with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: original = create_test_data() original.to_netcdf(filename, engine=engine, format=nc_format) with xr.open_dataset(filename, chunks=chunks, engine=engine, autoclose=autoclose) as restored: assert isinstance(restored.var1.data, da.Array) computed = restored.compute() assert_allclose(original, computed)
def test_gather_after_failed_worker(loop): with cluster(active_rpc_timeout=10) as (s, [a, b]): with Client(s['address'], loop=loop) as c: L = c.map(inc, range(10)) wait(L) a['proc']().terminate() result = c.gather(L) assert result == list(map(inc, range(10)))
def test_submit_after_failed_worker_sync(loop): with cluster(active_rpc_timeout=10) as (s, [a, b]): with Client(s['address'], loop=loop) as c: L = c.map(inc, range(10)) wait(L) a['proc']().terminate() total = c.submit(sum, L) assert total.result() == sum(map(inc, range(10)))
def test_gather_after_failed_worker(): with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port'])) as e: L = e.map(inc, range(10)) wait(L) a['proc'].terminate() result = e.gather(L) assert result == list(map(inc, range(10)))
def test_submit_after_failed_worker(loop): with cluster() as (s, [a, b]): with Client(('127.0.0.1', s['port']), loop=loop) as c: L = c.map(inc, range(10)) wait(L) a['proc'].terminate() total = c.submit(sum, L) assert total.result() == sum(map(inc, range(10)))
def test_gather_after_failed_worker(loop): with cluster() as (s, [a, b]): with Client(('127.0.0.1', s['port']), loop=loop) as c: L = c.map(inc, range(10)) wait(L) a['proc'].terminate() result = c.gather(L) assert result == list(map(inc, range(10)))
def test_futures_to_delayed_bag(loop): db = pytest.importorskip('dask.bag') L = [1, 2, 3] with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: # flake8: noqa futures = c.scatter([L, L]) b = db.from_delayed(futures) assert list(b) == L + L
def test_fast(loop): with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port']), loop=loop) as e: L = e.map(inc, range(100)) L2 = e.map(dec, L) L3 = e.map(add, L, L2) p = progress(L3, multi=True, complete=True, notebook=True) assert set(p.all_keys) == {'inc', 'dec', 'add'}
def test_stress_gc(loop, func, n): with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: x = c.submit(func, 1) for i in range(n): x = c.submit(func, x) assert x.result() == n + 2
def test_stress_gc(): with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port']), delete_batch_time=0.5) as e: x = e.submit(slowinc, 1) for i in range(20): # this could be increased x = e.submit(slowinc, x) assert x.result() == 22
def test_progress_function(loop, capsys): with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port']), loop=loop) as e: f = e.submit(lambda: 1) g = e.submit(lambda: 2) progress([[f], [[g]]], notebook=False) check_bar_completed(capsys)
def test_Future_exception_sync(loop): with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port']), loop=loop) as e: x = e.submit(div, 1, 0) assert isinstance(x.exception(), ZeroDivisionError) x = e.submit(div, 1, 1) assert x.exception() is None
def test_submit_after_failed_worker(): with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port'])) as e: L = e.map(inc, range(10)) wait(L) a['proc'].terminate() total = e.submit(sum, L) assert total.result() == sum(map(inc, range(10)))
def test_dask_distributed_rasterio_integration_test(loop): with create_tmp_geotiff() as (tmp_file, expected): with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: da_tiff = xr.open_rasterio(tmp_file, chunks={'band': 1}) assert isinstance(da_tiff.data, da.Array) actual = da_tiff.compute() assert_allclose(actual, expected)
def test_read_text_bucket_key_inputs(loop): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: a = read_text(test_bucket_name, '/text/accounts', lazy=True) b = read_text(test_bucket_name, 'text/accounts', lazy=True) c = read_text(test_bucket_name + '/text/accounts', lazy=True) assert a._keys() == b._keys() == c._keys()
def test_get_worker_name(loop): with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: def f(): get_client().submit(inc, 1).result() c.run(f) def func(dask_scheduler): return list(dask_scheduler.clients) start = time() while not any('worker' in n for n in c.run_on_scheduler(func)): sleep(0.1) assert time() < start + 10
def test_as_completed_repeats(loop): with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: ac = as_completed() x = c.submit(inc, 1) ac.add(x) ac.add(x) assert next(ac) is x assert next(ac) is x with pytest.raises(StopIteration): next(ac) ac.add(x) assert next(ac) is x
def test_traceback_sync(loop): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: x = e.submit(div, 1, 0) tb = x.traceback() if sys.version_info[0] >= 3: assert any('x / y' in line for line in tb) y = e.submit(inc, x) tb2 = y.traceback() assert set(tb2).issuperset(set(tb)) z = e.submit(div, 1, 2) tb = z.traceback() assert tb is None
def test_regressor(xgboost_loop): # noqa with cluster() as (s, [a, b]): with Client(s['address'], loop=xgboost_loop): a = dxgb.XGBRegressor() X2 = da.from_array(X, 5) y2 = da.from_array(y, 5) weight1 = da.from_array(weight, 5) a.fit(X2, y2, sample_weight=weight1) p1 = a.predict(X2) b = xgb.XGBRegressor() b.fit(X, y, sample_weight=weight) np.testing.assert_array_almost_equal(a.feature_importances_, b.feature_importances_) assert_eq(p1, b.predict(X))
def test_avro_sync(loop): avro_files = { '/tmp/test/1.avro': avro_bytes, '/tmp/test/2.avro': avro_bytes } with make_hdfs() as hdfs: for k, v in avro_files.items(): with hdfs.open(k, 'wb') as f: f.write(v) with cluster(nworkers=1) as (s, [a]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: futures = read_avro('/tmp/test/*.avro', lazy=False) assert all(isinstance(f, Future) for f in futures) L = e.gather(futures) assert L[0][:5] == data[:5]
def test_classifier_local_predict(loop, listen_port): #noqa with cluster() as (s, [a, b]): with Client(s['address'], loop=loop): X, y, w, dX, dy, dw = _create_data('classification', output="array") a = dlgbm.LGBMClassifier(local_listen_port=listen_port) a = a.fit(dX, dy, sample_weight=dw) p1 = a.to_local().predict(dX) b = lightgbm.LGBMClassifier() b.fit(X, y, sample_weight=w) p2 = b.predict(X) assert_eq(p1, p2) assert_eq(y, p1) assert_eq(y, p2)
def test_gather_then_submit_after_failed_workers(loop): with cluster(nworkers=4) as (s, [w, x, y, z]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: L = e.map(inc, range(20)) wait(L) w['proc'].terminate() total = e.submit(sum, L) wait([total]) (_, port) = first(e.scheduler.who_has[total.key]) for d in [x, y, z]: if d['port'] == port: d['proc'].terminate() result = e.gather([total]) assert result == [sum(map(inc, range(20)))]
def test_futures_to_dask_array(loop): with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: remote_arrays = [[c.submit(np.full, (3, 3), i + j) for i in range(3)] for j in range(3)] x = futures_to_dask_array(remote_arrays, client=c) assert x.chunks == ((3, 3, 3), (3, 3, 3)) assert x.dtype == np.full((), 0).dtype assert x.sum().compute(get=c.get) == 162 assert (x + x.T).sum().compute(get=c.get) == 162 * 2 y = futures_to_collection(remote_arrays, client=c) assert x.dask == y.dask
def test_futures_to_dask_array(loop): with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port']), loop=loop) as e: remote_arrays = [[ e.submit(np.full, (3, 3), i + j) for i in range(3) ] for j in range(3)] x = futures_to_dask_array(remote_arrays, executor=e) assert x.chunks == ((3, 3, 3), (3, 3, 3)) assert x.dtype == np.full((), 0).dtype assert x.sum().compute(get=e.get) == 162 assert (x + x.T).sum().compute(get=e.get) == 162 * 2 y = futures_to_collection(remote_arrays, executor=e) assert x.dask == y.dask
def test_start_ipython_scheduler_magic(loop, zmq_ctx): with cluster(1) as (s, [a]): with Client(s["address"], loop=loop) as e, mock_ipython() as ip: info = e.start_ipython_scheduler() expected = [ {"magic_kind": "line", "magic_name": "scheduler"}, {"magic_kind": "cell", "magic_name": "scheduler"}, ] call_kwargs_list = [ kwargs for (args, kwargs) in ip.register_magic_function.call_args_list ] assert call_kwargs_list == expected magic = ip.register_magic_function.call_args_list[0][0][0] magic(line="", cell="scheduler")
def test_read_text_bucket_key_inputs(loop): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: a = read_text(test_bucket_name, '/test/accounts*', lazy=True, anon=True) b = read_text(test_bucket_name, 'test/accounts*', lazy=True, anon=True) c = read_text(test_bucket_name + '/test/accounts*', lazy=True, anon=True) assert a._keys() == b._keys() == c._keys()
def test_start_ipython_remote(loop, zmq_ctx): from distributed._ipython_utils import remote_magic with cluster(1) as (s, [a]): with Client(s["address"], loop=loop) as e, mock_ipython() as ip: worker = first(e.nthreads()) ip.user_ns["info"] = e.start_ipython_workers(worker)[worker] remote_magic("info 1") # line magic remote_magic("info", "worker") # cell magic expected = [ ((remote_magic,), {"magic_kind": "line", "magic_name": "remote"}), ((remote_magic,), {"magic_kind": "cell", "magic_name": "remote"}), ] assert ip.register_magic_function.call_args_list == expected assert ip.register_magic_function.call_count == 2
def test_start_ipython_workers(loop, zmq_ctx): from jupyter_client import BlockingKernelClient with cluster(1) as (s, [a]): with Client(s["address"], loop=loop) as e: info_dict = e.start_ipython_workers() info = first(info_dict.values()) kc = BlockingKernelClient() kc.load_connection_info(info) kc.start_channels() kc.wait_for_ready(timeout=10) msg_id = kc.execute("worker") reply = kc.get_shell_msg(timeout=10) assert reply["parent_header"]["msg_id"] == msg_id assert reply["content"]["status"] == "ok" kc.stop_channels()
def test_occupancy(loop): with cluster(nanny=True) as (s, [a, b]): rm = Occupancy(('127.0.0.1', s['port']), interval=0.01) for k in ['host', 'processing', 'waiting']: assert k in rm.cds.data start = time() while not rm.cds.data['host']: loop.run_sync(lambda: gen.sleep(0.05)) assert time() < start + 2 assert (len(rm.cds.data['host']) == len(rm.cds.data['processing']) == len(rm.cds.data['waiting']) == 2) assert isinstance(rm.figure, Figure) rm.stream.close()
def test_start_ipython_workers_magic(loop, zmq_ctx): with cluster(2) as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e, mock_ipython() as ip: workers = list(e.ncores())[:2] names = ['magic%i' % i for i in range(len(workers))] info_dict = e.start_ipython_workers(workers, magic_names=names) expected = [ { 'magic_kind': 'line', 'magic_name': 'remote' }, { 'magic_kind': 'cell', 'magic_name': 'remote' }, { 'magic_kind': 'line', 'magic_name': 'magic0' }, { 'magic_kind': 'cell', 'magic_name': 'magic0' }, { 'magic_kind': 'line', 'magic_name': 'magic1' }, { 'magic_kind': 'cell', 'magic_name': 'magic1' }, ] call_kwargs_list = [ kwargs for (args, kwargs) in ip.register_magic_function.call_args_list ] assert call_kwargs_list == expected assert ip.register_magic_function.call_count == 6 magics = [ args[0][0] for args in ip.register_magic_function.call_args_list[2:] ] magics[-1](line="", cell="worker") [m.client.stop_channels() for m in magics]
def test_start_ipython_workers(loop, zmq_ctx): from jupyter_client import BlockingKernelClient with cluster(1) as (s, [a]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: info_dict = e.start_ipython_workers() info = first(info_dict.values()) key = info.pop('key') kc = BlockingKernelClient(**info) kc.session.key = key kc.start_channels() kc.wait_for_ready(timeout=10) msg_id = kc.execute("worker") reply = kc.get_shell_msg(timeout=10) assert reply['parent_header']['msg_id'] == msg_id assert reply['content']['status'] == 'ok' kc.stop_channels()
def test_distributed_persist(loop, dask_array): """Test persist() for distributed machines.""" q = ureg.Quantity(dask_array, units_) with cluster() as (s, [a, b]): with Client(s["address"], loop=loop): comps = add_five(q) persisted_q = comps.persist() comps_truth = dask_array + 5 persisted_truth = comps_truth.persist() assert np.all(persisted_q.m == persisted_truth) assert dask.is_dask_collection(persisted_q) assert persisted_q.units == units_ assert q.magnitude is dask_array
def test_classifier_proba(loop, output, listen_port, centers): with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as client: X, y, w, dX, dy, dw = _create_data('classification', output=output, centers=centers) a = dlgbm.LGBMClassifier(local_listen_port=listen_port) a = a.fit(dX, dy, sample_weight=dw) p1 = a.predict_proba(dX, client=client) p1 = p1.compute() b = lightgbm.LGBMClassifier() b.fit(X, y, sample_weight=w) p2 = b.predict_proba(X) assert_eq(p1, p2, atol=0.3)
def test_restart_sync(loop): with cluster(nanny=True) as (s, [a, b]): with Client(s["address"], loop=loop) as c: x = c.submit(div, 1, 2) x.result() assert sync(loop, c.scheduler.who_has) c.restart() assert not sync(loop, c.scheduler.who_has) assert x.cancelled() assert len(c.nthreads()) == 2 with pytest.raises(CancelledError): x.result() y = c.submit(div, 1, 3) assert y.result() == 1 / 3
def test_start_ipython_workers_magic(loop, zmq_ctx): with cluster(2) as (s, [a, b]): with Client(s["address"], loop=loop) as e, mock_ipython() as ip: workers = list(e.nthreads())[:2] names = ["magic%i" % i for i in range(len(workers))] info_dict = e.start_ipython_workers(workers, magic_names=names) expected = [ { "magic_kind": "line", "magic_name": "remote" }, { "magic_kind": "cell", "magic_name": "remote" }, { "magic_kind": "line", "magic_name": "magic0" }, { "magic_kind": "cell", "magic_name": "magic0" }, { "magic_kind": "line", "magic_name": "magic1" }, { "magic_kind": "cell", "magic_name": "magic1" }, ] call_kwargs_list = [ kwargs for (args, kwargs) in ip.register_magic_function.call_args_list ] assert call_kwargs_list == expected assert ip.register_magic_function.call_count == 6 magics = [ args[0][0] for args in ip.register_magic_function.call_args_list[2:] ] magics[-1](line="", cell="worker") [m.client.stop_channels() for m in magics]
def test_futures_to_delayed_array(loop): da = pytest.importorskip("dask.array") from dask.array.utils import assert_eq np = pytest.importorskip("numpy") x = np.arange(5) with cluster() as (s, [a, b]): with Client(s["address"], loop=loop) as c: futures = c.scatter([x, x]) A = da.concatenate( [ da.from_delayed(f, shape=x.shape, dtype=x.dtype) for f in futures ], axis=0, ) assert_eq(A.compute(), np.concatenate([x, x], axis=0))
def test_values(loop): with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: L = [c.submit(inc, i) for i in range(5)] wait(L) p = MultiProgressWidget(L) sync(loop, p.listen) assert set(p.bars) == {'inc'} assert p.status == 'finished' assert p.comm.closed() assert '5 / 5' in p.bar_texts['inc'].value assert p.bars['inc'].value == 1.0 x = c.submit(throws, 1) p = MultiProgressWidget([x]) sync(loop, p.listen) assert p.status == 'error'
def test_regressor_local_predict(loop, listen_port): with cluster() as (s, [a, b]): with Client(s['address'], loop=loop): X, y, w, dX, dy, dw = _create_data('regression', output="array") a = dlgbm.LGBMRegressor(local_listen_port=listen_port, seed=42) a = a.fit(dX, dy, sample_weight=dw) p1 = a.predict(dX) p2 = a.to_local().predict(X) s1 = r2_score(dy, p1) p1 = p1.compute() s2 = a.to_local().score(X, y) print(s1) # Predictions and scores should be the same assert_eq(p1, p2) np.isclose(s1, s2)
def test_simple(loop, joblib): if joblib is None: pytest.skip() Parallel = joblib.Parallel delayed = joblib.delayed with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as client: with joblib.parallel_backend('dask') as (ba, _): seq = Parallel()(delayed(inc)(i) for i in range(10)) assert seq == [inc(i) for i in range(10)] with pytest.raises(ValueError): Parallel()(delayed(slow_raise_value_error)(i == 3) for i in range(10)) seq = Parallel()(delayed(inc)(i) for i in range(10)) assert seq == [inc(i) for i in range(10)]
def test_read_csv_sync_compute(loop): import dask.dataframe as dd import pandas as pd with cluster(nworkers=1) as (s, [a]): with make_hdfs() as hdfs: with hdfs.open('/tmp/test/1.csv', 'wb') as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') with hdfs.open('/tmp/test/2.csv', 'wb') as f: f.write(b'name,amount,id\nCharlie,300,3\nDennis,400,4') with Executor(('127.0.0.1', s['port']), loop=loop) as e: for lazy in [True, False]: df = read_csv('/tmp/test/*.csv', collection=True, lazy=lazy) assert df.amount.sum().compute(get=e.get) == 1000
def test_values(loop): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: L = [e.submit(inc, i) for i in range(5)] wait(L) p = MultiProgressWidget(L) sync(loop, p.listen) assert set(p.bars) == {'inc'} assert p.status == 'finished' assert p.stream.closed() assert '5 / 5' in p.bar_texts['inc'].value assert p.bars['inc'].value == 1.0 x = e.submit(throws, 1) p = MultiProgressWidget([x]) sync(loop, p.listen) assert p.status == 'error'
def test_restart_sync(loop): with cluster(nanny=True) as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: x = e.submit(div, 1, 2) x.result() assert sync(loop, e.scheduler.who_has) e.restart() assert not sync(loop, e.scheduler.who_has) assert x.cancelled() assert len(e.ncores()) == 2 with pytest.raises(CancelledError): x.result() y = e.submit(div, 1, 3) assert y.result() == 1 / 3
def test_text_progressbar(capsys, loop): with cluster(nanny=True) as (s, [a, b]): with Client(('127.0.0.1', s['port']), loop=loop) as c: futures = c.map(inc, range(10)) p = TextProgressBar(futures, interval=0.01, complete=True) c.gather(futures) start = time() while p.status != 'finished': sleep(0.01) assert time() - start < 5 check_bar_completed(capsys) assert p._last_response == {'all': 10, 'remaining': 0, 'status': 'finished'} assert p.stream.closed()
def test_sync(loop): with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: counter = c.submit(Counter, actor=True) counter = counter.result() assert counter.n == 0 future = counter.increment() n = future.result() assert n == 1 assert counter.n == 1 assert future.result() == future.result() assert 'ActorFuture' in repr(future) assert 'distributed.actor' not in repr(future)
def test_as_completed_distributed(loop): # noqa cluster_kwargs = dict(active_rpc_timeout=10, nanny=Nanny) if DISTRIBUTED_2_11_0: cluster_kwargs["disconnect_timeout"] = 10 with cluster(**cluster_kwargs) as (s, [a, b]): with Client(s["address"], loop=loop) as c: counter_name = "counter_name" counter = Variable(counter_name, client=c) counter.set(0) lock_name = "lock" killed_workers_name = "killed_workers" killed_workers = Variable(killed_workers_name, client=c) killed_workers.set({}) X, y = make_classification(n_samples=100, n_features=10, random_state=0) gs = dcv.GridSearchCV( AsCompletedEstimator(killed_workers_name, lock_name, counter_name, 7), param_grid={"foo_param": [0, 1, 2]}, cv=3, refit=False, cache_cv=False, scheduler=c, ) gs.fit(X, y) def f(dask_scheduler): return dask_scheduler.transition_log def check_reprocess(transition_log): finished = set() for transition in transition_log: key, start_state, end_state = ( transition[0], transition[1], transition[2], ) assert key not in finished if ("score" in key and start_state == "memory" and end_state == "forgotten"): finished.add(key) check_reprocess(c.run_on_scheduler(f))