def test_failed_worker_without_warning(c, s, a, b): L = c.map(inc, range(10)) yield wait(L) original_pid = a.pid with ignoring(CommClosedError): yield c._run(os._exit, 1, workers=[a.worker_address]) start = time() while a.pid == original_pid: yield gen.sleep(0.01) assert time() - start < 10 yield gen.sleep(0.5) start = time() while len(s.ncores) < 2: yield gen.sleep(0.01) assert time() - start < 10 yield wait(L) L2 = c.map(inc, range(10, 20)) yield wait(L2) assert all(len(keys) > 0 for keys in s.has_what.values()) ncores2 = dict(s.ncores) yield c._restart() L = c.map(inc, range(10)) yield wait(L) assert all(len(keys) > 0 for keys in s.has_what.values()) assert not (set(ncores2) & set(s.ncores)) # no overlap
def test_gather_after_failed_worker(loop): with cluster(active_rpc_timeout=10) as (s, [a, b]): with Client(s['address'], loop=loop) as c: L = c.map(inc, range(10)) wait(L) a['proc']().terminate() result = c.gather(L) assert result == list(map(inc, range(10)))
def test_submit_after_failed_worker(loop): with cluster() as (s, [a, b]): with Client(('127.0.0.1', s['port']), loop=loop) as c: L = c.map(inc, range(10)) wait(L) a['proc'].terminate() total = c.submit(sum, L) assert total.result() == sum(map(inc, range(10)))
def test_steal_cheap_data_slow_computation(c, s, a, b): x = c.submit(slowinc, 100, delay=0.1) # learn that slowinc is slow yield wait(x) futures = c.map(slowinc, range(10), delay=0.1, workers=a.address, allow_other_workers=True) yield wait(futures) assert abs(len(a.data) - len(b.data)) <= 5
def test_gather_after_failed_worker(loop): with cluster() as (s, [a, b]): with Client(('127.0.0.1', s['port']), loop=loop) as c: L = c.map(inc, range(10)) wait(L) a['proc'].terminate() result = c.gather(L) assert result == list(map(inc, range(10)))
def test_submit_after_failed_worker_sync(loop): with cluster(active_rpc_timeout=10) as (s, [a, b]): with Client(s['address'], loop=loop) as c: L = c.map(inc, range(10)) wait(L) a['proc']().terminate() total = c.submit(sum, L) assert total.result() == sum(map(inc, range(10)))
def _scatter_data(self, job_cluster, context, workers): logger.debug("Scattering data to workers started") shared_data = {'context': context, 'settings_config': settings.config} scattered = job_cluster.scatter(shared_data, broadcast=True, workers=workers) # Waiting data is scattered to workers distributed.wait(scattered.values()) logger.debug("Scattering data to workers finished") return scattered
def test_share_communication(c, s, w1, w2, w3): x = c.submit(mul, b'1', int(w3.target_message_size + 1), workers=w1.address) y = c.submit(mul, b'2', int(w3.target_message_size + 1), workers=w2.address) yield wait([x, y]) yield c._replicate([x, y], workers=[w1.address, w2.address]) z = c.submit(add, x, y, workers=w3.address) yield wait(z) assert len(w3.incoming_transfer_log) == 2 assert w1.outgoing_transfer_log assert w2.outgoing_transfer_log
def test_dont_overlap_communications_to_same_worker(c, s, a, b): x = c.submit(mul, b'1', int(b.target_message_size + 1), workers=a.address) y = c.submit(mul, b'2', int(b.target_message_size + 1), workers=a.address) yield wait([x, y]) z = c.submit(add, x, y, workers=b.address) yield wait(z) assert len(b.incoming_transfer_log) == 2 l1, l2 = b.incoming_transfer_log assert l1['stop'] < l2['start']
def test_cancel_stress_sync(loop): da = pytest.importorskip('dask.array') x = da.random.random((50, 50), chunks=(2, 2)) with cluster(active_rpc_timeout=10) as (s, [a, b]): with Client(s['address'], loop=loop) as c: x = c.persist(x) y = (x.sum(axis=0) + x.sum(axis=1) + 1).std() wait(x) for i in range(5): f = c.compute(y) sleep(random.random()) c.cancel(f)
def test_load_balance_map(c, s, *workers): class Foo(object): def __init__(self, x, y=None): pass b = c.submit(operator.mul, 'b', 1000000) yield wait(b) actors = c.map(Foo, range(10), y=b, actor=True) yield wait(actors) assert all(len(w.actors) == 2 for w in workers)
def test_cancel_stress_sync(loop): da = pytest.importorskip('dask.array') x = da.random.random((40, 40), chunks=(1, 1)) with cluster() as (s, [a, b]): with Client(('127.0.0.1', s['port']), loop=loop) as c: x = c.persist(x) y = (x.sum(axis=0) + x.sum(axis=1) + 1).std() wait(x) for i in range(5): f = c.compute(y) sleep(1) c.cancel(f)
def test_correct_bad_time_estimate(c, s, *workers): future = c.submit(slowinc, 1, delay=0) yield wait(future) futures = [c.submit(slowinc, future, delay=0.1, pure=False) for i in range(20)] yield gen.sleep(0.5) yield wait(futures) assert all(w.data for w in workers), [sorted(w.data) for w in workers]
def test_dont_steal_few_saturated_tasks_many_workers(c, s, a, *rest): s.extensions['stealing']._pc.callback_time = 20 x = c.submit(mul, b'0', 100000000, workers=a.address) # 100 MB yield wait(x) s.task_duration['slowidentity'] = 0.2 futures = [c.submit(slowidentity, x, pure=False, delay=0.2) for i in range(2)] yield wait(futures) assert len(a.data) == 3 assert not any(w.task_state for w in rest)
def test_dont_steal_expensive_data_fast_computation(c, s, a, b): np = pytest.importorskip('numpy') x = c.submit(np.arange, 1000000, workers=a.address) yield wait([x]) future = c.submit(np.sum, [1], workers=a.address) # learn that sum is fast yield wait([future]) cheap = [c.submit(np.sum, x, pure=False, workers=a.address, allow_other_workers=True) for i in range(10)] yield wait(cheap) assert len(s.who_has[x.key]) == 1 assert len(b.data) == 0 assert len(a.data) == 12
def test_mising_data_errant_worker(c, s, w1, w2, w3): with dask.config.set({'distributed.comm.timeouts.connect': '1s'}): np = pytest.importorskip('numpy') x = c.submit(np.random.random, 10000000, workers=w1.address) yield wait(x) yield c.replicate(x, workers=[w1.address, w2.address]) y = c.submit(len, x, workers=w3.address) while not w3.tasks: yield gen.sleep(0.001) w1._close() yield wait(y)
def test_steal_expensive_data_slow_computation(c, s, a, b): np = pytest.importorskip('numpy') x = c.submit(slowinc, 100, delay=0.2, workers=a.address) yield wait(x) # learn that slowinc is slow x = c.submit(np.arange, 1000000, workers=a.address) # put expensive data yield wait(x) slow = [c.submit(slowinc, x, delay=0.1, pure=False) for i in range(20)] yield wait(slow) assert len(s.who_has[x.key]) > 1 assert b.data # not empty
def test_dont_recompute_if_erred(c, s, a, b): x = delayed(inc)(1, dask_key_name='x') y = delayed(div)(x, 0, dask_key_name='y') yy = y.persist() yield wait(yy) old = list(s.transition_log) yyy = y.persist() yield wait(yyy) yield gen.sleep(0.100) assert list(s.transition_log) == old
def test_file_descriptors(c, s): yield gen.sleep(0.1) psutil = pytest.importorskip('psutil') da = pytest.importorskip('dask.array') proc = psutil.Process() num_fds_1 = proc.num_fds() N = 20 nannies = [Nanny(s.ip, s.port, loop=s.loop) for i in range(N)] yield [n._start() for n in nannies] while len(s.ncores) < N: yield gen.sleep(0.1) num_fds_2 = proc.num_fds() yield gen.sleep(0.2) num_fds_3 = proc.num_fds() assert num_fds_3 <= num_fds_2 + N # add some heartbeats x = da.random.random(size=(1000, 1000), chunks=(25, 25)) x = c.persist(x) yield wait(x) num_fds_4 = proc.num_fds() assert num_fds_4 <= num_fds_2 + 2 * N y = c.persist(x + x.T) yield wait(y) num_fds_5 = proc.num_fds() assert num_fds_5 < num_fds_4 + N yield gen.sleep(1) num_fds_6 = proc.num_fds() assert num_fds_6 < num_fds_5 + N yield [n._close() for n in nannies] assert not s.rpc.open assert not c.rpc.open assert not s.stream_comms start = time() while proc.num_fds() > num_fds_1 + N: yield gen.sleep(0.01) assert time() < start + 3
def test_dont_recompute_if_persisted_3(c, s, a, b): x = delayed(inc)(1, dask_key_name='x') y = delayed(inc)(2, dask_key_name='y') z = delayed(inc)(y, dask_key_name='z') w = delayed(add)(x, z, dask_key_name='w') ww = w.persist() yield wait(ww) old = list(s.transition_log) www = w.persist() yield wait(www) yield gen.sleep(0.100) assert list(s.transition_log) == old
def test_avoid_oversubscription(c, s, *workers): np = pytest.importorskip('numpy') x = c.submit(np.random.random, 1000000, workers=[workers[0].address]) yield wait(x) futures = [c.submit(len, x, pure=False, workers=[w.address]) for w in workers[1:]] yield wait(futures) # Original worker not responsible for all transfers assert len(workers[0].outgoing_transfer_log) < len(workers) - 2 # Some other workers did some work assert len([w for w in workers if len(w.outgoing_transfer_log) > 0]) >= 3
def test_dont_steal_fast_tasks(c, s, *workers): np = pytest.importorskip('numpy') x = c.submit(np.random.random, 10000000, workers=workers[0].address) def do_nothing(x, y=None): pass yield wait(c.submit(do_nothing, 1)) futures = c.map(do_nothing, range(1000), y=x) yield wait(futures) assert len(s.who_has[x.key]) == 1 assert len(s.has_what[workers[0].address]) == 1001
def test_work_steal_no_kwargs(c, s, a, b): yield wait(c.submit(slowinc, 1, delay=0.05)) futures = c.map(slowinc, range(100), workers=a.address, allow_other_workers=True, delay=0.05) yield wait(futures) assert 20 < len(a.data) < 80 assert 20 < len(b.data) < 80 total = c.submit(sum, futures) result = yield total assert result == sum(map(inc, range(100)))
def test_dont_recompute_if_persisted_2(c, s, a, b): x = delayed(inc)(1, dask_key_name='x') y = delayed(inc)(x, dask_key_name='y') z = delayed(inc)(y, dask_key_name='z') yy = y.persist() yield wait(yy) old = s.story('x', 'y') zz = z.persist() yield wait(zz) yield gen.sleep(0.100) assert s.story('x', 'y') == old
def test_worker_who_has_clears_after_failed_connection(c, s, a, b): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 futures = c.map(slowinc, range(20), delay=0.01, key=['f%d' % i for i in range(20)]) yield wait(futures) result = yield c.submit(sum, futures, workers=a.address) for dep in set(a.dep_state) - set(a.task_state): a.release_dep(dep, report=True) n_worker_address = n.worker_address with ignoring(CommClosedError): yield c._run(os._exit, 1, workers=[n_worker_address]) while len(s.workers) > 2: yield gen.sleep(0.01) total = c.submit(sum, futures, workers=a.address) yield total assert not a.has_what.get(n_worker_address) assert not any(n_worker_address in s for s in a.who_has.values()) yield n._close()
def test_fail_write_many_to_disk(c, s, a): a.validate = False yield gen.sleep(0.1) assert not a.paused class Bad(object): def __init__(self, x): pass def __getstate__(self): raise TypeError() def __sizeof__(self): return int(2e9) futures = c.map(Bad, range(11)) future = c.submit(lambda *args: 123, *futures) yield wait(future) with pytest.raises(Exception) as info: yield future # workers still operational result = yield c.submit(inc, 1, workers=a.address) assert result == 2
def test_spill_by_default(c, s, w): da = pytest.importorskip('dask.array') x = da.ones(int(TOTAL_MEMORY * 0.7), chunks=10000000, dtype='u1') y = c.persist(x) yield wait(y) assert len(w.data.slow) # something is on disk del x, y
def test_dont_steal_long_running_tasks(c, s, a, b): def long(delay): with worker_client() as c: sleep(delay) yield c.submit(long, 0.1) # learn duration yield c.submit(inc, 1) # learn duration long_tasks = c.map(long, [0.5, 0.6], workers=a.address, allow_other_workers=True) while sum(map(len, s.processing.values())) < 2: # let them start yield gen.sleep(0.01) start = time() while any(t.key in s.extensions['stealing'].key_stealable for t in long_tasks): yield gen.sleep(0.01) assert time() < start + 1 na = len(a.executing) nb = len(b.executing) incs = c.map(inc, range(100), workers=a.address, allow_other_workers=True) yield gen.sleep(0.2) yield wait(long_tasks) for t in long_tasks: assert (sum(log[1] == 'executing' for log in a.story(t)) + sum(log[1] == 'executing' for log in b.story(t))) <= 1
def test_cleanup_repeated_tasks(c, s, a, b): class Foo(object): pass s.extensions['stealing']._pc.callback_time = 20 yield c.submit(slowidentity, -1, delay=0.1) objects = [c.submit(Foo, pure=False, workers=a.address) for _ in range(50)] x = c.map(slowidentity, objects, workers=a.address, allow_other_workers=True, delay=0.05) del objects yield wait(x) assert a.data and b.data assert len(a.data) + len(b.data) > 10 ws = weakref.WeakSet() ws.update(a.data.values()) ws.update(b.data.values()) del x start = time() while a.data or b.data: yield gen.sleep(0.01) assert time() < start + 1 assert not s.who_has assert not any(s.has_what.values()) assert not list(ws)
def test_gather_then_submit_after_failed_workers(loop): with cluster(nworkers=4) as (s, [w, x, y, z]): with Client(('127.0.0.1', s['port']), loop=loop) as c: L = c.map(inc, range(20)) wait(L) w['proc'].terminate() total = c.submit(sum, L) wait([total]) (_, port) = first(c.scheduler.who_has[total.key]) for d in [x, y, z]: if d['port'] == port: d['proc'].terminate() result = c.gather([total]) assert result == [sum(map(inc, range(20)))]
def test_new_worker_steals(c, s, a): yield wait(c.submit(slowinc, 1, delay=0.01)) futures = c.map(slowinc, range(100), delay=0.05) total = c.submit(sum, futures) while len(a.task_state) < 10: yield gen.sleep(0.01) b = yield Worker(s.address, loop=s.loop, nthreads=1, memory_limit=MEMORY_LIMIT) result = yield total assert result == sum(map(inc, range(100))) for w in [a, b]: assert all(isinstance(v, int) for v in w.data.values()) assert b.data yield b.close()
def test_pause_executor(c, s, a): memory = psutil.Process().memory_info().rss a.memory_limit = memory / 0.8 + 200e6 np = pytest.importorskip('numpy') def f(): x = np.ones(int(300e6), dtype='u1') sleep(1) with captured_logger(logging.getLogger('distributed.worker')) as logger: future = c.submit(f) futures = c.map(slowinc, range(10), delay=0.1) yield gen.sleep(0.3) assert a.paused out = logger.getvalue() assert 'memory' in out.lower() assert 'stop' in out.lower() assert sum(f.status == 'finished' for f in futures) < 4 yield wait(futures)
def test_steal_more_attractive_tasks(c, s, a, *rest): def slow2(x): sleep(1) return x s.extensions['stealing']._pc.callback_time = 20 x = c.submit(mul, b'0', 100000000, workers=a.address) # 100 MB yield wait(x) s.task_duration['slowidentity'] = 0.2 s.task_duration['slow2'] = 1 futures = [ c.submit(slowidentity, x, pure=False, delay=0.2) for i in range(10) ] future = c.submit(slow2, x, priority=-1) while not any(w.task_state for w in rest): yield gen.sleep(0.01) # good future moves first assert any(future.key in w.task_state for w in rest)
def test_device_spill(client, scheduler, worker): # There's a known issue with datetime64: # https://github.com/numpy/numpy/issues/4983#issuecomment-441332940 # The same error above happens when spilling datetime64 to disk cdf = (dask.datasets.timeseries( dtypes={ "x": int, "y": float }, freq="20ms").reset_index(drop=True).map_partitions( cudf.from_pandas)) sizes = yield client.compute( cdf.map_partitions(lambda df: df.__sizeof__())) sizes = sizes.tolist() nbytes = sum(sizes) part_index_nbytes = (yield client.compute( cdf.partitions[0].index)).__sizeof__() cdf2 = cdf.persist() yield wait(cdf2) del cdf host_chunks = yield client.run(lambda: len(get_worker().data.host)) disk_chunks = yield client.run(lambda: len(get_worker().data.disk)) for hc, dc in zip(host_chunks.values(), disk_chunks.values()): if params["spills_to_disk"]: assert dc > 0 else: assert hc > 0 assert dc == 0 yield client.run(worker_assert, nbytes, 32, 2048 + part_index_nbytes) del cdf2 yield client.run(delayed_worker_assert, 0, 0, 0)
def test_context_specific_serialization_class(c, s, a, b): register_serialization(MyObject, my_dumps, my_loads) # Create the object on A, force communication to B x = c.submit(MyObject, x=1, y=2, workers=a.address) y = c.submit(lambda x: x, x, workers=b.address) yield wait(y) key = y.key def check(dask_worker): # Get the context from the object stored on B my_obj = dask_worker.data[key] return my_obj.context result = yield c.run(check, workers=[b.address]) expected = {"sender": a.address, "recipient": b.address} assert result[b.address]["sender"] == a.address # see origin worker z = yield y # bring object to local process assert z.x == 1 and z.y == 2 assert z.context["sender"] == b.address
def test_device_spill(client, scheduler, worker): rs = da.random.RandomState(RandomState=cupy.random.RandomState) x = rs.random(int(250e6), chunks=10e6) xx = x.persist() yield wait(xx) # Allow up to 1024 bytes overhead per chunk serialized yield client.run(worker_assert, x.nbytes, 1024, 1024) y = client.compute(x.sum()) res = yield y assert (abs(res / x.size) - 0.5) < 1e-3 yield client.run(worker_assert, x.nbytes, 1024, 1024) host_chunks = yield client.run(lambda: len(get_worker().data.host)) disk_chunks = yield client.run(lambda: len(get_worker().data.disk)) for hc, dc in zip(host_chunks.values(), disk_chunks.values()): if params["spills_to_disk"]: assert dc > 0 else: assert hc > 0 assert dc == 0
def test_get_client(c, s, a, b): def f(x): cc = get_client() future = cc.submit(inc, x) return future.result() assert default_client() is c future = c.submit(f, 10, workers=a.address) result = yield future assert result == 11 assert a._client assert not b._client assert a._client is c assert default_client() is c a_client = a._client for i in range(10): yield wait(c.submit(f, i)) assert a._client is a_client
def handle(self, *args, **options): dataset = self.DATASETS[options['dataset']]() example_f, labels_f = _create_dataset(dataset) try: ds = DataSet.objects.get(name=options['dataset']) except DataSet.DoesNotExist: ds = DataSet.objects.create( name=options['dataset'], examples=SimpleUploadedFile(example_f.name, example_f.read()), labels=SimpleUploadedFile(labels_f.name, labels_f.read())) if options['classifier'] == 'Tree': gs_tree = ATGridSearchCV( sklearn.tree.DecisionTreeClassifier(), { 'criterion': ['gini', 'entropy'], 'max_depth': range(1, 6), 'max_features': range(1, len(dataset.data[0])) }, dataset=ds.name, webserver_url=options['url']) futures = gs_tree.fit(dataset.data, dataset.target) distributed.wait(futures) elif options['classifier'] == 'Forest': gs_forest = ATGridSearchCV( sklearn.ensemble.RandomForestClassifier(), { 'criterion': ['gini', 'entropy'], 'max_depth': range(1, 6), 'max_features': range(1, len(dataset.data[0])) }, dataset=ds.name, webserver_url=options['url']) distributed.wait(gs_forest.fit(dataset.data, dataset.target)) else: gs_network = ATGridSearchCV( sklearn.neural_network.MLPClassifier(), { 'solver': ['lbfgs', 'sgd', 'adam'], 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'max_iter': range(200, 2000, 200) }, dataset=ds.name, webserver_url=options['url']) distributed.wait(gs_network.fit(dataset.data, dataset.target))
def test_balance_many_workers_2(c, s, *workers): s.extensions['stealing']._pc.callback_time = 100000000 futures = c.map(slowinc, range(90), delay=0.2) yield wait(futures) assert set(map(len, s.has_what.values())) == {3}
def test_balance_many_workers(c, s, *workers): futures = c.map(slowinc, range(20), delay=0.2) yield wait(futures) assert set(map(len, s.has_what.values())) == {0, 1}
def test_balance_many_workers(c, s, *workers): futures = c.map(slowinc, range(20), delay=0.2) yield wait(futures) assert {len(w.has_what) for w in s.workers.values()} == {0, 1}
def main(client, config): import cudf import dask_cudf (date_dim_df, web_page_df, web_sales_df) = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) date_dim_cov_df = date_dim_df.map_partitions(convert_datestring_to_days) q08_start_dt = np.datetime64(q08_STARTDATE, "D").astype(int) q08_end_dt = np.datetime64(q08_ENDDATE, "D").astype(int) filtered_date_df = date_dim_cov_df.query( f"d_date >= {q08_start_dt} and d_date <= {q08_end_dt}", meta=date_dim_cov_df._meta, ).reset_index(drop=True) # Convert wp_type to categorical and get cat_id of review and dynamic type # see https://github.com/rapidsai/cudf/issues/4093 for more info web_page_df = web_page_df.persist() # map_partitions is a bit faster than ddf[col].astype('category') web_page_df["wp_type"] = web_page_df["wp_type"].map_partitions( lambda ser: ser.astype("category")) cpu_categories = web_page_df["wp_type"].compute().cat.categories.to_pandas( ) REVIEW_CAT_CODE = cpu_categories.get_loc("review") # cast to minimum viable dtype codes_min_signed_type = cudf.utils.dtypes.min_signed_type( len(cpu_categories)) web_page_df["wp_type_codes"] = web_page_df["wp_type"].cat.codes.astype( codes_min_signed_type) web_page_newcols = ["wp_web_page_sk", "wp_type_codes"] web_page_df = web_page_df[web_page_newcols] web_clickstream_flist = glob.glob(config["data_dir"] + "web_clickstreams/*.parquet") task_ls = [ delayed(etl_wcs)(fn, filtered_date_df.to_delayed()[0], web_page_df.to_delayed()[0]) for fn in web_clickstream_flist ] meta_d = { "wcs_user_sk": np.ones(1, dtype=np.int64), "tstamp_inSec": np.ones(1, dtype=np.int64), "wcs_sales_sk": np.ones(1, dtype=np.int64), "wp_type_codes": np.ones(1, dtype=np.int8), } meta_df = cudf.DataFrame(meta_d) merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df) merged_df = merged_df.repartition(columns=["wcs_user_sk"]) reviewed_sales = merged_df.map_partitions( reduction_function, REVIEW_CAT_CODE, meta=cudf.DataFrame({"wcs_sales_sk": np.ones(1, dtype=np.int64)}), ) reviewed_sales = reviewed_sales.persist() wait(reviewed_sales) del merged_df all_sales_in_year = filtered_date_df.merge(web_sales_df, left_on=["d_date_sk"], right_on=["ws_sold_date_sk"], how="inner") all_sales_in_year = all_sales_in_year[["ws_net_paid", "ws_order_number"]] all_sales_in_year = all_sales_in_year.persist() wait(all_sales_in_year) # note: switch to mainline # once https://github.com/dask/dask/pull/6066 # lands q08_reviewed_sales = hash_merge( lhs=all_sales_in_year, rhs=reviewed_sales, left_on=["ws_order_number"], right_on=["wcs_sales_sk"], how="inner", ) q08_reviewed_sales_sum = q08_reviewed_sales["ws_net_paid"].sum() q08_all_sales_sum = all_sales_in_year["ws_net_paid"].sum() q08_reviewed_sales_sum, q08_all_sales_sum = client.compute( [q08_reviewed_sales_sum, q08_all_sales_sum]) q08_reviewed_sales_sum, q08_all_sales_sum = ( q08_reviewed_sales_sum.result(), q08_all_sales_sum.result(), ) no_q08_review_sales_amount = q08_all_sales_sum - q08_reviewed_sales_sum final_result_df = cudf.DataFrame() final_result_df["q08_review_sales_amount"] = [q08_reviewed_sales_sum] final_result_df["q08_review_sales_amount"] = final_result_df[ "q08_review_sales_amount"].astype("int") final_result_df["no_q08_review_sales_amount"] = [ no_q08_review_sales_amount ] final_result_df["no_q08_review_sales_amount"] = final_result_df[ "no_q08_review_sales_amount"].astype("int") return final_result_df
def test_dont_steal_unknown_functions(c, s, a, b): futures = c.map(inc, [1, 2], workers=a.address, allow_other_workers=True) yield wait(futures) assert len(a.data) == 2 assert len(b.data) == 0
def test_Actors_create_dependencies(c, s, a, b): counter = yield c.submit(Counter, actor=True) future = c.submit(lambda x: None, counter) yield wait(future) assert s.tasks[future.key].dependencies == {s.tasks[counter.key]}
def test_worker_task_data(c, s, w): x = delayed(2) xx = c.persist(x) yield wait(xx) assert w.data[x.key] == 2
def fwi_gradient(vp_in, nshots, client, solver, shots_container, auth, scale_gradient=None, mute_water=True, exclude_boundaries=True, water_depth=20, checkpointing=False, checkpoint_params=None): start_time = time.time() reset_cluster(client) if not hasattr(fwi_gradient, "obj_fn_cache"): fwi_gradient.obj_fn_cache = {} if exclude_boundaries: vp = np.array(vec2mat(vp_in, solver.model.shape), dtype=solver.model.dtype) else: vp = np.array(vec2mat(vp_in, solver.model.vp.shape), dtype=solver.model.dtype) solver.model.update("vp", vp) # Dask enforces this for large objects f_solver = client.scatter(solver, broadcast=True) futures = [] for i in range(nshots): if checkpointing: futures.append(client.submit(process_shot_checkpointed, i, f_solver, shots_container, auth, exclude_boundaries, checkpoint_params, resources={'tasks': 1})) else: futures.append(client.submit(process_shot, i, f_solver, shots_container, auth, exclude_boundaries, resources={'tasks': 1})) # Ensure one task per worker (to run two, tasks=0.5) if exclude_boundaries: gradient_shape = solver.model.shape else: gradient_shape = solver.model.vp.shape def reduction(*args): grad = np.zeros(gradient_shape) # Closured from above objective = 0. for a in args: o, g = a objective += o grad += g return objective, grad reduce_future = client.submit(reduction, *futures) wait(reduce_future) objective, grad = reduce_future.result() if mute_water: if exclude_boundaries: muted_depth = water_depth else: muted_depth = water_depth + solver.model.nbl grad[:, 0:muted_depth] = 0 # Scipy LBFGS misbehaves if type is not float64 grad = mat2vec(grad).astype(np.float64) if scale_gradient is not None: if scale_gradient == "W": if not hasattr(fwi_gradient, "gradient_scaling_factor"): fwi_gradient.gradient_scaling_factor = np.max(np.abs(grad)) grad /= fwi_gradient.gradient_scaling_factor elif scale_gradient == "L": grad /= np.max(np.abs(grad)) else: raise ValueError("Invalid value %s for gradient scaling. Allowed: None, L, W" % scale_gradient) fwi_gradient.obj_fn_cache[vp_in.tobytes()] = objective elapsed_time = time.time() - start_time eprint("Objective function evaluation completed in %f seconds. F=%f" % (elapsed_time, objective)) return objective, -grad
def test_balance_many_workers_2(c, s, *workers): s.extensions["stealing"]._pc.callback_time = 100000000 futures = c.map(slowinc, range(90), delay=0.2) yield wait(futures) assert {len(w.has_what) for w in s.workers.values()} == {3}
def test_eventually_steal_unknown_functions(c, s, a, b): futures = c.map(slowinc, range(10), delay=0.1, workers=a.address, allow_other_workers=True) yield wait(futures) assert len(a.data) >= 3 assert len(b.data) >= 3
def main(client): item_df, store_sales_df, web_clickstreams_df = read_tables() ### Query 0. Filtering item table filtered_item_df = string_filter(item_df, "i_category", q12_i_category_IN) filtered_item_df = filtered_item_df.persist() ### filtered_item_df is a single partition to allow a nx1 merge using map partitions filtered_item_df = filtered_item_df.repartition(npartitions=1) ### Query 1 # The main idea is that we don't fuse a filtration task with reading task yet # this causes more memory pressures as we try to read the whole thing ( and spill that) # at once and then do filtration . ### Below Pr has the dashboard snapshot which makes the problem clear ### https://github.com/rapidsai/tpcx-bb-internal/pull/496#issue-399946141 meta_d = { "wcs_user_sk": np.ones(1, dtype=web_clickstreams_df["wcs_user_sk"].dtype), "wcs_click_date_sk": np.ones(1, dtype=np.int64), } meta_df = cudf.DataFrame(meta_d) filter_wcs_df = web_clickstreams_df.map_partitions( filter_wcs_table, filtered_item_df.to_delayed()[0], meta=meta_df) ### Query 2 # The main idea is that we don't fuse a filtration task with reading task yet # this causes more memory pressures as we try to read the whole thing ( and spill that) # at once and then do filtration . ### Below Pr has the dashboard snapshot which makes the problem clear ### https://github.com/rapidsai/tpcx-bb-internal/pull/496#issue-399946141 meta_d = { "ss_customer_sk": np.ones(1, dtype=store_sales_df["ss_customer_sk"].dtype), "ss_sold_date_sk": np.ones(1, dtype=np.int64), } meta_df = cudf.DataFrame(meta_d) filtered_ss_df = store_sales_df.map_partitions( filter_ss_table, filtered_item_df.to_delayed()[0], meta=meta_df) ### Result Query ### SELECT DISTINCT wcs_user_sk ### .... ### webInRange ### storeInRange ### WHERE wcs_user_sk = ss_customer_sk ### AND wcs_click_date_sk < ss_sold_date_sk -- buy AFTER viewed on website ### ORDER BY wcs_user_sk ### Note: Below brings it down to a single partition filter_wcs_df_d = filter_wcs_df.drop_duplicates() filtered_ss_df_d = filtered_ss_df.drop_duplicates() ss_wcs_join = filter_wcs_df_d.merge(filtered_ss_df_d, left_on="wcs_user_sk", right_on="ss_customer_sk", how="inner") ss_wcs_join = ss_wcs_join[ ss_wcs_join["wcs_click_date_sk"] < ss_wcs_join["ss_sold_date_sk"]] ss_wcs_join = ss_wcs_join["wcs_user_sk"] ### todo: check performence by replacing with 1 single drop_duplicates call ### below decreases memory usage on the single gpu to help with subsequent compute ss_wcs_join = ss_wcs_join.map_partitions(lambda sr: sr.drop_duplicates()) ss_wcs_join = ss_wcs_join.repartition(npartitions=1).persist() ss_wcs_join = ss_wcs_join.drop_duplicates().reset_index(drop=True) ss_wcs_join = ss_wcs_join.map_partitions(lambda ser: ser.sort_values()) # todo:check if repartition helps for writing efficiency # context: 0.1 seconds on sf-1k wait(ss_wcs_join) return ss_wcs_join.to_frame()
def test_decide_worker_with_restrictions(client, s, a, b, c): x = client.submit(inc, 1, workers=[a.address, b.address]) yield wait(x) assert x.key in a.data or x.key in b.data
tfrecords = att.generate( csv_file=record, HSI_sensor_path=hyperspec_path, RGB_sensor_path=rgb_path, chunk_size=500, train=True, domain=numeric_domain, site=numeric_site, heights=heights, elevation=elevation, label_column="filtered_taxonID", species_label_dict=species_label_dict ) return tfrecords train_tfrecords = [] for record in records_to_run: future = client.submit(run, record=record) train_tfrecords.append(future) wait(train_tfrecords) for x in train_tfrecords: try: print(x.result()) except Exception as e: print("{} failed with {}".format(x, e)) pass
def test_worker_bad_args(c, s, a, b): class NoReprObj(object): """ This object cannot be properly represented as a string. """ def __str__(self): raise ValueError("I have no str representation.") def __repr__(self): raise ValueError("I have no repr representation.") x = c.submit(NoReprObj, workers=a.address) yield wait(x) assert not a.executing assert a.data def bad_func(*args, **kwargs): 1 / 0 class MockLoggingHandler(logging.Handler): """Mock logging handler to check for expected logs.""" def __init__(self, *args, **kwargs): self.reset() logging.Handler.__init__(self, *args, **kwargs) def emit(self, record): self.messages[record.levelname.lower()].append(record.getMessage()) def reset(self): self.messages = { 'debug': [], 'info': [], 'warning': [], 'error': [], 'critical': [], } hdlr = MockLoggingHandler() old_level = logger.level logger.setLevel(logging.DEBUG) logger.addHandler(hdlr) y = c.submit(bad_func, x, k=x, workers=b.address) yield wait(y) assert not b.executing assert y.status == 'error' # Make sure job died because of bad func and not because of bad # argument. with pytest.raises(ZeroDivisionError): yield y if sys.version_info[0] >= 3: tb = yield y._traceback() assert any('1 / 0' in line for line in pluck(3, traceback.extract_tb(tb)) if line) assert "Compute Failed" in hdlr.messages['warning'][0] logger.setLevel(old_level) # Now we check that both workers are still alive. xx = c.submit(add, 1, 2, workers=a.address) yy = c.submit(add, 3, 4, workers=b.address) results = yield c._gather([xx, yy]) assert tuple(results) == (3, 7)
def f(): if futures_of(self): yield wait(self) raise gen.Return(self)
def test_statistical_profiling(c, s, a, b): futures = c.map(slowinc, range(10), delay=0.1) yield wait(futures) profile = a.profile_keys['slowinc'] assert profile['count']
def test_work_stealing(c, s, a, b): [x] = yield c._scatter([1], workers=a.address) futures = c.map(slowadd, range(50), [x] * 50) yield wait(futures) assert len(a.data) > 10 assert len(b.data) > 10
def test_move_data_over_break_restrictions(client, s, a, b, c): [x] = yield client._scatter([1], workers=b.address) y = client.submit(inc, x, workers=[a.address, b.address]) yield wait(y) assert y.key in a.data or y.key in b.data
def main(client, config): import cudf import dask_cudf store_sales, date_dim, store, product_reviews = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) ### adding a wait call slows this down by 3-4 seconds, removing it for now ### Make TEMP_TABLE1 # filter date table q18_startDate_int = np.datetime64(q18_startDate, "ms").astype(int) q18_endDate_int = np.datetime64(q18_endDate, "ms").astype(int) date_dim_filtered = date_dim.loc[ (date_dim.d_date.astype("datetime64[ms]").astype("int") >= q18_startDate_int) & (date_dim.d_date.astype("datetime64[ms]").astype("int") <= q18_endDate_int)].reset_index(drop=True) # build the regression_analysis table ss_date_dim_join = left_semi_join( store_sales, date_dim_filtered, left_on=["ss_sold_date_sk"], right_on=["d_date_sk"], ) temp = (ss_date_dim_join.groupby(["ss_store_sk", "ss_sold_date_sk"], ).agg( { "ss_net_paid": "sum" }).reset_index()) temp["xx"] = temp.ss_sold_date_sk * temp.ss_sold_date_sk temp["xy"] = temp.ss_sold_date_sk * temp.ss_net_paid temp.columns = ["ss_store_sk", "x", "y", "xx", "xy"] regression_analysis = (temp.groupby(["ss_store_sk"]).agg({ "x": ["count", "sum"], "xy": "sum", "y": "sum", "xx": "sum" }).reset_index(drop=False)) regression_analysis["slope"] = ( regression_analysis[("x", "count")] * regression_analysis[ ("xy", "sum")] - regression_analysis[("x", "sum")] * regression_analysis[("y", "sum")] ) / (regression_analysis[("x", "count")] * regression_analysis[ ("xx", "sum")] - regression_analysis[("x", "sum")] * regression_analysis[("x", "sum")]) regression_analysis = regression_analysis[["ss_store_sk", "slope"]] regression_analysis.columns = ["ss_store_sk", "slope"] regression_analysis["ss_store_sk"] = regression_analysis[ "ss_store_sk"].astype("int32") store["s_store_sk"] = store["s_store_sk"].astype("int32") temp_table1 = store.merge( regression_analysis[["ss_store_sk", "slope" ]].query("slope <= 0").reset_index(drop=True), left_on="s_store_sk", right_on="ss_store_sk", ) temp_table1 = temp_table1[["s_store_sk", "s_store_name"]] # repartition this table to be one partition, since its only 192 at SF1000 temp_table1 = temp_table1.repartition(npartitions=1) temp_table1 = temp_table1.persist() ### Make TEMP_TABLE2 stores_with_regression = temp_table1 pr = product_reviews # known to be small. very few relevant stores (169) at SF1000 targets = (stores_with_regression.s_store_name.str.lower().unique(). compute().tolist()) n_targets = len(targets) no_nulls = pr[~pr.pr_review_content.isnull()].reset_index(drop=True) no_nulls["pr_review_sk"] = no_nulls["pr_review_sk"].astype("int32") ### perssiting because no_nulls is used twice no_nulls = no_nulls.reset_index(drop=True).persist() temp_table2_meta_empty_df = cudf.DataFrame({ "word": ["a"], "pr_review_sk": np.ones(1, dtype=np.int64), "pr_review_date": ["a"], }).head(0) ### get relevant reviews combined = no_nulls.map_partitions( find_relevant_reviews, targets, meta=temp_table2_meta_empty_df, ) stores_with_regression[ "store_ID"] = stores_with_regression.s_store_sk.astype("str").str.cat( stores_with_regression.s_store_name, sep="_") stores_with_regression[ "s_store_name"] = stores_with_regression.s_store_name.str.lower() # Keep this commented line to illustrate that we could exactly match Spark # temp_table2 = temp_table2[['store_ID', 'pr_review_date', 'pr_review_content']] temp_table2 = combined.merge(stores_with_regression, how="inner", left_on=["word"], right_on=["s_store_name"]) temp_table2 = temp_table2[["store_ID", "pr_review_date", "pr_review_sk"]] temp_table2 = temp_table2.persist() ### REAL QUERY (PART THREE) no_nulls["pr_review_content"] = no_nulls.pr_review_content.str.replace( [". ", "? ", "! "], [EOL_CHAR], regex=False) sentences = no_nulls.map_partitions(create_sentences_from_reviews) # need the global position in the sentence tokenized df sentences["x"] = 1 sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum() del sentences["x"] # This file comes from the official TPCx-BB kit # We extracted it from bigbenchqueriesmr.jar sentiment_dir = "/".join(config["data_dir"].split("/")[:-3] + ["sentiment_files"]) with open(f"{sentiment_dir}/negativeSentiment.txt") as fh: negativeSentiment = list(map(str.strip, fh.readlines())) # dedupe for one extra record in the source file negativeSentiment = list(set(negativeSentiment)) word_df = sentences.map_partitions( create_words_from_sentences, global_position_column="sentence_tokenized_global_pos", ) sent_df = cudf.DataFrame({"word": negativeSentiment}) sent_df["sentiment"] = "NEG" sent_df = dask_cudf.from_cudf(sent_df, npartitions=1) word_sentence_sentiment = word_df.merge(sent_df, how="inner", on="word") word_sentence_sentiment[ "sentence_idx_global_pos"] = word_sentence_sentiment[ "sentence_idx_global_pos"].astype("int64") sentences["sentence_tokenized_global_pos"] = sentences[ "sentence_tokenized_global_pos"].astype("int64") word_sentence_sentiment_with_sentence_info = word_sentence_sentiment.merge( sentences, how="left", left_on="sentence_idx_global_pos", right_on="sentence_tokenized_global_pos", ) temp_table2["pr_review_sk"] = temp_table2["pr_review_sk"].astype("int32") final = word_sentence_sentiment_with_sentence_info.merge( temp_table2[["store_ID", "pr_review_date", "pr_review_sk"]], how="inner", left_on="review_idx_global_pos", right_on="pr_review_sk", ) keepcols = ["store_ID", "pr_review_date", "sentence", "sentiment", "word"] final = final[keepcols] final.columns = [ "s_name", "r_date", "r_sentence", "sentiment", "sentiment_word" ] final = final.persist() wait(final) final = final.sort_values( ["s_name", "r_date", "r_sentence", "sentiment_word"]) final = final.persist() wait(final) print(len(final)) return final
def test_log_tasks_during_restart(c, s, a, b): future = c.submit(sys.exit, 0) yield wait(future) assert 'exit' in str(s.events)
def main(client, config): import dask_cudf import cudf item_df = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) wcs_tstamp_min = get_wcs_minima(config) item_df["i_item_sk"] = item_df["i_item_sk"].astype("int32") item_df["i_category_id"] = item_df["i_category_id"].astype("int8") # we eventually will only care about these categories, so we can filter now item_df_filtered = item_df.loc[item_df.i_category_id.isin( q03_purchased_item_category_IN)].reset_index(drop=True) # The main idea is that we don't fuse a filtration task with reading task yet # this causes more memory pressures as we try to read the whole thing ( and spill that) # at once and then do filtration . ### Below Pr has the dashboard snapshot which makes the problem clear ### https://github.com/rapidsai/tpcx-bb-internal/pull/496#issue-399946141 web_clickstream_flist = glob.glob( os.path.join(config["data_dir"], "web_clickstreams/*.parquet")) task_ls = [ delayed(pre_repartition_task)(fn, item_df.to_delayed()[0], wcs_tstamp_min) for fn in web_clickstream_flist ] meta_d = { "wcs_user_sk": np.ones(1, dtype=np.int32), "tstamp": np.ones(1, dtype=np.int32), "wcs_item_sk": np.ones(1, dtype=np.int32), "wcs_sales_sk": np.ones(1, dtype=np.int32), "i_category_id": np.ones(1, dtype=np.int8), } meta_df = cudf.DataFrame(meta_d) merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df) merged_df = merged_df.shuffle(on="wcs_user_sk") meta_d = { "i_item_sk": np.ones(1, dtype=merged_df["wcs_item_sk"].dtype), "cnt": np.ones(1, dtype=merged_df["wcs_item_sk"].dtype), } meta_df = cudf.DataFrame(meta_d) grouped_df = merged_df.map_partitions(reduction_function, item_df_filtered.to_delayed()[0], meta=meta_df) ### todo: check if this has any impact on stability grouped_df = grouped_df.persist(priority=10000) ### todo: remove this later after more testing wait(grouped_df) print("---" * 20) print("grouping complete ={}".format(len(grouped_df))) grouped_df = grouped_df.groupby(["i_item_sk" ]).sum(split_every=2).reset_index() grouped_df.columns = ["i_item_sk", "cnt"] result_df = grouped_df.map_partitions( lambda df: df.sort_values(by=["cnt"], ascending=False)) result_df.columns = ["lastviewed_item", "cnt"] result_df["purchased_item"] = q03_purchased_item_IN cols_order = ["purchased_item", "lastviewed_item", "cnt"] result_df = result_df[cols_order] result_df = result_df.persist() ### todo: remove this later after more testing wait(result_df) print(len(result_df)) result_df = result_df.head(q03_limit) print("result complete") print("---" * 20) return result_df
def test_get_task_status(c, s, a, b): future = c.submit(inc, 1) yield wait(future) result = yield a.scheduler.get_task_status(keys=[future.key]) assert result == {future.key: 'memory'}