def trigger_update(self): self.state = profile.get_profile(self.log, start=self.start, stop=self.stop) data = profile.plot_data(self.state, profile_interval) self.states = data.pop("states") update(self.source, data) times = [t * 1000 for t, _ in self.log] counts = list(toolz.pluck("count", toolz.pluck(1, self.log))) self.ts_source.data.update({"time": times, "count": counts})
def slice_slices_and_integers(out_name, in_name, blockdims, index): """ Dask array indexing with slices and integers See Also -------- _slice_1d """ from .core import unknown_chunk_message shape = tuple(cached_cumsum(dim, initial_zero=True)[-1] for dim in blockdims) for dim, ind in zip(shape, index): if np.isnan(dim) and ind != slice(None, None, None): raise ValueError( "Arrays chunk sizes are unknown: %s%s" % (shape, unknown_chunk_message) ) assert all(isinstance(ind, (slice, Integral)) for ind in index) assert len(index) == len(blockdims) # Get a list (for each dimension) of dicts{blocknum: slice()} block_slices = list(map(_slice_1d, shape, blockdims, index)) sorted_block_slices = [sorted(i.items()) for i in block_slices] # (in_name, 1, 1, 2), (in_name, 1, 1, 4), (in_name, 2, 1, 2), ... in_names = list(product([in_name], *[pluck(0, s) for s in sorted_block_slices])) # (out_name, 0, 0, 0), (out_name, 0, 0, 1), (out_name, 0, 1, 0), ... out_names = list( product( [out_name], *[ range(len(d))[::-1] if i.step and i.step < 0 else range(len(d)) for d, i in zip(block_slices, index) if not isinstance(i, Integral) ] ) ) all_slices = list(product(*[pluck(1, s) for s in sorted_block_slices])) dsk_out = { out_name: (getitem, in_name, slices) for out_name, in_name, slices in zip(out_names, in_names, all_slices) } new_blockdims = [ new_blockdim(d, db, i) for d, i, db in zip(shape, index, blockdims) if not isinstance(i, Integral) ] return dsk_out, new_blockdims
def arg_reduction(x, chunk, combine, agg, axis=None, split_every=None, out=None): """Generic function for argreduction. Parameters ---------- x : Array chunk : callable Partialed ``arg_chunk``. combine : callable Partialed ``arg_combine``. agg : callable Partialed ``arg_agg``. axis : int, optional split_every : int or dict, optional """ if axis is None: axis = tuple(range(x.ndim)) ravel = True elif isinstance(axis, Integral): axis = validate_axis(axis, x.ndim) axis = (axis,) ravel = x.ndim == 1 else: raise TypeError("axis must be either `None` or int, got '{0}'".format(axis)) for ax in axis: chunks = x.chunks[ax] if len(chunks) > 1 and np.isnan(chunks).any(): raise ValueError( "Arg-reductions do not work with arrays that have " "unknown chunksizes. At some point in your computation " "this array lost chunking information.\n\n" "A possible solution is with \n" " x.compute_chunk_sizes()" ) # Map chunk across all blocks name = "arg-reduce-{0}".format(tokenize(axis, x, chunk, combine, split_every)) old = x.name keys = list(product(*map(range, x.numblocks))) offsets = list(product(*(accumulate(operator.add, bd[:-1], 0) for bd in x.chunks))) if ravel: offset_info = zip(offsets, repeat(x.shape)) else: offset_info = pluck(axis[0], offsets) chunks = tuple((1,) * len(c) if i in axis else c for (i, c) in enumerate(x.chunks)) dsk = dict( ((name,) + k, (chunk, (old,) + k, axis, off)) for (k, off) in zip(keys, offset_info) ) # The dtype of `tmp` doesn't actually matter, just need to provide something graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x]) tmp = Array(graph, name, chunks, dtype=x.dtype) dtype = np.argmin([1]).dtype result = _tree_reduce(tmp, agg, axis, False, dtype, split_every, combine) return handle_out(out, result)
def test_groupby_tasks(): b = db.from_sequence(range(160), npartitions=4) out = b.groupby(lambda x: x % 10, max_branch=4, shuffle="tasks") partitions = dask.get(out.dask, out.__dask_keys__()) for a in partitions: for b in partitions: if a is not b: assert not set(pluck(0, a)) & set(pluck(0, b)) b = db.from_sequence(range(1000), npartitions=100) out = b.groupby(lambda x: x % 123, shuffle="tasks") assert len(out.dask) < 100 ** 2 partitions = dask.get(out.dask, out.__dask_keys__()) for a in partitions: for b in partitions: if a is not b: assert not set(pluck(0, a)) & set(pluck(0, b)) b = db.from_sequence(range(10000), npartitions=345) out = b.groupby(lambda x: x % 2834, max_branch=24, shuffle="tasks") partitions = dask.get(out.dask, out.__dask_keys__()) for a in partitions: for b in partitions: if a is not b: assert not set(pluck(0, a)) & set(pluck(0, b))
async def test_gather_many_small(c, s, a, *workers): a.total_out_connections = 2 futures = await c._scatter(list(range(100))) assert all(w.data for w in workers) def f(*args): return 10 future = c.submit(f, *futures, workers=a.address) await wait(future) types = list(pluck(0, a.log)) req = [i for i, t in enumerate(types) if t == "request-dep"] recv = [i for i, t in enumerate(types) if t == "receive-dep"] assert min(recv) > max(req) assert a.comm_nbytes == 0
def get_profile(history, recent=None, start=None, stop=None, key=None): """Collect profile information from a sequence of profile states Parameters ---------- history : Sequence[Tuple[time, Dict]] A list or deque of profile states recent : dict The most recent accumulating state start : time stop : time """ now = time() if start is None: istart = 0 else: istart = bisect.bisect_left(history, (start,)) if stop is None: istop = None else: istop = bisect.bisect_right(history, (stop,)) + 1 if istop >= len(history): istop = None # include end if istart == 0 and istop is None: history = list(history) else: iistop = len(history) if istop is None else istop history = [history[i] for i in range(istart, iistop)] prof = merge(*toolz.pluck(1, history)) if not history: return create() if recent: prof = merge(prof, recent) return prof
def plot_cache(results, dsk, start_time, metric_name, palette="Viridis", label_size=60, **kwargs): """Visualize the results of profiling in a bokeh plot. Parameters ---------- results : sequence Output of CacheProfiler.results dsk : dict The dask graph being profiled. start_time : float Start time of the profile. metric_name : string Metric used to measure cache size palette : string, optional Name of the bokeh palette to use, must be a member of bokeh.palettes.all_palettes. label_size: int (optional) Maximum size of output labels in plot, defaults to 60 **kwargs Other keyword arguments, passed to bokeh.figure. These will override all defaults set by visualize. Returns ------- The completed bokeh plot object. """ bp = import_required("bokeh.plotting", _BOKEH_MISSING_MSG) from bokeh.models import HoverTool defaults = dict( title="Profile Results", tools="hover,save,reset,wheel_zoom,xpan", toolbar_location="above", width=800, height=300, ) # Support plot_width and plot_height for backwards compatibility if "plot_width" in kwargs: kwargs["width"] = kwargs.pop("plot_width") if "plot_height" in kwargs: kwargs["height"] = kwargs.pop("plot_height") defaults.update( (k, v) for (k, v) in kwargs.items() if k in _get_figure_keywords()) if results: starts, ends = list(zip(*results))[3:] tics = sorted(unique(starts + ends)) groups = groupby(lambda d: pprint_task(d[1], dsk, label_size), results) data = {} for k, vals in groups.items(): cnts = dict.fromkeys(tics, 0) for v in vals: cnts[v.cache_time] += v.metric cnts[v.free_time] -= v.metric data[k] = [0] + list( accumulate(add, pluck(1, sorted(cnts.items())))) tics = [0] + [i - start_time for i in tics] p = bp.figure(x_range=[0, max(tics)], **defaults) for (key, val), color in zip(data.items(), get_colors(palette, data.keys())): p.line( "x", "y", line_color=color, line_width=3, source=bp.ColumnDataSource({ "x": tics, "y": val, "label": [key for i in val] }), ) else: p = bp.figure(y_range=[0, 10], x_range=[0, 10], **defaults) p.yaxis.axis_label = "Cache Size ({0})".format(metric_name) p.xaxis.axis_label = "Time (s)" hover = p.select(HoverTool) hover.tooltips = """ <div> <span style="font-size: 14px; font-weight: bold;">Task:</span> <span style="font-size: 10px; font-family: Monaco, monospace;">@label</span> </div> """ return p
async def test_worker_bad_args(c, s, a, b): class NoReprObj: """ This object cannot be properly represented as a string. """ def __str__(self): raise ValueError("I have no str representation.") def __repr__(self): raise ValueError("I have no repr representation.") x = c.submit(NoReprObj, workers=a.address) await wait(x) assert not a.executing assert a.data def bad_func(*args, **kwargs): 1 / 0 class MockLoggingHandler(logging.Handler): """Mock logging handler to check for expected logs.""" def __init__(self, *args, **kwargs): self.reset() logging.Handler.__init__(self, *args, **kwargs) def emit(self, record): self.messages[record.levelname.lower()].append(record.getMessage()) def reset(self): self.messages = { "debug": [], "info": [], "warning": [], "error": [], "critical": [], } hdlr = MockLoggingHandler() old_level = logger.level logger.setLevel(logging.DEBUG) logger.addHandler(hdlr) y = c.submit(bad_func, x, k=x, workers=b.address) await wait(y) assert not b.executing assert y.status == "error" # Make sure job died because of bad func and not because of bad # argument. with pytest.raises(ZeroDivisionError): await y tb = await y._traceback() assert any("1 / 0" in line for line in pluck(3, traceback.extract_tb(tb)) if line) assert "Compute Failed" in hdlr.messages["warning"][0] logger.setLevel(old_level) # Now we check that both workers are still alive. xx = c.submit(add, 1, 2, workers=a.address) yy = c.submit(add, 3, 4, workers=b.address) results = await c._gather([xx, yy]) assert tuple(results) == (3, 7)