def apply_gufunc(func, signature, *args, **kwargs): """ Apply a generalized ufunc or similar python function to arrays. ``signature`` determines if the function consumes or produces core dimensions. The remaining dimensions in given input arrays (``*args``) are considered loop dimensions and are required to broadcast naturally against each other. In other terms, this function is like ``np.vectorize``, but for the blocks of dask arrays. If the function itself shall also be vectorized use ``vectorize=True`` for convenience. Parameters ---------- func : callable Function to call like ``func(*args, **kwargs)`` on input arrays (``*args``) that returns an array or tuple of arrays. If multiple arguments with non-matching dimensions are supplied, this function is expected to vectorize (broadcast) over axes of positional arguments in the style of NumPy universal functions [1]_ (if this is not the case, set ``vectorize=True``). If this function returns multiple outputs, ``output_core_dims`` has to be set as well. signature: string Specifies what core dimensions are consumed and produced by ``func``. According to the specification of numpy.gufunc signature [2]_ *args : numeric Input arrays or scalars to the callable function. axes: List of tuples, optional, keyword only A list of tuples with indices of axes a generalized ufunc should operate on. For instance, for a signature of ``"(i,j),(j,k)->(i,k)"`` appropriate for matrix multiplication, the base elements are two-dimensional matrices and these are taken to be stored in the two last axes of each argument. The corresponding axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``. For simplicity, for generalized ufuncs that operate on 1-dimensional arrays (vectors), a single integer is accepted instead of a single-element tuple, and for generalized ufuncs for which all outputs are scalars, the output tuples can be omitted. axis: int, optional, keyword only A single axis over which a generalized ufunc should operate. This is a short-cut for ufuncs that operate over a single, shared core dimension, equivalent to passing in axes with entries of (axis,) for each single-core-dimension argument and ``()`` for all others. For instance, for a signature ``"(i),(i)->()"``, it is equivalent to passing in ``axes=[(axis,), (axis,), ()]``. keepdims: bool, optional, keyword only If this is set to True, axes which are reduced over will be left in the result as a dimension with size one, so that the result will broadcast correctly against the inputs. This option can only be used for generalized ufuncs that operate on inputs that all have the same number of core dimensions and with outputs that have no core dimensions , i.e., with signatures like ``"(i),(i)->()"`` or ``"(m,m)->()"``. If used, the location of the dimensions in the output can be controlled with axes and axis. output_dtypes : Optional, dtype or list of dtypes, keyword only Valid numpy dtype specification or list thereof. If not given, a call of ``func`` with a small set of data is performed in order to try to automatically determine the output dtypes. output_sizes : dict, optional, keyword only Optional mapping from dimension names to sizes for outputs. Only used if new core dimensions (not found on inputs) appear on outputs. vectorize: bool, keyword only If set to ``True``, ``np.vectorize`` is applied to ``func`` for convenience. Defaults to ``False``. allow_rechunk: Optional, bool, keyword only Allows rechunking, otherwise chunk sizes need to match and core dimensions are to consist only of one chunk. Warning: enabling this can increase memory usage significantly. Defaults to ``False``. **kwargs : dict Extra keyword arguments to pass to `func` Returns ------- Single dask.array.Array or tuple of dask.array.Array Examples -------- >>> import dask.array as da >>> import numpy as np >>> def stats(x): ... return np.mean(x, axis=-1), np.std(x, axis=-1) >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30)) >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a) >>> mean.compute().shape (10, 20) >>> def outer_product(x, y): ... return np.einsum("i,j->ij", x, y) >>> a = da.random.normal(size=( 20,30), chunks=(10, 30)) >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40)) >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, vectorize=True) >>> c.compute().shape (10, 20, 30, 40) References ---------- .. [1] https://docs.scipy.org/doc/numpy/reference/ufuncs.html .. [2] https://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html """ axes = kwargs.pop("axes", None) axis = kwargs.pop("axis", None) keepdims = kwargs.pop("keepdims", False) output_dtypes = kwargs.pop("output_dtypes", None) output_sizes = kwargs.pop("output_sizes", None) vectorize = kwargs.pop("vectorize", None) allow_rechunk = kwargs.pop("allow_rechunk", False) # Input processing: ## Signature if not isinstance(signature, str): raise TypeError("`signature` has to be of type string") input_coredimss, output_coredimss = _parse_gufunc_signature(signature) ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples nout = None if not isinstance(output_coredimss, list) else len(output_coredimss) ## Determine and handle output_dtypes if output_dtypes is None: if vectorize: tempfunc = np.vectorize(func, signature=signature) else: tempfunc = func output_dtypes = apply_infer_dtype(tempfunc, args, kwargs, "apply_gufunc", "output_dtypes", nout) if isinstance(output_dtypes, (tuple, list)): if nout is None: if len(output_dtypes) > 1: raise ValueError( ("Must specify single dtype or list of one dtype " "for `output_dtypes` for function with one output")) otypes = output_dtypes output_dtypes = output_dtypes[0] else: otypes = output_dtypes else: if nout is not None: raise ValueError( "Must specify tuple of dtypes for `output_dtypes` for function with multiple outputs" ) otypes = [output_dtypes] ## Vectorize function, if required if vectorize: func = np.vectorize(func, signature=signature, otypes=otypes) ## Miscellaneous if output_sizes is None: output_sizes = {} ## Axes input_axes, output_axes = _validate_normalize_axes(axes, axis, keepdims, input_coredimss, output_coredimss) # Main code: ## Cast all input arrays to dask args = [asarray(a) for a in args] if len(input_coredimss) != len(args): ValueError( "According to `signature`, `func` requires %d arguments, but %s given" % (len(input_coredimss), len(args))) ## Axes: transpose input arguments transposed_args = [] for arg, iax, input_coredims in zip(args, input_axes, input_coredimss): shape = arg.shape iax = tuple(a if a < 0 else a - len(shape) for a in iax) tidc = tuple(i for i in range(-len(shape) + 0, 0) if i not in iax) + iax transposed_arg = arg.transpose(tidc) transposed_args.append(transposed_arg) args = transposed_args ## Assess input args for loop dims input_shapes = [a.shape for a in args] input_chunkss = [a.chunks for a in args] num_loopdims = [ len(s) - len(cd) for s, cd in zip(input_shapes, input_coredimss) ] max_loopdims = max(num_loopdims) if num_loopdims else None core_input_shapes = [ dict(zip(icd, s[n:])) for s, n, icd in zip(input_shapes, num_loopdims, input_coredimss) ] core_shapes = merge(*core_input_shapes) core_shapes.update(output_sizes) loop_input_dimss = [ tuple("__loopdim%d__" % d for d in range(max_loopdims - n, max_loopdims)) for n in num_loopdims ] input_dimss = [l + c for l, c in zip(loop_input_dimss, input_coredimss)] loop_output_dims = max(loop_input_dimss, key=len) if loop_input_dimss else tuple() ## Assess input args for same size and chunk sizes ### Collect sizes and chunksizes of all dims in all arrays dimsizess = {} chunksizess = {} for dims, shape, chunksizes in zip(input_dimss, input_shapes, input_chunkss): for dim, size, chunksize in zip(dims, shape, chunksizes): dimsizes = dimsizess.get(dim, []) dimsizes.append(size) dimsizess[dim] = dimsizes chunksizes_ = chunksizess.get(dim, []) chunksizes_.append(chunksize) chunksizess[dim] = chunksizes_ ### Assert correct partitioning, for case: for dim, sizes in dimsizess.items(): #### Check that the arrays have same length for same dimensions or dimension `1` if set(sizes).union({1}) != {1, max(sizes)}: raise ValueError( "Dimension `'{}'` with different lengths in arrays".format( dim)) if not allow_rechunk: chunksizes = chunksizess[dim] #### Check if core dimensions consist of only one chunk if (dim in core_shapes) and (chunksizes[0][0] < core_shapes[dim]): raise ValueError( "Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \ chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \ significantly.".format(dim)) #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1 relevant_chunksizes = list( unique(c for s, c in zip(sizes, chunksizes) if s > 1)) if len(relevant_chunksizes) > 1: raise ValueError( "Dimension `'{}'` with different chunksize present".format( dim)) ## Apply function - use blockwise here arginds = list(concat(zip(args, input_dimss))) ### Use existing `blockwise` but only with loopdims to enforce ### concatenation for coredims that appear also at the output ### Modifying `blockwise` could improve things here. try: tmp = blockwise( # First try to compute meta func, loop_output_dims, *arginds, concatenate=True, **kwargs) except ValueError: # If computing meta doesn't work, provide it explicitly based on # provided dtypes sample = arginds[0]._meta if isinstance(output_dtypes, tuple): meta = tuple( meta_from_array(sample, dtype=odt) for ocd, odt in zip(output_coredimss, output_dtypes)) else: meta = tuple( meta_from_array(sample, dtype=odt) for ocd, odt in zip((output_coredimss, ), (output_dtypes, ))) tmp = blockwise(func, loop_output_dims, *arginds, concatenate=True, meta=meta, **kwargs) if isinstance(tmp._meta, tuple): metas = tmp._meta else: metas = (tmp._meta, ) ## Prepare output shapes loop_output_shape = tmp.shape loop_output_chunks = tmp.chunks keys = list(flatten(tmp.__dask_keys__())) name, token = keys[0][0].split("-") ### *) Treat direct output if nout is None: output_coredimss = [output_coredimss] output_dtypes = [output_dtypes] ## Split output leaf_arrs = [] for i, (ocd, oax, meta) in enumerate(zip(output_coredimss, output_axes, metas)): core_output_shape = tuple(core_shapes[d] for d in ocd) core_chunkinds = len(ocd) * (0, ) output_shape = loop_output_shape + core_output_shape output_chunks = loop_output_chunks + core_output_shape leaf_name = "%s_%d-%s" % (name, i, token) leaf_dsk = {(leaf_name, ) + key[1:] + core_chunkinds: ((getitem, key, i) if nout else key) for key in keys} graph = HighLevelGraph.from_collections(leaf_name, leaf_dsk, dependencies=[tmp]) meta = meta_from_array(meta, len(output_shape)) leaf_arr = Array(graph, leaf_name, chunks=output_chunks, shape=output_shape, meta=meta) ### Axes: if keepdims: slices = len( leaf_arr.shape) * (slice(None), ) + len(oax) * (np.newaxis, ) leaf_arr = leaf_arr[slices] tidcs = [None] * len(leaf_arr.shape) for i, oa in zip(range(-len(oax), 0), oax): tidcs[oa] = i j = 0 for i in range(len(tidcs)): if tidcs[i] is None: tidcs[i] = j j += 1 leaf_arr = leaf_arr.transpose(tidcs) leaf_arrs.append(leaf_arr) return (*leaf_arrs, ) if nout else leaf_arrs[0] # Undo *) from above
def unpack_collections(expr): """Normalize a python object and merge all sub-graphs. - Replace ``Delayed`` with their keys - Convert literals to things the schedulers can handle - Extract dask graphs from all enclosed values Parameters ---------- expr : object The object to be normalized. This function knows how to handle dask collections, as well as most builtin python types. Returns ------- task : normalized task to be run collections : a tuple of collections Examples -------- >>> import dask >>> a = delayed(1, 'a') >>> b = delayed(2, 'b') >>> task, collections = unpack_collections([a, b, 3]) >>> task # doctest: +SKIP ['a', 'b', 3] >>> collections # doctest: +SKIP (a, b) >>> task, collections = unpack_collections({a: 1, b: 2}) >>> task # doctest: +SKIP (dict, [['a', 1], ['b', 2]]) >>> collections # doctest: +SKIP {a, b} """ if isinstance(expr, Delayed): return expr._key, (expr, ) if is_dask_collection(expr): finalized = finalize(expr) return finalized._key, (finalized, ) if isinstance(expr, Iterator): expr = tuple(expr) typ = type(expr) if typ in (list, tuple, set): args, collections = unzip((unpack_collections(e) for e in expr), 2) args = list(args) collections = tuple(unique(concat(collections), key=id)) # Ensure output type matches input type if typ is not list: args = (typ, args) return args, collections if typ is dict: args, collections = unpack_collections([[k, v] for k, v in expr.items()]) return (dict, args), collections if typ is slice: args, collections = unpack_collections( [expr.start, expr.stop, expr.step]) return (slice, ) + tuple(args), collections if is_dataclass(expr): args, collections = unpack_collections( [[f.name, getattr(expr, f.name)] for f in dataclass_fields(expr)]) return (apply, typ, (), (dict, args)), collections return expr, ()
def __iter__(self): return toolz.unique(toolz.concat(self.layers.values()))
def test_distinct_with_key(): seq = [{"a": i} for i in [0, 1, 2, 1, 2, 3, 2, 3, 4, 5]] bag = db.from_sequence(seq, npartitions=3) expected = list(unique(seq, key=lambda x: x["a"])) assert_eq(bag.distinct(key="a"), expected) assert_eq(bag.distinct(key=lambda x: x["a"]), expected)
def _paths_to_cats(paths, file_scheme): """ Extract categorical fields and labels from hive- or drill-style paths. FixMe: This has been pasted from https://github.com/dask/fastparquet/pull/471 Use fastparquet.api.paths_to_cats from fastparquet>0.3.2 instead. Parameters ---------- paths (Iterable[str]): file paths relative to root file_scheme (str): Returns ------- cats (OrderedDict[str, List[Any]]): a dict of field names and their values """ if file_scheme in ["simple", "flat", "other"]: cats = {} return cats cats = OrderedDict() raw_cats = OrderedDict() s = ex_from_sep("/") paths = toolz.unique(paths) if file_scheme == "hive": partitions = toolz.unique( (k, v) for path in paths for k, v in s.findall(path)) for key, val in partitions: cats.setdefault(key, set()).add(val_to_num(val)) raw_cats.setdefault(key, set()).add(val) else: i_val = toolz.unique((i, val) for path in paths for i, val in enumerate(path.split("/")[:-1])) for i, val in i_val: key = "dir%i" % i cats.setdefault(key, set()).add(val_to_num(val)) raw_cats.setdefault(key, set()).add(val) for key, v in cats.items(): # Check that no partition names map to the same value after transformation by val_to_num raw = raw_cats[key] if len(v) != len(raw): conflicts_by_value = OrderedDict() for raw_val in raw_cats[key]: conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val) conflicts = [ c for k in conflicts_by_value.values() if len(k) > 1 for c in k ] raise ValueError("Partition names map to the same value: %s" % conflicts) vals_by_type = groupby_types(v) # Check that all partition names map to the same type after transformation by val_to_num if len(vals_by_type) > 1: examples = [x[0] for x in vals_by_type.values()] warnings.warn( "Partition names coerce to values of different types, e.g. %s" % examples) cats = OrderedDict([(key, list(v)) for key, v in cats.items()]) return cats
def _find_unique(series): values = series.dropna() if series.name in self.columns_with_iterables: values = tlz.concat(values) return list(tlz.unique(values))
def plot_cache( results, dsk, start_time, metric_name, palette="Viridis", label_size=60, **kwargs ): """Visualize the results of profiling in a bokeh plot. Parameters ---------- results : sequence Output of CacheProfiler.results dsk : dict The dask graph being profiled. start_time : float Start time of the profile. metric_name : string Metric used to measure cache size palette : string, optional Name of the bokeh palette to use, must be a member of bokeh.palettes.all_palettes. label_size: int (optional) Maximum size of output labels in plot, defaults to 60 **kwargs Other keyword arguments, passed to bokeh.figure. These will override all defaults set by visualize. Returns ------- The completed bokeh plot object. """ bp = import_required("bokeh.plotting", _BOKEH_MISSING_MSG) from bokeh.models import HoverTool defaults = dict( title="Profile Results", tools="hover,save,reset,wheel_zoom,xpan", toolbar_location="above", width=800, height=300, ) # Support plot_width and plot_height for backwards compatibility if "plot_width" in kwargs: kwargs["width"] = kwargs.pop("plot_width") if "plot_height" in kwargs: kwargs["height"] = kwargs.pop("plot_height") defaults.update(**kwargs) if results: starts, ends = list(zip(*results))[3:] tics = sorted(unique(starts + ends)) groups = groupby(lambda d: pprint_task(d[1], dsk, label_size), results) data = {} for k, vals in groups.items(): cnts = dict.fromkeys(tics, 0) for v in vals: cnts[v.cache_time] += v.metric cnts[v.free_time] -= v.metric data[k] = [0] + list(accumulate(add, pluck(1, sorted(cnts.items())))) tics = [0] + [i - start_time for i in tics] p = bp.figure(x_range=[0, max(tics)], **defaults) for (key, val), color in zip(data.items(), get_colors(palette, data.keys())): p.line( "x", "y", line_color=color, line_width=3, source=bp.ColumnDataSource( {"x": tics, "y": val, "label": [key for i in val]} ), ) else: p = bp.figure(y_range=[0, 10], x_range=[0, 10], **defaults) p.yaxis.axis_label = "Cache Size ({0})".format(metric_name) p.xaxis.axis_label = "Time (s)" hover = p.select(HoverTool) hover.tooltips = """ <div> <span style="font-size: 14px; font-weight: bold;">Task:</span> <span style="font-size: 10px; font-family: Monaco, monospace;">@label</span> </div> """ return p
def _calculate_divisions( df: DataFrame, partition_col: Series, repartition: bool, npartitions: int, upsample: float = 1.0, partition_size: float = 128e6, ) -> Tuple[List, List, List]: """ Utility function to calculate divisions for calls to `map_partitions` """ sizes = df.map_partitions(sizeof) if repartition else [] divisions = partition_col._repartition_quantiles(npartitions, upsample=upsample) mins = partition_col.map_partitions(M.min) maxes = partition_col.map_partitions(M.max) try: divisions, sizes, mins, maxes = compute(divisions, sizes, mins, maxes) except TypeError as e: # When there are nulls and a column is non-numeric, a TypeError is sometimes raised as a result of # 1) computing mins/maxes above, 2) every null being switched to NaN, and 3) NaN being a float. # Also, Pandas ExtensionDtypes may cause TypeErrors when dealing with special nulls such as pd.NaT or pd.NA. # If this happens, we hint the user about eliminating nulls beforehand. if not is_numeric_dtype(partition_col.dtype): obj, suggested_method = ( ("column", f"`.dropna(subset=['{partition_col.name}'])`") if any( partition_col._name == df[c]._name for c in df) else ("series", "`.loc[series[~series.isna()]]`")) raise NotImplementedError( f"Divisions calculation failed for non-numeric {obj} '{partition_col.name}'.\n" f"This is probably due to the presence of nulls, which Dask does not entirely support in the index.\n" f"We suggest you try with {suggested_method}.") from e # For numeric types there shouldn't be problems with nulls, so we raise as-it-is this particular TypeError else: raise e divisions = methods.tolist(divisions) if type(sizes) is not list: sizes = methods.tolist(sizes) mins = methods.tolist(mins) maxes = methods.tolist(maxes) empty_dataframe_detected = pd.isna(divisions).all() if repartition or empty_dataframe_detected: total = sum(sizes) npartitions = max(math.ceil(total / partition_size), 1) npartitions = min(npartitions, df.npartitions) n = len(divisions) try: divisions = np.interp( x=np.linspace(0, n - 1, npartitions + 1), xp=np.linspace(0, n - 1, n), fp=divisions, ).tolist() except (TypeError, ValueError): # str type indexes = np.linspace(0, n - 1, npartitions + 1).astype(int) divisions = [divisions[i] for i in indexes] else: # Drop duplicate divisions returned by partition quantiles divisions = list(toolz.unique(divisions[:-1])) + [divisions[-1]] mins = remove_nans(mins) maxes = remove_nans(maxes) if pd.api.types.is_categorical_dtype(partition_col.dtype): dtype = partition_col.dtype mins = pd.Categorical(mins, dtype=dtype).codes.tolist() maxes = pd.Categorical(maxes, dtype=dtype).codes.tolist() return divisions, mins, maxes