def assert_dask_dtypes(ddf, res, numeric_equal=True): """Check that the dask metadata matches the result. If `numeric_equal`, integer and floating dtypes compare equal. This is useful due to the implicit conversion of integer to floating upon encountering missingness, which is hard to infer statically.""" eq_type_sets = [{"O", "S", "U", "a"}] # treat object and strings alike if numeric_equal: eq_type_sets.append({"i", "f", "u"}) def eq_dtypes(a, b): return any(a.kind in eq_types and b.kind in eq_types for eq_types in eq_type_sets) or (a == b) if not is_dask_collection(res) and is_dataframe_like(res): for col, a, b in pd.concat([ddf._meta.dtypes, res.dtypes], axis=1).itertuples(): assert eq_dtypes(a, b) elif not is_dask_collection(res) and (is_index_like(res) or is_series_like(res)): a = ddf._meta.dtype b = res.dtype assert eq_dtypes(a, b) else: if hasattr(ddf._meta, "dtype"): a = ddf._meta.dtype if not hasattr(res, "dtype"): assert np.isscalar(res) b = np.dtype(type(res)) else: b = res.dtype assert eq_dtypes(a, b) else: assert type(ddf._meta) == type(res)
def test_is_dask_collection(): class DummyCollection(object): def __init__(self, dsk=None): self.dask = dsk def __dask_graph__(self): return self.dask x = delayed(1) + 2 assert is_dask_collection(x) assert not is_dask_collection(2) assert is_dask_collection(DummyCollection({})) assert not is_dask_collection(DummyCollection())
def test_is_dask_collection(): class DummyCollection(object): def __init__(self, dsk=None): self.dask = dsk def __dask_graph__(self): return self.dask x = delayed(1) + 2 assert is_dask_collection(x) assert not is_dask_collection(2) assert is_dask_collection(DummyCollection({})) assert not is_dask_collection(DummyCollection()) assert not is_dask_collection(DummyCollection)
def __check_dly_processing_prereq(self, inputs: dict): '''At least one input must be a dask DataFrame type. Output types must be specified as cudf.DataFrame or dask_cudf.DataFrame. (Functionality could also be extended to support dask dataframe of pandas, but currently only cudf/dask_cudf dataframes are supported.) ''' # check if dask future or delayed ivals = inputs.values() if not any((is_dask_collection(iv) for iv in ivals)) and \ not any((isinstance(iv, Future) for iv in ivals)): # None of the inputs are Delayed or Futures so no intention of # using delayed processing. Return False and avoid printing # non-applicable warning. return False use_delayed = False for ival in ivals: if isinstance(ival, DaskDataFrame): use_delayed = True break # NOTE: Currently only support delayed processing when one of the # inputs is a dask_cudf.DataFrame. In the future might generalize # to support dask processing of other delayed/future type inputs. if not use_delayed: warn_msg = \ 'None of the Node "{}" inputs '\ 'is a dask_cudf.DataFrame. Ignoring '\ '"delayed_process" setting.'.format(self.uid) warnings.warn(warn_msg) return use_delayed
def is_dask_collection(x): if dsk.available: from dask.base import is_dask_collection return is_dask_collection(x) else: return False
def outer(self, A, B, **kwargs): if self.nin != 2: raise ValueError( "outer product only supported for binary functions") if "out" in kwargs: raise ValueError("`out` kwarg not supported") A_is_dask = is_dask_collection(A) B_is_dask = is_dask_collection(B) if not A_is_dask and not B_is_dask: return self._ufunc.outer(A, B, **kwargs) elif (A_is_dask and not isinstance(A, Array) or B_is_dask and not isinstance(B, Array)): raise NotImplementedError( "Dask objects besides `dask.array.Array` " "are not supported at this time.") A = asarray(A) B = asarray(B) ndim = A.ndim + B.ndim out_inds = tuple(range(ndim)) A_inds = out_inds[:A.ndim] B_inds = out_inds[A.ndim:] dtype = apply_infer_dtype(self._ufunc.outer, [A, B], kwargs, "ufunc.outer", suggest_dtype=False) if "dtype" in kwargs: func = partial(self._ufunc.outer, dtype=kwargs.pop("dtype")) else: func = self._ufunc.outer return blockwise( func, out_inds, A, A_inds, B, B_inds, dtype=dtype, token=self.__name__ + ".outer", **kwargs, )
def __subclasshook__(cls, C): if cls is DaskImage: try: if (is_dask_collection(C) and any("__daskmeta__" in B.__dict__ for B in C.__mro__)): return True except AttributeError: pass return NotImplemented
def finalize(collection): assert is_dask_collection(collection) name = "finalize-" + tokenize(collection) keys = collection.__dask_keys__() finalize, args = collection.__dask_postcompute__() layer = {name: (finalize, keys) + args} graph = HighLevelGraph.from_collections(name, layer, dependencies=[collection]) return Delayed(name, graph)
def from_collections(cls, name, layer, dependencies=()): """Construct a HighLevelGraph from a new layer and a set of collections This constructs a HighLevelGraph in the common case where we have a single new layer and a set of old collections on which we want to depend. This pulls out the ``__dask_layers__()`` method of the collections if they exist, and adds them to the dependencies for this new layer. It also merges all of the layers from all of the dependent collections together into the new layers for this graph. Parameters ---------- name : str The name of the new layer layer : Mapping The graph layer itself dependencies : List of Dask collections A list of other dask collections (like arrays or dataframes) that have graphs themselves Examples -------- In typical usage we make a new task layer, and then pass that layer along with all dependent collections to this method. >>> def add(self, other): ... name = 'add-' + tokenize(self, other) ... layer = {(name, i): (add, input_key, other) ... for i, input_key in enumerate(self.__dask_keys__())} ... graph = HighLevelGraph.from_collections(name, layer, dependencies=[self]) ... return new_collection(name, graph) """ if len(dependencies) == 1: return cls._from_collection(name, layer, dependencies[0]) layers = {name: layer} deps = {name: set()} for collection in toolz.unique(dependencies, key=id): if is_dask_collection(collection): graph = collection.__dask_graph__() if isinstance(graph, HighLevelGraph): layers.update(graph.layers) deps.update(graph.dependencies) deps[name] |= set(collection.__dask_layers__()) else: key = _get_some_layer_name(collection) layers[key] = graph deps[name].add(key) deps[key] = set() else: raise TypeError(type(collection)) return cls(layers, deps)
def concrete_fill(self, *args: Any, weight: Any | None = None, sample=None, threads=None) -> Histogram: """Fill the histogram with concrete data (not a Dask collection). Calls the super class fill function :py:func:`boost_histogram.Histogram.fill`. Parameters ---------- *args : array_like Provide one value or array per dimension weight : array_like, optional Provide weights (only if the storage supports them) sample : array_like Provide samples (only if the storage supports them) threads : int, optional Fill with threads. Defaults to None, which does not activate threaded filling. Using 0 will automatically pick the number of available threads (usually two per core). Returns ------- dask_histogram.Histogram Class instance now filled with concrete data. """ if any(is_dask_collection(a) for a in args) or is_dask_collection(weight): raise TypeError( "concrete_fill does not support Dask collections, only materialized " "data; use the Histogram.fill method.") return super().fill(*args, weight=weight, sample=sample, threads=threads)
def __call__(self, a: AggHistogram, b: AggHistogram) -> AggHistogram: name = f"{self.__name__}-hist-{tokenize(a, b)}" deps = [] if is_dask_collection(a): deps.append(a) k1 = a.name else: k1 = a # type: ignore if is_dask_collection(b): deps.append(b) k2 = b.name else: k2 = b # type: ignore k1 = a.__dask_tokenize__() if is_dask_collection( a) else a # type: ignore k2 = b.__dask_tokenize__() if is_dask_collection( b) else b # type: ignore llg = {name: (self.func, k1, k2)} g = HighLevelGraph.from_collections(name, llg, dependencies=deps) try: ref = a.histref except AttributeError: ref = b.histref return AggHistogram(g, name, histref=ref)
def to_keys(dsk, *args): for x in args: if x is None: yield None elif isinstance(x, da.Array): x = delayed(x) dsk.update(x.dask) yield x.key elif isinstance(x, Delayed): dsk.update(x.dask) yield x.key else: assert not is_dask_collection(x) key = "array-" + tokenize(x) dsk[key] = x yield key
def to_keys(dsk, *args): for x in args: if x is None: yield None elif isinstance(x, (da.Array, dd.DataFrame)): x = delayed(x) dsk.update(x.dask) yield x.key elif isinstance(x, Delayed): dsk.update(x.dask) yield x.key else: assert not is_dask_collection(x) key = type(x).__name__ + "-" + tokenize(x) dsk[key] = x yield key
def to_keys(dsk, *args): for x in args: if x is None: yield None elif isinstance(x, da.Array): x = delayed(x) dsk.update(x.dask) yield x.key elif isinstance(x, Delayed): dsk.update(x.dask) yield x.key else: assert not is_dask_collection(x) key = 'array-' + tokenize(x) dsk[key] = x yield key
def _from_collection(cls, name, layer, collection): """`from_collections` optimized for a single collection""" if not is_dask_collection(collection): raise TypeError(type(collection)) graph = collection.__dask_graph__() if isinstance(graph, HighLevelGraph): layers = ensure_dict(graph.layers, copy=True) layers[name] = layer deps = ensure_dict(graph.dependencies, copy=True) deps[name] = set(collection.__dask_layers__()) else: key = _get_some_layer_name(collection) layers = {name: layer, key: graph} deps = {name: {key}, key: set()} return cls(layers, deps)
def to_indexable(*args, **kwargs): """Ensure that all args are an indexable type. Conversion runs lazily for dask objects, immediately otherwise. Parameters ---------- args : array_like or scalar allow_scalars : bool, optional Whether to allow scalars in args. Default is False. """ if kwargs.get("allow_scalars", False): indexable = _maybe_indexable else: indexable = _indexable for x in args: if x is None or isinstance(x, da.Array): yield x elif is_dask_collection(x): yield delayed(indexable, pure=True)(x) else: yield indexable(x)
def to_indexable(*args, **kwargs): """Ensure that all args are an indexable type. Conversion runs lazily for dask objects, immediately otherwise. Parameters ---------- args : array_like or scalar allow_scalars : bool, optional Whether to allow scalars in args. Default is False. """ if kwargs.get('allow_scalars', False): indexable = _maybe_indexable else: indexable = _indexable for x in args: if x is None or isinstance(x, da.Array): yield x elif is_dask_collection(x): yield delayed(indexable, pure=True)(x) else: yield indexable(x)
def test_custom_collection(): dsk = {'a': 1, 'b': 2} dsk2 = {'c': (add, 'a', 'b'), 'd': (add, 'c', 1)} dsk2.update(dsk) dsk3 = {'e': (add, 'a', 4), 'f': (inc, 'e')} dsk3.update(dsk) x = Tuple(dsk, ['a', 'b']) y = Tuple(dsk2, ['c', 'd']) z = Tuple(dsk3, ['e', 'f']) # __slots__ defined on base mixin class propogates with pytest.raises(AttributeError): x.foo = 1 # is_dask_collection assert is_dask_collection(x) # tokenize assert tokenize(x) == tokenize(x) assert tokenize(x) != tokenize(y) # compute assert x.compute() == (1, 2) assert dask.compute(x, [y, z]) == ((1, 2), [(3, 4), (5, 6)]) t = x + y + z assert t.compute() == (1, 2, 3, 4, 5, 6) # persist t2 = t.persist() assert isinstance(t2, Tuple) assert t2._dask == dict(zip('abcdef', range(1, 7))) assert t2.compute() == (1, 2, 3, 4, 5, 6) x2, y2, z2 = dask.persist(x, y, z) t3 = x2 + y2 + z2 assert t2._dask == t3._dask
def test_custom_collection(): dsk = {"a": 1, "b": 2} dsk2 = {"c": (add, "a", "b"), "d": (add, "c", 1)} dsk2.update(dsk) dsk3 = {"e": (add, "a", 4), "f": (inc, "e")} dsk3.update(dsk) x = Tuple(dsk, ["a", "b"]) y = Tuple(dsk2, ["c", "d"]) z = Tuple(dsk3, ["e", "f"]) # __slots__ defined on base mixin class propagates with pytest.raises(AttributeError): x.foo = 1 # is_dask_collection assert is_dask_collection(x) # tokenize assert tokenize(x) == tokenize(x) assert tokenize(x) != tokenize(y) # compute assert x.compute() == (1, 2) assert dask.compute(x, [y, z]) == ((1, 2), [(3, 4), (5, 6)]) t = x + y + z assert t.compute() == (1, 2, 3, 4, 5, 6) # persist t2 = t.persist() assert isinstance(t2, Tuple) assert t2._dask == dict(zip("abcdef", range(1, 7))) assert t2.compute() == (1, 2, 3, 4, 5, 6) x2, y2, z2 = dask.persist(x, y, z) t3 = x2 + y2 + z2 assert t2._dask == t3._dask
def unpack_collections(expr): """Normalize a python object and merge all sub-graphs. - Replace ``Delayed`` with their keys - Convert literals to things the schedulers can handle - Extract dask graphs from all enclosed values Parameters ---------- expr : object The object to be normalized. This function knows how to handle dask collections, as well as most builtin python types. Returns ------- task : normalized task to be run collections : a tuple of collections Examples -------- >>> import dask >>> a = delayed(1, 'a') >>> b = delayed(2, 'b') >>> task, collections = unpack_collections([a, b, 3]) >>> task ['a', 'b', 3] >>> collections (Delayed('a'), Delayed('b')) >>> task, collections = unpack_collections({a: 1, b: 2}) >>> task (<class 'dict'>, [['a', 1], ['b', 2]]) >>> collections (Delayed('a'), Delayed('b')) """ if isinstance(expr, Delayed): return expr._key, (expr, ) if is_dask_collection(expr): finalized = finalize(expr) return finalized._key, (finalized, ) if isinstance(expr, Iterator): expr = tuple(expr) typ = type(expr) if typ in (list, tuple, set): args, collections = unzip((unpack_collections(e) for e in expr), 2) args = list(args) collections = tuple(unique(concat(collections), key=id)) # Ensure output type matches input type if typ is not list: args = (typ, args) return args, collections if typ is dict: args, collections = unpack_collections([[k, v] for k, v in expr.items()]) return (dict, args), collections if typ is slice: args, collections = unpack_collections( [expr.start, expr.stop, expr.step]) return (slice, ) + tuple(args), collections if is_dataclass(expr): args, collections = unpack_collections([ [f.name, getattr(expr, f.name)] for f in fields(expr) if hasattr(expr, f.name) # if init=False, field might not exist ]) return (apply, typ, (), (dict, args)), collections return expr, ()
def histogramdd( a: DaskCollection | tuple[DaskCollection, ...], bins: BinArg = 10, range: RangeArg = None, normed: bool | None = None, weights: DaskCollection | None = None, density: bool = False, *, histogram: Any | None = None, storage: storage.Storage = storage.Double(), threads: int | None = None, ) -> Histogram | tuple[da.Array, ...] | tuple[da.Array, list[da.Array]]: """Histogram Dask data in multiple dimensions. Parameters ---------- a : dask collection or tuple of dask collections Data to histogram. Acceptable input data can be of the form: * A dask.array.Array of shape (N, D) where each row is a sample and each column is a specific coordinate for the data. * A sequence of dask collections where each collection (e.g. array or series) contains all values for one coordinate of all data. bins : sequence of arrays, int, or sequence of ints The bin specification. The possible binning configurations are: * A sequence of arrays describing the monotonically increasing bin edges along each dimension. * A single int describing the total number of bins that will be used in each dimension (this requires the `range` argument to be defined). * A sequence of ints describing the total number of bins to be used in each dimension (this requires the `range` argument to be defined). When bins are described by arrays, the rightmost edge is included. Bins described by arrays also allows for non-uniform bin widths. range : tuple(tuple(float, float), ...) optional A sequence of length D, each a (min, max) tuple giving the outer bin edges to be used if the edges are not given explicitly in `bins`. If defined, this argument is required to have an entry for each dimension. Unlike :func:`numpy.histogramdd`, if `bins` does not define bin edges, this argument is required (this function will not automatically use the min and max of of the value in a given dimension because the input data may be lazy in dask). normed : bool, optional An unsupported argument that has been deprecated in the NumPy API (preserved to maintain calls dependent on argument order). weights : dask.array.Array or dask.dataframe.Series, optional An array of values weighing each sample in the input data. The chunks of the weights must be identical to the chunking along the 0th (row) axis of the data sample. density : bool If ``False`` (default), the returned array represents the number of samples in each bin. If ``True``, the returned array represents the probability density function at each bin. histogram : dask_histogram.Histogram, optional If `dh.Histogram`, object based output is enabled. storage : boost_histogram.storage.Storage Define the storage used by the :py:class:`Histogram` object. threads : int, optional Ignored argument kept for compatibility with boost-histogram. We let Dask have complete control over threads. Returns ------- tuple(dask.array.Array, tuple(dask.array.Array)) or Histogram The default return is the style of :func:`dask.array.histogramdd`: An array of bin contents and a tuple of edges arrays (one for each dimension). If the `histogram` argument is used then the return is a :obj:`dask_histogram.Histogram` object. See Also -------- histogram histogram2d Examples -------- Creating a three dimensional histogram with variable width bins in each dimension. First, using three 1D arrays for each coordinate: >>> import dask.array as da >>> import dask_histogram.boost as dhb >>> x = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> y = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> z = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> bins = [ ... [-3, -2, 0, 1, 3], ... [-3, -1, 1, 2, 3], ... [-3, -2, 0, 2, 3], ... ] >>> h, edges = dhb.histogramdd((x, y, z), bins=bins) >>> type(h) <class 'dask.array.core.Array'> >>> h.shape (4, 4, 4) >>> len(edges) 3 Now the same histogram but instead of a :py:func:`dask.array.histogramdd` style return (which mirrors the return style of :py:func:`numpy.histogramdd`), we use the `histogram` argument to trigger the return of a :obj:`dask_histogram.Histogram` object: >>> import dask.array as da >>> import dask_histogram.boost as dhb >>> x = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> y = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> z = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> bins = [ ... [-3, -2, 0, 1, 3], ... [-3, -1, 1, 2, 3], ... [-3, -2, 0, 2, 3], ... ] >>> h = dhb.histogramdd((x, y, z), bins=bins, histogram=dhb.Histogram) >>> h Histogram( Variable([-3, -2, 0, 1, 3]), Variable([-3, -1, 1, 2, 3]), Variable([-3, -2, 0, 2, 3]), storage=Double()) # (has staged fills) >>> h.staged_fills() True >>> h = h.compute() >>> h # doctest: +SKIP Histogram( Variable([-3, -2, 0, 1, 3]), Variable([-3, -1, 1, 2, 3]), Variable([-3, -2, 0, 2, 3]), storage=Double()) # Sum: 9919.0 (10000.0 with flow) Another 3D histogram example but with an alternative dataset form (a single array with three columns), fixed bin widths, sample weights, and usage of the boost-histogram ``Weight()`` storage: >>> import dask.array as da >>> import dask_histogram.boost as dhb >>> a = da.random.standard_normal(size=(10000, 3), chunks=(2000, 3)) >>> w = da.random.uniform(0.5, 0.7, size=(10000,), chunks=2000) >>> bins = (7, 5, 6) >>> range = ((-3, 3), (-2.9, 2.9), (-3.1, 3.1)) >>> h = dhb.histogramdd( ... a, ... bins=bins, ... range=range, ... weights=w, ... histogram=dhb.Histogram, ... storage=dhb.storage.Weight() ... ) >>> h Histogram( Regular(7, -3, 3), Regular(5, -2.9, 2.9), Regular(6, -3.1, 3.1), storage=Weight()) # Sum: WeightedSum(value=0, variance=0) (has staged fills) >>> h.staged_fills() True >>> h = h.compute() >>> h.staged_fills() False """ # Check for invalid argument combinations. if normed is not None: raise KeyError( "normed=True is deprecated in NumPy and not supported by dask-histogram." ) if density and histogram is not None: raise KeyError( "dask-histogram does not support the density keyword when returning a " "dask-histogram object.") # If input is a multidimensional array or dataframe, we wrap it in # a tuple that will be passed to fill and unrolled in the backend. if (is_arraylike(a) and a.ndim > 1) or is_dataframe_like(a): # type: ignore ndim = a.shape[1] # type: ignore a = (a, ) # type: ignore else: ndim = len(a) for entry in a: if not is_dask_collection(entry): raise ValueError( "non-dask collection was passed; this function only supports dask " "collections as input") bins, range = normalize_bins_range(ndim, bins, range) # Create the axes based on the bins and range values. axes = [] for _, (b, r) in enumerate(zip(bins, range)): # type: ignore if r is None: axes.append(axis.Variable(b)) # type: ignore else: axes.append(axis.Regular(bins=b, start=r[0], stop=r[1])) # type: ignore # Finally create and fill the histogram object. hist = Histogram(*axes, storage=storage).fill(*a, weight=weights) if histogram != Histogram: return hist.to_dask_array(flow=False, dd=True) return hist
def is_awkward_like(x: Any) -> bool: return is_dask_collection(x) and hasattr(x, "_typetracer")
def check_meta(x, meta, funcname=None, numeric_equal=True): """Check that the dask metadata matches the result. If metadata matches, ``x`` is passed through unchanged. A nice error is raised if metadata doesn't match. Parameters ---------- x : DataFrame, Series, or Index meta : DataFrame, Series, or Index The expected metadata that ``x`` should match funcname : str, optional The name of the function in which the metadata was specified. If provided, the function name will be included in the error message to be more helpful to users. numeric_equal : bool, optionl If True, integer and floating dtypes compare equal. This is useful due to panda's implicit conversion of integer to floating upon encountering missingness, which is hard to infer statically. """ eq_types = {"i", "f", "u"} if numeric_equal else set() def equal_dtypes(a, b): if is_categorical_dtype(a) != is_categorical_dtype(b): return False if isinstance(a, str) and a == "-" or isinstance(b, str) and b == "-": return False if is_categorical_dtype(a) and is_categorical_dtype(b): if UNKNOWN_CATEGORIES in a.categories or UNKNOWN_CATEGORIES in b.categories: return True return a == b return (a.kind in eq_types and b.kind in eq_types) or is_dtype_equal( a, b) if not (is_dataframe_like(meta) or is_series_like(meta) or is_index_like(meta)) or is_dask_collection(meta): raise TypeError("Expected partition to be DataFrame, Series, or " "Index, got `%s`" % typename(type(meta))) # Notice, we use .__class__ as opposed to type() in order to support # object proxies see <https://github.com/dask/dask/pull/6981> if x.__class__ != meta.__class__: errmsg = "Expected partition of type `{}` but got `{}`".format( typename(type(meta)), typename(type(x)), ) elif is_dataframe_like(meta): dtypes = pd.concat([x.dtypes, meta.dtypes], axis=1, sort=True) bad_dtypes = [(repr(col), a, b) for col, a, b in dtypes.fillna("-").itertuples() if not equal_dtypes(a, b)] if bad_dtypes: errmsg = "Partition type: `{}`\n{}".format( typename(type(meta)), asciitable(["Column", "Found", "Expected"], bad_dtypes), ) else: check_matching_columns(meta, x) return x else: if equal_dtypes(x.dtype, meta.dtype): return x errmsg = "Partition type: `{}`\n{}".format( typename(type(meta)), asciitable(["", "dtype"], [("Found", x.dtype), ("Expected", meta.dtype)]), ) raise ValueError("Metadata mismatch found%s.\n\n" "%s" % ((" in `%s`" % funcname if funcname else ""), errmsg))
def fill( # type: ignore self, *args: DaskCollection, weight: DaskCollection | None = None, sample: Any | None = None, threads: Any | None = None, ) -> Histogram: """Stage a fill call using a Dask collection as input. If materialized NumPy ararys are passed to this function, all arguments are forwarded :func:`concrete_fill`. Parameters ---------- *args : one or more Dask collections Provide one dask collection per dimension, or a single columnar Dask collection (DataFrame or 2D Array) where the total number of columns equals to the total number of histogram dimensions. * A single one dimensional collection (:obj:`dask.array.Array` or :obj:`dask.dataframe.Series`) * Multiple one dimensional collections, each representing one an array of one coordinate of the dataset to be histogrammed. * A single two dimensional collection (:obj:`dask.array.Array` or :obj:`dask.dataframe.DataFrame`), each column representing one coordinate of the dataset to be histogrammed. If multiple one dimensional arguments are passed (i.e. an `x` array and a `y` array for a two dimensional histogram), the collections must have equal chunking/partitioning. If a single two dimensional array is passed (i.e. an array of shape ``(2000, 3)`` for a three dimensional histogram), chunking can only exist along the 0th (row) axis. (coordinates cannot be separated by a chunk boundry, only whole individual samples can be separated). weight : dask.array.Array or dask.dataframe.Series, optional Weights associated with each sample. The weights must be chunked/partitioned in a way compatible with the dataset. sample : dask.array.Array or dask.dataframe.Series, optional Provide samples if the histogram storage allows it. The partitioning/chunking of the samples must be compatible with the input data. threads : int, optional Ignored argument kept for compatibility with boost-histogram. We let Dask have complete control over threads. Returns ------- dask_histogram.Histogram Class instance with a staged (delayed) fill added. """ # Pass to concrete fill if non-dask-collection if all(not is_dask_collection(a) for a in args): return self.concrete_fill( *args, weight=weight, sample=sample, threads=None, ) if len(args) == 1 and args[0].ndim == 1: pass elif len(args) == 1 and args[0].ndim == 2: pass elif len(args) > 1: pass else: raise ValueError(f"Cannot interpret input data: {args}") new_fill = factory(*args, histref=self, weights=weight, sample=sample) if self._staged is not None: self._staged += new_fill else: self._staged = new_fill # type: ignore return self
def is_duck_dask_array(x): return is_duck_array(x) and is_dask_collection(x)
def delayed(obj, name=None, pure=None, nout=None, traverse=True): """Wraps a function or object to produce a ``Delayed``. ``Delayed`` objects act as proxies for the object they wrap, but all operations on them are done lazily by building up a dask graph internally. Parameters ---------- obj : object The function or object to wrap name : string or hashable, optional The key to use in the underlying graph for the wrapped object. Defaults to hashing content. Note that this only affects the name of the object wrapped by this call to delayed, and *not* the output of delayed function calls - for that use ``dask_key_name=`` as described below. .. note:: Because this ``name`` is used as the key in task graphs, you should ensure that it uniquely identifies ``obj``. If you'd like to provide a descriptive name that is still unique, combine the descriptive name with :func:`dask.base.tokenize` of the ``array_like``. See :ref:`graphs` for more. pure : bool, optional Indicates whether calling the resulting ``Delayed`` object is a pure operation. If True, arguments to the call are hashed to produce deterministic keys. If not provided, the default is to check the global ``delayed_pure`` setting, and fallback to ``False`` if unset. nout : int, optional The number of outputs returned from calling the resulting ``Delayed`` object. If provided, the ``Delayed`` output of the call can be iterated into ``nout`` objects, allowing for unpacking of results. By default iteration over ``Delayed`` objects will error. Note, that ``nout=1`` expects ``obj`` to return a tuple of length 1, and consequently for ``nout=0``, ``obj`` should return an empty tuple. traverse : bool, optional By default dask traverses builtin python collections looking for dask objects passed to ``delayed``. For large collections this can be expensive. If ``obj`` doesn't contain any dask objects, set ``traverse=False`` to avoid doing this traversal. Examples -------- Apply to functions to delay execution: >>> from dask import delayed >>> def inc(x): ... return x + 1 >>> inc(10) 11 >>> x = delayed(inc, pure=True)(10) >>> type(x) == Delayed True >>> x.compute() 11 Can be used as a decorator: >>> @delayed(pure=True) ... def add(a, b): ... return a + b >>> add(1, 2).compute() 3 ``delayed`` also accepts an optional keyword ``pure``. If False, then subsequent calls will always produce a different ``Delayed``. This is useful for non-pure functions (such as ``time`` or ``random``). >>> from random import random >>> out1 = delayed(random, pure=False)() >>> out2 = delayed(random, pure=False)() >>> out1.key == out2.key False If you know a function is pure (output only depends on the input, with no global state), then you can set ``pure=True``. This will attempt to apply a consistent name to the output, but will fallback on the same behavior of ``pure=False`` if this fails. >>> @delayed(pure=True) ... def add(a, b): ... return a + b >>> out1 = add(1, 2) >>> out2 = add(1, 2) >>> out1.key == out2.key True Instead of setting ``pure`` as a property of the callable, you can also set it contextually using the ``delayed_pure`` setting. Note that this influences the *call* and not the *creation* of the callable: >>> @delayed ... def mul(a, b): ... return a * b >>> import dask >>> with dask.config.set(delayed_pure=True): ... print(mul(1, 2).key == mul(1, 2).key) True >>> with dask.config.set(delayed_pure=False): ... print(mul(1, 2).key == mul(1, 2).key) False The key name of the result of calling a delayed object is determined by hashing the arguments by default. To explicitly set the name, you can use the ``dask_key_name`` keyword when calling the function: >>> add(1, 2) # doctest: +SKIP Delayed('add-3dce7c56edd1ac2614add714086e950f') >>> add(1, 2, dask_key_name='three') Delayed('three') Note that objects with the same key name are assumed to have the same result. If you set the names explicitly you should make sure your key names are different for different results. >>> add(1, 2, dask_key_name='three') Delayed('three') >>> add(2, 1, dask_key_name='three') Delayed('three') >>> add(2, 2, dask_key_name='four') Delayed('four') ``delayed`` can also be applied to objects to make operations on them lazy: >>> a = delayed([1, 2, 3]) >>> isinstance(a, Delayed) True >>> a.compute() [1, 2, 3] The key name of a delayed object is hashed by default if ``pure=True`` or is generated randomly if ``pure=False`` (default). To explicitly set the name, you can use the ``name`` keyword. To ensure that the key is unique you should include the tokenized value as well, or otherwise ensure that it's unique: >>> from dask.base import tokenize >>> data = [1, 2, 3] >>> a = delayed(data, name='mylist-' + tokenize(data)) >>> a # doctest: +SKIP Delayed('mylist-55af65871cb378a4fa6de1660c3e8fb7') Delayed results act as a proxy to the underlying object. Many operators are supported: >>> (a + [1, 2]).compute() [1, 2, 3, 1, 2] >>> a[1].compute() 2 Method and attribute access also works: >>> a.count(2).compute() 1 Note that if a method doesn't exist, no error will be thrown until runtime: >>> res = a.not_a_real_method() # doctest: +SKIP >>> res.compute() # doctest: +SKIP AttributeError("'list' object has no attribute 'not_a_real_method'") "Magic" methods (e.g. operators and attribute access) are assumed to be pure, meaning that subsequent calls must return the same results. This behavior is not overrideable through the ``delayed`` call, but can be modified using other ways as described below. To invoke an impure attribute or operator, you'd need to use it in a delayed function with ``pure=False``: >>> class Incrementer: ... def __init__(self): ... self._n = 0 ... @property ... def n(self): ... self._n += 1 ... return self._n ... >>> x = delayed(Incrementer()) >>> x.n.key == x.n.key True >>> get_n = delayed(lambda x: x.n, pure=False) >>> get_n(x).key == get_n(x).key False In contrast, methods are assumed to be impure by default, meaning that subsequent calls may return different results. To assume purity, set ``pure=True``. This allows sharing of any intermediate values. >>> a.count(2, pure=True).key == a.count(2, pure=True).key True As with function calls, method calls also respect the global ``delayed_pure`` setting and support the ``dask_key_name`` keyword: >>> a.count(2, dask_key_name="count_2") Delayed('count_2') >>> import dask >>> with dask.config.set(delayed_pure=True): ... print(a.count(2).key == a.count(2).key) True """ if isinstance(obj, Delayed): return obj if is_dask_collection(obj) or traverse: task, collections = unpack_collections(obj) else: task = quote(obj) collections = set() if not (nout is None or (type(nout) is int and nout >= 0)): raise ValueError( "nout must be None or a non-negative integer, got %s" % nout) if task is obj: if not name: try: prefix = obj.__name__ except AttributeError: prefix = type(obj).__name__ token = tokenize(obj, nout, pure=pure) name = f"{prefix}-{token}" return DelayedLeaf(obj, name, pure=pure, nout=nout) else: if not name: name = f"{type(obj).__name__}-{tokenize(task, pure=pure)}" layer = {name: task} graph = HighLevelGraph.from_collections(name, layer, dependencies=collections) return Delayed(name, graph, nout)
def test_custom_collection(): # Arbitrary hashables h1 = object() h2 = object() # Collections with 2+ keys must have all keys in the format of tuples where the # first element is the same string, referred to as collection name, and the rest are # arbitrary hashables dsk = {("x", h1): 1, ("x", h2): 2} dsk2 = {("y", h1): (add, ("x", h1), ("x", h2)), ("y", h2): (add, ("y", h1), 1)} dsk2.update(dsk) # If and only if there is only one top-level key, it can be just a string dsk3 = {"z": (add, ("y", h1), ("y", h2))} dsk3.update(dsk2) w = Tuple({}, []) # A collection can have no keys at all x = Tuple(dsk, [("x", h1), ("x", h2)]) y = Tuple(dsk2, [("y", h1), ("y", h2)]) z = Tuple(dsk3, ["z"]) # __slots__ defined on base mixin class propagates with pytest.raises(AttributeError): x.foo = 1 # is_dask_collection assert is_dask_collection(w) assert is_dask_collection(x) assert is_dask_collection(y) assert is_dask_collection(z) # tokenize assert tokenize(w) == tokenize(w) assert tokenize(x) == tokenize(x) assert tokenize(y) == tokenize(y) assert tokenize(z) == tokenize(z) assert len({tokenize(coll) for coll in (w, x, y, z)}) == 4 # get_collection_name with pytest.raises(KeyError): get_collection_name(w) assert get_collection_name(x) == "x" assert get_collection_name(y) == "y" assert get_collection_name(z) == "z" # compute assert w.compute() == () assert x.compute() == (1, 2) assert y.compute() == (3, 4) assert z.compute() == (7,) assert dask.compute(w, [{"x": x}, y, z]) == ((), [{"x": (1, 2)}, (3, 4), (7,)]) t = w + x + y + z assert t.compute() == (1, 2, 3, 4, 7) # persist t2 = t.persist() assert isinstance(t2, Tuple) assert t2._keys == t._keys assert sorted(t2._dask.values()) == [1, 2, 3, 4, 7] assert t2.compute() == (1, 2, 3, 4, 7) w2, x2, y2, z2 = dask.persist(w, x, y, z) assert y2._keys == y._keys assert y2._dask == {("y", h1): 3, ("y", h2): 4} assert y2.compute() == (3, 4) t3 = x2 + y2 + z2 assert t3.compute() == (1, 2, 3, 4, 7) # __dask_postpersist__ with name change rebuild, args = w.__dask_postpersist__() w3 = rebuild({}, *args, name="w3") assert w3.compute() == () rebuild, args = x.__dask_postpersist__() x3 = rebuild({("x3", h1): 10, ("x3", h2): 20}, *args, name="x3") assert x3.compute() == (10, 20) rebuild, args = z.__dask_postpersist__() z3 = rebuild({"z3": 70}, *args, name="z3") assert z3.compute() == (70,)
def shuffle( df, index, shuffle=None, npartitions=None, max_branch=32, ignore_index=False, compute=None, ): """Group DataFrame by index Hash grouping of elements. After this operation all elements that have the same index will be in the same partition. Note that this requires full dataset read, serialization and shuffle. This is expensive. If possible you should avoid shuffles. This does not preserve a meaningful index/partitioning scheme. This is not deterministic if done in parallel. See Also -------- set_index set_partition shuffle_disk """ list_like = pd.api.types.is_list_like( index) and not is_dask_collection(index) if shuffle == "tasks" and (isinstance(index, str) or list_like): # Avoid creating the "_partitions" column if possible. # We currently do this if the user is passing in # specific column names (and shuffle == "tasks"). if isinstance(index, str): index = [index] else: index = list(index) nset = set(index) if nset & set(df.columns) == nset: return rearrange_by_column( df, index, npartitions=npartitions, max_branch=max_branch, shuffle=shuffle, ignore_index=ignore_index, compute=compute, ) if not isinstance(index, _Frame): if list_like: # Make sure we don't try to select with pd.Series/pd.Index index = list(index) index = df._select_columns_or_index(index) elif hasattr(index, "to_frame"): # If this is an index, we should still convert to a # DataFrame. Otherwise, the hashed values of a column # selection will not match (important when merging). index = index.to_frame() partitions = index.map_partitions( partitioning_index, npartitions=npartitions or df.npartitions, meta=df._meta._constructor_sliced([0]), transform_divisions=False, ) df2 = df.assign(_partitions=partitions) df2._meta.index.name = df._meta.index.name df3 = rearrange_by_column( df2, "_partitions", npartitions=npartitions, max_branch=max_branch, shuffle=shuffle, compute=compute, ignore_index=ignore_index, ) del df3["_partitions"] return df3
def test_custom_collection(): # Arbitrary hashables h1 = object() h2 = object() dsk = {("x", h1): 1, ("x", h2): 2} dsk2 = { ("y", h1): (add, ("x", h1), ("x", h2)), ("y", h2): (add, ("y", h1), 1) } dsk2.update(dsk) dsk3 = {"z": (add, ("y", h1), ("y", h2))} dsk3.update(dsk2) w = Tuple({}, []) # A collection can have no keys at all x = Tuple(dsk, [("x", h1), ("x", h2)]) y = Tuple(dsk2, [("y", h1), ("y", h2)]) z = Tuple(dsk3, ["z"]) # Collection with multiple names t = w + x + y + z # __slots__ defined on base mixin class propagates with pytest.raises(AttributeError): x.foo = 1 # is_dask_collection assert is_dask_collection(w) assert is_dask_collection(x) assert is_dask_collection(y) assert is_dask_collection(z) assert is_dask_collection(t) # tokenize assert tokenize(w) == tokenize(w) assert tokenize(x) == tokenize(x) assert tokenize(y) == tokenize(y) assert tokenize(z) == tokenize(z) assert tokenize(t) == tokenize(t) # All tokens are unique assert len({tokenize(coll) for coll in (w, x, y, z, t)}) == 5 # get_collection_names assert get_collection_names(w) == set() assert get_collection_names(x) == {"x"} assert get_collection_names(y) == {"y"} assert get_collection_names(z) == {"z"} assert get_collection_names(t) == {"x", "y", "z"} # compute assert w.compute() == () assert x.compute() == (1, 2) assert y.compute() == (3, 4) assert z.compute() == (7, ) assert dask.compute(w, [{ "x": x }, y, z]) == ((), [{ "x": (1, 2) }, (3, 4), (7, )]) assert t.compute() == (1, 2, 3, 4, 7) # persist t2 = t.persist() assert isinstance(t2, Tuple) assert t2._keys == t._keys assert sorted(t2._dask.values()) == [1, 2, 3, 4, 7] assert t2.compute() == (1, 2, 3, 4, 7) w2, x2, y2, z2 = dask.persist(w, x, y, z) assert y2._keys == y._keys assert y2._dask == {("y", h1): 3, ("y", h2): 4} assert y2.compute() == (3, 4) t3 = x2 + y2 + z2 assert t3.compute() == (1, 2, 3, 4, 7) # __dask_postpersist__ with name change rebuild, args = w.__dask_postpersist__() w3 = rebuild({}, *args, rename={"w": "w3"}) assert w3.compute() == () rebuild, args = x.__dask_postpersist__() x3 = rebuild({("x3", h1): 10, ("x3", h2): 20}, *args, rename={"x": "x3"}) assert x3.compute() == (10, 20) rebuild, args = z.__dask_postpersist__() z3 = rebuild({"z3": 70}, *args, rename={"z": "z3"}) assert z3.compute() == (70, )
def to_task_dask(expr): """Normalize a python object and merge all sub-graphs. - Replace ``Delayed`` with their keys - Convert literals to things the schedulers can handle - Extract dask graphs from all enclosed values Parameters ---------- expr : object The object to be normalized. This function knows how to handle ``Delayed``s, as well as most builtin python types. Returns ------- task : normalized task to be run dask : a merged dask graph that forms the dag for this task Examples -------- >>> import dask >>> a = delayed(1, 'a') >>> b = delayed(2, 'b') >>> task, dask = to_task_dask([a, b, 3]) # doctest: +SKIP >>> task # doctest: +SKIP ['a', 'b', 3] >>> dict(dask) # doctest: +SKIP {'a': 1, 'b': 2} >>> task, dasks = to_task_dask({a: 1, b: 2}) # doctest: +SKIP >>> task # doctest: +SKIP (dict, [['a', 1], ['b', 2]]) >>> dict(dask) # doctest: +SKIP {'a': 1, 'b': 2} """ warnings.warn( "The dask.delayed.to_dask_dask function has been " "Deprecated in favor of unpack_collections", stacklevel=2, ) if isinstance(expr, Delayed): return expr.key, expr.dask if is_dask_collection(expr): name = "finalize-" + tokenize(expr, pure=True) keys = expr.__dask_keys__() opt = getattr(expr, "__dask_optimize__", dont_optimize) finalize, args = expr.__dask_postcompute__() dsk = {name: (finalize, keys) + args} dsk.update(opt(expr.__dask_graph__(), keys)) return name, dsk if isinstance(expr, Iterator): expr = list(expr) typ = type(expr) if typ in (list, tuple, set): args, dasks = unzip((to_task_dask(e) for e in expr), 2) args = list(args) dsk = merge(dasks) # Ensure output type matches input type return (args, dsk) if typ is list else ((typ, args), dsk) if typ is dict: args, dsk = to_task_dask([[k, v] for k, v in expr.items()]) return (dict, args), dsk if is_dataclass(expr): args, dsk = to_task_dask([ [f.name, getattr(expr, f.name)] for f in fields(expr) if hasattr(expr, f.name) # if init=False, field might not exist ]) return (apply, typ, (), (dict, args)), dsk if typ is slice: args, dsk = to_task_dask([expr.start, expr.stop, expr.step]) return (slice, ) + tuple(args), dsk return expr, {}