def concat(objs: List): """ Concat the results of partitioned dask task executions. This function guess the types of resulting list, then calls the corresponding native dask concat functions. Parameters ---------- objs: List List of the partitioned dask task execution results, which will be concat. Returns ------- obj: The concat result """ if is_arraylike(objs[0]): res = array_concat(objs, axes=[0]) # TODO: Add concat with args support elif any((is_dataframe_like(objs[0]), is_series_like(objs[0]), is_index_like(objs[0]))): res = df_concat(objs) else: res = objs return res.compute() if is_dask_collection(res) else res
def assert_dask_dtypes(ddf, res, numeric_equal=True): """Check that the dask metadata matches the result. If `numeric_equal`, integer and floating dtypes compare equal. This is useful due to the implicit conversion of integer to floating upon encountering missingness, which is hard to infer statically.""" eq_type_sets = [{"O", "S", "U", "a"}] # treat object and strings alike if numeric_equal: eq_type_sets.append({"i", "f", "u"}) def eq_dtypes(a, b): return any(a.kind in eq_types and b.kind in eq_types for eq_types in eq_type_sets) or (a == b) if not is_dask_collection(res) and is_dataframe_like(res): for col, a, b in pd.concat([ddf._meta.dtypes, res.dtypes], axis=1).itertuples(): assert eq_dtypes(a, b) elif not is_dask_collection(res) and (is_index_like(res) or is_series_like(res)): a = ddf._meta.dtype b = res.dtype assert eq_dtypes(a, b) else: if hasattr(ddf._meta, "dtype"): a = ddf._meta.dtype if not hasattr(res, "dtype"): assert np.isscalar(res) b = np.dtype(type(res)) else: b = res.dtype assert eq_dtypes(a, b) else: assert type(ddf._meta) == type(res)
def _maybe_sort(a, check_index: bool): # sort by value, then index try: if is_dataframe_like(a): if set(a.index.names) & set(a.columns): a.index.names = [ "-overlapped-index-name-%d" % i for i in range(len(a.index.names)) ] a = a.sort_values(by=methods.tolist(a.columns)) else: a = a.sort_values() except (TypeError, IndexError, ValueError): pass return a.sort_index() if check_index else a
def _input_to_dask_cupy_array(self, X): if (is_dataframe_like(X) or is_series_like(X)) and hasattr(X, "dask"): if not isinstance(X._meta, (cudf.Series, cudf.DataFrame)): raise TypeError("Please convert your Dask DataFrame" " to a Dask-cuDF DataFrame using dask_cudf.") X = X.values X._meta = cp.asarray(X._meta) elif is_arraylike(X) and hasattr(X, "dask"): if not isinstance(X._meta, cp.ndarray): raise TypeError("Please convert your CPU Dask Array" " to a GPU Dask Array using" " arr.map_blocks(cp.asarray).") else: raise TypeError("Please pass a GPU backed Dask DataFrame" " or Dask Array.") X.compute_chunk_sizes() return X
def histogramdd( a: DaskCollection | tuple[DaskCollection, ...], bins: BinArg = 10, range: RangeArg = None, normed: bool | None = None, weights: DaskCollection | None = None, density: bool = False, *, histogram: Any | None = None, storage: storage.Storage = storage.Double(), threads: int | None = None, ) -> Histogram | tuple[da.Array, ...] | tuple[da.Array, list[da.Array]]: """Histogram Dask data in multiple dimensions. Parameters ---------- a : dask collection or tuple of dask collections Data to histogram. Acceptable input data can be of the form: * A dask.array.Array of shape (N, D) where each row is a sample and each column is a specific coordinate for the data. * A sequence of dask collections where each collection (e.g. array or series) contains all values for one coordinate of all data. bins : sequence of arrays, int, or sequence of ints The bin specification. The possible binning configurations are: * A sequence of arrays describing the monotonically increasing bin edges along each dimension. * A single int describing the total number of bins that will be used in each dimension (this requires the `range` argument to be defined). * A sequence of ints describing the total number of bins to be used in each dimension (this requires the `range` argument to be defined). When bins are described by arrays, the rightmost edge is included. Bins described by arrays also allows for non-uniform bin widths. range : tuple(tuple(float, float), ...) optional A sequence of length D, each a (min, max) tuple giving the outer bin edges to be used if the edges are not given explicitly in `bins`. If defined, this argument is required to have an entry for each dimension. Unlike :func:`numpy.histogramdd`, if `bins` does not define bin edges, this argument is required (this function will not automatically use the min and max of of the value in a given dimension because the input data may be lazy in dask). normed : bool, optional An unsupported argument that has been deprecated in the NumPy API (preserved to maintain calls dependent on argument order). weights : dask.array.Array or dask.dataframe.Series, optional An array of values weighing each sample in the input data. The chunks of the weights must be identical to the chunking along the 0th (row) axis of the data sample. density : bool If ``False`` (default), the returned array represents the number of samples in each bin. If ``True``, the returned array represents the probability density function at each bin. histogram : dask_histogram.Histogram, optional If `dh.Histogram`, object based output is enabled. storage : boost_histogram.storage.Storage Define the storage used by the :py:class:`Histogram` object. threads : int, optional Ignored argument kept for compatibility with boost-histogram. We let Dask have complete control over threads. Returns ------- tuple(dask.array.Array, tuple(dask.array.Array)) or Histogram The default return is the style of :func:`dask.array.histogramdd`: An array of bin contents and a tuple of edges arrays (one for each dimension). If the `histogram` argument is used then the return is a :obj:`dask_histogram.Histogram` object. See Also -------- histogram histogram2d Examples -------- Creating a three dimensional histogram with variable width bins in each dimension. First, using three 1D arrays for each coordinate: >>> import dask.array as da >>> import dask_histogram.boost as dhb >>> x = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> y = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> z = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> bins = [ ... [-3, -2, 0, 1, 3], ... [-3, -1, 1, 2, 3], ... [-3, -2, 0, 2, 3], ... ] >>> h, edges = dhb.histogramdd((x, y, z), bins=bins) >>> type(h) <class 'dask.array.core.Array'> >>> h.shape (4, 4, 4) >>> len(edges) 3 Now the same histogram but instead of a :py:func:`dask.array.histogramdd` style return (which mirrors the return style of :py:func:`numpy.histogramdd`), we use the `histogram` argument to trigger the return of a :obj:`dask_histogram.Histogram` object: >>> import dask.array as da >>> import dask_histogram.boost as dhb >>> x = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> y = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> z = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> bins = [ ... [-3, -2, 0, 1, 3], ... [-3, -1, 1, 2, 3], ... [-3, -2, 0, 2, 3], ... ] >>> h = dhb.histogramdd((x, y, z), bins=bins, histogram=dhb.Histogram) >>> h Histogram( Variable([-3, -2, 0, 1, 3]), Variable([-3, -1, 1, 2, 3]), Variable([-3, -2, 0, 2, 3]), storage=Double()) # (has staged fills) >>> h.staged_fills() True >>> h = h.compute() >>> h # doctest: +SKIP Histogram( Variable([-3, -2, 0, 1, 3]), Variable([-3, -1, 1, 2, 3]), Variable([-3, -2, 0, 2, 3]), storage=Double()) # Sum: 9919.0 (10000.0 with flow) Another 3D histogram example but with an alternative dataset form (a single array with three columns), fixed bin widths, sample weights, and usage of the boost-histogram ``Weight()`` storage: >>> import dask.array as da >>> import dask_histogram.boost as dhb >>> a = da.random.standard_normal(size=(10000, 3), chunks=(2000, 3)) >>> w = da.random.uniform(0.5, 0.7, size=(10000,), chunks=2000) >>> bins = (7, 5, 6) >>> range = ((-3, 3), (-2.9, 2.9), (-3.1, 3.1)) >>> h = dhb.histogramdd( ... a, ... bins=bins, ... range=range, ... weights=w, ... histogram=dhb.Histogram, ... storage=dhb.storage.Weight() ... ) >>> h Histogram( Regular(7, -3, 3), Regular(5, -2.9, 2.9), Regular(6, -3.1, 3.1), storage=Weight()) # Sum: WeightedSum(value=0, variance=0) (has staged fills) >>> h.staged_fills() True >>> h = h.compute() >>> h.staged_fills() False """ # Check for invalid argument combinations. if normed is not None: raise KeyError( "normed=True is deprecated in NumPy and not supported by dask-histogram." ) if density and histogram is not None: raise KeyError( "dask-histogram does not support the density keyword when returning a " "dask-histogram object.") # If input is a multidimensional array or dataframe, we wrap it in # a tuple that will be passed to fill and unrolled in the backend. if (is_arraylike(a) and a.ndim > 1) or is_dataframe_like(a): # type: ignore ndim = a.shape[1] # type: ignore a = (a, ) # type: ignore else: ndim = len(a) for entry in a: if not is_dask_collection(entry): raise ValueError( "non-dask collection was passed; this function only supports dask " "collections as input") bins, range = normalize_bins_range(ndim, bins, range) # Create the axes based on the bins and range values. axes = [] for _, (b, r) in enumerate(zip(bins, range)): # type: ignore if r is None: axes.append(axis.Variable(b)) # type: ignore else: axes.append(axis.Regular(bins=b, start=r[0], stop=r[1])) # type: ignore # Finally create and fill the histogram object. hist = Histogram(*axes, storage=storage).fill(*a, weight=weights) if histogram != Histogram: return hist.to_dask_array(flow=False, dd=True) return hist
def _partitioned_histogram( *data: DaskCollection, histref: bh.Histogram, weights: DaskCollection | None = None, sample: DaskCollection | None = None, split_every: int | None = None, ) -> PartitionedHistogram: name = f"hist-on-block-{tokenize(data, histref, weights, sample, split_every)}" data_is_df = is_dataframe_like(data[0]) data_is_dak = is_awkward_like(data[0]) _weight_sample_check(*data, weights=weights) # Single awkward array object. if len(data) == 1 and data_is_dak: from dask_awkward.core import partitionwise_layer as dak_pwl x = data[0] if weights is not None and sample is not None: raise NotImplementedError() elif weights is not None and sample is None: raise NotImplementedError() elif weights is None and sample is not None: raise NotImplementedError() else: g = dak_pwl(_blocked_dak, name, x, histref=histref) # Single object, not a dataframe elif len(data) == 1 and not data_is_df: x = data[0] if weights is not None and sample is not None: g = partitionwise(_blocked_sa_w_s, name, x, weights, sample, histref=histref) elif weights is not None and sample is None: g = partitionwise(_blocked_sa_w, name, x, weights, histref=histref) elif weights is None and sample is not None: g = partitionwise(_blocked_sa_s, name, x, sample, histref=histref) else: g = partitionwise(_blocked_sa, name, x, histref=histref) # Single object, is a dataframe elif len(data) == 1 and data_is_df: x = data[0] if weights is not None and sample is not None: g = partitionwise(_blocked_df_w_s, name, x, weights, sample, histref=histref) elif weights is not None and sample is None: g = partitionwise(_blocked_df_w, name, x, weights, histref=histref) elif weights is None and sample is not None: g = partitionwise(_blocked_df_s, name, x, sample, histref=histref) else: g = partitionwise(_blocked_df, name, x, histref=histref) # Multiple objects else: # Awkward array collection detected as first argument if data_is_dak: from dask_awkward.core import partitionwise_layer as dak_pwl if weights is None and sample is None: g = dak_pwl(_blocked_dak_ma, name, *data, histref=histref) else: raise NotImplementedError() # Not an awkward array collection elif weights is not None and sample is not None: g = partitionwise(_blocked_ma_w_s, name, *data, weights, sample, histref=histref) elif weights is not None and sample is None: g = partitionwise(_blocked_ma_w, name, *data, weights, histref=histref) elif weights is None and sample is not None: g = partitionwise(_blocked_ma_s, name, *data, sample, histref=histref) else: g = partitionwise(_blocked_ma, name, *data, histref=histref) dependencies = _dependencies(*data, weights=weights, sample=sample) hlg = HighLevelGraph.from_collections(name, g, dependencies=dependencies) return PartitionedHistogram(hlg, name, data[0].npartitions, histref=histref)
def check_meta(x, meta, funcname=None, numeric_equal=True): """Check that the dask metadata matches the result. If metadata matches, ``x`` is passed through unchanged. A nice error is raised if metadata doesn't match. Parameters ---------- x : DataFrame, Series, or Index meta : DataFrame, Series, or Index The expected metadata that ``x`` should match funcname : str, optional The name of the function in which the metadata was specified. If provided, the function name will be included in the error message to be more helpful to users. numeric_equal : bool, optionl If True, integer and floating dtypes compare equal. This is useful due to panda's implicit conversion of integer to floating upon encountering missingness, which is hard to infer statically. """ eq_types = {"i", "f", "u"} if numeric_equal else set() def equal_dtypes(a, b): if is_categorical_dtype(a) != is_categorical_dtype(b): return False if isinstance(a, str) and a == "-" or isinstance(b, str) and b == "-": return False if is_categorical_dtype(a) and is_categorical_dtype(b): if UNKNOWN_CATEGORIES in a.categories or UNKNOWN_CATEGORIES in b.categories: return True return a == b return (a.kind in eq_types and b.kind in eq_types) or is_dtype_equal( a, b) if not (is_dataframe_like(meta) or is_series_like(meta) or is_index_like(meta)) or is_dask_collection(meta): raise TypeError("Expected partition to be DataFrame, Series, or " "Index, got `%s`" % typename(type(meta))) # Notice, we use .__class__ as opposed to type() in order to support # object proxies see <https://github.com/dask/dask/pull/6981> if x.__class__ != meta.__class__: errmsg = "Expected partition of type `{}` but got `{}`".format( typename(type(meta)), typename(type(x)), ) elif is_dataframe_like(meta): dtypes = pd.concat([x.dtypes, meta.dtypes], axis=1, sort=True) bad_dtypes = [(repr(col), a, b) for col, a, b in dtypes.fillna("-").itertuples() if not equal_dtypes(a, b)] if bad_dtypes: errmsg = "Partition type: `{}`\n{}".format( typename(type(meta)), asciitable(["Column", "Found", "Expected"], bad_dtypes), ) else: check_matching_columns(meta, x) return x else: if equal_dtypes(x.dtype, meta.dtype): return x errmsg = "Partition type: `{}`\n{}".format( typename(type(meta)), asciitable(["", "dtype"], [("Found", x.dtype), ("Expected", meta.dtype)]), ) raise ValueError("Metadata mismatch found%s.\n\n" "%s" % ((" in `%s`" % funcname if funcname else ""), errmsg))