def test_is_arraylike(): np = pytest.importorskip("numpy") assert is_arraylike(0) is False assert is_arraylike(()) is False assert is_arraylike(0) is False assert is_arraylike([]) is False assert is_arraylike([0]) is False assert is_arraylike(np.empty(())) is True assert is_arraylike(np.empty((0, ))) is True assert is_arraylike(np.empty((0, 0))) is True
def _blockwise_comparison_dnf(op, indices: list, dsk: RegenerableGraph): # Return DNF expression pattern for a simple comparison left = _get_blockwise_input(0, indices, dsk) right = _get_blockwise_input(1, indices, dsk) def _inv(symbol: str): return { ">": "<", "<": ">", ">=": "<=", "<=": ">=", }.get(symbol, symbol) if is_arraylike(left) and hasattr(left, "item") and left.size == 1: left = left.item() # Need inverse comparison in read_parquet return (right, _inv(_comparison_symbols[op]), left) if is_arraylike(right) and hasattr(right, "item") and right.size == 1: right = right.item() return to_dnf((left, _comparison_symbols[op], right))
def _data_to_source(arr, path, component=None, storage_options=None, **kwargs): from dask.utils import is_arraylike from dask.array import to_zarr, from_array from ..source.zarr import ZarrArraySource if not is_arraylike(arr): raise NotImplementedError if not hasattr(arr, 'npartitions'): arr = from_array(arr, chunks='auto') to_zarr(arr, path, component=None, storage_options=storage_options, **kwargs) source = ZarrArraySource(path, storage_options, component) return source
def _input_to_dask_cupy_array(self, X): if (is_dataframe_like(X) or is_series_like(X)) and hasattr(X, "dask"): if not isinstance(X._meta, (cudf.Series, cudf.DataFrame)): raise TypeError("Please convert your Dask DataFrame" " to a Dask-cuDF DataFrame using dask_cudf.") X = X.values X._meta = cp.asarray(X._meta) elif is_arraylike(X) and hasattr(X, "dask"): if not isinstance(X._meta, cp.ndarray): raise TypeError("Please convert your CPU Dask Array" " to a GPU Dask Array using" " arr.map_blocks(cp.asarray).") else: raise TypeError("Please pass a GPU backed Dask DataFrame" " or Dask Array.") X.compute_chunk_sizes() return X
def make_meta_object(x, index=None): """Create an empty pandas object containing the desired metadata. Parameters ---------- x : dict, tuple, list, pd.Series, pd.DataFrame, pd.Index, dtype, scalar To create a DataFrame, provide a `dict` mapping of `{name: dtype}`, or an iterable of `(name, dtype)` tuples. To create a `Series`, provide a tuple of `(name, dtype)`. If a pandas object, names, dtypes, and index should match the desired output. If a dtype or scalar, a scalar of the same dtype is returned. index : pd.Index, optional Any pandas index to use in the metadata. If none provided, a `RangeIndex` will be used. Examples -------- >>> make_meta_object([('a', 'i8'), ('b', 'O')]) Empty DataFrame Columns: [a, b] Index: [] >>> make_meta_object(('a', 'f8')) Series([], Name: a, dtype: float64) >>> make_meta_object('i8') 1 """ if is_arraylike(x) and x.shape: return x[:0] if index is not None: index = make_meta_dispatch(index) if isinstance(x, dict): return pd.DataFrame( {c: _empty_series(c, d, index=index) for (c, d) in x.items()}, index=index) if isinstance(x, tuple) and len(x) == 2: return _empty_series(x[0], x[1], index=index) elif isinstance(x, Iterable) and not isinstance(x, str): if not all(isinstance(i, tuple) and len(i) == 2 for i in x): raise ValueError( f"Expected iterable of tuples of (name, dtype), got {x}") return pd.DataFrame( {c: _empty_series(c, d, index=index) for (c, d) in x}, columns=[c for c, d in x], index=index, ) elif not hasattr(x, "dtype") and x is not None: # could be a string, a dtype object, or a python type. Skip `None`, # because it is implictly converted to `dtype('f8')`, which we don't # want here. try: dtype = np.dtype(x) return _scalar_from_dtype(dtype) except Exception: # Continue on to next check pass if is_scalar(x): return _nonempty_scalar(x) raise TypeError(f"Don't know how to create metadata from {x}")
def histogramdd( a: DaskCollection | tuple[DaskCollection, ...], bins: BinArg = 10, range: RangeArg = None, normed: bool | None = None, weights: DaskCollection | None = None, density: bool = False, *, histogram: Any | None = None, storage: storage.Storage = storage.Double(), threads: int | None = None, ) -> Histogram | tuple[da.Array, ...] | tuple[da.Array, list[da.Array]]: """Histogram Dask data in multiple dimensions. Parameters ---------- a : dask collection or tuple of dask collections Data to histogram. Acceptable input data can be of the form: * A dask.array.Array of shape (N, D) where each row is a sample and each column is a specific coordinate for the data. * A sequence of dask collections where each collection (e.g. array or series) contains all values for one coordinate of all data. bins : sequence of arrays, int, or sequence of ints The bin specification. The possible binning configurations are: * A sequence of arrays describing the monotonically increasing bin edges along each dimension. * A single int describing the total number of bins that will be used in each dimension (this requires the `range` argument to be defined). * A sequence of ints describing the total number of bins to be used in each dimension (this requires the `range` argument to be defined). When bins are described by arrays, the rightmost edge is included. Bins described by arrays also allows for non-uniform bin widths. range : tuple(tuple(float, float), ...) optional A sequence of length D, each a (min, max) tuple giving the outer bin edges to be used if the edges are not given explicitly in `bins`. If defined, this argument is required to have an entry for each dimension. Unlike :func:`numpy.histogramdd`, if `bins` does not define bin edges, this argument is required (this function will not automatically use the min and max of of the value in a given dimension because the input data may be lazy in dask). normed : bool, optional An unsupported argument that has been deprecated in the NumPy API (preserved to maintain calls dependent on argument order). weights : dask.array.Array or dask.dataframe.Series, optional An array of values weighing each sample in the input data. The chunks of the weights must be identical to the chunking along the 0th (row) axis of the data sample. density : bool If ``False`` (default), the returned array represents the number of samples in each bin. If ``True``, the returned array represents the probability density function at each bin. histogram : dask_histogram.Histogram, optional If `dh.Histogram`, object based output is enabled. storage : boost_histogram.storage.Storage Define the storage used by the :py:class:`Histogram` object. threads : int, optional Ignored argument kept for compatibility with boost-histogram. We let Dask have complete control over threads. Returns ------- tuple(dask.array.Array, tuple(dask.array.Array)) or Histogram The default return is the style of :func:`dask.array.histogramdd`: An array of bin contents and a tuple of edges arrays (one for each dimension). If the `histogram` argument is used then the return is a :obj:`dask_histogram.Histogram` object. See Also -------- histogram histogram2d Examples -------- Creating a three dimensional histogram with variable width bins in each dimension. First, using three 1D arrays for each coordinate: >>> import dask.array as da >>> import dask_histogram.boost as dhb >>> x = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> y = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> z = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> bins = [ ... [-3, -2, 0, 1, 3], ... [-3, -1, 1, 2, 3], ... [-3, -2, 0, 2, 3], ... ] >>> h, edges = dhb.histogramdd((x, y, z), bins=bins) >>> type(h) <class 'dask.array.core.Array'> >>> h.shape (4, 4, 4) >>> len(edges) 3 Now the same histogram but instead of a :py:func:`dask.array.histogramdd` style return (which mirrors the return style of :py:func:`numpy.histogramdd`), we use the `histogram` argument to trigger the return of a :obj:`dask_histogram.Histogram` object: >>> import dask.array as da >>> import dask_histogram.boost as dhb >>> x = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> y = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> z = da.random.standard_normal(size=(10000,), chunks=(2000,)) >>> bins = [ ... [-3, -2, 0, 1, 3], ... [-3, -1, 1, 2, 3], ... [-3, -2, 0, 2, 3], ... ] >>> h = dhb.histogramdd((x, y, z), bins=bins, histogram=dhb.Histogram) >>> h Histogram( Variable([-3, -2, 0, 1, 3]), Variable([-3, -1, 1, 2, 3]), Variable([-3, -2, 0, 2, 3]), storage=Double()) # (has staged fills) >>> h.staged_fills() True >>> h = h.compute() >>> h # doctest: +SKIP Histogram( Variable([-3, -2, 0, 1, 3]), Variable([-3, -1, 1, 2, 3]), Variable([-3, -2, 0, 2, 3]), storage=Double()) # Sum: 9919.0 (10000.0 with flow) Another 3D histogram example but with an alternative dataset form (a single array with three columns), fixed bin widths, sample weights, and usage of the boost-histogram ``Weight()`` storage: >>> import dask.array as da >>> import dask_histogram.boost as dhb >>> a = da.random.standard_normal(size=(10000, 3), chunks=(2000, 3)) >>> w = da.random.uniform(0.5, 0.7, size=(10000,), chunks=2000) >>> bins = (7, 5, 6) >>> range = ((-3, 3), (-2.9, 2.9), (-3.1, 3.1)) >>> h = dhb.histogramdd( ... a, ... bins=bins, ... range=range, ... weights=w, ... histogram=dhb.Histogram, ... storage=dhb.storage.Weight() ... ) >>> h Histogram( Regular(7, -3, 3), Regular(5, -2.9, 2.9), Regular(6, -3.1, 3.1), storage=Weight()) # Sum: WeightedSum(value=0, variance=0) (has staged fills) >>> h.staged_fills() True >>> h = h.compute() >>> h.staged_fills() False """ # Check for invalid argument combinations. if normed is not None: raise KeyError( "normed=True is deprecated in NumPy and not supported by dask-histogram." ) if density and histogram is not None: raise KeyError( "dask-histogram does not support the density keyword when returning a " "dask-histogram object.") # If input is a multidimensional array or dataframe, we wrap it in # a tuple that will be passed to fill and unrolled in the backend. if (is_arraylike(a) and a.ndim > 1) or is_dataframe_like(a): # type: ignore ndim = a.shape[1] # type: ignore a = (a, ) # type: ignore else: ndim = len(a) for entry in a: if not is_dask_collection(entry): raise ValueError( "non-dask collection was passed; this function only supports dask " "collections as input") bins, range = normalize_bins_range(ndim, bins, range) # Create the axes based on the bins and range values. axes = [] for _, (b, r) in enumerate(zip(bins, range)): # type: ignore if r is None: axes.append(axis.Variable(b)) # type: ignore else: axes.append(axis.Regular(bins=b, start=r[0], stop=r[1])) # type: ignore # Finally create and fill the histogram object. hist = Histogram(*axes, storage=storage).fill(*a, weight=weights) if histogram != Histogram: return hist.to_dask_array(flow=False, dd=True) return hist
def from_map( func, *iterables, args=None, meta=None, divisions=None, label=None, token=None, enforce_metadata=True, **kwargs, ): """Create a DataFrame collection from a custom function map WARNING: The ``from_map`` API is experimental, and stability is not yet guaranteed. Use at your own risk! Parameters ---------- func : callable Function used to create each partition. If ``func`` satisfies the ``DataFrameIOFunction`` protocol, column projection will be enabled. *iterables : Iterable objects Iterable objects to map to each output partition. All iterables must be the same length. This length determines the number of partitions in the output collection (only one element of each iterable will be passed to ``func`` for each partition). args : list or tuple, optional Positional arguments to broadcast to each output partition. Note that these arguments will always be passed to ``func`` after the ``iterables`` positional arguments. $META divisions : tuple, str, optional Partition boundaries along the index. For tuple, see https://docs.dask.org/en/latest/dataframe-design.html#partitions For string 'sorted' will compute the delayed values to find index values. Assumes that the indexes are mutually sorted. If None, then won't use index information label : str, optional String to use as the function-name label in the output collection-key names. token : str, optional String to use as the "token" in the output collection-key names. enforce_metadata : bool, default True Whether to enforce at runtime that the structure of the DataFrame produced by ``func`` actually matches the structure of ``meta``. This will rename and reorder columns for each partition, and will raise an error if this doesn't work or types don't match. **kwargs: Key-word arguments to broadcast to each output partition. These same arguments will be passed to ``func`` for every output partition. Examples -------- >>> import pandas as pd >>> import dask.dataframe as dd >>> func = lambda x, size=0: pd.Series([x] * size) >>> inputs = ["A", "B"] >>> dd.from_map(func, inputs, size=2).compute() 0 A 1 A 0 B 1 B dtype: object This API can also be used as an alternative to other file-based IO functions, like ``read_parquet`` (which are already just ``from_map`` wrapper functions): >>> import pandas as pd >>> import dask.dataframe as dd >>> paths = ["0.parquet", "1.parquet", "2.parquet"] >>> dd.from_map(pd.read_parquet, paths).head() # doctest: +SKIP name timestamp 2000-01-01 00:00:00 Laura 2000-01-01 00:00:01 Oliver 2000-01-01 00:00:02 Alice 2000-01-01 00:00:03 Victor 2000-01-01 00:00:04 Bob Since ``from_map`` allows you to map an arbitrary function to any number of iterable objects, it can be a very convenient means of implementing functionality that may be missing from from other DataFrame-creation methods. For example, if you happen to have apriori knowledge about the number of rows in each of the files in a dataset, you can generate a DataFrame collection with a global RangeIndex: >>> import pandas as pd >>> import numpy as np >>> import dask.dataframe as dd >>> paths = ["0.parquet", "1.parquet", "2.parquet"] >>> file_sizes = [86400, 86400, 86400] >>> def func(path, row_offset): ... # Read parquet file and set RangeIndex offset ... df = pd.read_parquet(path) ... return df.set_index( ... pd.RangeIndex(row_offset, row_offset+len(df)) ... ) >>> def get_ddf(paths, file_sizes): ... offsets = [0] + list(np.cumsum(file_sizes)) ... return dd.from_map( ... func, paths, offsets[:-1], divisions=offsets ... ) >>> ddf = get_ddf(paths, file_sizes) # doctest: +SKIP >>> ddf.index # doctest: +SKIP Dask Index Structure: npartitions=3 0 int64 86400 ... 172800 ... 259200 ... dtype: int64 Dask Name: myfunc, 6 tasks See Also -------- dask.dataframe.from_delayed dask.layers.DataFrameIOLayer """ # Input validation if not callable(func): raise ValueError("`func` argument must be `callable`") lengths = set() iterables = list(iterables) for i, iterable in enumerate(iterables): if not isinstance(iterable, Iterable): raise ValueError( f"All elements of `iterables` must be Iterable, got {type(iterable)}" ) try: lengths.add(len(iterable)) except (AttributeError, TypeError): iterables[i] = list(iterable) lengths.add(len(iterables[i])) if len(lengths) == 0: raise ValueError("`from_map` requires at least one Iterable input") elif len(lengths) > 1: raise ValueError("All `iterables` must have the same length") if lengths == {0}: raise ValueError("All `iterables` must have a non-zero length") # Check for `produces_tasks` and `creation_info`. # These options are included in the function signature, # because they are not intended for "public" use. produces_tasks = kwargs.pop("produces_tasks", False) creation_info = kwargs.pop("creation_info", None) if produces_tasks or len(iterables) == 1: if len(iterables) > 1: # Tasks are not detected correctly when they are "packed" # within an outer list/tuple raise ValueError( "Multiple iterables not supported when produces_tasks=True") inputs = iterables[0] packed = False else: inputs = list(zip(*iterables)) packed = True # Define collection name label = label or funcname(func) token = token or tokenize(func, meta, inputs, args, divisions, enforce_metadata, **kwargs) name = f"{label}-{token}" # Get "projectable" column selection. # Note that this relies on the IO function # ducktyping with DataFrameIOFunction column_projection = func.columns if isinstance( func, DataFrameIOFunction) else None # NOTE: Most of the metadata-handling logic used here # is copied directly from `map_partitions` if meta is None: meta = _emulate( func, *(inputs[0] if packed else inputs[:1]), *(args or []), udf=True, **kwargs, ) meta_is_emulated = True else: meta = make_meta(meta) meta_is_emulated = False if not (has_parallel_type(meta) or is_arraylike(meta) and meta.shape): if not meta_is_emulated: raise TypeError( "Meta is not valid, `from_map` expects output to be a pandas object. " "Try passing a pandas object as meta or a dict or tuple representing the " "(name, dtype) of the columns.") # If `meta` is not a pandas object, the concatenated results will be a # different type meta = make_meta(_concat([meta])) # Ensure meta is empty DataFrame meta = make_meta(meta) # Define io_func if packed or args or kwargs or enforce_metadata: io_func = _PackedArgCallable( func, args=args, kwargs=kwargs, meta=meta if enforce_metadata else None, enforce_metadata=enforce_metadata, packed=packed, ) else: io_func = func # Construct DataFrameIOLayer layer = DataFrameIOLayer( name, column_projection, inputs, io_func, label=label, produces_tasks=produces_tasks, creation_info=creation_info, ) # Return new DataFrame-collection object divisions = divisions or [None] * (len(inputs) + 1) graph = HighLevelGraph.from_collections(name, layer, dependencies=[]) return new_dd_object(graph, name, meta, divisions)