Exemplo n.º 1
0
def test_ensure_dict():
    d = {'x': 1}
    assert ensure_dict(d) is d
    hlg = HighLevelGraph.from_collections('x', d)
    assert type(ensure_dict(hlg)) is dict
    assert ensure_dict(hlg) == d

    class mydict(dict):
        pass

    md = mydict()
    md['x'] = 1
    assert type(ensure_dict(md)) is dict
    assert ensure_dict(md) == d
Exemplo n.º 2
0
def test_blockwise_non_blockwise_output():
    x = da.ones(10, chunks=(5,))
    y = (((x + 1) + 2) + 3)
    w = y.sum()
    z = (((y * 2) * 3) * 4)

    z_top_before = tuple(z.dask.dicts[z.name].indices)
    (zz,) = dask.optimize(z)
    z_top_after = tuple(z.dask.dicts[z.name].indices)
    assert z_top_before == z_top_after, "z_top mutated"

    dsk = optimize_blockwise(z.dask, keys=list(dask.core.flatten(z.__dask_keys__())))
    assert isinstance(dsk, HighLevelGraph)
    assert len([layer for layer in dsk.dicts.values() if isinstance(layer, Blockwise)]) == 1

    dsk = optimize_blockwise(HighLevelGraph.merge(w.dask, z.dask),
                             keys=list(dask.core.flatten([w.__dask_keys__(), z.__dask_keys__()])))
    assert isinstance(dsk, HighLevelGraph)
    assert len([layer for layer in z.dask.dicts.values() if isinstance(layer, Blockwise)]) >= 1
Exemplo n.º 3
0
def _groupby_to_disk(
    ddf,
    write_func,
    col_groups,
    agg_cols,
    agg_list,
    out_path,
    freq_limit,
    tree_width,
    on_host,
    stat_name="categories",
    concat_groups=False,
    name_sep="_",
):
    if not col_groups:
        return {}

    if concat_groups:
        if agg_list and agg_list != ["count"]:
            raise ValueError("Cannot use concat_groups=True with aggregations other than count")
        if agg_cols:
            raise ValueError("Cannot aggregate continuous-column stats with concat_groups=True")

    # Update tree_width
    tw = {}
    for col in col_groups:
        col = [col] if isinstance(col, str) else col
        col_str = _make_name(*col, sep=name_sep)
        if tree_width is None:
            tw[col_str] = 8
        elif isinstance(tree_width, int):
            tw[col_str] = tree_width
        else:
            tw[col_str] = tree_width.get(col_str, None) or 8
    tree_width = tw

    # Make dedicated output directory for the categories
    fs = get_fs_token_paths(out_path)[0]
    out_path = fs.sep.join([out_path, stat_name])
    fs.mkdirs(out_path, exist_ok=True)

    dsk = {}
    token = tokenize(ddf, col_groups, out_path, freq_limit, tree_width, on_host)
    level_1_name = "level_1-" + token
    split_name = "split-" + token
    level_2_name = "level_2-" + token
    level_3_name = "level_3-" + token
    finalize_labels_name = stat_name + "-" + token
    for p in range(ddf.npartitions):
        dsk[(level_1_name, p)] = (
            _top_level_groupby,
            (ddf._name, p),
            col_groups,
            tree_width,
            agg_cols,
            agg_list,
            on_host,
            concat_groups,
            name_sep,
        )
        k = 0
        for c, col in enumerate(col_groups):
            col = [col] if isinstance(col, str) else col
            col_str = _make_name(*col, sep=name_sep)
            for s in range(tree_width[col_str]):
                dsk[(split_name, p, c, s)] = (getitem, (level_1_name, p), k)
                k += 1

    col_groups_str = []
    for c, col in enumerate(col_groups):
        col = [col] if isinstance(col, str) else col
        col_str = _make_name(*col, sep=name_sep)
        col_groups_str.append(col_str)
        freq_limit_val = None
        if freq_limit:
            freq_limit_val = freq_limit[col_str] if isinstance(freq_limit, dict) else freq_limit
        for s in range(tree_width[col_str]):
            dsk[(level_2_name, c, s)] = (
                _mid_level_groupby,
                [(split_name, p, c, s) for p in range(ddf.npartitions)],
                col,
                agg_cols,
                agg_list,
                freq_limit_val,
                on_host,
                concat_groups,
                name_sep,
            )

        dsk[(level_3_name, c)] = (
            write_func,
            [(level_2_name, c, s) for s in range(tree_width[col_str])],
            out_path,
            col,
            on_host,
            concat_groups,
            name_sep,
        )

    dsk[finalize_labels_name] = (
        _finish_labels,
        [(level_3_name, c) for c, col in enumerate(col_groups)],
        col_groups_str,
    )
    graph = HighLevelGraph.from_collections(finalize_labels_name, dsk, dependencies=[ddf])
    return graph, finalize_labels_name
Exemplo n.º 4
0
def blockwise(
    func,
    out_ind,
    *args,
    name=None,
    token=None,
    dtype=None,
    adjust_chunks=None,
    new_axes=None,
    align_arrays=True,
    concatenate=None,
    meta=None,
    **kwargs,
):
    """Tensor operation: Generalized inner and outer products

    A broad class of blocked algorithms and patterns can be specified with a
    concise multi-index notation.  The ``blockwise`` function applies an in-memory
    function across multiple blocks of multiple inputs in a variety of ways.
    Many dask.array operations are special cases of blockwise including
    elementwise, broadcasting, reductions, tensordot, and transpose.

    Parameters
    ----------
    func : callable
        Function to apply to individual tuples of blocks
    out_ind : iterable
        Block pattern of the output, something like 'ijk' or (1, 2, 3)
    *args : sequence of Array, index pairs
        Sequence like (x, 'ij', y, 'jk', z, 'i')
    **kwargs : dict
        Extra keyword arguments to pass to function
    dtype : np.dtype
        Datatype of resulting array.
    concatenate : bool, keyword only
        If true concatenate arrays along dummy indices, else provide lists
    adjust_chunks : dict
        Dictionary mapping index to function to be applied to chunk sizes
    new_axes : dict, keyword only
        New indexes and their dimension lengths
    align_arrays: bool
        Whether or not to align chunks along equally sized dimensions when
        multiple arrays are provided.  This allows for larger chunks in some
        arrays to be broken into smaller ones that match chunk sizes in other
        arrays such that they are compatible for block function mapping. If
        this is false, then an error will be thrown if arrays do not already
        have the same number of blocks in each dimension.

    Examples
    --------
    2D embarrassingly parallel operation from two arrays, x, and y.

    >>> import operator, numpy as np, dask.array as da
    >>> x = da.from_array([[1, 2],
    ...                    [3, 4]], chunks=(1, 2))
    >>> y = da.from_array([[10, 20],
    ...                    [0, 0]])
    >>> z = blockwise(operator.add, 'ij', x, 'ij', y, 'ij', dtype='f8')
    >>> z.compute()
    array([[11, 22],
           [ 3,  4]])

    Outer product multiplying a by b, two 1-d vectors

    >>> a = da.from_array([0, 1, 2], chunks=1)
    >>> b = da.from_array([10, 50, 100], chunks=1)
    >>> z = blockwise(np.outer, 'ij', a, 'i', b, 'j', dtype='f8')
    >>> z.compute()
    array([[  0,   0,   0],
           [ 10,  50, 100],
           [ 20, 100, 200]])

    z = x.T

    >>> z = blockwise(np.transpose, 'ji', x, 'ij', dtype=x.dtype)
    >>> z.compute()
    array([[1, 3],
           [2, 4]])

    The transpose case above is illustrative because it does transposition
    both on each in-memory block by calling ``np.transpose`` and on the order
    of the blocks themselves, by switching the order of the index ``ij -> ji``.

    We can compose these same patterns with more variables and more complex
    in-memory functions

    z = X + Y.T

    >>> z = blockwise(lambda x, y: x + y.T, 'ij', x, 'ij', y, 'ji', dtype='f8')
    >>> z.compute()
    array([[11,  2],
           [23,  4]])

    Any index, like ``i`` missing from the output index is interpreted as a
    contraction (note that this differs from Einstein convention; repeated
    indices do not imply contraction.)  In the case of a contraction the passed
    function should expect an iterable of blocks on any array that holds that
    index.  To receive arrays concatenated along contracted dimensions instead
    pass ``concatenate=True``.

    Inner product multiplying a by b, two 1-d vectors

    >>> def sequence_dot(a_blocks, b_blocks):
    ...     result = 0
    ...     for a, b in zip(a_blocks, b_blocks):
    ...         result += a.dot(b)
    ...     return result

    >>> z = blockwise(sequence_dot, '', a, 'i', b, 'i', dtype='f8')
    >>> z.compute()
    250

    Add new single-chunk dimensions with the ``new_axes=`` keyword, including
    the length of the new dimension.  New dimensions will always be in a single
    chunk.

    >>> def f(a):
    ...     return a[:, None] * np.ones((1, 5))

    >>> z = blockwise(f, 'az', a, 'a', new_axes={'z': 5}, dtype=a.dtype)

    New dimensions can also be multi-chunk by specifying a tuple of chunk
    sizes.  This has limited utility as is (because the chunks are all the
    same), but the resulting graph can be modified to achieve more useful
    results (see ``da.map_blocks``).

    >>> z = blockwise(f, 'az', a, 'a', new_axes={'z': (5, 5)}, dtype=x.dtype)
    >>> z.chunks
    ((1, 1, 1), (5, 5))

    If the applied function changes the size of each chunk you can specify this
    with a ``adjust_chunks={...}`` dictionary holding a function for each index
    that modifies the dimension size in that index.

    >>> def double(x):
    ...     return np.concatenate([x, x])

    >>> y = blockwise(double, 'ij', x, 'ij',
    ...               adjust_chunks={'i': lambda n: 2 * n}, dtype=x.dtype)
    >>> y.chunks
    ((2, 2), (2,))

    Include literals by indexing with None

    >>> z = blockwise(operator.add, 'ij', x, 'ij', 1234, None, dtype=x.dtype)
    >>> z.compute()
    array([[1235, 1236],
           [1237, 1238]])
    """
    out = name
    new_axes = new_axes or {}

    # Input Validation
    if len(set(out_ind)) != len(out_ind):
        raise ValueError(
            "Repeated elements not allowed in output index",
            [k for k, v in toolz.frequencies(out_ind).items() if v > 1],
        )
    new = (set(out_ind) -
           {a
            for arg in args[1::2] if arg is not None
            for a in arg} - set(new_axes or ()))
    if new:
        raise ValueError("Unknown dimension", new)

    from dask.array.core import normalize_arg, unify_chunks

    if align_arrays:
        chunkss, arrays = unify_chunks(*args)
    else:
        arginds = [(a, i) for (a, i) in toolz.partition(2, args)
                   if i is not None]
        chunkss = {}
        # For each dimension, use the input chunking that has the most blocks;
        # this will ensure that broadcasting works as expected, and in
        # particular the number of blocks should be correct if the inputs are
        # consistent.
        for arg, ind in arginds:
            for c, i in zip(arg.chunks, ind):
                if i not in chunkss or len(c) > len(chunkss[i]):
                    chunkss[i] = c
        arrays = args[::2]

    for k, v in new_axes.items():
        if not isinstance(v, tuple):
            v = (v, )
        chunkss[k] = v

    arginds = zip(arrays, args[1::2])
    numblocks = {}

    dependencies = []
    arrays = []

    # Normalize arguments
    argindsstr = []

    for arg, ind in arginds:
        if ind is None:
            arg = normalize_arg(arg)
            arg, collections = unpack_collections(arg)
            dependencies.extend(collections)
        else:
            if (hasattr(arg, "ndim") and hasattr(ind, "__len__")
                    and arg.ndim != len(ind)):
                raise ValueError(
                    "Index string %s does not match array dimension %d" %
                    (ind, arg.ndim))
            numblocks[arg.name] = arg.numblocks
            arrays.append(arg)
            arg = arg.name
        argindsstr.extend((arg, ind))

    # Normalize keyword arguments
    kwargs2 = {}
    for k, v in kwargs.items():
        v = normalize_arg(v)
        v, collections = unpack_collections(v)
        dependencies.extend(collections)
        kwargs2[k] = v

    # Finish up the name
    if not out:
        out = "{}-{}".format(
            token or utils.funcname(func).strip("_"),
            base.tokenize(func, out_ind, argindsstr, dtype, **kwargs),
        )

    graph = core_blockwise(
        func,
        out,
        out_ind,
        *argindsstr,
        numblocks=numblocks,
        dependencies=dependencies,
        new_axes=new_axes,
        concatenate=concatenate,
        **kwargs2,
    )
    graph = HighLevelGraph.from_collections(out,
                                            graph,
                                            dependencies=arrays + dependencies)

    chunks = [chunkss[i] for i in out_ind]
    if adjust_chunks:
        for i, ind in enumerate(out_ind):
            if ind in adjust_chunks:
                if callable(adjust_chunks[ind]):
                    chunks[i] = tuple(map(adjust_chunks[ind], chunks[i]))
                elif isinstance(adjust_chunks[ind], numbers.Integral):
                    chunks[i] = tuple(adjust_chunks[ind] for _ in chunks[i])
                elif isinstance(adjust_chunks[ind], (tuple, list)):
                    if len(adjust_chunks[ind]) != len(chunks[i]):
                        raise ValueError(
                            f"Dimension {i} has {len(chunks[i])} blocks, adjust_chunks "
                            f"specified with {len(adjust_chunks[ind])} blocks")
                    chunks[i] = tuple(adjust_chunks[ind])
                else:
                    raise NotImplementedError(
                        "adjust_chunks values must be callable, int, or tuple")
    chunks = tuple(chunks)

    if meta is None:
        from dask.array.utils import compute_meta

        meta = compute_meta(func, dtype, *args[::2], **kwargs)
    return new_da_object(graph, out, chunks, meta=meta, dtype=dtype)
Exemplo n.º 5
0
def rearrange_by_column_tasks(df,
                              column,
                              max_branch=32,
                              npartitions=None,
                              ignore_index=False):
    """Order divisions of DataFrame so that all values within column(s) align

    This enacts a task-based shuffle.  It contains most of the tricky logic
    around the complex network of tasks.  Typically before this function is
    called a new column, ``"_partitions"`` has been added to the dataframe,
    containing the output partition number of every row.  This function
    produces a new dataframe where every row is in the proper partition.  It
    accomplishes this by splitting each input partition into several pieces,
    and then concatenating pieces from different input partitions into output
    partitions.  If there are enough partitions then it does this work in
    stages to avoid scheduling overhead.

    Lets explain the motivation for this further.  Imagine that we have 1000
    input partitions and 1000 output partitions. In theory we could split each
    input into 1000 pieces, and then move the 1 000 000 resulting pieces
    around, and then concatenate them all into 1000 output groups.  This would
    be fine, but the central scheduling overhead of 1 000 000 tasks would
    become a bottleneck.  Instead we do this in stages so that we split each of
    the 1000 inputs into 30 pieces (we now have 30 000 pieces) move those
    around, concatenate back down to 1000, and then do the same process again.
    This has the same result as the full transfer, but now we've moved data
    twice (expensive) but done so with only 60 000 tasks (cheap).

    Note that the `column` input may correspond to a list of columns (rather
    than just a single column name).  In this case, the `shuffle_group` and
    `shuffle_group_2` functions will use hashing to map each row to an output
    partition. This approach may require the same rows to be hased multiple
    times, but avoids the need to assign a new "_partitions" column.

    Parameters
    ----------
    df: dask.dataframe.DataFrame
    column: str or list
        A column name on which we want to split, commonly ``"_partitions"``
        which is assigned by functions upstream.  This could also be a list of
        columns (in which case shuffle_group will create a hash array/column).
    max_branch: int
        The maximum number of splits per input partition.  Defaults to 32.
        If there are more partitions than this then the shuffling will occur in
        stages in order to avoid creating npartitions**2 tasks
        Increasing this number increases scheduling overhead but decreases the
        number of full-dataset transfers that we have to make.
    npartitions: Optional[int]
        The desired number of output partitions

    Returns
    -------
    df3: dask.dataframe.DataFrame

    See also
    --------
    rearrange_by_column_disk: same operation, but uses partd
    rearrange_by_column: parent function that calls this or rearrange_by_column_disk
    shuffle_group: does the actual splitting per-partition
    """

    max_branch = max_branch or 32

    if (npartitions or df.npartitions) <= max_branch:
        # We are creating a small number of output partitions.
        # No need for staged shuffling. Staged shuffling will
        # sometimes require extra work/communication in this case.
        token = tokenize(df, column, npartitions)
        shuffle_name = f"simple-shuffle-{token}"
        npartitions = npartitions or df.npartitions
        shuffle_layer = SimpleShuffleLayer(
            shuffle_name,
            column,
            npartitions,
            df.npartitions,
            ignore_index,
            df._name,
            df._meta,
        )
        graph = HighLevelGraph.from_collections(shuffle_name,
                                                shuffle_layer,
                                                dependencies=[df])
        return new_dd_object(graph, shuffle_name, df._meta,
                             [None] * (npartitions + 1))

    n = df.npartitions
    stages = int(math.ceil(math.log(n) / math.log(max_branch)))
    if stages > 1:
        k = int(math.ceil(n**(1 / stages)))
    else:
        k = n

    inputs = [
        tuple(digit(i, j, k) for j in range(stages)) for i in range(k**stages)
    ]

    npartitions_orig = df.npartitions
    token = tokenize(df, stages, column, n, k)
    for stage in range(stages):
        stage_name = f"shuffle-{stage}-{token}"
        stage_layer = ShuffleLayer(
            stage_name,
            column,
            inputs,
            stage,
            npartitions,
            n,
            k,
            ignore_index,
            df._name,
            df._meta,
        )
        graph = HighLevelGraph.from_collections(stage_name,
                                                stage_layer,
                                                dependencies=[df])
        df = new_dd_object(graph, stage_name, df._meta, df.divisions)

    if npartitions is not None and npartitions != npartitions_orig:
        token = tokenize(df, npartitions)
        repartition_group_token = "repartition-group-" + token

        dsk = {(repartition_group_token, i): (
            shuffle_group_2,
            k,
            column,
            ignore_index,
            npartitions,
        )
               for i, k in enumerate(df.__dask_keys__())}

        repartition_get_name = "repartition-get-" + token

        for p in range(npartitions):
            dsk[(repartition_get_name, p)] = (
                shuffle_group_get,
                (repartition_group_token, p % npartitions_orig),
                p,
            )

        graph2 = HighLevelGraph.from_collections(repartition_get_name,
                                                 dsk,
                                                 dependencies=[df])
        df2 = new_dd_object(graph2, repartition_get_name, df._meta,
                            [None] * (npartitions + 1))
    else:
        df2 = df
        df2.divisions = (None, ) * (npartitions_orig + 1)

    return df2
Exemplo n.º 6
0
def _approximate_quantile(df, q):
    """Approximate quantiles of DataFrame or Series.
    [NOTE: Same logic as dask.dataframe Series quantile]
    """
    # current implementation needs q to be sorted so
    # sort if array-like, otherwise leave it alone
    q_ndarray = np.array(q)
    if q_ndarray.ndim > 0:
        q_ndarray.sort(kind="mergesort")
        q = q_ndarray

    # Lets assume we are dealing with a DataFrame throughout
    if isinstance(df, (Series, Index)):
        df = df.to_frame()
    assert isinstance(df, DataFrame)
    final_type = df._meta._constructor

    # Create metadata
    meta = df._meta_nonempty.quantiles(q=q)

    # Define final action (create df with quantiles as index)
    def finalize_tsk(tsk):
        return (final_type, tsk)

    return_type = df.__class__

    # pandas/cudf uses quantile in [0, 1]
    # numpy / cupy uses [0, 100]
    qs = np.asarray(q)
    token = tokenize(df, qs)

    if len(qs) == 0:
        name = "quantiles-" + token
        empty_index = gd.Index([], dtype=float)
        return Series(
            {
                (name, 0):
                final_type(
                    {col: []
                     for col in df.columns},
                    name=df.name,
                    index=empty_index,
                )
            },
            name,
            df._meta,
            [None, None],
        )
    else:
        new_divisions = [np.min(q), np.max(q)]

    name = "quantiles-1-" + token
    val_dsk = {(name, i): (_quantile, key, qs)
               for i, key in enumerate(df.__dask_keys__())}

    name2 = "quantiles-2-" + token
    merge_dsk = {
        (name2, 0):
        finalize_tsk(
            (merge_quantiles, qs, [qs] * df.npartitions, sorted(val_dsk)))
    }
    dsk = toolz.merge(val_dsk, merge_dsk)
    graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[df])
    df = return_type(graph, name2, meta, new_divisions)

    def set_quantile_index(df):
        df.index = q
        return df

    df = df.map_partitions(set_quantile_index, meta=meta)
    return df
Exemplo n.º 7
0
    def _apply_offset(self, df: dd.DataFrame, offset: int,
                      end: int) -> dd.DataFrame:
        """
        Limit the dataframe to the window [offset, end].
        That is unfortunately, not so simple as we do not know how many
        items we have in each partition. We have therefore no other way than to
        calculate (!!!) the sizes of each partition
        (this means we need to compute the dataframe already here).

        After that, we can create a new dataframe from the old
        dataframe by calculating for each partition if and how much
        it should be used.
        We do this via generating our own dask computation graph as
        we need to pass the partition number to the selection
        function, which is not possible with normal "map_partitions".
        """
        # As we need to calculate the partition size, we better persist
        # the df. I think...
        # TODO: check if this is the best thing to do
        df = df.persist()

        # First, we need to find out which partitions we want to use.
        # Therefore we count the total number of entries
        partition_borders = df.map_partitions(lambda x: len(x)).compute()
        partition_borders = partition_borders.cumsum().to_dict()

        # Now we let each of the partitions figure out, how much it needs to return
        # using these partition borders
        # For this, we generate out own dask computation graph (as it does not really)
        # fit well with one of the already present methods

        # (a) we define a method to be calculated on each partition
        # This method returns the part of the partition, which falls between [offset, fetch]
        def select_from_to(df, partition_index):
            this_partition_border_left = (partition_borders[partition_index -
                                                            1]
                                          if partition_index > 0 else 0)
            this_partition_border_right = partition_borders[partition_index]

            if (end and end < this_partition_border_left) or (
                    offset and offset >= this_partition_border_right):
                return df.iloc[0:0]

            from_index = max(offset -
                             this_partition_border_left, 0) if offset else 0
            to_index = (
                min(end, this_partition_border_right) if end else
                this_partition_border_right) - this_partition_border_left

            return df.iloc[from_index:to_index]

        # Then we (b) define a task graph. It should calculate the function above on each of the partitions of
        # df (specified by (df._name, i) for each partition i). As an argument, we pass the partition_index.
        dask_graph_name = df._name + "-limit"
        dask_graph_dict = {}

        for partition_index in range(df.npartitions):
            dask_graph_dict[(dask_graph_name, partition_index)] = (
                select_from_to,
                (df._name, partition_index),
                partition_index,
            )

        # We replace df with our new graph
        graph = HighLevelGraph.from_collections(dask_graph_name,
                                                dask_graph_dict,
                                                dependencies=[df])
        return new_dd_object(graph, dask_graph_name, df._meta, df.divisions)
Exemplo n.º 8
0
def map_blocks(
    func: Callable[..., T_DSorDA],
    obj: Union[DataArray, Dataset],
    args: Sequence[Any] = (),
    kwargs: Mapping[str, Any] = None,
    template: Union[DataArray, Dataset] = None,
) -> T_DSorDA:
    """Apply a function to each block of a DataArray or Dataset.

    .. warning::
        This function is experimental and its signature may change.

    Parameters
    ----------
    func : callable
        User-provided function that accepts a DataArray or Dataset as its first
        parameter ``obj``. The function will receive a subset or 'block' of ``obj`` (see below),
        corresponding to one chunk along each chunked dimension. ``func`` will be
        executed as ``func(subset_obj, *subset_args, **kwargs)``.

        This function must return either a single DataArray or a single Dataset.

        This function cannot add a new chunked dimension.
    obj : DataArray, Dataset
        Passed to the function as its first argument, one block at a time.
    args : sequence
        Passed to func after unpacking and subsetting any xarray objects by blocks.
        xarray objects in args must be aligned with obj, otherwise an error is raised.
    kwargs : mapping
        Passed verbatim to func after unpacking. xarray objects, if any, will not be
        subset to blocks. Passing dask collections in kwargs is not allowed.
    template : DataArray or Dataset, optional
        xarray object representing the final result after compute is called. If not provided,
        the function will be first run on mocked-up data, that looks like ``obj`` but
        has sizes 0, to determine properties of the returned object such as dtype,
        variable names, attributes, new dimensions and new indexes (if any).
        ``template`` must be provided if the function changes the size of existing dimensions.
        When provided, ``attrs`` on variables in `template` are copied over to the result. Any
        ``attrs`` set by ``func`` will be ignored.

    Returns
    -------
    A single DataArray or Dataset with dask backend, reassembled from the outputs of the
    function.

    Notes
    -----
    This function is designed for when ``func`` needs to manipulate a whole xarray object
    subset to each block. Each block is loaded into memory. In the more common case where
    ``func`` can work on numpy arrays, it is recommended to use ``apply_ufunc``.

    If none of the variables in ``obj`` is backed by dask arrays, calling this function is
    equivalent to calling ``func(obj, *args, **kwargs)``.

    See Also
    --------
    dask.array.map_blocks, xarray.apply_ufunc, xarray.Dataset.map_blocks
    xarray.DataArray.map_blocks

    Examples
    --------
    Calculate an anomaly from climatology using ``.groupby()``. Using
    ``xr.map_blocks()`` allows for parallel operations with knowledge of ``xarray``,
    its indices, and its methods like ``.groupby()``.

    >>> def calculate_anomaly(da, groupby_type="time.month"):
    ...     gb = da.groupby(groupby_type)
    ...     clim = gb.mean(dim="time")
    ...     return gb - clim
    ...
    >>> time = xr.cftime_range("1990-01", "1992-01", freq="M")
    >>> month = xr.DataArray(time.month, coords={"time": time}, dims=["time"])
    >>> np.random.seed(123)
    >>> array = xr.DataArray(
    ...     np.random.rand(len(time)),
    ...     dims=["time"],
    ...     coords={"time": time, "month": month},
    ... ).chunk()
    >>> array.map_blocks(calculate_anomaly, template=array).compute()
    <xarray.DataArray (time: 24)>
    array([ 0.12894847,  0.11323072, -0.0855964 , -0.09334032,  0.26848862,
            0.12382735,  0.22460641,  0.07650108, -0.07673453, -0.22865714,
           -0.19063865,  0.0590131 , -0.12894847, -0.11323072,  0.0855964 ,
            0.09334032, -0.26848862, -0.12382735, -0.22460641, -0.07650108,
            0.07673453,  0.22865714,  0.19063865, -0.0590131 ])
    Coordinates:
      * time     (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00
        month    (time) int64 1 2 3 4 5 6 7 8 9 10 11 12 1 2 3 4 5 6 7 8 9 10 11 12

    Note that one must explicitly use ``args=[]`` and ``kwargs={}`` to pass arguments
    to the function being applied in ``xr.map_blocks()``:

    >>> array.map_blocks(
    ...     calculate_anomaly,
    ...     kwargs={"groupby_type": "time.year"},
    ...     template=array,
    ... )  # doctest: +ELLIPSIS
    <xarray.DataArray (time: 24)>
    dask.array<<this-array>-calculate_anomaly, shape=(24,), dtype=float64, chunksize=(24,), chunktype=numpy.ndarray>
    Coordinates:
      * time     (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00
        month    (time) int64 dask.array<chunksize=(24,), meta=np.ndarray>
    """

    def _wrapper(
        func: Callable,
        args: List,
        kwargs: dict,
        arg_is_array: Iterable[bool],
        expected: dict,
    ):
        """
        Wrapper function that receives datasets in args; converts to dataarrays when necessary;
        passes these to the user function `func` and checks returned objects for expected shapes/sizes/etc.
        """

        converted_args = [
            dataset_to_dataarray(arg) if is_array else arg
            for is_array, arg in zip(arg_is_array, args)
        ]

        result = func(*converted_args, **kwargs)

        # check all dims are present
        missing_dimensions = set(expected["shapes"]) - set(result.sizes)
        if missing_dimensions:
            raise ValueError(
                f"Dimensions {missing_dimensions} missing on returned object."
            )

        # check that index lengths and values are as expected
        for name, index in result.xindexes.items():
            if name in expected["shapes"]:
                if len(index) != expected["shapes"][name]:
                    raise ValueError(
                        f"Received dimension {name!r} of length {len(index)}. Expected length {expected['shapes'][name]}."
                    )
            if name in expected["indexes"]:
                expected_index = expected["indexes"][name]
                if not index.equals(expected_index):
                    raise ValueError(
                        f"Expected index {name!r} to be {expected_index!r}. Received {index!r} instead."
                    )

        # check that all expected variables were returned
        check_result_variables(result, expected, "coords")
        if isinstance(result, Dataset):
            check_result_variables(result, expected, "data_vars")

        return make_dict(result)

    if template is not None and not isinstance(template, (DataArray, Dataset)):
        raise TypeError(
            f"template must be a DataArray or Dataset. Received {type(template).__name__} instead."
        )
    if not isinstance(args, Sequence):
        raise TypeError("args must be a sequence (for example, a list or tuple).")
    if kwargs is None:
        kwargs = {}
    elif not isinstance(kwargs, Mapping):
        raise TypeError("kwargs must be a mapping (for example, a dict)")

    for value in kwargs.values():
        if dask.is_dask_collection(value):
            raise TypeError(
                "Cannot pass dask collections in kwargs yet. Please compute or "
                "load values before passing to map_blocks."
            )

    if not dask.is_dask_collection(obj):
        return func(obj, *args, **kwargs)

    all_args = [obj] + list(args)
    is_xarray = [isinstance(arg, (Dataset, DataArray)) for arg in all_args]
    is_array = [isinstance(arg, DataArray) for arg in all_args]

    # there should be a better way to group this. partition?
    xarray_indices, xarray_objs = unzip(
        (index, arg) for index, arg in enumerate(all_args) if is_xarray[index]
    )
    others = [
        (index, arg) for index, arg in enumerate(all_args) if not is_xarray[index]
    ]

    # all xarray objects must be aligned. This is consistent with apply_ufunc.
    aligned = align(*xarray_objs, join="exact")
    xarray_objs = tuple(
        dataarray_to_dataset(arg) if is_da else arg
        for is_da, arg in zip(is_array, aligned)
    )

    _, npargs = unzip(
        sorted(list(zip(xarray_indices, xarray_objs)) + others, key=lambda x: x[0])
    )

    # check that chunk sizes are compatible
    input_chunks = dict(npargs[0].chunks)
    input_indexes = dict(npargs[0].xindexes)
    for arg in xarray_objs[1:]:
        assert_chunks_compatible(npargs[0], arg)
        input_chunks.update(arg.chunks)
        input_indexes.update(arg.xindexes)

    if template is None:
        # infer template by providing zero-shaped arrays
        template = infer_template(func, aligned[0], *args, **kwargs)
        template_indexes = set(template.xindexes)
        preserved_indexes = template_indexes & set(input_indexes)
        new_indexes = template_indexes - set(input_indexes)
        indexes = {dim: input_indexes[dim] for dim in preserved_indexes}
        indexes.update({k: template.xindexes[k] for k in new_indexes})
        output_chunks = {
            dim: input_chunks[dim] for dim in template.dims if dim in input_chunks
        }

    else:
        # template xarray object has been provided with proper sizes and chunk shapes
        indexes = dict(template.xindexes)
        if isinstance(template, DataArray):
            output_chunks = dict(
                zip(template.dims, template.chunks)  # type: ignore[arg-type]
            )
        else:
            output_chunks = dict(template.chunks)

    for dim in output_chunks:
        if dim in input_chunks and len(input_chunks[dim]) != len(output_chunks[dim]):
            raise ValueError(
                "map_blocks requires that one block of the input maps to one block of output. "
                f"Expected number of output chunks along dimension {dim!r} to be {len(input_chunks[dim])}. "
                f"Received {len(output_chunks[dim])} instead. Please provide template if not provided, or "
                "fix the provided template."
            )

    if isinstance(template, DataArray):
        result_is_array = True
        template_name = template.name
        template = template._to_temp_dataset()
    elif isinstance(template, Dataset):
        result_is_array = False
    else:
        raise TypeError(
            f"func output must be DataArray or Dataset; got {type(template)}"
        )

    # We're building a new HighLevelGraph hlg. We'll have one new layer
    # for each variable in the dataset, which is the result of the
    # func applied to the values.

    graph: Dict[Any, Any] = {}
    new_layers: DefaultDict[str, Dict[Any, Any]] = collections.defaultdict(dict)
    gname = "{}-{}".format(
        dask.utils.funcname(func), dask.base.tokenize(npargs[0], args, kwargs)
    )

    # map dims to list of chunk indexes
    ichunk = {dim: range(len(chunks_v)) for dim, chunks_v in input_chunks.items()}
    # mapping from chunk index to slice bounds
    input_chunk_bounds = {
        dim: np.cumsum((0,) + chunks_v) for dim, chunks_v in input_chunks.items()
    }
    output_chunk_bounds = {
        dim: np.cumsum((0,) + chunks_v) for dim, chunks_v in output_chunks.items()
    }

    def subset_dataset_to_block(
        graph: dict, gname: str, dataset: Dataset, input_chunk_bounds, chunk_index
    ):
        """
        Creates a task that subsets an xarray dataset to a block determined by chunk_index.
        Block extents are determined by input_chunk_bounds.
        Also subtasks that subset the constituent variables of a dataset.
        """

        # this will become [[name1, variable1],
        #                   [name2, variable2],
        #                   ...]
        # which is passed to dict and then to Dataset
        data_vars = []
        coords = []

        chunk_tuple = tuple(chunk_index.values())
        for name, variable in dataset.variables.items():
            # make a task that creates tuple of (dims, chunk)
            if dask.is_dask_collection(variable.data):
                # recursively index into dask_keys nested list to get chunk
                chunk = variable.__dask_keys__()
                for dim in variable.dims:
                    chunk = chunk[chunk_index[dim]]

                chunk_variable_task = (f"{name}-{gname}-{chunk[0]}",) + chunk_tuple
                graph[chunk_variable_task] = (
                    tuple,
                    [variable.dims, chunk, variable.attrs],
                )
            else:
                # non-dask array possibly with dimensions chunked on other variables
                # index into variable appropriately
                subsetter = {
                    dim: _get_chunk_slicer(dim, chunk_index, input_chunk_bounds)
                    for dim in variable.dims
                }
                subset = variable.isel(subsetter)
                chunk_variable_task = (
                    f"{name}-{gname}-{dask.base.tokenize(subset)}",
                ) + chunk_tuple
                graph[chunk_variable_task] = (
                    tuple,
                    [subset.dims, subset, subset.attrs],
                )

            # this task creates dict mapping variable name to above tuple
            if name in dataset._coord_names:
                coords.append([name, chunk_variable_task])
            else:
                data_vars.append([name, chunk_variable_task])

        return (Dataset, (dict, data_vars), (dict, coords), dataset.attrs)

    # iterate over all possible chunk combinations
    for chunk_tuple in itertools.product(*ichunk.values()):
        # mapping from dimension name to chunk index
        chunk_index = dict(zip(ichunk.keys(), chunk_tuple))

        blocked_args = [
            subset_dataset_to_block(graph, gname, arg, input_chunk_bounds, chunk_index)
            if isxr
            else arg
            for isxr, arg in zip(is_xarray, npargs)
        ]

        # expected["shapes", "coords", "data_vars", "indexes"] are used to
        # raise nice error messages in _wrapper
        expected = {}
        # input chunk 0 along a dimension maps to output chunk 0 along the same dimension
        # even if length of dimension is changed by the applied function
        expected["shapes"] = {
            k: output_chunks[k][v] for k, v in chunk_index.items() if k in output_chunks
        }
        expected["data_vars"] = set(template.data_vars.keys())  # type: ignore[assignment]
        expected["coords"] = set(template.coords.keys())  # type: ignore[assignment]
        # TODO: benbovy - flexible indexes: clean this up
        # for now assumes pandas index (thus can be indexed) but it won't be the case for
        # all indexes
        expected_indexes = {}
        for dim in indexes:
            idx = indexes[dim].to_pandas_index()[
                _get_chunk_slicer(dim, chunk_index, output_chunk_bounds)
            ]
            expected_indexes[dim] = PandasIndex(idx)
        expected["indexes"] = expected_indexes

        from_wrapper = (gname,) + chunk_tuple
        graph[from_wrapper] = (_wrapper, func, blocked_args, kwargs, is_array, expected)

        # mapping from variable name to dask graph key
        var_key_map: Dict[Hashable, str] = {}
        for name, variable in template.variables.items():
            if name in indexes:
                continue
            gname_l = f"{name}-{gname}"
            var_key_map[name] = gname_l

            key: Tuple[Any, ...] = (gname_l,)
            for dim in variable.dims:
                if dim in chunk_index:
                    key += (chunk_index[dim],)
                else:
                    # unchunked dimensions in the input have one chunk in the result
                    # output can have new dimensions with exactly one chunk
                    key += (0,)

            # We're adding multiple new layers to the graph:
            # The first new layer is the result of the computation on
            # the array.
            # Then we add one layer per variable, which extracts the
            # result for that variable, and depends on just the first new
            # layer.
            new_layers[gname_l][key] = (operator.getitem, from_wrapper, name)

    hlg = HighLevelGraph.from_collections(
        gname,
        graph,
        dependencies=[arg for arg in npargs if dask.is_dask_collection(arg)],
    )

    # This adds in the getitems for each variable in the dataset.
    hlg = HighLevelGraph(
        {**hlg.layers, **new_layers},
        dependencies={
            **hlg.dependencies,
            **{name: {gname} for name in new_layers.keys()},
        },
    )

    result = Dataset(coords=indexes, attrs=template.attrs)
    for index in result.xindexes:
        result[index].attrs = template[index].attrs
        result[index].encoding = template[index].encoding

    for name, gname_l in var_key_map.items():
        dims = template[name].dims
        var_chunks = []
        for dim in dims:
            if dim in output_chunks:
                var_chunks.append(output_chunks[dim])
            elif dim in indexes:
                var_chunks.append((len(indexes[dim]),))
            elif dim in template.dims:
                # new unindexed dimension
                var_chunks.append((template.sizes[dim],))

        data = dask.array.Array(
            hlg, name=gname_l, chunks=var_chunks, dtype=template[name].dtype
        )
        result[name] = (dims, data, template[name].attrs)
        result[name].encoding = template[name].encoding

    result = result.set_coords(template._coord_names)

    if result_is_array:
        da = dataset_to_dataarray(result)
        da.name = template_name
        return da  # type: ignore[return-value]
    return result  # type: ignore[return-value]
Exemplo n.º 9
0
def test_dicts_deprecated():
    a = {"x": 1, "y": (inc, "x")}
    hg = HighLevelGraph({"a": a}, {"a": set()})
    with pytest.warns(FutureWarning, match="HighLevelGraph.layers"):
        assert hg.dicts == hg.layers
Exemplo n.º 10
0
def map_blocks(
    func: Callable[..., T_DSorDA],
    obj: Union[DataArray, Dataset],
    args: Sequence[Any] = (),
    kwargs: Mapping[str, Any] = None,
) -> T_DSorDA:
    """Apply a function to each chunk of a DataArray or Dataset. This function is
    experimental and its signature may change.

    Parameters
    ----------
    func: callable
        User-provided function that accepts a DataArray or Dataset as its first
        parameter. The function will receive a subset of 'obj' (see below),
        corresponding to one chunk along each chunked dimension. ``func`` will be
        executed as ``func(obj_subset, *args, **kwargs)``.

        The function will be first run on mocked-up data, that looks like 'obj' but
        has sizes 0, to determine properties of the returned object such as dtype,
        variable names, new dimensions and new indexes (if any).

        This function must return either a single DataArray or a single Dataset.

        This function cannot change size of existing dimensions, or add new chunked
        dimensions.
    obj: DataArray, Dataset
        Passed to the function as its first argument, one dask chunk at a time.
    args: Sequence
        Passed verbatim to func after unpacking, after the sliced obj. xarray objects,
        if any, will not be split by chunks. Passing dask collections is not allowed.
    kwargs: Mapping
        Passed verbatim to func after unpacking. xarray objects, if any, will not be
        split by chunks. Passing dask collections is not allowed.

    Returns
    -------
    A single DataArray or Dataset with dask backend, reassembled from the outputs of the
    function.

    Notes
    -----
    This function is designed for when one needs to manipulate a whole xarray object
    within each chunk. In the more common case where one can work on numpy arrays, it is
    recommended to use apply_ufunc.

    If none of the variables in obj is backed by dask, calling this function is
    equivalent to calling ``func(obj, *args, **kwargs)``.

    See Also
    --------
    dask.array.map_blocks, xarray.apply_ufunc, xarray.Dataset.map_blocks,
    xarray.DataArray.map_blocks

    Examples
    --------

    Calculate an anomaly from climatology using ``.groupby()``. Using
    ``xr.map_blocks()`` allows for parallel operations with knowledge of ``xarray``,
    its indices, and its methods like ``.groupby()``.

    >>> def calculate_anomaly(da, groupby_type="time.month"):
    ...     # Necessary workaround to xarray's check with zero dimensions
    ...     # https://github.com/pydata/xarray/issues/3575
    ...     if sum(da.shape) == 0:
    ...         return da
    ...     gb = da.groupby(groupby_type)
    ...     clim = gb.mean(dim="time")
    ...     return gb - clim
    >>> time = xr.cftime_range("1990-01", "1992-01", freq="M")
    >>> np.random.seed(123)
    >>> array = xr.DataArray(
    ...     np.random.rand(len(time)), dims="time", coords=[time]
    ... ).chunk()
    >>> xr.map_blocks(calculate_anomaly, array).compute()
    <xarray.DataArray (time: 24)>
    array([ 0.12894847,  0.11323072, -0.0855964 , -0.09334032,  0.26848862,
            0.12382735,  0.22460641,  0.07650108, -0.07673453, -0.22865714,
           -0.19063865,  0.0590131 , -0.12894847, -0.11323072,  0.0855964 ,
            0.09334032, -0.26848862, -0.12382735, -0.22460641, -0.07650108,
            0.07673453,  0.22865714,  0.19063865, -0.0590131 ])
    Coordinates:
      * time     (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00

    Note that one must explicitly use ``args=[]`` and ``kwargs={}`` to pass arguments
    to the function being applied in ``xr.map_blocks()``:

    >>> xr.map_blocks(
    ...     calculate_anomaly, array, kwargs={"groupby_type": "time.year"},
    ... )
    <xarray.DataArray (time: 24)>
    array([ 0.15361741, -0.25671244, -0.31600032,  0.008463  ,  0.1766172 ,
           -0.11974531,  0.43791243,  0.14197797, -0.06191987, -0.15073425,
           -0.19967375,  0.18619794, -0.05100474, -0.42989909, -0.09153273,
            0.24841842, -0.30708526, -0.31412523,  0.04197439,  0.0422506 ,
            0.14482397,  0.35985481,  0.23487834,  0.12144652])
    Coordinates:
        * time     (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00
    """

    def _wrapper(func, obj, to_array, args, kwargs):
        if to_array:
            obj = dataset_to_dataarray(obj)

        result = func(obj, *args, **kwargs)

        for name, index in result.indexes.items():
            if name in obj.indexes:
                if len(index) != len(obj.indexes[name]):
                    raise ValueError(
                        "Length of the %r dimension has changed. This is not allowed."
                        % name
                    )

        return make_dict(result)

    if not isinstance(args, Sequence):
        raise TypeError("args must be a sequence (for example, a list or tuple).")
    if kwargs is None:
        kwargs = {}
    elif not isinstance(kwargs, Mapping):
        raise TypeError("kwargs must be a mapping (for example, a dict)")

    for value in list(args) + list(kwargs.values()):
        if dask.is_dask_collection(value):
            raise TypeError(
                "Cannot pass dask collections in args or kwargs yet. Please compute or "
                "load values before passing to map_blocks."
            )

    if not dask.is_dask_collection(obj):
        return func(obj, *args, **kwargs)

    if isinstance(obj, DataArray):
        # only using _to_temp_dataset would break
        # func = lambda x: x.to_dataset()
        # since that relies on preserving name.
        if obj.name is None:
            dataset = obj._to_temp_dataset()
        else:
            dataset = obj.to_dataset()
        input_is_array = True
    else:
        dataset = obj
        input_is_array = False

    input_chunks = dataset.chunks

    template: Union[DataArray, Dataset] = infer_template(func, obj, *args, **kwargs)
    if isinstance(template, DataArray):
        result_is_array = True
        template_name = template.name
        template = template._to_temp_dataset()
    elif isinstance(template, Dataset):
        result_is_array = False
    else:
        raise TypeError(
            f"func output must be DataArray or Dataset; got {type(template)}"
        )

    template_indexes = set(template.indexes)
    dataset_indexes = set(dataset.indexes)
    preserved_indexes = template_indexes & dataset_indexes
    new_indexes = template_indexes - dataset_indexes
    indexes = {dim: dataset.indexes[dim] for dim in preserved_indexes}
    indexes.update({k: template.indexes[k] for k in new_indexes})

    # We're building a new HighLevelGraph hlg. We'll have one new layer
    # for each variable in the dataset, which is the result of the
    # func applied to the values.

    graph: Dict[Any, Any] = {}
    new_layers: DefaultDict[str, Dict[Any, Any]] = collections.defaultdict(dict)
    gname = "{}-{}".format(
        dask.utils.funcname(func), dask.base.tokenize(dataset, args, kwargs)
    )

    # map dims to list of chunk indexes
    ichunk = {dim: range(len(chunks_v)) for dim, chunks_v in input_chunks.items()}
    # mapping from chunk index to slice bounds
    chunk_index_bounds = {
        dim: np.cumsum((0,) + chunks_v) for dim, chunks_v in input_chunks.items()
    }

    # iterate over all possible chunk combinations
    for v in itertools.product(*ichunk.values()):
        chunk_index_dict = dict(zip(dataset.dims, v))

        # this will become [[name1, variable1],
        #                   [name2, variable2],
        #                   ...]
        # which is passed to dict and then to Dataset
        data_vars = []
        coords = []

        for name, variable in dataset.variables.items():
            # make a task that creates tuple of (dims, chunk)
            if dask.is_dask_collection(variable.data):
                # recursively index into dask_keys nested list to get chunk
                chunk = variable.__dask_keys__()
                for dim in variable.dims:
                    chunk = chunk[chunk_index_dict[dim]]

                chunk_variable_task = (f"{gname}-{chunk[0]}",) + v
                graph[chunk_variable_task] = (
                    tuple,
                    [variable.dims, chunk, variable.attrs],
                )
            else:
                # non-dask array with possibly chunked dimensions
                # index into variable appropriately
                subsetter = {}
                for dim in variable.dims:
                    if dim in chunk_index_dict:
                        which_chunk = chunk_index_dict[dim]
                        subsetter[dim] = slice(
                            chunk_index_bounds[dim][which_chunk],
                            chunk_index_bounds[dim][which_chunk + 1],
                        )

                subset = variable.isel(subsetter)
                chunk_variable_task = (
                    "{}-{}".format(gname, dask.base.tokenize(subset)),
                ) + v
                graph[chunk_variable_task] = (
                    tuple,
                    [subset.dims, subset, subset.attrs],
                )

            # this task creates dict mapping variable name to above tuple
            if name in dataset._coord_names:
                coords.append([name, chunk_variable_task])
            else:
                data_vars.append([name, chunk_variable_task])

        from_wrapper = (gname,) + v
        graph[from_wrapper] = (
            _wrapper,
            func,
            (Dataset, (dict, data_vars), (dict, coords), dataset.attrs),
            input_is_array,
            args,
            kwargs,
        )

        # mapping from variable name to dask graph key
        var_key_map: Dict[Hashable, str] = {}
        for name, variable in template.variables.items():
            if name in indexes:
                continue
            gname_l = f"{gname}-{name}"
            var_key_map[name] = gname_l

            key: Tuple[Any, ...] = (gname_l,)
            for dim in variable.dims:
                if dim in chunk_index_dict:
                    key += (chunk_index_dict[dim],)
                else:
                    # unchunked dimensions in the input have one chunk in the result
                    key += (0,)

            # We're adding multiple new layers to the graph:
            # The first new layer is the result of the computation on
            # the array.
            # Then we add one layer per variable, which extracts the
            # result for that variable, and depends on just the first new
            # layer.
            new_layers[gname_l][key] = (operator.getitem, from_wrapper, name)

    hlg = HighLevelGraph.from_collections(gname, graph, dependencies=[dataset])

    for gname_l, layer in new_layers.items():
        # This adds in the getitems for each variable in the dataset.
        hlg.dependencies[gname_l] = {gname}
        hlg.layers[gname_l] = layer

    result = Dataset(coords=indexes, attrs=template.attrs)
    for name, gname_l in var_key_map.items():
        dims = template[name].dims
        var_chunks = []
        for dim in dims:
            if dim in input_chunks:
                var_chunks.append(input_chunks[dim])
            elif dim in indexes:
                var_chunks.append((len(indexes[dim]),))
            elif dim in template.dims:
                # new unindexed dimension
                var_chunks.append((template.sizes[dim],))

        data = dask.array.Array(
            hlg, name=gname_l, chunks=var_chunks, dtype=template[name].dtype
        )
        result[name] = (dims, data, template[name].attrs)

    result = result.set_coords(template._coord_names)

    if result_is_array:
        da = dataset_to_dataarray(result)
        da.name = template_name
        return da  # type: ignore
    return result  # type: ignore
Exemplo n.º 11
0
    def _wrap(self,
              funcname,
              *args,
              size=None,
              chunks="auto",
              extra_chunks=(),
              **kwargs):
        """Wrap numpy random function to produce dask.array random function

        extra_chunks should be a chunks tuple to append to the end of chunks
        """
        if size is not None and not isinstance(size, (tuple, list)):
            size = (size, )

        shapes = list({
            ar.shape
            for ar in chain(args, kwargs.values())
            if isinstance(ar, (Array, np.ndarray))
        })
        if size is not None:
            shapes.append(size)
        # broadcast to the final size(shape)
        size = broadcast_shapes(*shapes)
        chunks = normalize_chunks(
            chunks,
            size,  # ideally would use dtype here
            dtype=kwargs.get("dtype", np.float64),
        )
        slices = slices_from_chunks(chunks)

        def _broadcast_any(ar, shape, chunks):
            if isinstance(ar, Array):
                return broadcast_to(ar, shape).rechunk(chunks)
            if isinstance(ar, np.ndarray):
                return np.ascontiguousarray(np.broadcast_to(ar, shape))

        # Broadcast all arguments, get tiny versions as well
        # Start adding the relevant bits to the graph
        dsk = {}
        lookup = {}
        small_args = []
        dependencies = []
        for i, ar in enumerate(args):
            if isinstance(ar, (np.ndarray, Array)):
                res = _broadcast_any(ar, size, chunks)
                if isinstance(res, Array):
                    dependencies.append(res)
                    lookup[i] = res.name
                elif isinstance(res, np.ndarray):
                    name = f"array-{tokenize(res)}"
                    lookup[i] = name
                    dsk[name] = res
                small_args.append(ar[tuple(0 for _ in ar.shape)])
            else:
                small_args.append(ar)

        small_kwargs = {}
        for key, ar in kwargs.items():
            if isinstance(ar, (np.ndarray, Array)):
                res = _broadcast_any(ar, size, chunks)
                if isinstance(res, Array):
                    dependencies.append(res)
                    lookup[key] = res.name
                elif isinstance(res, np.ndarray):
                    name = f"array-{tokenize(res)}"
                    lookup[key] = name
                    dsk[name] = res
                small_kwargs[key] = ar[tuple(0 for _ in ar.shape)]
            else:
                small_kwargs[key] = ar

        sizes = list(product(*chunks))
        seeds = random_state_data(len(sizes), self._numpy_state)
        token = tokenize(seeds, size, chunks, args, kwargs)
        name = f"{funcname}-{token}"

        keys = product([name],
                       *([range(len(bd))
                          for bd in chunks] + [[0]] * len(extra_chunks)))
        blocks = product(*[range(len(bd)) for bd in chunks])

        vals = []
        for seed, size, slc, block in zip(seeds, sizes, slices, blocks):
            arg = []
            for i, ar in enumerate(args):
                if i not in lookup:
                    arg.append(ar)
                else:
                    if isinstance(ar, Array):
                        arg.append((lookup[i], ) + block)
                    else:  # np.ndarray
                        arg.append((getitem, lookup[i], slc))
            kwrg = {}
            for k, ar in kwargs.items():
                if k not in lookup:
                    kwrg[k] = ar
                else:
                    if isinstance(ar, Array):
                        kwrg[k] = (lookup[k], ) + block
                    else:  # np.ndarray
                        kwrg[k] = (getitem, lookup[k], slc)
            vals.append((_apply_random, self._RandomState, funcname, seed,
                         size, arg, kwrg))

        meta = _apply_random(
            self._RandomState,
            funcname,
            seed,
            (0, ) * len(size),
            small_args,
            small_kwargs,
        )

        dsk.update(dict(zip(keys, vals)))

        graph = HighLevelGraph.from_collections(name,
                                                dsk,
                                                dependencies=dependencies)
        return Array(graph, name, chunks + extra_chunks, meta=meta)
Exemplo n.º 12
0
def test_keyset_deprecated():
    a = {"x": 1, "y": (inc, "x")}
    hg = HighLevelGraph({"a": a}, {"a": set()})
    with pytest.warns(FutureWarning, match="HighLevelGraph.keys"):
        assert hg.keyset() == hg.keys()
Exemplo n.º 13
0
        def choice(self, a, size=None, replace=True, p=None, chunks="auto"):
            dependencies = []
            # Normalize and validate `a`
            if isinstance(a, Integral):
                # On windows the output dtype differs if p is provided or
                # absent, see https://github.com/numpy/numpy/issues/9867
                dummy_p = np.array([1]) if p is not None else p
                dtype = np.random.choice(1, size=(), p=dummy_p).dtype
                len_a = a
                if a < 0:
                    raise ValueError("a must be greater than 0")
            else:
                a = asarray(a)
                a = a.rechunk(a.shape)
                dtype = a.dtype
                if a.ndim != 1:
                    raise ValueError("a must be one dimensional")
                len_a = len(a)
                dependencies.append(a)
                a = a.__dask_keys__()[0]

            # Normalize and validate `p`
            if p is not None:
                if not isinstance(p, Array):
                    # If p is not a dask array, first check the sum is close
                    # to 1 before converting.
                    p = np.asarray(p)
                    if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0):
                        raise ValueError("probabilities do not sum to 1")
                    p = asarray(p)
                else:
                    p = p.rechunk(p.shape)

                if p.ndim != 1:
                    raise ValueError("p must be one dimensional")
                if len(p) != len_a:
                    raise ValueError("a and p must have the same size")

                dependencies.append(p)
                p = p.__dask_keys__()[0]

            if size is None:
                size = ()
            elif not isinstance(size, (tuple, list)):
                size = (size, )

            chunks = normalize_chunks(chunks, size, dtype=np.float64)
            if not replace and len(chunks[0]) > 1:
                err_msg = ("replace=False is not currently supported for "
                           "dask.array.choice with multi-chunk output "
                           "arrays")
                raise NotImplementedError(err_msg)
            sizes = list(product(*chunks))
            state_data = random_state_data(len(sizes), self._numpy_state)

            name = "da.random.choice-%s" % tokenize(state_data, size, chunks,
                                                    a, replace, p)
            keys = product([name], *(range(len(bd)) for bd in chunks))
            dsk = {
                k: (_choice, state, a, size, replace, p)
                for k, state, size in zip(keys, state_data, sizes)
            }

            graph = HighLevelGraph.from_collections(name,
                                                    dsk,
                                                    dependencies=dependencies)
            return Array(graph, name, chunks, dtype=dtype)
Exemplo n.º 14
0
def fit(model,
        x,
        y,
        compute=True,
        shuffle_blocks=True,
        random_state=None,
        **kwargs):
    """ Fit scikit learn model against dask arrays

    Model must support the ``partial_fit`` interface for online or batch
    learning.

    Ideally your rows are independent and identically distributed. By default,
    this function will step through chunks of the arrays in random order.

    Parameters
    ----------
    model: sklearn model
        Any model supporting partial_fit interface
    x: dask Array
        Two dimensional array, likely tall and skinny
    y: dask Array
        One dimensional array with same chunks as x's rows
    compute : bool
        Whether to compute this result
    shuffle_blocks : bool
        Whether to shuffle the blocks with ``random_state`` or not
    random_state : int or numpy.random.RandomState
        Random state to use when shuffling blocks
    kwargs:
        options to pass to partial_fit

    Examples
    --------
    >>> import dask.array as da
    >>> X = da.random.random((10, 3), chunks=(5, 3))
    >>> y = da.random.randint(0, 2, 10, chunks=(5,))

    >>> from sklearn.linear_model import SGDClassifier
    >>> sgd = SGDClassifier()

    >>> sgd = da.learn.fit(sgd, X, y, classes=[1, 0])
    >>> sgd  # doctest: +SKIP
    SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
           fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
           loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5,
           random_state=None, shuffle=False, verbose=0, warm_start=False)

    This passes all of X and y through the classifier sequentially.  We can use
    the classifier as normal on in-memory data

    >>> import numpy as np
    >>> sgd.predict(np.random.random((4, 3)))  # doctest: +SKIP
    array([1, 0, 0, 1])

    Or predict on a larger dataset

    >>> z = da.random.random((400, 3), chunks=(100, 3))
    >>> da.learn.predict(sgd, z)  # doctest: +SKIP
    dask.array<x_11, shape=(400,), chunks=((100, 100, 100, 100),), dtype=int64>
    """
    if not hasattr(x, "chunks") and hasattr(x, "to_dask_array"):
        x = x.to_dask_array()
    assert x.ndim == 2
    if y is not None:
        if not hasattr(y, "chunks") and hasattr(y, "to_dask_array"):
            y = y.to_dask_array()

        assert y.ndim == 1
        assert x.chunks[0] == y.chunks[0]

    assert hasattr(model, "partial_fit")
    if len(x.chunks[1]) > 1:
        x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1])))

    nblocks = len(x.chunks[0])
    order = list(range(nblocks))
    if shuffle_blocks:
        rng = sklearn.utils.check_random_state(random_state)
        rng.shuffle(order)

    name = "fit-" + dask.base.tokenize(model, x, y, kwargs, order)
    dsk = {(name, -1): model}
    dsk.update({(name, i): (
        _partial_fit,
        (name, i - 1),
        (x.name, order[i], 0),
        (getattr(y, "name", ""), order[i]),
        kwargs,
    )
                for i in range(nblocks)})

    graphs = {x.name: x.__dask_graph__(), name: dsk}
    if hasattr(y, "__dask_graph__"):
        graphs[y.name] = y.__dask_graph__()

    try:
        from dask.highlevelgraph import HighLevelGraph

        new_dsk = HighLevelGraph.merge(*graphs.values())
    except ImportError:
        from dask import sharedict

        new_dsk = sharedict.merge(*graphs.values())

    value = Delayed((name, nblocks - 1), new_dsk)

    if compute:
        return value.compute()
    else:
        return value
Exemplo n.º 15
0
def dask_reproject(
    src: da.Array,
    src_geobox: GeoBox,
    dst_geobox: GeoBox,
    resampling: str = "nearest",
    chunks: Optional[Tuple[int, int]] = None,
    src_nodata: Optional[NodataType] = None,
    dst_nodata: Optional[NodataType] = None,
    axis: int = 0,
    name: str = "reproject",
) -> da.Array:
    """
    Reproject to GeoBox as dask operation

    :param src       : Input src[(time,) y,x (, band)]
    :param src_geobox: GeoBox of the source array
    :param dst_geobox: GeoBox of the destination
    :param resampling: Resampling strategy as a string: nearest, bilinear, average, mode ...
    :param chunks    : In Y,X dimensions only, default is to use same input chunk size
    :param axis      : Index of Y axis (default is 0)
    :param src_nodata: nodata marker for source image
    :param dst_nodata: nodata marker for dst image
    :param name      : Dask graph name, "reproject" is the default
    """
    if chunks is None:
        chunks = src.chunksize[axis:axis + 2]

    if dst_nodata is None:
        dst_nodata = src_nodata

    assert src.shape[axis:axis + 2] == src_geobox.shape
    yx_shape = dst_geobox.shape
    yx_chunks = unpack_chunks(chunks, yx_shape)

    dst_chunks = src.chunks[:axis] + yx_chunks + src.chunks[axis + 2:]
    dst_shape = src.shape[:axis] + yx_shape + src.shape[axis + 2:]

    #  tuple(*dims1, y, x, *dims2) -- complete shape in blocks
    dims1 = tuple(map(len, dst_chunks[:axis]))
    dims2 = tuple(map(len, dst_chunks[axis + 2:]))
    assert dims2 == ()
    deps = [src]

    tile_shape = (yx_chunks[0][0], yx_chunks[1][0])
    gbt = GeoboxTiles(dst_geobox, tile_shape)
    xy_chunks_with_data = list(gbt.tiles(src_geobox.extent))

    name = randomize(name)
    dsk: Dict[Any, Any] = {}

    block_impl = (_reproject_block_bool_impl
                  if src.dtype == "bool" else _reproject_block_impl)

    for idx in xy_chunks_with_data:
        _dst_geobox = gbt[idx]
        rr = compute_reproject_roi(src_geobox, _dst_geobox)
        _src = crop_2d_dense(src, rr.roi_src, axis=axis)
        _src_geobox = src_geobox[rr.roi_src]

        deps.append(_src)

        for ii1 in np.ndindex(dims1):
            # TODO: band dims
            dsk[(name, *ii1, *idx)] = (
                block_impl,
                (_src.name, *ii1, 0, 0),
                _src_geobox,
                _dst_geobox,
                resampling,
                src_nodata,
                dst_nodata,
                axis,
            )

    fill_value = 0 if dst_nodata is None else dst_nodata
    shape_in_blocks = tuple(map(len, dst_chunks))

    mk_empty = empty_maker(fill_value, src.dtype, dsk)

    for idx in np.ndindex(shape_in_blocks):
        # TODO: other dims
        k = (name, *idx)
        if k not in dsk:
            bshape = tuple(ch[i] for ch, i in zip(dst_chunks, idx))
            dsk[k] = mk_empty(bshape)

    dsk = HighLevelGraph.from_collections(name, dsk, dependencies=deps)

    return da.Array(dsk,
                    name,
                    chunks=dst_chunks,
                    dtype=src.dtype,
                    shape=dst_shape)
Exemplo n.º 16
0
    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        # Joining is a bit more complicated, so lets do it in steps:

        # 1. We now have two inputs (from left and right), so we fetch them both
        dc_lhs, dc_rhs = self.assert_inputs(rel, 2, context)
        cc_lhs = dc_lhs.column_container
        cc_rhs = dc_rhs.column_container

        # 2. dask's merge will do some smart things with columns, which have the same name
        # on lhs an rhs (which also includes reordering).
        # However, that will confuse our column numbering in SQL.
        # So we make our life easier by converting the column names into unique names
        # We will convert back in the end
        cc_lhs_renamed = cc_lhs.make_unique("lhs")
        cc_rhs_renamed = cc_rhs.make_unique("rhs")

        dc_lhs_renamed = DataContainer(dc_lhs.df, cc_lhs_renamed)
        dc_rhs_renamed = DataContainer(dc_rhs.df, cc_rhs_renamed)

        df_lhs_renamed = dc_lhs_renamed.assign()
        df_rhs_renamed = dc_rhs_renamed.assign()

        join_type = rel.getJoinType()
        join_type = self.JOIN_TYPE_MAPPING[str(join_type)]

        # 3. The join condition can have two forms, that we can understand
        # (a) a = b
        # (b) X AND Y AND a = b AND Z ... (can also be multiple a = b)
        # The first case is very simple and we do not need any additional filter
        # In the second case we do a merge on all the a = b,
        # and then apply a filter using the other expressions.
        # In all other cases, we need to do a full table cross join and filter afterwards.
        # As this is probably non-sense for large tables, but there is no other
        # known solution so far.
        join_condition = rel.getCondition()
        lhs_on, rhs_on, filter_condition = self._split_join_condition(
            join_condition)

        logger.debug(
            f"Joining with type {join_type} on columns {lhs_on}, {rhs_on}.")

        # lhs_on and rhs_on are the indices of the columns to merge on.
        # The given column indices are for the full, merged table which consists
        # of lhs and rhs put side-by-side (in this order)
        # We therefore need to normalize the rhs indices relative to the rhs table.
        rhs_on = [index - len(df_lhs_renamed.columns) for index in rhs_on]

        # 4. dask can only merge on the same column names.
        # We therefore create new columns on purpose, which have a distinct name.
        assert len(lhs_on) == len(rhs_on)
        if lhs_on:
            # 5. Now we can finally merge on these columns
            # The resulting dataframe will contain all (renamed) columns from the lhs and rhs
            # plus the added columns
            df = self._join_on_columns(
                df_lhs_renamed,
                df_rhs_renamed,
                lhs_on,
                rhs_on,
                join_type,
            )
        else:
            # 5. We are in the complex join case
            # where we have no column to merge on
            # This means we have no other chance than to merge
            # everything with everything...

            # TODO: we should implement a shortcut
            # for filter conditions that are always false

            def merge_single_partitions(lhs_partition, rhs_partition):
                # Do a cross join with the two partitions
                # TODO: it would be nice to apply the filter already here
                # problem: this would mean we need to ship the rex to the
                # workers (as this is executed on the workers),
                # which is definitely not possible (java dependency, JVM start...)
                lhs_partition = lhs_partition.assign(common=1)
                rhs_partition = rhs_partition.assign(common=1)

                return lhs_partition.merge(rhs_partition,
                                           on="common").drop(columns="common")

            # Iterate nested over all partitions from lhs and rhs and merge them
            name = "cross-join-" + tokenize(df_lhs_renamed, df_rhs_renamed)
            dsk = {(name, i * df_rhs_renamed.npartitions + j): (
                merge_single_partitions,
                (df_lhs_renamed._name, i),
                (df_rhs_renamed._name, j),
            )
                   for i in range(df_lhs_renamed.npartitions)
                   for j in range(df_rhs_renamed.npartitions)}

            graph = HighLevelGraph.from_collections(
                name, dsk, dependencies=[df_lhs_renamed, df_rhs_renamed])

            meta = dd.dispatch.concat(
                [df_lhs_renamed._meta_nonempty, df_rhs_renamed._meta_nonempty],
                axis=1)
            # TODO: Do we know the divisions in any way here?
            divisions = [None] * (len(dsk) + 1)
            df = dd.DataFrame(graph, name, meta=meta, divisions=divisions)

            warnings.warn(
                "Need to do a cross-join, which is typically very resource heavy",
                ResourceWarning,
            )

        # 6. So the next step is to make sure
        # we have the correct column order (and to remove the temporary join columns)
        correct_column_order = list(df_lhs_renamed.columns) + list(
            df_rhs_renamed.columns)
        cc = ColumnContainer(df.columns).limit_to(correct_column_order)

        # and to rename them like the rel specifies
        row_type = rel.getRowType()
        field_specifications = [str(f) for f in row_type.getFieldNames()]
        cc = cc.rename({
            from_col: to_col
            for from_col, to_col in zip(cc.columns, field_specifications)
        })
        cc = self.fix_column_to_row_type(cc, row_type)
        dc = DataContainer(df, cc)

        # 7. Last but not least we apply any filters by and-chaining together the filters
        if filter_condition:
            # This line is a bit of code duplication with RexCallPlugin - but I guess it is worth to keep it separate
            filter_condition = reduce(
                operator.and_,
                [
                    RexConverter.convert(rex, dc, context=context)
                    for rex in filter_condition
                ],
            )
            logger.debug(f"Additionally applying filter {filter_condition}")
            df = filter_or_scalar(df, filter_condition)
            dc = DataContainer(df, cc)

        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc
Exemplo n.º 17
0
def test_highlevelgraph_dicts_deprecation():
    with pytest.warns(FutureWarning):
        layers = {"a": BasicLayer({"x": 1, "y": (inc, "x")})}
        hg = HighLevelGraph(layers, {"a": set()})
        assert hg.dicts == layers
Exemplo n.º 18
0
def groupby_agg(
    ddf,
    gb_cols,
    aggs_in,
    split_every=None,
    split_out=None,
    dropna=True,
    sep="___",
    sort=False,
    as_index=True,
):
    """ Optimized groupby aggregation for Dask-CuDF.

        This aggregation algorithm only supports the following options:

        {"count", "mean", "std", "var", "sum", "min", "max"}

        This "optimized" approach is more performant than the algorithm
        in `dask.dataframe`, because it allows the cudf backend to
        perform multiple aggregations at once.
    """

    # Deal with default split_out and split_every params
    if split_every is False:
        split_every = ddf.npartitions
    split_every = split_every or 8
    split_out = split_out or 1

    # Standardize `gb_cols` and `columns` lists
    aggs = aggs_in.copy()
    if isinstance(gb_cols, str):
        gb_cols = [gb_cols]
    columns = [c for c in ddf.columns if c not in gb_cols]
    str_cols_out = False
    if isinstance(aggs, dict):
        # Use `str_cols_out` to specify if the output columns
        # will have str (rather than MultiIndex/tuple) names.
        # This happens when all values in the `aggs` dict are
        # strings (no lists)
        str_cols_out = True
        for col in aggs:
            if isinstance(aggs[col], str):
                aggs[col] = [aggs[col]]
            else:
                str_cols_out = False
            if col in gb_cols:
                columns.append(col)

    # Assert that aggregations are supported
    _supported = {"count", "mean", "std", "var", "sum", "min", "max"}
    if not _is_supported(aggs, _supported):
        raise ValueError(
            f"Supported aggs include {_supported} for groupby_agg API. "
            f"Aggregations must be specified with dict or list syntax.")

    # Always convert aggs to dict for consistency
    if isinstance(aggs, list):
        aggs = {col: aggs for col in columns}

    # Begin graph construction
    dsk = {}
    token = tokenize(ddf, gb_cols, aggs)
    partition_agg_name = "groupby_partition_agg-" + token
    tree_reduce_name = "groupby_tree_reduce-" + token
    gb_agg_name = "groupby_agg-" + token
    for p in range(ddf.npartitions):
        # Perform groupby aggregation on each partition.
        # Split each result into `split_out` chunks (by hashing `gb_cols`)
        dsk[(partition_agg_name, p)] = (
            _groupby_partition_agg,
            (ddf._name, p),
            gb_cols,
            aggs,
            columns,
            split_out,
            dropna,
            sort,
            sep,
        )
        # Pick out each chunk using `getitem`
        for s in range(split_out):
            dsk[(tree_reduce_name, p, s, 0)] = (
                getitem,
                (partition_agg_name, p),
                s,
            )

    # Build reduction tree
    parts = ddf.npartitions
    widths = [parts]
    while parts > 1:
        parts = math.ceil(parts / split_every)
        widths.append(parts)
    height = len(widths)
    for s in range(split_out):
        for depth in range(1, height):
            for group in range(widths[depth]):

                p_max = widths[depth - 1]
                lstart = split_every * group
                lstop = min(lstart + split_every, p_max)
                node_list = [(tree_reduce_name, p, s, depth - 1)
                             for p in range(lstart, lstop)]

                dsk[(tree_reduce_name, group, s, depth)] = (
                    _tree_node_agg,
                    node_list,
                    gb_cols,
                    split_out,
                    dropna,
                    sort,
                    sep,
                )

    # Final output partitions.
    _aggs = aggs.copy()
    if str_cols_out:
        # Metadata should use `str` for dict values if that is
        # what the user originally specified (column names will
        # be str, rather than tuples).
        for col in aggs:
            _aggs[col] = _aggs[col][0]
    _meta = ddf._meta.groupby(gb_cols, as_index=as_index).agg(_aggs)
    for s in range(split_out):
        dsk[(gb_agg_name, s)] = (
            _finalize_gb_agg,
            (tree_reduce_name, 0, s, height - 1),
            gb_cols,
            aggs,
            columns,
            _meta.columns,
            as_index,
            sort,
            sep,
            str_cols_out,
        )

    divisions = [None] * (split_out + 1)
    graph = HighLevelGraph.from_collections(gb_agg_name,
                                            dsk,
                                            dependencies=[ddf])
    return new_dd_object(graph, gb_agg_name, _meta, divisions)
Exemplo n.º 19
0
    def warp(self, dem=None, proj="EPSG:4326", **kwargs):
        """Delayed warp across an entire AOI or Image

        Creates a new dask image by deferring calls to the warp_geometry on chunks

        Args:
            dem (ndarray): optional. A DEM for warping to specific elevation planes
            proj (str): optional. An EPSG proj string to project the image data into ("EPSG:32612")

        Returns:
            daskarray: a warped image as deferred image array
        """
        try:
            img_md = self.rda.metadata["image"]
            x_size = img_md["tileXSize"]
            y_size = img_md["tileYSize"]
        except (AttributeError, KeyError):
            x_size = kwargs.get("chunk_size", 256)
            y_size = kwargs.get("chunk_size", 256)

        # Create an affine transform to convert between real-world and pixels
        if self.proj is None:
            from_proj = "EPSG:4326"
        else:
            from_proj = self.proj

        try:
            # NOTE: this only works on images that have rda rpcs metadata
            center = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).centroid
            g = box(*center.buffer(self.rda.metadata["rpcs"]["gsd"] / 2).bounds)
            tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"), pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", ops.transform(tfm, g).area ** 0.5)
            current_bounds = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).bounds
        except (AttributeError, KeyError, TypeError):
            tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj), pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area / (self.shape[1] * self.shape[2])) ** 0.5)
            current_bounds = self.bounds

        tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj), pyproj.Proj(init=proj))
        itfm = partial(pyproj.transform, pyproj.Proj(init=proj), pyproj.Proj(init=from_proj))
        output_bounds = ops.transform(tfm, box(*current_bounds)).bounds
        gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3], 0.0, -1 * gsd)

        ll = ~gtf * (output_bounds[:2])
        ur = ~gtf * (output_bounds[2:])
        x_chunks = int((ur[0] - ll[0]) / x_size) + 1
        y_chunks = int((ll[1] - ur[1]) / y_size) + 1

        num_bands = self.shape[0]

        try:
            dtype = img_md["dataType"]
        except:
            dtype = 'uint8'

        daskmeta = {
            "dask": {},
            "chunks": (num_bands, y_size, x_size),
            "dtype": dtype,
            "name": "warp-{}".format(self.name),
            "shape": (num_bands, y_chunks * y_size, x_chunks * x_size)
        }

        def px_to_geom(xmin, ymin):
            xmax = int(xmin + x_size)
            ymax = int(ymin + y_size)
            bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin)))
            return box(*bounds)

        full_bounds = box(*output_bounds)

        dasks = []
        if isinstance(dem, GeoDaskImage):
            if dem.proj != proj:
                dem = dem.warp(proj=proj, dem=dem)
            dasks.append(dem.dask)

        for y in range(y_chunks):
            for x in range(x_chunks):
                xmin = x * x_size
                ymin = y * y_size
                geometry = px_to_geom(xmin, ymin)
                daskmeta["dask"][(daskmeta["name"], 0, y, x)] = (self._warp, geometry, gsd, dem, proj, dtype, 5)
        daskmeta["dask"], _ = optimization.cull(HighLevelGraph.merge(daskmeta["dask"], *dasks),
                                                list(daskmeta["dask"].keys()))

        gi = mapping(full_bounds)
        gt = AffineTransform(gtf, proj)
        image = GeoDaskImage(daskmeta, __geo_interface__=gi, __geo_transform__=gt)
        return image[box(*output_bounds)]
Exemplo n.º 20
0
def add_row_order_factory(table_proxy, datasets):
    """
    Generate arrays which add the appropriate rows for each array row chunk
    of a dataset, as well as returning the appropriate row ordering
    for that chunk

    Each array chunk (and by implication dataset) is linked by a call to
    :func:`daskms.writes.add_row_orders` to a previous chunk,
    either in the same array or the previous array.

    This establishes an order on how:

    1. Rows are added to the table.
    2. Column writes are performed.

    Returns
    -------
    list of :class:`dask.array.Array`
        row orderings for each dataset
    """
    prev_key = None
    prev_deps = []
    row_add_ops = []

    for di, ds in enumerate(datasets):
        data_vars = ds.data_vars
        found = False

        for k, v in data_vars.items():
            dims = v.dims
            array = v.data

            # Need something with a row dimension
            if not dims[0] == 'row':
                continue

            found = True
            token = dask.base.tokenize(array)
            name = '-'.join(('add-rows', str(di), token))
            layers = {}

            for b in range(array.numblocks[0]):
                key = (name, b)
                array_key = (array.name, b) + (0, ) * (array.ndim - 1)
                layers[key] = (add_row_orders, array_key, table_proxy,
                               prev_key)
                prev_key = key

            graph = HighLevelGraph.from_collections(name, layers,
                                                    prev_deps + [array])
            chunks = (array.chunks[0], )
            row_adds = da.Array(graph, name, chunks, dtype=np.object)
            row_add_ops.append(row_adds)
            prev_deps = [row_adds]

            break

        if not found:
            raise ValueError("Couldn't find an array with "
                             "which to establish a row ordering "
                             "in dataset %d" % di)

    return row_add_ops
Exemplo n.º 21
0
def rearrange_by_column_p2p(
    df: DataFrame,
    column: str,
    npartitions: int | None = None,
):
    from dask.dataframe import DataFrame

    npartitions = npartitions or df.npartitions
    token = tokenize(df, column, npartitions)

    setup = delayed(shuffle_setup, pure=True)(NewShuffleMetadata(
        ShuffleId(token),
        df._meta,
        column,
        npartitions,
    ))

    transferred = df.map_partitions(
        shuffle_transfer,
        token,
        setup,
        meta=df,
        enforce_metadata=False,
        transform_divisions=False,
    )

    barrier_key = "shuffle-barrier-" + token
    barrier_dsk = {
        barrier_key: (shuffle_barrier, token, transferred.__dask_keys__())
    }
    barrier = Delayed(
        barrier_key,
        HighLevelGraph.from_collections(barrier_key,
                                        barrier_dsk,
                                        dependencies=[transferred]),
    )

    name = "shuffle-unpack-" + token
    dsk = {(name, i): (shuffle_unpack, token, i, barrier_key)
           for i in range(npartitions)}
    # TODO: update to use blockwise.
    # Changes task names, so breaks setting worker restrictions at the moment.
    # Also maybe would be nice if the `DataFrameIOLayer` interface supported this?
    # dsk = blockwise(
    #     shuffle_unpack,
    #     name,
    #     "i",
    #     token,
    #     None,
    #     BlockwiseDepDict({(i,): i for i in range(npartitions)}),
    #     "i",
    #     barrier_key,
    #     None,
    #     numblocks={},
    # )

    return DataFrame(
        HighLevelGraph.from_collections(name, dsk, [barrier]),
        name,
        df._meta,
        [None] * (npartitions + 1),
    )
Exemplo n.º 22
0
def sjoin(left, right, how="inner", predicate="intersects", **kwargs):
    """
    Spatial join of two GeoDataFrames.

    Parameters
    ----------
    left, right : geopandas or dask_geopandas GeoDataFrames
        If a geopandas.GeoDataFrame is passed, it is considered as a
        dask_geopandas.GeoDataFrame with 1 partition (without spatial
        partitioning information).
    how : string, default 'inner'
        The type of join. Currently only 'inner' is supported.
    predicate : string, default 'intersects'
        Binary predicate how to match corresponding rows of the left and right
        GeoDataFrame. Possible values: 'contains', 'contains_properly',
        'covered_by', 'covers', 'crosses', 'intersects', 'overlaps',
        'touches', 'within'.

    Returns
    -------
    dask_geopandas.GeoDataFrame

    Notes
    -----
    If both the left and right GeoDataFrame have spatial partitioning
    information available (the ``spatial_partitions`` attribute is set),
    the output partitions are determined based on intersection of the
    spatial partitions. In all other cases, the output partitions are
    all combinations (cartesian/cross product) of all input partition
    of the left and right GeoDataFrame.
    """
    if "op" in kwargs:
        predicate = kwargs.pop("op")
        deprecation_message = (
            "The `op` parameter is deprecated and will be removed"
            " in a future release. Please use the `predicate` parameter"
            " instead."
        )
        warnings.warn(deprecation_message, FutureWarning, stacklevel=2)
    if how != "inner":
        raise NotImplementedError("Only how='inner' is supported right now")

    if isinstance(left, geopandas.GeoDataFrame):
        left = from_geopandas(left, npartitions=1)
    if isinstance(right, geopandas.GeoDataFrame):
        right = from_geopandas(right, npartitions=1)

    name = "sjoin-" + tokenize(left, right, how, predicate)
    meta = geopandas.sjoin(left._meta, right._meta, how=how, predicate=predicate)

    if left.spatial_partitions is not None and right.spatial_partitions is not None:
        # Spatial partitions are known -> use them to trim down the list of
        # partitions that need to be joined
        parts = geopandas.sjoin(
            left.spatial_partitions.to_frame("geometry"),
            right.spatial_partitions.to_frame("geometry"),
            how="inner",
            predicate="intersects",
        )
        parts_left = np.asarray(parts.index)
        parts_right = np.asarray(parts["index_right"].values)
        using_spatial_partitions = True
    else:
        # Unknown spatial partitions -> full cartesian (cross) product of all
        # combinations of the partitions of the left and right dataframe
        n_left = left.npartitions
        n_right = right.npartitions
        parts_left = np.repeat(np.arange(n_left), n_right)
        parts_right = np.tile(np.arange(n_right), n_left)
        using_spatial_partitions = False

    dsk = {}
    new_spatial_partitions = []
    for i, (l, r) in enumerate(zip(parts_left, parts_right)):
        dsk[(name, i)] = (
            geopandas.sjoin,
            (left._name, l),
            (right._name, r),
            how,
            predicate,
        )
        # TODO preserve spatial partitions of the output if only left has spatial
        # partitions
        if using_spatial_partitions:
            lr = left.spatial_partitions.iloc[l]
            rr = right.spatial_partitions.iloc[r]
            # extent = lr.intersection(rr).buffer(buffer).intersection(lr.union(rr))
            extent = lr.intersection(rr)
            new_spatial_partitions.append(extent)

    divisions = [None] * (len(dsk) + 1)
    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[left, right])
    if not using_spatial_partitions:
        new_spatial_partitions = None
    return GeoDataFrame(graph, name, meta, divisions, new_spatial_partitions)
Exemplo n.º 23
0
def to_csv(x: xarray.DataArray, path: str, *, nogil: bool = True, **kwargs):
    """Print DataArray to CSV.

    When x has numpy backend, this function is functionally equivalent to (but
    much) faster than)::

        x.to_pandas().to_csv(path_or_buf, **kwargs)

    When x has dask backend, this function returns a dask delayed object which
    will write to the disk only when its .compute() method is invoked.

    Formatting and optional compression are parallelised across all available
    CPUs, using one dask task per chunk on the first dimension. Chunks on other
    dimensions will be merged ahead of computation.

    :param x:
        :class:`~xarray.DataArray` with one or two dimensions
    :param str path:
        Output file path
    :param bool nogil:
        If True, use accelerated C implementation. Several kwargs won't be
        processed correctly (see limitations below). If False, use pandas
        to_csv method (slow, and does not release the GIL).
        nogil=True exclusively supports float and integer values dtypes (but
        the coords can be anything). In case of incompatible dtype, nogil
        is automatically switched to False.
    :param kwargs:
        Passed verbatim to :meth:`pandas.DataFrame.to_csv` or
        :meth:`pandas.Series.to_csv`

    **Limitations**

    - Fancy URIs are not (yet) supported.
    - compression='zip' is not supported. All other compression methods (gzip,
      bz2, xz) are supported.
    - When running with nogil=True, the following parameters are ignored:
      columns, quoting, quotechar, doublequote, escapechar, chunksize, decimal

    **Distributed computing**

    This function supports `dask distributed`_, with the caveat that all workers
    must write to the same shared mountpoint and that the shared filesystem
    must strictly guarantee **close-open coherency**, meaning that one must be
    able to call write() and then close() on a file descriptor from one host
    and then immediately afterwards open() from another host and see the output
    from the first host. Note that, for performance reasons, most network
    filesystems do not enable this feature by default.

    Alternatively, one may write to local mountpoints and then manually collect
    and concatenate the partial outputs.
    """
    if not isinstance(x, xarray.DataArray):
        raise ValueError("first argument must be a DataArray")

    # Health checks
    if not isinstance(path, str):
        raise ValueError("path_or_buf must be a file path")

    if x.ndim not in (1, 2):
        raise ValueError("cannot convert arrays with %d dimensions into "
                         "pandas objects" % x.ndim)

    if nogil and x.dtype.kind not in "if":
        nogil = False

    # Extract row and columns indices
    indices = [x.get_index(dim) for dim in x.dims]
    if x.ndim == 2:
        index, columns = indices
    else:
        index = indices[0]
        columns = None

    compression = kwargs.pop("compression", "infer")
    compress = _compress_func(path, compression)
    mode = kwargs.pop("mode", "w")
    if mode not in "wa":
        raise ValueError('mode: expected w or a; got "%s"' % mode)

    # Fast exit for numpy backend
    if not x.chunks:
        bdata = kernels.to_csv(x.values, index, columns, True, nogil, kwargs)
        if compress:
            bdata = compress(bdata)
        with open(path, mode + "b") as fh:
            fh.write(bdata)
        return None

    # Merge chunks on all dimensions beyond the first
    x = x.chunk((x.chunks[0], ) + tuple((s, ) for s in x.shape[1:]))

    # Manually define the dask graph
    tok = tokenize(x.data, index, columns, compression, path, kwargs)
    name1 = "to_csv_encode-" + tok
    name2 = "to_csv_compress-" + tok
    name3 = "to_csv_write-" + tok
    name4 = "to_csv-" + tok

    dsk: dict[str | tuple, tuple] = {}

    assert x.chunks
    assert x.chunks[0]
    offset = 0
    for i, size in enumerate(x.chunks[0]):
        # Slice index
        index_i = index[offset:offset + size]
        offset += size

        x_i = (x.data.name, i) + (0, ) * (x.ndim - 1)

        # Step 1: convert to CSV and encode to binary blob
        if i == 0:
            # First chunk: print header
            dsk[name1, i] = (kernels.to_csv, x_i, index_i, columns, True,
                             nogil, kwargs)
        else:
            kwargs_i = kwargs.copy()
            kwargs_i["header"] = False
            dsk[name1, i] = (kernels.to_csv, x_i, index_i, None, False, nogil,
                             kwargs_i)

        # Step 2 (optional): compress
        if compress:
            prevname = name2
            dsk[name2, i] = compress, (name1, i)
        else:
            prevname = name1

        # Step 3: write to file
        if i == 0:
            # First chunk: overwrite file if it already exists
            dsk[name3, i] = kernels.to_file, path, mode + "b", (prevname, i)
        else:
            # Next chunks: wait for previous chunk to complete and append
            dsk[name3, i] = (kernels.to_file, path, "ab", (prevname, i),
                             (name3, i - 1))

    # Rename final key
    dsk[name4] = dsk.pop((name3, i))

    hlg = HighLevelGraph.from_collections(name4, dsk, (x, ))
    return Delayed(name4, hlg)
Exemplo n.º 24
0
    def warp(self, dem=None, proj="EPSG:4326", **kwargs):
        """Delayed warp across an entire AOI or Image

        Creates a new dask image by deferring calls to the warp_geometry on chunks

        Args:
            dem (ndarray): optional. A DEM for warping to specific elevation planes
            proj (str): optional. An EPSG proj string to project the image data into ("EPSG:32612")

        Returns:
            daskarray: a warped image as deferred image array
        """
        try:
            img_md = self.rda.metadata["image"]
            x_size = img_md["tileXSize"]
            y_size = img_md["tileYSize"]
        except (AttributeError, KeyError):
            x_size = kwargs.get("chunk_size", 256)
            y_size = kwargs.get("chunk_size", 256)

        # Create an affine transform to convert between real-world and pixels
        if self.proj is None:
            from_proj = "EPSG:4326"
        else:
            from_proj = self.proj

        try:
            # NOTE: this only works on images that have rda rpcs metadata
            center = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).centroid
            g = box(*(center.buffer(self.rda.metadata["rpcs"]["gsd"] / 2).bounds))
            tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"), pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", ops.transform(tfm, g).area ** 0.5)
            current_bounds = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).bounds
        except (AttributeError, KeyError, TypeError):
            tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj), pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area / (self.shape[1] * self.shape[2])) ** 0.5 )
            current_bounds = self.bounds

        tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj), pyproj.Proj(init=proj))
        itfm = partial(pyproj.transform, pyproj.Proj(init=proj), pyproj.Proj(init=from_proj))
        output_bounds = ops.transform(tfm, box(*current_bounds)).bounds
        gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3], 0.0, -1 * gsd)

        ll = ~gtf * (output_bounds[:2])
        ur = ~gtf * (output_bounds[2:])
        x_chunks = int((ur[0] - ll[0]) / x_size) + 1
        y_chunks = int((ll[1] - ur[1]) / y_size) + 1

        num_bands = self.shape[0]

        try:
            dtype = RDA_TO_DTYPE[img_md["dataType"]]
        except:
            dtype = 'uint8'

        daskmeta = {
            "dask": {},
            "chunks": (num_bands, y_size, x_size),
            "dtype": dtype,
            "name": "warp-{}".format(self.name),
            "shape": (num_bands, y_chunks * y_size, x_chunks * x_size)
        }

        def px_to_geom(xmin, ymin):
            xmax = int(xmin + x_size)
            ymax = int(ymin + y_size)
            bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin)))
            return box(*bounds)

        full_bounds = box(*output_bounds)

        dasks = []
        if isinstance(dem, GeoDaskImage):
            if dem.proj != proj:
                dem = dem.warp(proj=proj, dem=dem)
            dasks.append(dem.dask)

        for y in xrange(y_chunks):
            for x in xrange(x_chunks):
                xmin = x * x_size
                ymin = y * y_size
                geometry = px_to_geom(xmin, ymin)
                daskmeta["dask"][(daskmeta["name"], 0, y, x)] = (self._warp, geometry, gsd, dem, proj, dtype, 5)
        daskmeta["dask"], _ = optimization.cull(HighLevelGraph.merge(daskmeta["dask"], *dasks), list(daskmeta["dask"].keys()))

        gi = mapping(full_bounds)
        gt = AffineTransform(gtf, proj)
        image = GeoDaskImage(daskmeta, __geo_interface__ = gi, __geo_transform__ = gt)
        return image[box(*output_bounds)]
Exemplo n.º 25
0
def reduction(
    args,
    chunk=None,
    aggregate=None,
    combine=None,
    meta=None,
    token=None,
    chunk_kwargs=None,
    aggregate_kwargs=None,
    combine_kwargs=None,
    split_every=None,
    **kwargs,
):
    """Generic tree reduction operation.

    Parameters
    ----------
    args :
        Positional arguments for the `chunk` function. All `dask.dataframe`
        objects should be partitioned and indexed equivalently.
    chunk : function [block-per-arg] -> block
        Function to operate on each block of data
    aggregate : function list-of-blocks -> block
        Function to operate on the list of results of chunk
    combine : function list-of-blocks -> block, optional
        Function to operate on intermediate lists of results of chunk
        in a tree-reduction. If not provided, defaults to aggregate.
    $META
    token : str, optional
        The name to use for the output keys.
    chunk_kwargs : dict, optional
        Keywords for the chunk function only.
    aggregate_kwargs : dict, optional
        Keywords for the aggregate function only.
    combine_kwargs : dict, optional
        Keywords for the combine function only.
    split_every : int, optional
        Group partitions into groups of this size while performing a
        tree-reduction. If set to False, no tree-reduction will be used,
        and all intermediates will be concatenated and passed to ``aggregate``.
        Default is 8.
    kwargs :
        All remaining keywords will be passed to ``chunk``, ``aggregate``, and
        ``combine``.
    """
    if chunk_kwargs is None:
        chunk_kwargs = dict()
    if aggregate_kwargs is None:
        aggregate_kwargs = dict()
    chunk_kwargs.update(kwargs)
    aggregate_kwargs.update(kwargs)

    if combine is None:
        if combine_kwargs:
            raise ValueError("`combine_kwargs` provided with no `combine`")
        combine = aggregate
        combine_kwargs = aggregate_kwargs
    else:
        if combine_kwargs is None:
            combine_kwargs = dict()
        combine_kwargs.update(kwargs)

    if not isinstance(args, (tuple, list)):
        args = [args]

    npartitions = set(
        arg.npartitions for arg in args if isinstance(arg, _Frame)
    )
    if len(npartitions) > 1:
        raise ValueError("All arguments must have same number of partitions")
    npartitions = npartitions.pop()

    if split_every is None:
        split_every = 8
    elif split_every is False:
        split_every = npartitions
    elif split_every < 2 or not isinstance(split_every, int):
        raise ValueError("split_every must be an integer >= 2")

    token_key = tokenize(
        token or (chunk, aggregate),
        meta,
        args,
        chunk_kwargs,
        aggregate_kwargs,
        combine_kwargs,
        split_every,
    )

    # Chunk
    a = "{0}-chunk-{1}".format(token or funcname(chunk), token_key)
    if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs:
        dsk = {
            (a, 0, i): (chunk, key)
            for i, key in enumerate(args[0].__dask_keys__())
        }
    else:
        dsk = {
            (a, 0, i): (
                apply,
                chunk,
                [(x._name, i) if isinstance(x, _Frame) else x for x in args],
                chunk_kwargs,
            )
            for i in range(args[0].npartitions)
        }

    # Combine
    b = "{0}-combine-{1}".format(token or funcname(combine), token_key)
    k = npartitions
    depth = 0
    while k > split_every:
        for part_i, inds in enumerate(partition_all(split_every, range(k))):
            conc = (list, [(a, depth, i) for i in inds])
            dsk[(b, depth + 1, part_i)] = (
                (apply, combine, [conc], combine_kwargs)
                if combine_kwargs
                else (combine, conc)
            )
        k = part_i + 1
        a = b
        depth += 1

    # Aggregate
    b = "{0}-agg-{1}".format(token or funcname(aggregate), token_key)
    conc = (list, [(a, depth, i) for i in range(k)])
    if aggregate_kwargs:
        dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs)
    else:
        dsk[(b, 0)] = (aggregate, conc)

    if meta is None:
        meta_chunk = _emulate(apply, chunk, args, chunk_kwargs)
        meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs)
    meta = dd.core.make_meta(meta)

    graph = HighLevelGraph.from_collections(b, dsk, dependencies=args)
    return dd.core.new_dd_object(graph, b, meta, (None, None))
Exemplo n.º 26
0
def reshape_yxbt(
    xx: xr.Dataset,
    name: str = "reshape_yxbt",
    yx_chunks: Union[int, Tuple[int, int]] = -1,
) -> xr.DataArray:
    """
    Reshape Dask-backed ``xr.Dataset[Time,Y,X]`` into
    ``xr.DataArray[Y,X,Band,Time]``. On the output DataArray there is
    exactly one chunk along both Time and Band dimensions.

    :param xx: Dataset with 3 dimensional bands, dimension order (time, y, x)

    :param name: Dask name of the output operation

    :param yx_chunks: If supplied subdivide YX chunks of input into smaller
                      sections, note that this can only make yx chunks smaller
                      not bigger. Every output chunk depends on one input chunk
                      only, so output chunks might not be regular, for example
                      if input chunk sizes are 10, and yx_chunks=3, you'll get
                      chunks sized 3,3,3,1,3,3,3,1... (example only, never use chunks
                      that small)

    .. note:

       Chunks along first dimension ought to be of size 1 exactly (default for
       time dimension when using dc.load).
    """
    if isinstance(yx_chunks, int):
        yx_chunks = (yx_chunks, yx_chunks)

    if not is_dask_collection(xx):
        raise ValueError("Currently this code works only on Dask inputs")

    if not all(dv.data.numblocks[0] == dv.data.shape[0]
               for dv in xx.data_vars.values()):
        raise ValueError(
            "All input bands should have chunk=1 for the first dimension")

    name0 = name
    name = randomize(name)

    blocks, _ = _get_chunks_for_all_bands(xx)
    b0, *_ = xx.data_vars.values()

    attrs = dict(b0.attrs)
    nb = len(xx.data_vars.values())
    nt, ny, nx = b0.shape

    deps = [dv.data for dv in xx.data_vars.values()]
    shape = (ny, nx, nb, nt)
    dtype = b0.dtype
    dims = b0.dims[1:] + ("band", b0.dims[0])

    maxy, maxx = yx_chunks
    ychunks, xchunks = b0.data.chunks[1:3]
    _yy = list(_split_chunks(ychunks, maxy))
    _xx = list(_split_chunks(xchunks, maxx))
    ychunks = tuple(roi.stop - roi.start for _, _, roi in _yy)
    xchunks = tuple(roi.stop - roi.start for _, _, roi in _xx)

    chunks = [ychunks, xchunks, (nb, ), (nt, )]

    dsk = {}
    for iy, iy_src, y_roi in _yy:
        for ix, ix_src, x_roi in _xx:
            crop_yx = (y_roi, x_roi)
            _blocks = blocks[:, :, iy_src, ix_src].tolist()
            dsk[(name, iy, ix, 0, 0)] = (
                functools.partial(_reshape_yxbt_impl, crop_yx=crop_yx),
                _blocks,
            )

    dsk = HighLevelGraph.from_collections(name, dsk, dependencies=deps)
    data = da.Array(dsk, name, chunks=chunks, dtype=dtype, shape=shape)

    coords: Dict[Hashable, Any] = {k: c for k, c in xx.coords.items()}
    coords["band"] = list(xx.data_vars)

    return xr.DataArray(data=data,
                        dims=dims,
                        coords=coords,
                        name=name0,
                        attrs=attrs)
Exemplo n.º 27
0
def map_blocks(
        func: Callable[..., T_DSorDA],
        obj: Union[DataArray, Dataset],
        args: Sequence[Any] = (),
        kwargs: Mapping[str, Any] = None,
) -> T_DSorDA:
    """Apply a function to each chunk of a DataArray or Dataset. This function is
    experimental and its signature may change.

    Parameters
    ----------
    func: callable
        User-provided function that accepts a DataArray or Dataset as its first
        parameter. The function will receive a subset of 'obj' (see below),
        corresponding to one chunk along each chunked dimension. ``func`` will be
        executed as ``func(obj_subset, *args, **kwargs)``.

        The function will be first run on mocked-up data, that looks like 'obj' but
        has sizes 0, to determine properties of the returned object such as dtype,
        variable names, new dimensions and new indexes (if any).

        This function must return either a single DataArray or a single Dataset.

        This function cannot change size of existing dimensions, or add new chunked
        dimensions.
    obj: DataArray, Dataset
        Passed to the function as its first argument, one dask chunk at a time.
    args: Sequence
        Passed verbatim to func after unpacking, after the sliced obj. xarray objects,
        if any, will not be split by chunks. Passing dask collections is not allowed.
    kwargs: Mapping
        Passed verbatim to func after unpacking. xarray objects, if any, will not be
        split by chunks. Passing dask collections is not allowed.

    Returns
    -------
    A single DataArray or Dataset with dask backend, reassembled from the outputs of the
    function.

    Notes
    -----
    This function is designed for when one needs to manipulate a whole xarray object
    within each chunk. In the more common case where one can work on numpy arrays, it is
    recommended to use apply_ufunc.

    If none of the variables in obj is backed by dask, calling this function is
    equivalent to calling ``func(obj, *args, **kwargs)``.

    See Also
    --------
    dask.array.map_blocks, xarray.apply_ufunc, xarray.Dataset.map_blocks,
    xarray.DataArray.map_blocks
    """
    def _wrapper(func, obj, to_array, args, kwargs):
        if to_array:
            obj = dataset_to_dataarray(obj)

        result = func(obj, *args, **kwargs)

        for name, index in result.indexes.items():
            if name in obj.indexes:
                if len(index) != len(obj.indexes[name]):
                    raise ValueError(
                        "Length of the %r dimension has changed. This is not allowed."
                        % name)

        return make_dict(result)

    if not isinstance(args, Sequence):
        raise TypeError(
            "args must be a sequence (for example, a list or tuple).")
    if kwargs is None:
        kwargs = {}
    elif not isinstance(kwargs, Mapping):
        raise TypeError("kwargs must be a mapping (for example, a dict)")

    for value in list(args) + list(kwargs.values()):
        if dask.is_dask_collection(value):
            raise TypeError(
                "Cannot pass dask collections in args or kwargs yet. Please compute or "
                "load values before passing to map_blocks.")

    if not dask.is_dask_collection(obj):
        return func(obj, *args, **kwargs)

    if isinstance(obj, DataArray):
        # only using _to_temp_dataset would break
        # func = lambda x: x.to_dataset()
        # since that relies on preserving name.
        if obj.name is None:
            dataset = obj._to_temp_dataset()
        else:
            dataset = obj.to_dataset()
        input_is_array = True
    else:
        dataset = obj
        input_is_array = False

    input_chunks = dataset.chunks

    template: Union[DataArray,
                    Dataset] = infer_template(func, obj, *args, **kwargs)
    if isinstance(template, DataArray):
        result_is_array = True
        template_name = template.name
        template = template._to_temp_dataset()
    elif isinstance(template, Dataset):
        result_is_array = False
    else:
        raise TypeError(
            f"func output must be DataArray or Dataset; got {type(template)}")

    template_indexes = set(template.indexes)
    dataset_indexes = set(dataset.indexes)
    preserved_indexes = template_indexes & dataset_indexes
    new_indexes = template_indexes - dataset_indexes
    indexes = {dim: dataset.indexes[dim] for dim in preserved_indexes}
    indexes.update({k: template.indexes[k] for k in new_indexes})

    graph: Dict[Any, Any] = {}
    gname = "{}-{}".format(dask.utils.funcname(func),
                           dask.base.tokenize(dataset, args, kwargs))

    # map dims to list of chunk indexes
    ichunk = {
        dim: range(len(chunks_v))
        for dim, chunks_v in input_chunks.items()
    }
    # mapping from chunk index to slice bounds
    chunk_index_bounds = {
        dim: np.cumsum((0, ) + chunks_v)
        for dim, chunks_v in input_chunks.items()
    }

    # iterate over all possible chunk combinations
    for v in itertools.product(*ichunk.values()):
        chunk_index_dict = dict(zip(dataset.dims, v))

        # this will become [[name1, variable1],
        #                   [name2, variable2],
        #                   ...]
        # which is passed to dict and then to Dataset
        data_vars = []
        coords = []

        for name, variable in dataset.variables.items():
            # make a task that creates tuple of (dims, chunk)
            if dask.is_dask_collection(variable.data):
                # recursively index into dask_keys nested list to get chunk
                chunk = variable.__dask_keys__()
                for dim in variable.dims:
                    chunk = chunk[chunk_index_dict[dim]]

                chunk_variable_task = (f"{gname}-{chunk[0]}", ) + v
                graph[chunk_variable_task] = (
                    tuple,
                    [variable.dims, chunk, variable.attrs],
                )
            else:
                # non-dask array with possibly chunked dimensions
                # index into variable appropriately
                subsetter = {}
                for dim in variable.dims:
                    if dim in chunk_index_dict:
                        which_chunk = chunk_index_dict[dim]
                        subsetter[dim] = slice(
                            chunk_index_bounds[dim][which_chunk],
                            chunk_index_bounds[dim][which_chunk + 1],
                        )

                subset = variable.isel(subsetter)
                chunk_variable_task = ("{}-{}".format(
                    gname, dask.base.tokenize(subset)), ) + v
                graph[chunk_variable_task] = (
                    tuple,
                    [subset.dims, subset, subset.attrs],
                )

            # this task creates dict mapping variable name to above tuple
            if name in dataset._coord_names:
                coords.append([name, chunk_variable_task])
            else:
                data_vars.append([name, chunk_variable_task])

        from_wrapper = (gname, ) + v
        graph[from_wrapper] = (
            _wrapper,
            func,
            (Dataset, (dict, data_vars), (dict, coords), dataset.attrs),
            input_is_array,
            args,
            kwargs,
        )

        # mapping from variable name to dask graph key
        var_key_map: Dict[Hashable, str] = {}
        for name, variable in template.variables.items():
            if name in indexes:
                continue
            gname_l = f"{gname}-{name}"
            var_key_map[name] = gname_l

            key: Tuple[Any, ...] = (gname_l, )
            for dim in variable.dims:
                if dim in chunk_index_dict:
                    key += (chunk_index_dict[dim], )
                else:
                    # unchunked dimensions in the input have one chunk in the result
                    key += (0, )

            graph[key] = (operator.getitem, from_wrapper, name)

    graph = HighLevelGraph.from_collections(gname,
                                            graph,
                                            dependencies=[dataset])

    result = Dataset(coords=indexes, attrs=template.attrs)
    for name, gname_l in var_key_map.items():
        dims = template[name].dims
        var_chunks = []
        for dim in dims:
            if dim in input_chunks:
                var_chunks.append(input_chunks[dim])
            elif dim in indexes:
                var_chunks.append((len(indexes[dim]), ))

        data = dask.array.Array(graph,
                                name=gname_l,
                                chunks=var_chunks,
                                dtype=template[name].dtype)
        result[name] = (dims, data, template[name].attrs)

    result = result.set_coords(template._coord_names)

    if result_is_array:
        da = dataset_to_dataarray(result)
        da.name = template_name
        return da  # type: ignore
    return result  # type: ignore
Exemplo n.º 28
0
def _parallel_var(ddf, meta, skipna, split_every, out):
    def _local_var(x, skipna):
        if skipna:
            n = x.count(skipna=skipna)
            avg = x.mean(skipna=skipna)
        else:
            # Not skipping nulls, so might as well
            # avoid the full `count` operation
            n = len(x)
            avg = x.sum(skipna=skipna) / n
        m2 = ((x - avg) ** 2).sum(skipna=skipna)
        return n, avg, m2

    def _aggregate_var(parts):
        n, avg, m2 = parts[0]
        for i in range(1, len(parts)):
            n_a, avg_a, m2_a = n, avg, m2
            n_b, avg_b, m2_b = parts[i]
            n = n_a + n_b
            avg = (n_a * avg_a + n_b * avg_b) / n
            delta = avg_b - avg_a
            m2 = m2_a + m2_b + delta ** 2 * n_a * n_b / n
        return n, avg, m2

    def _finalize_var(vals):
        n, _, m2 = vals
        return m2 / (n - 1)

    # Build graph
    nparts = ddf.npartitions
    if not split_every:
        split_every = nparts
    name = "var-" + tokenize(skipna, split_every, out)
    local_name = "local-" + name
    num = ddf._get_numeric_data()
    dsk = {
        (local_name, n, 0): (_local_var, (num._name, n), skipna)
        for n in range(nparts)
    }

    # Use reduction tree
    widths = [nparts]
    while nparts > 1:
        nparts = math.ceil(nparts / split_every)
        widths.append(nparts)
    height = len(widths)
    for depth in range(1, height):
        for group in range(widths[depth]):
            p_max = widths[depth - 1]
            lstart = split_every * group
            lstop = min(lstart + split_every, p_max)
            node_list = [
                (local_name, p, depth - 1) for p in range(lstart, lstop)
            ]
            dsk[(local_name, group, depth)] = (_aggregate_var, node_list)
    if height == 1:
        group = depth = 0
    dsk[(name, 0)] = (_finalize_var, (local_name, group, depth))

    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf])
    result = dd.core.new_dd_object(graph, name, meta, (None, None))
    if isinstance(ddf, DataFrame):
        result.divisions = (min(ddf.columns), max(ddf.columns))
    return handle_out(out, result)
Exemplo n.º 29
0
def grid(vis,
         uvw,
         flags,
         weights,
         frequencies,
         grid_config,
         wmin=-1e30,
         wmax=1e30,
         streams=None):
    """
    Grids the supplied visibilities in parallel. Note that
    a grid is create for each visibility chunk.

    Parameters
    ----------
    vis : :class:`dask.array.Array`
        visibilities of shape :code:`(row, chan, corr)`
    uvw : :class:`dask.array.Array`
        uvw coordinates of shape :code:`(row, 3)`
    flags : :class:`dask.array.Array`
        flags of shape :code:`(row, chan, corr)`
    weights : :class:`dask.array.Array`
        weights of shape :code:`(row, chan, corr)`.
    frequencies : :class:`dask.array.Array`
        frequencies of shape :code:`(chan,)`
    grid_config : :class:`GridderConfigWrapper`
        Gridding Configuration
    wmin : float
        Minimum W coordinate to grid. Defaults to -1e30.
    wmax : float
        Maximum W coordinate to grid. Default to 1e30.
    streams : int, optional
        Number of parallel gridding operations. Default to None,
        in which case as many grids as visibility chunks will
        be created.

    Returns
    -------
    grid : :class:`dask.array.Array`
        grid of shape :code:`(ny, nx, corr)`
    """
    if len(frequencies.chunks[0]) != 1:
        raise ValueError("Chunking in channel currently unsupported")

    # Create a baseline object per row chunk
    baselines = da.blockwise(_nifty_baselines, ("row", ),
                             uvw, ("row", "uvw"),
                             frequencies, ("chan", ),
                             dtype=np.object)

    if len(frequencies.chunks[0]) != 1:
        raise ValueError("Chunking in channel unsupported")

    gc = grid_config.object
    grids = []

    for corr in range(vis.shape[2]):
        corr_flags = flags[:, :, corr]
        corr_vis = vis[:, :, corr]
        corr_weights = weights[:, :, corr]

        indices = da.blockwise(
            _nifty_indices,
            ("row", ),
            baselines,
            ("row", ),
            gc,
            None,
            corr_flags,
            ("row", "chan"),
            -1,
            None,  # channel begin
            -1,
            None,  # channel end
            wmin,
            None,
            wmax,
            None,
            dtype=np.int32)

        if streams is None:
            # Standard parallel reduction, possibly memory hungry
            # if many threads (and thus grids) are gridding
            # parallel
            grid = da.blockwise(_nifty_grid, ("row", "nu", "nv"),
                                baselines, ("row", ),
                                gc,
                                None,
                                indices, ("row", ),
                                corr_vis, ("row", "chan"),
                                corr_weights, ("row", "chan"),
                                new_axes={
                                    "nu": gc.Nu(),
                                    "nv": gc.Nv()
                                },
                                adjust_chunks={"row": 1},
                                dtype=np.complex128)

            grids.append(grid.sum(axis=0))
        else:
            # Stream reduction
            layers = GridStreamReduction(baselines, indices, gc, corr_vis,
                                         corr_weights, corr, streams)
            deps = [baselines, indices, corr_vis, corr_weights]
            graph = HighLevelGraph.from_collections(layers.name, layers, deps)
            chunks = corr_vis.chunks
            grid_stream_red = da.Array(graph, layers.name, chunks, vis.dtype)

            layers = FinalGridReduction(layers)
            deps = [grid_stream_red]
            graph = HighLevelGraph.from_collections(layers.name, layers, deps)
            chunks = ((gc.Nu(), ), (gc.Nv(), ))
            corr_grid = da.Array(graph, layers.name, chunks, vis.dtype)

            grids.append(corr_grid)

    return da.stack(grids, axis=2)
Exemplo n.º 30
0
def affine_transform(image,
                     matrix,
                     offset=0.0,
                     output_shape=None,
                     order=1,
                     output_chunks=None,
                     **kwargs):
    """Apply an affine transform using Dask. For every
    output chunk, only the slice containing the relevant part
    of the image is processed. Chunkwise processing is performed
    either using `ndimage.affine_transform` or
    `cupyx.scipy.ndimage.affine_transform`, depending on the input type.

    Notes
    -----
        Differences to `ndimage.affine_transformation`:
        - currently, prefiltering is not supported
          (affecting the output in case of interpolation `order > 1`)
        - default order is 1
        - modes 'reflect', 'mirror' and 'wrap' are not supported

        Arguments equal to `ndimage.affine_transformation`,
        except for `output_chunks`.

    Parameters
    ----------
    image : array_like (Numpy Array, Cupy Array, Dask Array...)
        The image array.
    matrix : array (ndim,), (ndim, ndim), (ndim, ndim+1) or (ndim+1, ndim+1)
        Transformation matrix.
    offset : float or sequence, optional
        The offset into the array where the transform is applied. If a float,
        `offset` is the same for each axis. If a sequence, `offset` should
        contain one value for each axis.
    output_shape : tuple of ints, optional
        The shape of the array to be returned.
    order : int, optional
        The order of the spline interpolation. Note that for order>1
        scipy's affine_transform applies prefiltering, which is not
        yet supported and skipped in this implementation.
    output_chunks : tuple of ints, optional
        The shape of the chunks of the output Dask Array.

    Returns
    -------
    affine_transform : Dask Array
        A dask array representing the transformed output

    """

    if not type(image) == da.core.Array:
        image = da.from_array(image)

    if output_shape is None:
        output_shape = image.shape

    if output_chunks is None:
        output_chunks = image.shape

    # Perform test run to ensure parameter validity.
    ndimage_affine_transform(np.zeros([0] * image.ndim), matrix, offset)

    # Make sure parameters contained in matrix and offset
    # are not overlapping, i.e. that the offset is valid as
    # it needs to be modified for each chunk.
    # Further parameter checks are performed directly by
    # `ndimage.affine_transform`.

    matrix = np.asarray(matrix)
    offset = np.asarray(offset).squeeze()

    # these lines were copied and adapted from `ndimage.affine_transform`
    if (matrix.ndim == 2 and matrix.shape[1] == image.ndim + 1
            and (matrix.shape[0] in [image.ndim, image.ndim + 1])):

        # assume input is homogeneous coordinate transformation matrix
        offset = matrix[:image.ndim, image.ndim]
        matrix = matrix[:image.ndim, :image.ndim]

    # process kwargs
    # prefilter is not yet supported
    if 'prefilter' in kwargs:
        if kwargs['prefilter'] and order > 1:
            warnings.warn(
                'Currently, `dask_image.ndinterp.affine_transform` '
                'doesn\'t support `prefilter=True`. Proceeding with'
                ' `prefilter=False`, which if order > 1 can lead '
                'to the output containing more blur than with '
                'prefiltering.', UserWarning)
        del kwargs['prefilter']

    if 'mode' in kwargs:
        if kwargs['mode'] in ['wrap', 'reflect', 'mirror']:
            raise (NotImplementedError("Mode %s is not currently supported." %
                                       kwargs['mode']))

    n = image.ndim
    image_shape = image.shape

    # calculate output array properties
    normalized_chunks = da.core.normalize_chunks(output_chunks,
                                                 tuple(output_shape))
    block_indices = product(*(range(len(bds)) for bds in normalized_chunks))
    block_offsets = [np.cumsum((0, ) + bds[:-1]) for bds in normalized_chunks]

    # use dispatching mechanism to determine backend
    affine_transform_method = dispatch_affine_transform(image)
    asarray_method = dispatch_asarray(image)

    # construct dask graph for output array
    # using unique and deterministic identifier
    output_name = 'affine_transform-' + tokenize(
        image, matrix, offset, output_shape, output_chunks, kwargs)
    output_layer = {}
    rel_images = []
    for ib, block_ind in enumerate(block_indices):

        out_chunk_shape = [
            normalized_chunks[dim][block_ind[dim]] for dim in range(n)
        ]
        out_chunk_offset = [
            block_offsets[dim][block_ind[dim]] for dim in range(n)
        ]

        out_chunk_edges = np.array([i for i in np.ndindex(tuple([2] * n))])\
            * np.array(out_chunk_shape) + np.array(out_chunk_offset)

        # map output chunk edges onto input image coordinates
        # to define the input region relevant for the current chunk
        if matrix.ndim == 1 and len(matrix) == image.ndim:
            rel_image_edges = matrix * out_chunk_edges + offset
        else:
            rel_image_edges = np.dot(matrix, out_chunk_edges.T).T + offset

        rel_image_i = np.min(rel_image_edges, 0)
        rel_image_f = np.max(rel_image_edges, 0)

        # Calculate edge coordinates required for the footprint of the
        # spline kernel according to
        # https://github.com/scipy/scipy/blob/9c0d08d7d11fc33311a96d2ac3ad73c8f6e3df00/scipy/ndimage/src/ni_interpolation.c#L412-L419 # noqa: E501
        # Also see this discussion:
        # https://github.com/dask/dask-image/issues/24#issuecomment-706165593 # noqa: E501
        for dim in range(n):

            if order % 2 == 0:
                rel_image_i[dim] += 0.5
                rel_image_f[dim] += 0.5

            rel_image_i[dim] = np.floor(rel_image_i[dim]) - order // 2
            rel_image_f[dim] = np.floor(rel_image_f[dim]) - order // 2 + order

            if order == 0:  # required for consistency with scipy.ndimage
                rel_image_i[dim] -= 1

        # clip image coordinates to image extent
        for dim, s in zip(range(n), image_shape):
            rel_image_i[dim] = np.clip(rel_image_i[dim], 0, s - 1)
            rel_image_f[dim] = np.clip(rel_image_f[dim], 0, s - 1)

        rel_image_slice = tuple([
            slice(int(rel_image_i[dim]),
                  int(rel_image_f[dim]) + 2) for dim in range(n)
        ])

        rel_image = image[rel_image_slice]
        """Block comment for future developers explaining how `offset` is
        transformed into `offset_prime` for each output chunk.
        Modify offset to point into cropped image.
        y = Mx + o
        Coordinate substitution:
        y' = y - y0(min_coord_px)
        x' = x - x0(chunk_offset)
        Then:
        y' = Mx' + o + Mx0 - y0
        M' = M
        o' = o + Mx0 - y0
        """

        offset_prime = offset + np.dot(matrix, out_chunk_offset) - rel_image_i

        output_layer[(output_name, ) + block_ind] = (
            affine_transform_method,
            (da.core.concatenate3, rel_image.__dask_keys__()),
            asarray_method(matrix),
            offset_prime,
            tuple(out_chunk_shape),  # output_shape
            None,  # out
            order,
            'constant' if 'mode' not in kwargs else kwargs['mode'],
            0. if 'cval' not in kwargs else kwargs['cval'],
            False  # prefilter
        )

        rel_images.append(rel_image)

    graph = HighLevelGraph.from_collections(output_name,
                                            output_layer,
                                            dependencies=[image] + rel_images)

    meta = dispatch_asarray(image)([0]).astype(image.dtype)

    transformed = da.Array(
        graph,
        output_name,
        shape=tuple(output_shape),
        # chunks=output_chunks,
        chunks=normalized_chunks,
        meta=meta)

    return transformed
Exemplo n.º 31
0
    def regenerate_dataset(
        cls,
        dataset,
        output_path,
        columns=None,
        file_size=None,
        part_size=None,
        cats=None,
        conts=None,
        labels=None,
        storage_options=None,
    ):
        """Regenerate an NVTabular Dataset for efficient processing.

        Example Usage::

            dataset = Dataset("/path/to/data_pq", engine="parquet")
            dataset.regenerate_dataset(
                out_path, part_size="1MiB", file_size="10MiB"
            )

        Parameters
        -----------
        dataset : Dataset
            Input `Dataset` object (to be regenerated).
        output_path : string
            Root directory path to use for the new (regenerated) dataset.
        columns : list[string], optional
            Subset of columns to include in the regenerated dataset.
        file_size : int or string, optional
            Desired size of each output file.
        part_size : int or string, optional
            Desired partition size to use within regeneration algorithm.
            Note that this is effectively the size of each contiguous write
            operation in cudf.
        cats : list[string], optional
            Categorical column list.
        conts : list[string], optional
            Continuous column list.
        labels : list[string], optional
            Label column list.
        storage_options : dict, optional
            Storage-option kwargs to pass through to the `fsspec` file-system
            interface.

        Returns
        -------
        result : int or Delayed
            If `compute=True` (default), the return value will be an integer
            corresponding to the number of generated data files.  If `False`,
            the returned value will be a `Delayed` object.
        """

        # Specify ideal file size and partition size
        row_group_size = 128_000_000
        file_size = parse_bytes(file_size) or row_group_size * 100
        part_size = parse_bytes(part_size) or row_group_size * 10
        part_size = min(part_size, file_size)

        fs, _, _ = get_fs_token_paths(output_path,
                                      mode="wb",
                                      storage_options=storage_options)

        # Start by converting the original dataset to a Dask-Dataframe
        # object in CPU memory.  We avoid GPU memory in case the original
        # dataset is prone to OOM errors.
        _ddf = dataset.engine.to_ddf(columns=columns, cpu=True)

        # Prepare general metadata (gmd)
        gmd = {}
        cats = cats or []
        conts = conts or []
        labels = labels or []
        if not len(cats + conts + labels):
            warnings.warn(
                "General-metadata information not detected! "
                "Please pass lists for `cats`, `conts`, and `labels` as"
                "arguments to `regenerate_dataset` to ensure a complete "
                "and correct _metadata.json file.")
        col_idx = {str(name): i for i, name in enumerate(_ddf.columns)}
        gmd["cats"] = [{"col_name": c, "index": col_idx[c]} for c in cats]
        gmd["conts"] = [{"col_name": c, "index": col_idx[c]} for c in conts]
        gmd["labels"] = [{"col_name": c, "index": col_idx[c]} for c in labels]

        # Get list of partition lengths
        token = tokenize(
            dataset,
            output_path,
            columns,
            part_size,
            file_size,
            cats,
            conts,
            labels,
            storage_options,
        )
        getlen_name = "getlen-" + token
        name = "all-" + getlen_name
        dsk = {(getlen_name, i): (len, (_ddf._name, i))
               for i in range(_ddf.npartitions)}
        dsk[name] = [(getlen_name, i) for i in range(_ddf.npartitions)]
        graph = HighLevelGraph.from_collections(name, dsk, dependencies=[_ddf])
        size_list = Delayed(name, graph).compute()

        # Get memory usage per row using first partition
        p0_mem_size = _ddf.partitions[0].memory_usage(
            deep=True, index=True).sum().compute()
        mem_per_row = int(float(p0_mem_size) / float(size_list[0]))

        # Determine the number of rows to assign to each output partition
        # and the number of output partitions to assign to each output file
        rows_per_part = int(part_size / mem_per_row)
        parts_per_file = int(file_size / part_size)

        # Construct re-partition graph
        dsk2 = {}
        repartition_name = "repartition-" + token
        split_name = "split-" + repartition_name
        getitem_name = "getitem-" + repartition_name

        gets = defaultdict(list)
        out_parts = 0
        remaining_out_part_rows = rows_per_part
        for i, in_part_size in enumerate(size_list):

            # The `split` dictionary will be passed to this input
            # partition to dictate how that partition will be split
            # into different output partitions/files.  The "key" of
            # this dict is the output partition, and the value is a
            # tuple specifying the (start, end) row range.
            split = {}
            last = 0
            while in_part_size >= remaining_out_part_rows:

                gets[out_parts].append(i)
                split[out_parts] = (last, last + remaining_out_part_rows)
                last += remaining_out_part_rows
                in_part_size = in_part_size - remaining_out_part_rows

                remaining_out_part_rows = rows_per_part
                out_parts += 1

            if in_part_size:
                gets[out_parts].append(i)
                split[out_parts] = (last, last + in_part_size)
                remaining_out_part_rows -= in_part_size

            if remaining_out_part_rows == 0:
                remaining_out_part_rows = rows_per_part
                out_parts += 1

            dsk2[(split_name, i)] = (_split_part, (_ddf._name, i), split)
        npartitions = max(gets) + 1

        for k, v_list in gets.items():
            last = None
            _concat_list = []
            for v in v_list:
                key = (getitem_name, v, k)
                _concat_list.append(key)
                dsk2[key] = (operator.getitem, (split_name, v), k)

            ignore_index = True
            dsk2[(repartition_name, k)] = (_concat, _concat_list, ignore_index)

        graph2 = HighLevelGraph.from_collections(repartition_name,
                                                 dsk2,
                                                 dependencies=[_ddf])
        divisions = [None] * (npartitions + 1)
        _ddf2 = new_dd_object(graph2, repartition_name, _ddf._meta, divisions)

        # Make sure the root directory exists
        fs.mkdirs(output_path, exist_ok=True)

        # Construct rewrite graph
        dsk3 = {}
        rewrite_name = "rewrite-" + token
        write_data_name = "write-data-" + rewrite_name
        write_metadata_name = "write-metadata-" + rewrite_name
        inputs = []
        final_inputs = []
        for i in range(_ddf2.npartitions):
            index = i // parts_per_file
            nex_index = (i + 1) // parts_per_file
            package_task = (index != nex_index) or (i
                                                    == (_ddf2.npartitions - 1))
            fn = f"part.{index}.parquet"
            inputs.append((repartition_name, i))
            if package_task:
                final_inputs.append((write_data_name, i))
                dsk3[(write_data_name, i)] = (
                    _write_data,
                    inputs,
                    output_path,
                    fs,
                    fn,
                )
                inputs = []

        # Final task collects and writes all metadata
        dsk3[write_metadata_name] = (
            _write_metadata_file,
            final_inputs,
            fs,
            output_path,
            gmd,
        )
        graph3 = HighLevelGraph.from_collections(write_metadata_name,
                                                 dsk3,
                                                 dependencies=[_ddf2])

        return Delayed(write_metadata_name, graph3)
Exemplo n.º 32
0
def _ddf_to_dataset(
    ddf,
    fs,
    output_path,
    shuffle,
    file_partition_map,
    out_files_per_proc,
    cat_names,
    cont_names,
    label_names,
    output_format,
    client,
    num_threads,
    cpu,
    suffix="",
    partition_on=None,
):

    # Construct graph for Dask-based dataset write
    token = tokenize(ddf, shuffle, out_files_per_proc, cat_names, cont_names,
                     label_names, suffix, partition_on)
    name = "write-processed-" + token
    write_name = name + "-partition" + token

    # Check that the data is in the correct place
    assert isinstance(ddf._meta, pd.DataFrame) is cpu

    dsk = {}
    task_list = []
    if partition_on:
        # Use hive partitioning to write the data
        cached_writers = False
        for idx in range(ddf.npartitions):
            task_list.append((write_name, idx))
            dsk[task_list[-1]] = (
                _write_partitioned,
                (ddf._name, idx),
                f"part.{idx}{suffix}",
                output_path,
                partition_on,
                shuffle,
                fs,
                cat_names,
                cont_names,
                label_names,
                output_format,
                num_threads,
                cpu,
            )
        dsk[name] = (
            _write_metadata_files,
            task_list,
            output_path,
            output_format,
            cpu,
        )
    elif file_partition_map is not None:
        # Use specified mapping of data to output files
        cached_writers = False
        full_graph = ddf.dask
        for fn, parts in file_partition_map.items():
            # Isolate subgraph for this output file
            subgraph = DaskSubgraph(full_graph, ddf._name, parts)
            task_list.append((write_name, fn))
            dsk[task_list[-1]] = (
                _write_subgraph,
                subgraph,
                fn,
                output_path,
                shuffle,
                fs,
                cat_names,
                cont_names,
                label_names,
                output_format,
                num_threads,
                cpu,
                suffix,
            )
        dsk[name] = (
            _write_metadata_files,
            task_list,
            output_path,
            output_format,
            cpu,
        )
    else:
        cached_writers = True
        for idx in range(ddf.npartitions):
            key = (write_name, idx)
            dsk[key] = (
                _write_output_partition,
                (ddf._name, idx),
                output_path,
                shuffle,
                out_files_per_proc,
                fs,
                cat_names,
                cont_names,
                label_names,
                output_format,
                num_threads,
                cpu,
                suffix,
            )
            task_list.append(key)
        dsk[name] = (lambda x: x, task_list)

    graph = _ensure_optimize_dataframe_graph(
        dsk=HighLevelGraph.from_collections(name, dsk, dependencies=[ddf]),
        keys=[name],
    )
    out = Delayed(name, graph)

    # Trigger write execution
    if client:
        out = client.compute(out).result()
    else:
        out = dask.compute(out, scheduler="synchronous")[0]

    if cached_writers:
        # Follow-up Shuffling and _metadata creation
        _finish_dataset(client, ddf, output_path, fs, output_format, cpu)
Exemplo n.º 33
0
def test_bind(layers):
    dsk1 = {("a-1", h1): 1, ("a-1", h2): 2}
    dsk2 = {"b-1": (add, ("a-1", h1), ("a-1", h2))}
    dsk3 = {"c-1": "b-1"}
    cnt = NodeCounter()
    dsk4 = {("d-1", h1): (cnt.f, 1), ("d-1", h2): (cnt.f, 2)}
    dsk4b = {"e": (cnt.f, 3)}

    if layers:
        dsk1 = HighLevelGraph({"a-1": dsk1}, {"a-1": set()})
        dsk2 = HighLevelGraph({
            "a-1": dsk1,
            "b-1": dsk2
        }, {
            "a-1": set(),
            "b-1": {"a-1"}
        })
        dsk3 = HighLevelGraph(
            {
                "a-1": dsk1,
                "b-1": dsk2,
                "c-1": dsk3
            },
            {
                "a-1": set(),
                "b-1": {"a-1"},
                "c-1": {"b-1"}
            },
        )
        dsk4 = HighLevelGraph({
            "d-1": dsk4,
            "e": dsk4b
        }, {
            "d-1": set(),
            "e": set()
        })
    else:
        dsk2.update(dsk1)
        dsk3.update(dsk2)
        dsk4.update(dsk4b)

    # t1 = Tuple(dsk1, [("a", h1), ("a", h2)])
    t2 = Tuple(dsk2, ["b-1"])
    t3 = Tuple(dsk3, ["c-1"])
    t4 = Tuple(dsk4, [("d-1", h1), ("d-1", h2), "e"])  # Multiple names

    bound1 = bind(t3, t4, seed=1, assume_layers=layers)
    cloned_a_name = clone_key("a-1", seed=1)
    assert bound1.__dask_graph__()[cloned_a_name, h1][0] is chunks.bind
    assert bound1.__dask_graph__()[cloned_a_name, h2][0] is chunks.bind
    assert bound1.compute() == (3, )
    assert cnt.n == 3

    bound2 = bind(t3, t4, omit=t2, seed=1, assume_layers=layers)
    cloned_c_name = clone_key("c-1", seed=1)
    assert bound2.__dask_graph__()[cloned_c_name][0] is chunks.bind
    assert bound2.compute() == (3, )
    assert cnt.n == 6

    bound3 = bind(t4, t3, seed=1, assume_layers=layers)
    cloned_d_name = clone_key("d-1", seed=1)
    cloned_e_name = clone_key("e", seed=1)
    assert bound3.__dask_graph__()[cloned_d_name, h1][0] is chunks.bind
    assert bound3.__dask_graph__()[cloned_d_name, h2][0] is chunks.bind
    assert bound3.__dask_graph__()[cloned_e_name][0] is chunks.bind
    assert bound3.compute() == (1, 2, 3)
    assert cnt.n == 9