예제 #1
0
파일: dask.py 프로젝트: cggh/scikit-allel
def da_compress(condition, a, axis=None):
    """Temporary workaround for https://github.com/dask/dask/issues/4940"""
    from dask.array.core import asarray
    from dask.array.utils import validate_axis
    from dask.utils import is_arraylike

    if not is_arraylike(condition):
        # Allow `condition` to be anything array-like, otherwise ensure `condition`
        # is a dask array.
        condition = asarray(condition)
    condition = condition.astype(bool)
    a = asarray(a)

    if condition.ndim != 1:
        raise ValueError("Condition must be one dimensional")

    if axis is None:
        a = a.ravel()
        axis = 0
    axis = validate_axis(axis, a.ndim)

    # Treat `condition` as filled with `False` (if it is too short)
    a = a[tuple(
        slice(None, len(condition)) if i == axis else slice(None)
        for i in range(a.ndim))]

    # Use `condition` to select along 1 dimension
    a = a[tuple(condition if i == axis else slice(None)
                for i in range(a.ndim))]

    return a
예제 #2
0
def to_dask_array(
    agghist: AggHistogram,
    flow: bool = False,
    dd: bool = False,
) -> tuple[DaskArray, ...] | tuple[DaskArray, list[DaskArray]]:
    """Convert `agghist` to a `dask.array` return style.

    Parameters
    ----------
    agghist : AggHistogram
        The aggregated histogram collection to convert.
    flow : bool
        If ``True``, include under- and over-flow bins
    dd : bool
        If True, use ``histogramdd`` style return.

    See Also
    --------
    dask_histogram.AggHistogram.to_dask_array

    Returns
    -------
    Union[Tuple[DaskCollection, List[DaskCollection]], Tuple[DaskCollection, ...]]
        The first return is always the bin counts. If `dd` is ``True``
        the second return is a list where each element is an array of
        bin edges for each axis. If `dd` is ``False``, the bin edge
        arrays will not be stored in a list (`histogram2d` style
        return).

    """
    name = f"to-dask-array-{tokenize(agghist)}"
    zeros = (0, ) * agghist.histref.ndim
    dsk = {
        (name, *zeros):
        (lambda x, f: x.to_numpy(flow=f)[0], agghist.name, flow)
    }
    graph = HighLevelGraph.from_collections(name,
                                            dsk,
                                            dependencies=(agghist, ))
    shape = agghist.histref.shape
    if flow:
        shape = tuple(i + 2 for i in shape)
    int_storage = agghist.histref._storage_type in (
        bh.storage.Int64,
        bh.storage.AtomicInt64,
    )
    dt = int if int_storage else float
    c = DaskArray(graph, name=name, shape=shape, chunks=shape, dtype=dt)
    axes = agghist.histref.axes

    if flow:
        edges = [
            asarray(np.concatenate([[-np.inf], ax.edges, [np.inf]]))
            for ax in axes
        ]
    else:
        edges = [asarray(ax.edges) for ax in axes]
    if dd:
        return c, edges
    return (c, *tuple(edges))
예제 #3
0
파일: creation.py 프로젝트: m-rossi/dask
def meshgrid(*xi, sparse=False, indexing="xy", **kwargs):
    sparse = bool(sparse)

    if "copy" in kwargs:
        raise NotImplementedError("`copy` not supported")

    if kwargs:
        raise TypeError("unsupported keyword argument(s) provided")

    if indexing not in ("ij", "xy"):
        raise ValueError("`indexing` must be `'ij'` or `'xy'`")

    xi = [asarray(e) for e in xi]
    xi = [e.flatten() for e in xi]

    if indexing == "xy" and len(xi) > 1:
        xi[0], xi[1] = xi[1], xi[0]

    grid = []
    for i in range(len(xi)):
        s = len(xi) * [None]
        s[i] = slice(None)
        s = tuple(s)

        r = xi[i][s]

        grid.append(r)

    if not sparse:
        grid = broadcast_arrays(*grid)

    if indexing == "xy" and len(xi) > 1:
        grid[0], grid[1] = grid[1], grid[0]

    return grid
예제 #4
0
파일: creation.py 프로젝트: m-rossi/dask
def full_like(a,
              fill_value,
              order="C",
              dtype=None,
              chunks=None,
              name=None,
              shape=None):
    """
    Return a full array with the same shape and type as a given array.

    Parameters
    ----------
    a : array_like
        The shape and data-type of `a` define these same attributes of
        the returned array.
    fill_value : scalar
        Fill value.
    dtype : data-type, optional
        Overrides the data type of the result.
    order : {'C', 'F'}, optional
        Whether to store multidimensional data in C- or Fortran-contiguous
        (row- or column-wise) order in memory.
    chunks : sequence of ints
        The number of samples on each block. Note that the last block will have
        fewer samples if ``len(array) % chunks != 0``.
    name : str, optional
        An optional keyname for the array. Defaults to hashing the input
        keyword arguments.
    shape : int or sequence of ints, optional.
        Overrides the shape of the result.

    Returns
    -------
    out : ndarray
        Array of `fill_value` with the same shape and type as `a`.

    See Also
    --------
    zeros_like : Return an array of zeros with shape and type of input.
    ones_like : Return an array of ones with shape and type of input.
    empty_like : Return an empty array with shape and type of input.
    zeros : Return a new array setting values to zero.
    ones : Return a new array setting values to one.
    empty : Return a new uninitialized array.
    full : Fill a new array.
    """

    a = asarray(a, name=False)
    shape, chunks = _get_like_function_shapes_chunks(a, chunks, shape)
    return full(
        shape,
        fill_value,
        dtype=(dtype or a.dtype),
        order=order,
        chunks=chunks,
        name=name,
        meta=a._meta,
    )
예제 #5
0
    def outer(self, A, B, **kwargs):
        if self.nin != 2:
            raise ValueError(
                "outer product only supported for binary functions")
        if "out" in kwargs:
            raise ValueError("`out` kwarg not supported")

        A_is_dask = is_dask_collection(A)
        B_is_dask = is_dask_collection(B)
        if not A_is_dask and not B_is_dask:
            return self._ufunc.outer(A, B, **kwargs)
        elif (A_is_dask and not isinstance(A, Array)
              or B_is_dask and not isinstance(B, Array)):
            raise NotImplementedError(
                "Dask objects besides `dask.array.Array` "
                "are not supported at this time.")

        A = asarray(A)
        B = asarray(B)
        ndim = A.ndim + B.ndim
        out_inds = tuple(range(ndim))
        A_inds = out_inds[:A.ndim]
        B_inds = out_inds[A.ndim:]

        dtype = apply_infer_dtype(self._ufunc.outer, [A, B],
                                  kwargs,
                                  "ufunc.outer",
                                  suggest_dtype=False)

        if "dtype" in kwargs:
            func = partial(self._ufunc.outer, dtype=kwargs.pop("dtype"))
        else:
            func = self._ufunc.outer

        return blockwise(
            func,
            out_inds,
            A,
            A_inds,
            B,
            B_inds,
            dtype=dtype,
            token=self.__name__ + ".outer",
            **kwargs,
        )
예제 #6
0
파일: creation.py 프로젝트: m-rossi/dask
def empty_like(a, dtype=None, order="C", chunks=None, name=None, shape=None):
    """
    Return a new array with the same shape and type as a given array.

    Parameters
    ----------
    a : array_like
        The shape and data-type of `a` define these same attributes of the
        returned array.
    dtype : data-type, optional
        Overrides the data type of the result.
    order : {'C', 'F'}, optional
        Whether to store multidimensional data in C- or Fortran-contiguous
        (row- or column-wise) order in memory.
    chunks : sequence of ints
        The number of samples on each block. Note that the last block will have
        fewer samples if ``len(array) % chunks != 0``.
    name : str, optional
        An optional keyname for the array. Defaults to hashing the input
        keyword arguments.
    shape : int or sequence of ints, optional.
        Overrides the shape of the result.

    Returns
    -------
    out : ndarray
        Array of uninitialized (arbitrary) data with the same
        shape and type as `a`.

    See Also
    --------
    ones_like : Return an array of ones with shape and type of input.
    zeros_like : Return an array of zeros with shape and type of input.
    empty : Return a new uninitialized array.
    ones : Return a new array setting values to one.
    zeros : Return a new array setting values to zero.

    Notes
    -----
    This function does *not* initialize the returned array; to do that use
    `zeros_like` or `ones_like` instead.  It may be marginally faster than
    the functions that do set the array values.
    """

    a = asarray(a, name=False)
    shape, chunks = _get_like_function_shapes_chunks(a, chunks, shape)
    return empty(
        shape,
        dtype=(dtype or a.dtype),
        order=order,
        chunks=chunks,
        name=name,
        meta=a._meta,
    )
예제 #7
0
파일: creation.py 프로젝트: m-rossi/dask
def pad(array, pad_width, mode="constant", **kwargs):
    array = asarray(array)

    pad_width = expand_pad_value(array, pad_width)

    if callable(mode):
        return pad_udf(array, pad_width, mode, **kwargs)

    # Make sure that no unsupported keywords were passed for the current mode
    allowed_kwargs = {
        "empty": [],
        "edge": [],
        "wrap": [],
        "constant": ["constant_values"],
        "linear_ramp": ["end_values"],
        "maximum": ["stat_length"],
        "mean": ["stat_length"],
        "median": ["stat_length"],
        "minimum": ["stat_length"],
        "reflect": ["reflect_type"],
        "symmetric": ["reflect_type"],
    }
    try:
        unsupported_kwargs = set(kwargs) - set(allowed_kwargs[mode])
    except KeyError as e:
        raise ValueError(f"mode '{mode}' is not supported") from e
    if unsupported_kwargs:
        raise ValueError(
            "unsupported keyword arguments for mode '{}': {}".format(
                mode, unsupported_kwargs))

    if mode in {"maximum", "mean", "median", "minimum"}:
        stat_length = kwargs.get("stat_length",
                                 tuple((n, n) for n in array.shape))
        return pad_stats(array, pad_width, mode, stat_length)
    elif mode == "constant":
        kwargs.setdefault("constant_values", 0)
        return pad_edge(array, pad_width, mode, **kwargs)
    elif mode == "linear_ramp":
        kwargs.setdefault("end_values", 0)
        return pad_edge(array, pad_width, mode, **kwargs)
    elif mode in {"edge", "empty"}:
        return pad_edge(array, pad_width, mode)
    elif mode in ["reflect", "symmetric", "wrap"]:
        return pad_reuse(array, pad_width, mode, **kwargs)

    assert False, "unreachable"
예제 #8
0
파일: creation.py 프로젝트: m-rossi/dask
def tile(A, reps):
    try:
        tup = tuple(reps)
    except TypeError:
        tup = (reps, )
    if any(i < 0 for i in tup):
        raise ValueError("Negative `reps` are not allowed.")
    c = asarray(A)

    if all(tup):
        for nrep in tup[::-1]:
            c = nrep * [c]
        return block(c)

    d = len(tup)
    if d < c.ndim:
        tup = (1, ) * (c.ndim - d) + tup
    if c.ndim < d:
        shape = (1, ) * (d - c.ndim) + c.shape
    else:
        shape = c.shape
    shape_out = tuple(s * t for s, t in zip(shape, tup))
    return empty(shape=shape_out, dtype=c.dtype)
예제 #9
0
        def choice(self, a, size=None, replace=True, p=None, chunks=None):
            dsks = []
            # Normalize and validate `a`
            if isinstance(a, Integral):
                # On windows the output dtype differs if p is provided or
                # absent, see https://github.com/numpy/numpy/issues/9867
                dummy_p = np.array([1]) if p is not None else p
                dtype = np.random.choice(1, size=(), p=dummy_p).dtype
                len_a = a
                if a < 0:
                    raise ValueError("a must be greater than 0")
            else:
                a = asarray(a).rechunk(a.shape)
                dtype = a.dtype
                if a.ndim != 1:
                    raise ValueError("a must be one dimensional")
                len_a = len(a)
                dsks.append(a.dask)
                a = a.__dask_keys__()[0]

            # Normalize and validate `p`
            if p is not None:
                if not isinstance(p, Array):
                    # If p is not a dask array, first check the sum is close
                    # to 1 before converting.
                    p = np.asarray(p)
                    if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0):
                        raise ValueError("probabilities do not sum to 1")
                    p = asarray(p)
                else:
                    p = p.rechunk(p.shape)

                if p.ndim != 1:
                    raise ValueError("p must be one dimensional")
                if len(p) != len_a:
                    raise ValueError("a and p must have the same size")

                dsks.append(p.dask)
                p = p.__dask_keys__()[0]

            if size is None:
                size = ()
            elif not isinstance(size, (tuple, list)):
                size = (size, )

            chunks = normalize_chunks(chunks, size)
            sizes = list(product(*chunks))
            state_data = random_state_data(len(sizes), self._numpy_state)

            name = 'da.random.choice-%s' % tokenize(state_data, size, chunks,
                                                    a, replace, p)
            keys = product([name], *(range(len(bd)) for bd in chunks))
            dsk = {
                k: (_choice, state, a, size, replace, p)
                for k, state, size in zip(keys, state_data, sizes)
            }

            return Array(sharedict.merge((name, dsk), *dsks),
                         name,
                         chunks,
                         dtype=dtype)
예제 #10
0
파일: random.py 프로젝트: m-rossi/dask
        def choice(self, a, size=None, replace=True, p=None, chunks="auto"):
            dependencies = []
            # Normalize and validate `a`
            if isinstance(a, Integral):
                # On windows the output dtype differs if p is provided or
                # absent, see https://github.com/numpy/numpy/issues/9867
                dummy_p = np.array([1]) if p is not None else p
                dtype = np.random.choice(1, size=(), p=dummy_p).dtype
                len_a = a
                if a < 0:
                    raise ValueError("a must be greater than 0")
            else:
                a = asarray(a)
                a = a.rechunk(a.shape)
                dtype = a.dtype
                if a.ndim != 1:
                    raise ValueError("a must be one dimensional")
                len_a = len(a)
                dependencies.append(a)
                a = a.__dask_keys__()[0]

            # Normalize and validate `p`
            if p is not None:
                if not isinstance(p, Array):
                    # If p is not a dask array, first check the sum is close
                    # to 1 before converting.
                    p = np.asarray(p)
                    if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0):
                        raise ValueError("probabilities do not sum to 1")
                    p = asarray(p)
                else:
                    p = p.rechunk(p.shape)

                if p.ndim != 1:
                    raise ValueError("p must be one dimensional")
                if len(p) != len_a:
                    raise ValueError("a and p must have the same size")

                dependencies.append(p)
                p = p.__dask_keys__()[0]

            if size is None:
                size = ()
            elif not isinstance(size, (tuple, list)):
                size = (size, )

            chunks = normalize_chunks(chunks, size, dtype=np.float64)
            if not replace and len(chunks[0]) > 1:
                err_msg = ("replace=False is not currently supported for "
                           "dask.array.choice with multi-chunk output "
                           "arrays")
                raise NotImplementedError(err_msg)
            sizes = list(product(*chunks))
            state_data = random_state_data(len(sizes), self._numpy_state)

            name = "da.random.choice-%s" % tokenize(state_data, size, chunks,
                                                    a, replace, p)
            keys = product([name], *(range(len(bd)) for bd in chunks))
            dsk = {
                k: (_choice, state, a, size, replace, p)
                for k, state, size in zip(keys, state_data, sizes)
            }

            graph = HighLevelGraph.from_collections(name,
                                                    dsk,
                                                    dependencies=dependencies)
            return Array(graph, name, chunks, dtype=dtype)
예제 #11
0
파일: gufunc.py 프로젝트: m-rossi/dask
def apply_gufunc(
    func,
    signature,
    *args,
    axes=None,
    axis=None,
    keepdims=False,
    output_dtypes=None,
    output_sizes=None,
    vectorize=None,
    allow_rechunk=False,
    meta=None,
    **kwargs,
):
    """
    Apply a generalized ufunc or similar python function to arrays.

    ``signature`` determines if the function consumes or produces core
    dimensions. The remaining dimensions in given input arrays (``*args``)
    are considered loop dimensions and are required to broadcast
    naturally against each other.

    In other terms, this function is like ``np.vectorize``, but for
    the blocks of dask arrays. If the function itself shall also
    be vectorized use ``vectorize=True`` for convenience.

    Parameters
    ----------
    func : callable
        Function to call like ``func(*args, **kwargs)`` on input arrays
        (``*args``) that returns an array or tuple of arrays. If multiple
        arguments with non-matching dimensions are supplied, this function is
        expected to vectorize (broadcast) over axes of positional arguments in
        the style of NumPy universal functions [1]_ (if this is not the case,
        set ``vectorize=True``). If this function returns multiple outputs,
        ``output_core_dims`` has to be set as well.
    signature: string
        Specifies what core dimensions are consumed and produced by ``func``.
        According to the specification of numpy.gufunc signature [2]_
    *args : numeric
        Input arrays or scalars to the callable function.
    axes: List of tuples, optional, keyword only
        A list of tuples with indices of axes a generalized ufunc should operate on.
        For instance, for a signature of ``"(i,j),(j,k)->(i,k)"`` appropriate for
        matrix multiplication, the base elements are two-dimensional matrices
        and these are taken to be stored in the two last axes of each argument. The
        corresponding axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``.
        For simplicity, for generalized ufuncs that operate on 1-dimensional arrays
        (vectors), a single integer is accepted instead of a single-element tuple,
        and for generalized ufuncs for which all outputs are scalars, the output
        tuples can be omitted.
    axis: int, optional, keyword only
        A single axis over which a generalized ufunc should operate. This is a short-cut
        for ufuncs that operate over a single, shared core dimension, equivalent to passing
        in axes with entries of (axis,) for each single-core-dimension argument and ``()`` for
        all others. For instance, for a signature ``"(i),(i)->()"``, it is equivalent to passing
        in ``axes=[(axis,), (axis,), ()]``.
    keepdims: bool, optional, keyword only
        If this is set to True, axes which are reduced over will be left in the result as
        a dimension with size one, so that the result will broadcast correctly against the
        inputs. This option can only be used for generalized ufuncs that operate on inputs
        that all have the same number of core dimensions and with outputs that have no core
        dimensions , i.e., with signatures like ``"(i),(i)->()"`` or ``"(m,m)->()"``.
        If used, the location of the dimensions in the output can be controlled with axes
        and axis.
    output_dtypes : Optional, dtype or list of dtypes, keyword only
        Valid numpy dtype specification or list thereof.
        If not given, a call of ``func`` with a small set of data
        is performed in order to try to automatically determine the
        output dtypes.
    output_sizes : dict, optional, keyword only
        Optional mapping from dimension names to sizes for outputs. Only used if
        new core dimensions (not found on inputs) appear on outputs.
    vectorize: bool, keyword only
        If set to ``True``, ``np.vectorize`` is applied to ``func`` for
        convenience. Defaults to ``False``.
    allow_rechunk: Optional, bool, keyword only
        Allows rechunking, otherwise chunk sizes need to match and core
        dimensions are to consist only of one chunk.
        Warning: enabling this can increase memory usage significantly.
        Defaults to ``False``.
    meta: Optional, tuple, keyword only
        tuple of empty ndarrays describing the shape and dtype of the output of the gufunc.
        Defaults to ``None``.
    **kwargs : dict
        Extra keyword arguments to pass to `func`

    Returns
    -------
    Single dask.array.Array or tuple of dask.array.Array

    Examples
    --------
    >>> import dask.array as da
    >>> import numpy as np
    >>> def stats(x):
    ...     return np.mean(x, axis=-1), np.std(x, axis=-1)
    >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30))
    >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a)
    >>> mean.compute().shape
    (10, 20)


    >>> def outer_product(x, y):
    ...     return np.einsum("i,j->ij", x, y)
    >>> a = da.random.normal(size=(   20,30), chunks=(10, 30))
    >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40))
    >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, vectorize=True)
    >>> c.compute().shape
    (10, 20, 30, 40)

    References
    ----------
    .. [1] https://docs.scipy.org/doc/numpy/reference/ufuncs.html
    .. [2] https://docs.scipy.org/doc/numpy/reference/c-api/generalized-ufuncs.html
    """
    # Input processing:
    ## Signature
    if not isinstance(signature, str):
        raise TypeError("`signature` has to be of type string")
    # NumPy versions before https://github.com/numpy/numpy/pull/19627
    # would not ignore whitespace characters in `signature` like they
    # are supposed to. We remove the whitespace here as a workaround.
    signature = re.sub(r"\s+", "", signature)
    input_coredimss, output_coredimss = _parse_gufunc_signature(signature)

    ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples
    nout = None if not isinstance(output_coredimss,
                                  list) else len(output_coredimss)

    ## Consolidate onto `meta`
    if meta is not None and output_dtypes is not None:
        raise ValueError(
            "Only one of `meta` and `output_dtypes` should be given (`meta` is preferred)."
        )
    if meta is None:
        if output_dtypes is None:
            ## Infer `output_dtypes`
            if vectorize:
                tempfunc = np.vectorize(func, signature=signature)
            else:
                tempfunc = func
            output_dtypes = apply_infer_dtype(tempfunc, args, kwargs,
                                              "apply_gufunc", "output_dtypes",
                                              nout)

        ## Turn `output_dtypes` into `meta`
        if (nout is None and isinstance(output_dtypes, (tuple, list))
                and len(output_dtypes) == 1):
            output_dtypes = output_dtypes[0]
        sample = args[0] if args else None
        if nout is None:
            meta = meta_from_array(sample, dtype=output_dtypes)
        else:
            meta = tuple(
                meta_from_array(sample, dtype=odt) for odt in output_dtypes)

    ## Normalize `meta` format
    meta = meta_from_array(meta)
    if isinstance(meta, list):
        meta = tuple(meta)

    ## Validate `meta`
    if nout is None:
        if isinstance(meta, tuple):
            if len(meta) == 1:
                meta = meta[0]
            else:
                raise ValueError(
                    "For a function with one output, must give a single item for `output_dtypes`/`meta`, "
                    "not a tuple or list.")
    else:
        if not isinstance(meta, tuple):
            raise ValueError(
                f"For a function with {nout} outputs, must give a tuple or list for `output_dtypes`/`meta`, "
                "not a single item.")
        if len(meta) != nout:
            raise ValueError(
                f"For a function with {nout} outputs, must give a tuple or list of {nout} items for "
                f"`output_dtypes`/`meta`, not {len(meta)}.")

    ## Vectorize function, if required
    if vectorize:
        otypes = [x.dtype
                  for x in meta] if isinstance(meta, tuple) else [meta.dtype]
        func = np.vectorize(func, signature=signature, otypes=otypes)

    ## Miscellaneous
    if output_sizes is None:
        output_sizes = {}

    ## Axes
    input_axes, output_axes = _validate_normalize_axes(axes, axis, keepdims,
                                                       input_coredimss,
                                                       output_coredimss)

    # Main code:
    ## Cast all input arrays to dask
    args = [asarray(a) for a in args]

    if len(input_coredimss) != len(args):
        raise ValueError(
            "According to `signature`, `func` requires %d arguments, but %s given"
            % (len(input_coredimss), len(args)))

    ## Axes: transpose input arguments
    transposed_args = []
    for arg, iax, input_coredims in zip(args, input_axes, input_coredimss):
        shape = arg.shape
        iax = tuple(a if a < 0 else a - len(shape) for a in iax)
        tidc = tuple(i
                     for i in range(-len(shape) + 0, 0) if i not in iax) + iax
        transposed_arg = arg.transpose(tidc)
        transposed_args.append(transposed_arg)
    args = transposed_args

    ## Assess input args for loop dims
    input_shapes = [a.shape for a in args]
    input_chunkss = [a.chunks for a in args]
    num_loopdims = [
        len(s) - len(cd) for s, cd in zip(input_shapes, input_coredimss)
    ]
    max_loopdims = max(num_loopdims) if num_loopdims else None
    core_input_shapes = [
        dict(zip(icd, s[n:]))
        for s, n, icd in zip(input_shapes, num_loopdims, input_coredimss)
    ]
    core_shapes = merge(*core_input_shapes)
    core_shapes.update(output_sizes)

    loop_input_dimss = [
        tuple("__loopdim%d__" % d
              for d in range(max_loopdims - n, max_loopdims))
        for n in num_loopdims
    ]
    input_dimss = [l + c for l, c in zip(loop_input_dimss, input_coredimss)]

    loop_output_dims = max(loop_input_dimss,
                           key=len) if loop_input_dimss else tuple()

    ## Assess input args for same size and chunk sizes
    ### Collect sizes and chunksizes of all dims in all arrays
    dimsizess = {}
    chunksizess = {}
    for dims, shape, chunksizes in zip(input_dimss, input_shapes,
                                       input_chunkss):
        for dim, size, chunksize in zip(dims, shape, chunksizes):
            dimsizes = dimsizess.get(dim, [])
            dimsizes.append(size)
            dimsizess[dim] = dimsizes
            chunksizes_ = chunksizess.get(dim, [])
            chunksizes_.append(chunksize)
            chunksizess[dim] = chunksizes_
    ### Assert correct partitioning, for case:
    for dim, sizes in dimsizess.items():
        #### Check that the arrays have same length for same dimensions or dimension `1`
        if set(sizes) | {1} != {1, max(sizes)}:
            raise ValueError(
                f"Dimension `'{dim}'` with different lengths in arrays")
        if not allow_rechunk:
            chunksizes = chunksizess[dim]
            #### Check if core dimensions consist of only one chunk
            if (dim in core_shapes) and (chunksizes[0][0] < core_shapes[dim]):
                raise ValueError(
                    "Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \
chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \
significantly.".format(dim))
            #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1
            relevant_chunksizes = list(
                unique(c for s, c in zip(sizes, chunksizes) if s > 1))
            if len(relevant_chunksizes) > 1:
                raise ValueError(
                    f"Dimension `'{dim}'` with different chunksize present")

    ## Apply function - use blockwise here
    arginds = list(concat(zip(args, input_dimss)))

    ### Use existing `blockwise` but only with loopdims to enforce
    ### concatenation for coredims that appear also at the output
    ### Modifying `blockwise` could improve things here.
    tmp = blockwise(func,
                    loop_output_dims,
                    *arginds,
                    concatenate=True,
                    meta=meta,
                    **kwargs)

    # NOTE: we likely could just use `meta` instead of `tmp._meta`,
    # but we use it and validate it anyway just to be sure nothing odd has happened.
    metas = tmp._meta
    if nout is None:
        assert not isinstance(
            metas, (list, tuple)
        ), f"meta changed from single output to multiple output during blockwise: {meta} -> {metas}"
        metas = (metas, )
    else:
        assert isinstance(
            metas, (list, tuple)
        ), f"meta changed from multiple output to single output during blockwise: {meta} -> {metas}"
        assert (
            len(metas) == nout
        ), f"Number of outputs changed from {nout} to {len(metas)} during blockwise"

    ## Prepare output shapes
    loop_output_shape = tmp.shape
    loop_output_chunks = tmp.chunks
    keys = list(flatten(tmp.__dask_keys__()))
    name, token = keys[0][0].split("-")

    ### *) Treat direct output
    if nout is None:
        output_coredimss = [output_coredimss]

    ## Split output
    leaf_arrs = []
    for i, (ocd, oax,
            meta) in enumerate(zip(output_coredimss, output_axes, metas)):
        core_output_shape = tuple(core_shapes[d] for d in ocd)
        core_chunkinds = len(ocd) * (0, )
        output_shape = loop_output_shape + core_output_shape
        output_chunks = loop_output_chunks + core_output_shape
        leaf_name = "%s_%d-%s" % (name, i, token)
        leaf_dsk = {(leaf_name, ) + key[1:] + core_chunkinds:
                    ((getitem, key, i) if nout else key)
                    for key in keys}
        graph = HighLevelGraph.from_collections(leaf_name,
                                                leaf_dsk,
                                                dependencies=[tmp])
        meta = meta_from_array(meta, len(output_shape))
        leaf_arr = Array(graph,
                         leaf_name,
                         chunks=output_chunks,
                         shape=output_shape,
                         meta=meta)

        ### Axes:
        if keepdims:
            slices = len(
                leaf_arr.shape) * (slice(None), ) + len(oax) * (np.newaxis, )
            leaf_arr = leaf_arr[slices]

        tidcs = [None] * len(leaf_arr.shape)
        for ii, oa in zip(range(-len(oax), 0), oax):
            tidcs[oa] = ii
        j = 0
        for ii in range(len(tidcs)):
            if tidcs[ii] is None:
                tidcs[ii] = j
                j += 1
        leaf_arr = leaf_arr.transpose(tidcs)
        leaf_arrs.append(leaf_arr)

    return (*leaf_arrs, ) if nout else leaf_arrs[0]  # Undo *) from above
예제 #12
0
파일: creation.py 프로젝트: m-rossi/dask
def diagonal(a, offset=0, axis1=0, axis2=1):
    name = "diagonal-" + tokenize(a, offset, axis1, axis2)

    if a.ndim < 2:
        # NumPy uses `diag` as we do here.
        raise ValueError("diag requires an array of at least two dimensions")

    def _axis_fmt(axis, name, ndim):
        if axis < 0:
            t = ndim + axis
            if t < 0:
                msg = "{}: axis {} is out of bounds for array of dimension {}"
                raise np.AxisError(msg.format(name, axis, ndim))
            axis = t
        return axis

    def pop_axes(chunks, axis1, axis2):
        chunks = list(chunks)
        chunks.pop(axis2)
        chunks.pop(axis1)
        return tuple(chunks)

    axis1 = _axis_fmt(axis1, "axis1", a.ndim)
    axis2 = _axis_fmt(axis2, "axis2", a.ndim)

    if axis1 == axis2:
        raise ValueError("axis1 and axis2 cannot be the same")

    a = asarray(a)
    k = offset
    if axis1 > axis2:
        axis1, axis2 = axis2, axis1
        k = -offset

    free_axes = set(range(a.ndim)) - {axis1, axis2}
    free_indices = list(product(*(range(a.numblocks[i]) for i in free_axes)))
    ndims_free = len(free_axes)

    # equation of diagonal: i = j - k
    kdiag_row_start = max(0, -k)
    kdiag_col_start = max(0, k)
    kdiag_row_stop = min(a.shape[axis1], a.shape[axis2] - k)
    len_kdiag = kdiag_row_stop - kdiag_row_start

    if len_kdiag <= 0:
        xp = np

        if is_cupy_type(a._meta):
            import cupy

            xp = cupy

        out_chunks = pop_axes(a.chunks, axis1, axis2) + ((0, ), )
        dsk = dict()
        for free_idx in free_indices:
            shape = tuple(out_chunks[axis][free_idx[axis]]
                          for axis in range(ndims_free))
            dsk[(name, ) + free_idx + (0, )] = (
                partial(xp.empty, dtype=a.dtype),
                shape + (0, ),
            )

        meta = meta_from_array(a, ndims_free + 1)
        return Array(dsk, name, out_chunks, meta=meta)

    # compute row index ranges for chunks along axis1:
    row_stops_ = np.cumsum(a.chunks[axis1])
    row_starts = np.roll(row_stops_, 1)
    row_starts[0] = 0

    # compute column index ranges for chunks along axis2:
    col_stops_ = np.cumsum(a.chunks[axis2])
    col_starts = np.roll(col_stops_, 1)
    col_starts[0] = 0

    # locate first chunk containing diagonal:
    row_blockid = np.arange(a.numblocks[axis1])
    col_blockid = np.arange(a.numblocks[axis2])

    row_filter = (row_starts <= kdiag_row_start) & (kdiag_row_start <
                                                    row_stops_)
    col_filter = (col_starts <= kdiag_col_start) & (kdiag_col_start <
                                                    col_stops_)
    (I, ) = row_blockid[row_filter]
    (J, ) = col_blockid[col_filter]

    # follow k-diagonal through chunks while constructing dask graph:
    dsk = dict()
    i = 0
    kdiag_chunks = ()
    while kdiag_row_start < a.shape[axis1] and kdiag_col_start < a.shape[axis2]:
        # localize block info:
        nrows, ncols = a.chunks[axis1][I], a.chunks[axis2][J]
        kdiag_row_start -= row_starts[I]
        kdiag_col_start -= col_starts[J]
        k = -kdiag_row_start if kdiag_row_start > 0 else kdiag_col_start
        kdiag_row_end = min(nrows, ncols - k)
        kdiag_len = kdiag_row_end - kdiag_row_start

        # increment dask graph:
        for free_idx in free_indices:
            input_idx = (free_idx[:axis1] + (I, ) + free_idx[axis1:axis2 - 1] +
                         (J, ) + free_idx[axis2 - 1:])
            output_idx = free_idx + (i, )
            dsk[(name, ) + output_idx] = (
                np.diagonal,
                (a.name, ) + input_idx,
                k,
                axis1,
                axis2,
            )

        kdiag_chunks += (kdiag_len, )
        # prepare for next iteration:
        i += 1
        kdiag_row_start = kdiag_row_end + row_starts[I]
        kdiag_col_start = min(ncols, nrows + k) + col_starts[J]
        I = I + 1 if kdiag_row_start == row_stops_[I] else I
        J = J + 1 if kdiag_col_start == col_stops_[J] else J

    out_chunks = pop_axes(a.chunks, axis1, axis2) + (kdiag_chunks, )
    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[a])
    meta = meta_from_array(a, ndims_free + 1)
    return Array(graph, name, out_chunks, meta=meta)
예제 #13
0
def parse_einsum_input(operands):
    """
    A reproduction of numpy's _parse_einsum_input()
    which in itself is a reproduction of
    c side einsum parsing in python.

    Returns
    -------
    input_strings : str
        Parsed input strings
    output_string : str
        Parsed output string
    operands : list of array_like
        The operands to use in the numpy contraction
    Examples
    --------
    The operand list is simplified to reduce printing:
    >> a = np.random.rand(4, 4)
    >> b = np.random.rand(4, 4, 4)
    >> __parse_einsum_input(('...a,...a->...', a, b))
    ('za,xza', 'xz', [a, b])
    >> __parse_einsum_input((a, [Ellipsis, 0], b, [Ellipsis, 0]))
    ('za,xza', 'xz', [a, b])
    """

    if len(operands) == 0:
        raise ValueError("No input operands")

    if isinstance(operands[0], basestring):
        subscripts = operands[0].replace(" ", "")
        operands = [asarray(o) for o in operands[1:]]

        # Ensure all characters are valid
        for s in subscripts:
            if s in ".,->":
                continue
            if s not in einsum_symbols_set:
                raise ValueError("Character %s is not a valid symbol." % s)

    else:
        tmp_operands = list(operands)
        operand_list = []
        subscript_list = []
        for p in range(len(operands) // 2):
            operand_list.append(tmp_operands.pop(0))
            subscript_list.append(tmp_operands.pop(0))

        output_list = tmp_operands[-1] if len(tmp_operands) else None
        operands = [asarray(v) for v in operand_list]
        subscripts = ""
        last = len(subscript_list) - 1
        for num, sub in enumerate(subscript_list):
            for s in sub:
                if s is Ellipsis:
                    subscripts += "..."
                elif isinstance(s, int):
                    subscripts += einsum_symbols[s]
                else:
                    raise TypeError("For this input type lists must contain "
                                    "either int or Ellipsis")
            if num != last:
                subscripts += ","

        if output_list is not None:
            subscripts += "->"
            for s in output_list:
                if s is Ellipsis:
                    subscripts += "..."
                elif isinstance(s, int):
                    subscripts += einsum_symbols[s]
                else:
                    raise TypeError("For this input type lists must contain "
                                    "either int or Ellipsis")
    # Check for proper "->"
    if ("-" in subscripts) or (">" in subscripts):
        invalid = (subscripts.count("-") > 1) or (subscripts.count(">") > 1)
        if invalid or (subscripts.count("->") != 1):
            raise ValueError("Subscripts can only contain one '->'.")

    # Parse ellipses
    if "." in subscripts:
        used = subscripts.replace(".", "").replace(",", "").replace("->", "")
        unused = list(einsum_symbols_set - set(used))
        ellipse_inds = "".join(unused)
        longest = 0

        if "->" in subscripts:
            input_tmp, output_sub = subscripts.split("->")
            split_subscripts = input_tmp.split(",")
            out_sub = True
        else:
            split_subscripts = subscripts.split(",")
            out_sub = False

        for num, sub in enumerate(split_subscripts):
            if "." in sub:
                if (sub.count(".") != 3) or (sub.count("...") != 1):
                    raise ValueError("Invalid Ellipses.")

                # Take into account numerical values
                if operands[num].shape == ():
                    ellipse_count = 0
                else:
                    ellipse_count = max(operands[num].ndim, 1)
                    ellipse_count -= len(sub) - 3

                if ellipse_count > longest:
                    longest = ellipse_count

                if ellipse_count < 0:
                    raise ValueError("Ellipses lengths do not match.")
                elif ellipse_count == 0:
                    split_subscripts[num] = sub.replace("...", "")
                else:
                    rep_inds = ellipse_inds[-ellipse_count:]
                    split_subscripts[num] = sub.replace("...", rep_inds)

        subscripts = ",".join(split_subscripts)
        if longest == 0:
            out_ellipse = ""
        else:
            out_ellipse = ellipse_inds[-longest:]

        if out_sub:
            subscripts += "->" + output_sub.replace("...", out_ellipse)
        else:
            # Special care for outputless ellipses
            output_subscript = ""
            tmp_subscripts = subscripts.replace(",", "")
            for s in sorted(set(tmp_subscripts)):
                if s not in einsum_symbols_set:
                    raise ValueError("Character %s is not a valid symbol." % s)
                if tmp_subscripts.count(s) == 1:
                    output_subscript += s
            normal_inds = "".join(
                sorted(set(output_subscript) - set(out_ellipse)))

            subscripts += "->" + out_ellipse + normal_inds

    # Build output string if does not exist
    if "->" in subscripts:
        input_subscripts, output_subscript = subscripts.split("->")
    else:
        input_subscripts = subscripts
        # Build output subscripts
        tmp_subscripts = subscripts.replace(",", "")
        output_subscript = ""
        for s in sorted(set(tmp_subscripts)):
            if s not in einsum_symbols_set:
                raise ValueError("Character %s is not a valid symbol." % s)
            if tmp_subscripts.count(s) == 1:
                output_subscript += s

    # Make sure output subscripts are in the input
    for char in output_subscript:
        if char not in input_subscripts:
            raise ValueError(
                "Output character %s did not appear in the input" % char)

    # Make sure number operands is equivalent to the number of terms
    if len(input_subscripts.split(",")) != len(operands):
        raise ValueError(
            "Number of einsum subscripts must be equal to the number of operands."
        )

    return (input_subscripts, output_subscript, operands)