def test_meta_from_array_literal(meta, dtype): if dtype is None: assert meta_from_array(meta, dtype=dtype).dtype.kind in "SU" else: assert ( meta_from_array(meta, dtype=dtype).dtype == np.array([], dtype=dtype).dtype )
def wrap_func_like(func, *args, **kwargs): """ Transform np creation function into blocked version """ x = args[0] meta = meta_from_array(x) shape = kwargs.get("shape", x.shape) parsed = _parse_wrap_args(func, args, kwargs, shape) shape = parsed["shape"] dtype = parsed["dtype"] chunks = parsed["chunks"] name = parsed["name"] kwargs = parsed["kwargs"] keys = product([name], *[range(len(bd)) for bd in chunks]) shapes = product(*chunks) shapes = list(shapes) kw = [kwargs for _ in shapes] for i, s in enumerate(list(shapes)): kw[i]["shape"] = s vals = ((partial(func, dtype=dtype, **k), ) + args for (k, s) in zip(kw, shapes)) dsk = dict(zip(keys, vals)) return Array(dsk, name, chunks, meta=meta.astype(dtype))
def diag(v, k=0): if not isinstance(v, np.ndarray) and not isinstance(v, Array): raise TypeError( f"v must be a dask array or numpy array, got {type(v)}") name = "diag-" + tokenize(v, k) meta = meta_from_array(v, 2 if v.ndim == 1 else 1) if isinstance(v, np.ndarray) or (hasattr(v, "__array_function__") and not isinstance(v, Array)): if v.ndim == 1: m = abs(k) chunks = ((v.shape[0] + m, ), (v.shape[0] + m, )) dsk = {(name, 0, 0): (np.diag, v, k)} elif v.ndim == 2: kdiag_row_start = max(0, -k) kdiag_row_stop = min(v.shape[0], v.shape[1] - k) len_kdiag = kdiag_row_stop - kdiag_row_start chunks = ((0, ), ) if len_kdiag <= 0 else ((len_kdiag, ), ) dsk = {(name, 0): (np.diag, v, k)} else: raise ValueError("Array must be 1d or 2d only") return Array(dsk, name, chunks, meta=meta) if v.ndim != 1: if v.ndim != 2: raise ValueError("Array must be 1d or 2d only") if k == 0 and v.chunks[0] == v.chunks[1]: dsk = {(name, i): (np.diag, row[i]) for i, row in enumerate(v.__dask_keys__())} graph = HighLevelGraph.from_collections(name, dsk, dependencies=[v]) return Array(graph, name, (v.chunks[0], ), meta=meta) else: return diagonal(v, k) if k == 0: chunks_1d = v.chunks[0] blocks = v.__dask_keys__() dsk = {} for i, m in enumerate(chunks_1d): for j, n in enumerate(chunks_1d): key = (name, i, j) if i == j: dsk[key] = (np.diag, blocks[i]) else: dsk[key] = (np.zeros, (m, n)) dsk[key] = (partial(np.zeros_like, shape=(m, n)), meta) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[v]) return Array(graph, name, (chunks_1d, chunks_1d), meta=meta) elif k > 0: return pad(diag(v), [[0, k], [k, 0]], mode="constant") elif k < 0: return pad(diag(v), [[-k, 0], [0, -k]], mode="constant")
def test_meta_from_array_type_inputs(): x = meta_from_array(np.ndarray, ndim=2, dtype=np.float32) assert isinstance(x, np.ndarray) assert x.ndim == 2 assert x.dtype == np.float32 x = da.Array({('x', 0, 0): (np.ones, (5, 5))}, name='x', chunks=(5, 5), shape=(5, 5), meta=np.ndarray, dtype=np.float) assert_eq(x, x) assert da.from_array(np.ones(5).astype(np.int32), meta=np.ndarray).dtype == np.int32
def slice_with_int_dask_array(x, idx, offset, x_size, axis): """Chunk function of `slice_with_int_dask_array_on_axis`. Slice one chunk of x by one chunk of idx. Parameters ---------- x: ndarray, any dtype, any shape i-th chunk of x idx: ndarray, ndim=1, dtype=any integer j-th chunk of idx (cartesian product with the chunks of x) offset: ndarray, shape=(1, ), dtype=int64 Index of the first element along axis of the current chunk of x x_size: int Total size of the x da.Array along axis axis: int normalized axis to take elements from (0 <= axis < x.ndim) Returns ------- x sliced along axis, using only the elements of idx that fall inside the current chunk. """ from dask.array.utils import asarray_safe, meta_from_array idx = asarray_safe(idx, like=meta_from_array(x)) # Needed when idx is unsigned idx = idx.astype(np.int64) # Normalize negative indices idx = np.where(idx < 0, idx + x_size, idx) # A chunk of the offset dask Array is a numpy array with shape (1, ). # It indicates the index of the first element along axis of the current # chunk of x. idx = idx - offset # Drop elements of idx that do not fall inside the current chunk of x idx_filter = (idx >= 0) & (idx < x.shape[axis]) idx = idx[idx_filter] # np.take does not support slice indices # return np.take(x, idx, axis) return x[tuple(idx if i == axis else slice(None) for i in range(x.ndim))]
def make_meta(obj): """If obj is a DataArray or Dataset, return a new object of the same type and with the same variables and dtypes, but where all variables have size 0 and numpy backend. If obj is neither a DataArray nor Dataset, return it unaltered. """ if isinstance(obj, DataArray): obj_array = obj obj = obj._to_temp_dataset() elif isinstance(obj, Dataset): obj_array = None else: return obj meta = Dataset() for name, variable in obj.variables.items(): meta_obj = meta_from_array(variable.data, ndim=variable.ndim) meta[name] = (variable.dims, meta_obj, variable.attrs) meta.attrs = obj.attrs meta = meta.set_coords(obj.coords) if obj_array is not None: return obj_array._from_temp_dataset(meta) return meta
def test_meta_from_array(asarray): x = np.array(1) assert meta_from_array(x, ndim=1).shape == (0, ) x = np.ones((1, 2, 3), dtype='float32') x = asarray(x) assert meta_from_array(x).shape == (0, 0, 0) assert meta_from_array(x).dtype == 'float32' assert type(meta_from_array(x)) is type(x) assert meta_from_array(x, ndim=2).shape == (0, 0) assert meta_from_array(x, ndim=4).shape == (0, 0, 0, 0) assert meta_from_array(x, dtype="float64").dtype == "float64" x = da.ones((1, )) assert isinstance(meta_from_array(x), np.ndarray) assert meta_from_array(123) == 123 assert meta_from_array('foo') == 'foo' assert meta_from_array(np.dtype('float32')) == np.dtype('float32')
def test_meta_from_array(asarray): if "COO.from_numpy" in str(asarray) and _numpy_120: raise pytest.xfail(reason="sparse-383") x = np.array(1) assert meta_from_array(x, ndim=1).shape == (0,) x = np.ones((1, 2, 3), dtype="float32") x = asarray(x) assert meta_from_array(x).shape == (0, 0, 0) assert meta_from_array(x).dtype == "float32" assert type(meta_from_array(x)) is type(x) assert meta_from_array(x, ndim=2).shape == (0, 0) assert meta_from_array(x, ndim=4).shape == (0, 0, 0, 0) assert meta_from_array(x, dtype="float64").dtype == "float64" x = da.ones((1,)) assert isinstance(meta_from_array(x), np.ndarray) assert meta_from_array(123) == 123 assert meta_from_array("foo") == "foo" assert meta_from_array(np.dtype("float32")) == np.dtype("float32")
def percentile(a, q, method="linear", internal_method="default", **kwargs): """Approximate percentile of 1-D array Parameters ---------- a : Array q : array_like of float Percentile or sequence of percentiles to compute, which must be between 0 and 100 inclusive. method : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}, optional The interpolation method to use when the desired percentile lies between two data points ``i < j``. Only valid for ``method='dask'``. - 'linear': ``i + (j - i) * fraction``, where ``fraction`` is the fractional part of the index surrounded by ``i`` and ``j``. - 'lower': ``i``. - 'higher': ``j``. - 'nearest': ``i`` or ``j``, whichever is nearest. - 'midpoint': ``(i + j) / 2``. .. versionchanged:: 2022.1.0 This argument was previously called "interpolation" internal_method : {'default', 'dask', 'tdigest'}, optional What internal method to use. By default will use dask's internal custom algorithm (``'dask'``). If set to ``'tdigest'`` will use tdigest for floats and ints and fallback to the ``'dask'`` otherwise. .. versionchanged:: 2022.1.0 This argument was previously called “method”. interpolation : str, optional Deprecated name for the method keyword argument. .. deprecated:: 2022.1.0 See Also -------- numpy.percentile : Numpy's equivalent Percentile function """ from dask.array.dispatch import percentile_lookup as _percentile from dask.array.utils import array_safe, meta_from_array allowed_internal_methods = ["default", "dask", "tdigest"] if method in allowed_internal_methods: warnings.warn( "In Dask 2022.1.0, the `method=` argument was renamed to `internal_method=`", FutureWarning, ) internal_method = method if "interpolation" in kwargs: if _numpy_122: warnings.warn( "In Dask 2022.1.0, the `interpolation=` argument to percentile was renamed to " "`method= ` ", FutureWarning, ) method = kwargs.pop("interpolation") if kwargs: raise TypeError( f"percentile() got an unexpected keyword argument {kwargs.keys()}") if not a.ndim == 1: raise NotImplementedError( "Percentiles only implemented for 1-d arrays") if isinstance(q, Number): q = [q] q = array_safe(q, like=meta_from_array(a)) token = tokenize(a, q, method) dtype = a.dtype if np.issubdtype(dtype, np.integer): dtype = (array_safe([], dtype=dtype, like=meta_from_array(a)) / 0.5).dtype meta = meta_from_array(a, dtype=dtype) if internal_method not in allowed_internal_methods: raise ValueError( f"`internal_method=` must be one of {allowed_internal_methods}") # Allow using t-digest if method is allowed and dtype is of floating or integer type if (internal_method == "tdigest" and method == "linear" and (np.issubdtype(dtype, np.floating) or np.issubdtype(dtype, np.integer))): from dask.utils import import_required import_required( "crick", "crick is a required dependency for using the t-digest method.") name = "percentile_tdigest_chunk-" + token dsk = {(name, i): (_tdigest_chunk, key) for i, key in enumerate(a.__dask_keys__())} name2 = "percentile_tdigest-" + token dsk2 = {(name2, 0): (_percentiles_from_tdigest, q, sorted(dsk))} # Otherwise use the custom percentile algorithm else: # Add 0 and 100 during calculation for more robust behavior (hopefully) calc_q = np.pad(q, 1, mode="constant") calc_q[-1] = 100 name = "percentile_chunk-" + token dsk = {(name, i): (_percentile, key, calc_q, method) for i, key in enumerate(a.__dask_keys__())} name2 = "percentile-" + token dsk2 = { (name2, 0): ( merge_percentiles, q, [calc_q] * len(a.chunks[0]), sorted(dsk), method, ) } dsk = merge(dsk, dsk2) graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[a]) return Array(graph, name2, chunks=((len(q), ), ), meta=meta)
def apply_gufunc( func, signature, *args, axes=None, axis=None, keepdims=False, output_dtypes=None, output_sizes=None, vectorize=None, allow_rechunk=False, meta=None, **kwargs, ): """ Apply a generalized ufunc or similar python function to arrays. ``signature`` determines if the function consumes or produces core dimensions. The remaining dimensions in given input arrays (``*args``) are considered loop dimensions and are required to broadcast naturally against each other. In other terms, this function is like ``np.vectorize``, but for the blocks of dask arrays. If the function itself shall also be vectorized use ``vectorize=True`` for convenience. Parameters ---------- func : callable Function to call like ``func(*args, **kwargs)`` on input arrays (``*args``) that returns an array or tuple of arrays. If multiple arguments with non-matching dimensions are supplied, this function is expected to vectorize (broadcast) over axes of positional arguments in the style of NumPy universal functions [1]_ (if this is not the case, set ``vectorize=True``). If this function returns multiple outputs, ``output_core_dims`` has to be set as well. signature: string Specifies what core dimensions are consumed and produced by ``func``. According to the specification of numpy.gufunc signature [2]_ *args : numeric Input arrays or scalars to the callable function. axes: List of tuples, optional, keyword only A list of tuples with indices of axes a generalized ufunc should operate on. For instance, for a signature of ``"(i,j),(j,k)->(i,k)"`` appropriate for matrix multiplication, the base elements are two-dimensional matrices and these are taken to be stored in the two last axes of each argument. The corresponding axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``. For simplicity, for generalized ufuncs that operate on 1-dimensional arrays (vectors), a single integer is accepted instead of a single-element tuple, and for generalized ufuncs for which all outputs are scalars, the output tuples can be omitted. axis: int, optional, keyword only A single axis over which a generalized ufunc should operate. This is a short-cut for ufuncs that operate over a single, shared core dimension, equivalent to passing in axes with entries of (axis,) for each single-core-dimension argument and ``()`` for all others. For instance, for a signature ``"(i),(i)->()"``, it is equivalent to passing in ``axes=[(axis,), (axis,), ()]``. keepdims: bool, optional, keyword only If this is set to True, axes which are reduced over will be left in the result as a dimension with size one, so that the result will broadcast correctly against the inputs. This option can only be used for generalized ufuncs that operate on inputs that all have the same number of core dimensions and with outputs that have no core dimensions , i.e., with signatures like ``"(i),(i)->()"`` or ``"(m,m)->()"``. If used, the location of the dimensions in the output can be controlled with axes and axis. output_dtypes : Optional, dtype or list of dtypes, keyword only Valid numpy dtype specification or list thereof. If not given, a call of ``func`` with a small set of data is performed in order to try to automatically determine the output dtypes. output_sizes : dict, optional, keyword only Optional mapping from dimension names to sizes for outputs. Only used if new core dimensions (not found on inputs) appear on outputs. vectorize: bool, keyword only If set to ``True``, ``np.vectorize`` is applied to ``func`` for convenience. Defaults to ``False``. allow_rechunk: Optional, bool, keyword only Allows rechunking, otherwise chunk sizes need to match and core dimensions are to consist only of one chunk. Warning: enabling this can increase memory usage significantly. Defaults to ``False``. meta: Optional, tuple, keyword only tuple of empty ndarrays describing the shape and dtype of the output of the gufunc. Defaults to ``None``. **kwargs : dict Extra keyword arguments to pass to `func` Returns ------- Single dask.array.Array or tuple of dask.array.Array Examples -------- >>> import dask.array as da >>> import numpy as np >>> def stats(x): ... return np.mean(x, axis=-1), np.std(x, axis=-1) >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30)) >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a) >>> mean.compute().shape (10, 20) >>> def outer_product(x, y): ... return np.einsum("i,j->ij", x, y) >>> a = da.random.normal(size=( 20,30), chunks=(10, 30)) >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40)) >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, vectorize=True) >>> c.compute().shape (10, 20, 30, 40) References ---------- .. [1] https://docs.scipy.org/doc/numpy/reference/ufuncs.html .. [2] https://docs.scipy.org/doc/numpy/reference/c-api/generalized-ufuncs.html """ # Input processing: ## Signature if not isinstance(signature, str): raise TypeError("`signature` has to be of type string") # NumPy versions before https://github.com/numpy/numpy/pull/19627 # would not ignore whitespace characters in `signature` like they # are supposed to. We remove the whitespace here as a workaround. signature = re.sub(r"\s+", "", signature) input_coredimss, output_coredimss = _parse_gufunc_signature(signature) ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples nout = None if not isinstance(output_coredimss, list) else len(output_coredimss) ## Consolidate onto `meta` if meta is not None and output_dtypes is not None: raise ValueError( "Only one of `meta` and `output_dtypes` should be given (`meta` is preferred)." ) if meta is None: if output_dtypes is None: ## Infer `output_dtypes` if vectorize: tempfunc = np.vectorize(func, signature=signature) else: tempfunc = func output_dtypes = apply_infer_dtype(tempfunc, args, kwargs, "apply_gufunc", "output_dtypes", nout) ## Turn `output_dtypes` into `meta` if (nout is None and isinstance(output_dtypes, (tuple, list)) and len(output_dtypes) == 1): output_dtypes = output_dtypes[0] sample = args[0] if args else None if nout is None: meta = meta_from_array(sample, dtype=output_dtypes) else: meta = tuple( meta_from_array(sample, dtype=odt) for odt in output_dtypes) ## Normalize `meta` format meta = meta_from_array(meta) if isinstance(meta, list): meta = tuple(meta) ## Validate `meta` if nout is None: if isinstance(meta, tuple): if len(meta) == 1: meta = meta[0] else: raise ValueError( "For a function with one output, must give a single item for `output_dtypes`/`meta`, " "not a tuple or list.") else: if not isinstance(meta, tuple): raise ValueError( f"For a function with {nout} outputs, must give a tuple or list for `output_dtypes`/`meta`, " "not a single item.") if len(meta) != nout: raise ValueError( f"For a function with {nout} outputs, must give a tuple or list of {nout} items for " f"`output_dtypes`/`meta`, not {len(meta)}.") ## Vectorize function, if required if vectorize: otypes = [x.dtype for x in meta] if isinstance(meta, tuple) else [meta.dtype] func = np.vectorize(func, signature=signature, otypes=otypes) ## Miscellaneous if output_sizes is None: output_sizes = {} ## Axes input_axes, output_axes = _validate_normalize_axes(axes, axis, keepdims, input_coredimss, output_coredimss) # Main code: ## Cast all input arrays to dask args = [asarray(a) for a in args] if len(input_coredimss) != len(args): raise ValueError( "According to `signature`, `func` requires %d arguments, but %s given" % (len(input_coredimss), len(args))) ## Axes: transpose input arguments transposed_args = [] for arg, iax, input_coredims in zip(args, input_axes, input_coredimss): shape = arg.shape iax = tuple(a if a < 0 else a - len(shape) for a in iax) tidc = tuple(i for i in range(-len(shape) + 0, 0) if i not in iax) + iax transposed_arg = arg.transpose(tidc) transposed_args.append(transposed_arg) args = transposed_args ## Assess input args for loop dims input_shapes = [a.shape for a in args] input_chunkss = [a.chunks for a in args] num_loopdims = [ len(s) - len(cd) for s, cd in zip(input_shapes, input_coredimss) ] max_loopdims = max(num_loopdims) if num_loopdims else None core_input_shapes = [ dict(zip(icd, s[n:])) for s, n, icd in zip(input_shapes, num_loopdims, input_coredimss) ] core_shapes = merge(*core_input_shapes) core_shapes.update(output_sizes) loop_input_dimss = [ tuple("__loopdim%d__" % d for d in range(max_loopdims - n, max_loopdims)) for n in num_loopdims ] input_dimss = [l + c for l, c in zip(loop_input_dimss, input_coredimss)] loop_output_dims = max(loop_input_dimss, key=len) if loop_input_dimss else tuple() ## Assess input args for same size and chunk sizes ### Collect sizes and chunksizes of all dims in all arrays dimsizess = {} chunksizess = {} for dims, shape, chunksizes in zip(input_dimss, input_shapes, input_chunkss): for dim, size, chunksize in zip(dims, shape, chunksizes): dimsizes = dimsizess.get(dim, []) dimsizes.append(size) dimsizess[dim] = dimsizes chunksizes_ = chunksizess.get(dim, []) chunksizes_.append(chunksize) chunksizess[dim] = chunksizes_ ### Assert correct partitioning, for case: for dim, sizes in dimsizess.items(): #### Check that the arrays have same length for same dimensions or dimension `1` if set(sizes) | {1} != {1, max(sizes)}: raise ValueError( f"Dimension `'{dim}'` with different lengths in arrays") if not allow_rechunk: chunksizes = chunksizess[dim] #### Check if core dimensions consist of only one chunk if (dim in core_shapes) and (chunksizes[0][0] < core_shapes[dim]): raise ValueError( "Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \ chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \ significantly.".format(dim)) #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1 relevant_chunksizes = list( unique(c for s, c in zip(sizes, chunksizes) if s > 1)) if len(relevant_chunksizes) > 1: raise ValueError( f"Dimension `'{dim}'` with different chunksize present") ## Apply function - use blockwise here arginds = list(concat(zip(args, input_dimss))) ### Use existing `blockwise` but only with loopdims to enforce ### concatenation for coredims that appear also at the output ### Modifying `blockwise` could improve things here. tmp = blockwise(func, loop_output_dims, *arginds, concatenate=True, meta=meta, **kwargs) # NOTE: we likely could just use `meta` instead of `tmp._meta`, # but we use it and validate it anyway just to be sure nothing odd has happened. metas = tmp._meta if nout is None: assert not isinstance( metas, (list, tuple) ), f"meta changed from single output to multiple output during blockwise: {meta} -> {metas}" metas = (metas, ) else: assert isinstance( metas, (list, tuple) ), f"meta changed from multiple output to single output during blockwise: {meta} -> {metas}" assert ( len(metas) == nout ), f"Number of outputs changed from {nout} to {len(metas)} during blockwise" ## Prepare output shapes loop_output_shape = tmp.shape loop_output_chunks = tmp.chunks keys = list(flatten(tmp.__dask_keys__())) name, token = keys[0][0].split("-") ### *) Treat direct output if nout is None: output_coredimss = [output_coredimss] ## Split output leaf_arrs = [] for i, (ocd, oax, meta) in enumerate(zip(output_coredimss, output_axes, metas)): core_output_shape = tuple(core_shapes[d] for d in ocd) core_chunkinds = len(ocd) * (0, ) output_shape = loop_output_shape + core_output_shape output_chunks = loop_output_chunks + core_output_shape leaf_name = "%s_%d-%s" % (name, i, token) leaf_dsk = {(leaf_name, ) + key[1:] + core_chunkinds: ((getitem, key, i) if nout else key) for key in keys} graph = HighLevelGraph.from_collections(leaf_name, leaf_dsk, dependencies=[tmp]) meta = meta_from_array(meta, len(output_shape)) leaf_arr = Array(graph, leaf_name, chunks=output_chunks, shape=output_shape, meta=meta) ### Axes: if keepdims: slices = len( leaf_arr.shape) * (slice(None), ) + len(oax) * (np.newaxis, ) leaf_arr = leaf_arr[slices] tidcs = [None] * len(leaf_arr.shape) for ii, oa in zip(range(-len(oax), 0), oax): tidcs[oa] = ii j = 0 for ii in range(len(tidcs)): if tidcs[ii] is None: tidcs[ii] = j j += 1 leaf_arr = leaf_arr.transpose(tidcs) leaf_arrs.append(leaf_arr) return (*leaf_arrs, ) if nout else leaf_arrs[0] # Undo *) from above
def pad_edge(array, pad_width, mode, **kwargs): """ Helper function for padding edges. Handles the cases where the only the values on the edge are needed. """ kwargs = {k: expand_pad_value(array, v) for k, v in kwargs.items()} result = array for d in range(array.ndim): pad_shapes, pad_chunks = get_pad_shapes_chunks(result, pad_width, (d, )) pad_arrays = [result, result] if mode == "constant": from dask.array.utils import asarray_safe constant_values = kwargs["constant_values"][d] constant_values = [ asarray_safe(c, like=meta_from_array(array), dtype=result.dtype) for c in constant_values ] pad_arrays = [ broadcast_to(v, s, c) for v, s, c in zip(constant_values, pad_shapes, pad_chunks) ] elif mode in ["edge", "linear_ramp"]: pad_slices = [ result.ndim * [slice(None)], result.ndim * [slice(None)] ] pad_slices[0][d] = slice(None, 1, None) pad_slices[1][d] = slice(-1, None, None) pad_slices = [tuple(sl) for sl in pad_slices] pad_arrays = [result[sl] for sl in pad_slices] if mode == "edge": pad_arrays = [ broadcast_to(a, s, c) for a, s, c in zip(pad_arrays, pad_shapes, pad_chunks) ] elif mode == "linear_ramp": end_values = kwargs["end_values"][d] pad_arrays = [ a.map_blocks( linear_ramp_chunk, ev, pw, chunks=c, dtype=result.dtype, dim=d, step=(2 * i - 1), ) for i, (a, ev, pw, c) in enumerate( zip(pad_arrays, end_values, pad_width[d], pad_chunks)) ] elif mode == "empty": pad_arrays = [ empty_like(array, shape=s, dtype=array.dtype, chunks=c) for s, c in zip(pad_shapes, pad_chunks) ] result = concatenate([pad_arrays[0], result, pad_arrays[1]], axis=d) return result
def diagonal(a, offset=0, axis1=0, axis2=1): name = "diagonal-" + tokenize(a, offset, axis1, axis2) if a.ndim < 2: # NumPy uses `diag` as we do here. raise ValueError("diag requires an array of at least two dimensions") def _axis_fmt(axis, name, ndim): if axis < 0: t = ndim + axis if t < 0: msg = "{}: axis {} is out of bounds for array of dimension {}" raise np.AxisError(msg.format(name, axis, ndim)) axis = t return axis def pop_axes(chunks, axis1, axis2): chunks = list(chunks) chunks.pop(axis2) chunks.pop(axis1) return tuple(chunks) axis1 = _axis_fmt(axis1, "axis1", a.ndim) axis2 = _axis_fmt(axis2, "axis2", a.ndim) if axis1 == axis2: raise ValueError("axis1 and axis2 cannot be the same") a = asarray(a) k = offset if axis1 > axis2: axis1, axis2 = axis2, axis1 k = -offset free_axes = set(range(a.ndim)) - {axis1, axis2} free_indices = list(product(*(range(a.numblocks[i]) for i in free_axes))) ndims_free = len(free_axes) # equation of diagonal: i = j - k kdiag_row_start = max(0, -k) kdiag_col_start = max(0, k) kdiag_row_stop = min(a.shape[axis1], a.shape[axis2] - k) len_kdiag = kdiag_row_stop - kdiag_row_start if len_kdiag <= 0: xp = np if is_cupy_type(a._meta): import cupy xp = cupy out_chunks = pop_axes(a.chunks, axis1, axis2) + ((0, ), ) dsk = dict() for free_idx in free_indices: shape = tuple(out_chunks[axis][free_idx[axis]] for axis in range(ndims_free)) dsk[(name, ) + free_idx + (0, )] = ( partial(xp.empty, dtype=a.dtype), shape + (0, ), ) meta = meta_from_array(a, ndims_free + 1) return Array(dsk, name, out_chunks, meta=meta) # compute row index ranges for chunks along axis1: row_stops_ = np.cumsum(a.chunks[axis1]) row_starts = np.roll(row_stops_, 1) row_starts[0] = 0 # compute column index ranges for chunks along axis2: col_stops_ = np.cumsum(a.chunks[axis2]) col_starts = np.roll(col_stops_, 1) col_starts[0] = 0 # locate first chunk containing diagonal: row_blockid = np.arange(a.numblocks[axis1]) col_blockid = np.arange(a.numblocks[axis2]) row_filter = (row_starts <= kdiag_row_start) & (kdiag_row_start < row_stops_) col_filter = (col_starts <= kdiag_col_start) & (kdiag_col_start < col_stops_) (I, ) = row_blockid[row_filter] (J, ) = col_blockid[col_filter] # follow k-diagonal through chunks while constructing dask graph: dsk = dict() i = 0 kdiag_chunks = () while kdiag_row_start < a.shape[axis1] and kdiag_col_start < a.shape[axis2]: # localize block info: nrows, ncols = a.chunks[axis1][I], a.chunks[axis2][J] kdiag_row_start -= row_starts[I] kdiag_col_start -= col_starts[J] k = -kdiag_row_start if kdiag_row_start > 0 else kdiag_col_start kdiag_row_end = min(nrows, ncols - k) kdiag_len = kdiag_row_end - kdiag_row_start # increment dask graph: for free_idx in free_indices: input_idx = (free_idx[:axis1] + (I, ) + free_idx[axis1:axis2 - 1] + (J, ) + free_idx[axis2 - 1:]) output_idx = free_idx + (i, ) dsk[(name, ) + output_idx] = ( np.diagonal, (a.name, ) + input_idx, k, axis1, axis2, ) kdiag_chunks += (kdiag_len, ) # prepare for next iteration: i += 1 kdiag_row_start = kdiag_row_end + row_starts[I] kdiag_col_start = min(ncols, nrows + k) + col_starts[J] I = I + 1 if kdiag_row_start == row_stops_[I] else I J = J + 1 if kdiag_col_start == col_stops_[J] else J out_chunks = pop_axes(a.chunks, axis1, axis2) + (kdiag_chunks, ) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[a]) meta = meta_from_array(a, ndims_free + 1) return Array(graph, name, out_chunks, meta=meta)
def arange(*args, chunks="auto", like=None, dtype=None, **kwargs): """ Return evenly spaced values from `start` to `stop` with step size `step`. The values are half-open [start, stop), so including start and excluding stop. This is basically the same as python's range function but for dask arrays. When using a non-integer step, such as 0.1, the results will often not be consistent. It is better to use linspace for these cases. Parameters ---------- start : int, optional The starting value of the sequence. The default is 0. stop : int The end of the interval, this value is excluded from the interval. step : int, optional The spacing between the values. The default is 1 when not specified. The last value of the sequence. chunks : int The number of samples on each block. Note that the last block will have fewer samples if ``len(array) % chunks != 0``. Defaults to "auto" which will automatically determine chunk sizes. dtype : numpy.dtype Output dtype. Omit to infer it from start, stop, step Defaults to ``None``. like : array type or ``None`` Array to extract meta from. Defaults to ``None``. Returns ------- samples : dask array See Also -------- dask.array.linspace """ if len(args) == 1: start = 0 stop = args[0] step = 1 elif len(args) == 2: start = args[0] stop = args[1] step = 1 elif len(args) == 3: start, stop, step = args else: raise TypeError(""" arange takes 3 positional arguments: arange([start], stop, [step]) """) num = int(max(np.ceil((stop - start) / step), 0)) meta = meta_from_array(like) if like is not None else None if dtype is None: dtype = np.arange(start, stop, step * num if num else step).dtype chunks = normalize_chunks(chunks, (num, ), dtype=dtype) if kwargs: raise TypeError("Unexpected keyword argument(s): %s" % ",".join(kwargs.keys())) name = "arange-" + tokenize((start, stop, step, chunks, dtype)) dsk = {} elem_count = 0 for i, bs in enumerate(chunks[0]): blockstart = start + (elem_count * step) blockstop = start + ((elem_count + bs) * step) task = ( partial(chunk.arange, like=like), blockstart, blockstop, step, bs, dtype, ) dsk[(name, i)] = task elem_count += bs return Array(dsk, name, chunks, dtype=dtype, meta=meta)
def reshape(x, shape, merge_chunks=True, limit=None): """Reshape array to new shape Parameters ---------- shape : int or tuple of ints The new shape should be compatible with the original shape. If an integer, then the result will be a 1-D array of that length. One shape dimension can be -1. In this case, the value is inferred from the length of the array and remaining dimensions. merge_chunks : bool, default True Whether to merge chunks using the logic in :meth:`dask.array.rechunk` when communication is necessary given the input array chunking and the output shape. With ``merge_chunks==False``, the input array will be rechunked to a chunksize of 1, which can create very many tasks. limit: int (optional) The maximum block size to target in bytes. If no limit is provided, it defaults to using the ``array.chunk-size`` Dask config value. Notes ----- This is a parallelized version of the ``np.reshape`` function with the following limitations: 1. It assumes that the array is stored in `row-major order`_ 2. It only allows for reshapings that collapse or merge dimensions like ``(1, 2, 3, 4) -> (1, 6, 4)`` or ``(64,) -> (4, 4, 4)`` .. _`row-major order`: https://en.wikipedia.org/wiki/Row-_and_column-major_order When communication is necessary this algorithm depends on the logic within rechunk. It endeavors to keep chunk sizes roughly the same when possible. See :ref:`array-chunks.reshaping` for a discussion the tradeoffs of ``merge_chunks``. See Also -------- dask.array.rechunk numpy.reshape """ # Sanitize inputs, look for -1 in shape from dask.array.core import PerformanceWarning from dask.array.slicing import sanitize_index shape = tuple(map(sanitize_index, shape)) known_sizes = [s for s in shape if s != -1] if len(known_sizes) < len(shape): if len(shape) - len(known_sizes) > 1: raise ValueError("can only specify one unknown dimension") # Fastpath for x.reshape(-1) on 1D arrays, allows unknown shape in x # for this case only. if len(shape) == 1 and x.ndim == 1: return x missing_size = sanitize_index(x.size / reduce(mul, known_sizes, 1)) shape = tuple(missing_size if s == -1 else s for s in shape) if np.isnan(sum(x.shape)): raise ValueError("Array chunk size or shape is unknown. shape: %s\n\n" "Possible solution with x.compute_chunk_sizes()" % str(x.shape)) if reduce(mul, shape, 1) != x.size: raise ValueError("total size of new array must be unchanged") if x.shape == shape: return x meta = meta_from_array(x, len(shape)) name = "reshape-" + tokenize(x, shape) if x.npartitions == 1: key = next(flatten(x.__dask_keys__())) dsk = {(name, ) + (0, ) * len(shape): (M.reshape, key, shape)} chunks = tuple((d, ) for d in shape) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x]) return Array(graph, name, chunks, meta=meta) # Logic or how to rechunk din = len(x.shape) dout = len(shape) if not merge_chunks and din > dout: x = x.rechunk({i: 1 for i in range(din - dout)}) inchunks, outchunks = reshape_rechunk(x.shape, shape, x.chunks) # Check output chunks are not too large max_chunksize_in_bytes = reduce( mul, [max(i) for i in outchunks]) * x.dtype.itemsize if limit is None: limit = parse_bytes(config.get("array.chunk-size")) split = config.get("array.slicing.split-large-chunks", None) else: limit = parse_bytes(limit) split = True if max_chunksize_in_bytes > limit: if split is None: msg = ( "Reshaping is producing a large chunk. To accept the large\n" "chunk and silence this warning, set the option\n" " >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):\n" " ... array.reshape(shape)\n\n" "To avoid creating the large chunks, set the option\n" " >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n" " ... array.reshape(shape)" "Explictly passing ``limit`` to ``reshape`` will also silence this warning\n" " >>> array.reshape(shape, limit='128 MiB')") warnings.warn(msg, PerformanceWarning, stacklevel=6) elif split: # Leave chunk sizes unaltered where possible matching_chunks = Counter(inchunks) & Counter(outchunks) chunk_plan = [] for out in outchunks: if matching_chunks[out] > 0: chunk_plan.append(out) matching_chunks[out] -= 1 else: chunk_plan.append("auto") outchunks = normalize_chunks( chunk_plan, shape=shape, limit=limit, dtype=x.dtype, previous_chunks=inchunks, ) x2 = x.rechunk(inchunks) # Construct graph in_keys = list(product([x2.name], *[range(len(c)) for c in inchunks])) out_keys = list(product([name], *[range(len(c)) for c in outchunks])) shapes = list(product(*outchunks)) dsk = { a: (M.reshape, b, shape) for a, b, shape in zip(out_keys, in_keys, shapes) } graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x2]) return Array(graph, name, outchunks, meta=meta)