def test_ensure_dict(): d = {'x': 1} assert ensure_dict(d) is d hlg = HighLevelGraph.from_collections('x', d) assert type(ensure_dict(hlg)) is dict assert ensure_dict(hlg) == d class mydict(dict): pass md = mydict() md['x'] = 1 assert type(ensure_dict(md)) is dict assert ensure_dict(md) == d
def test_blockwise_non_blockwise_output(): x = da.ones(10, chunks=(5,)) y = (((x + 1) + 2) + 3) w = y.sum() z = (((y * 2) * 3) * 4) z_top_before = tuple(z.dask.dicts[z.name].indices) (zz,) = dask.optimize(z) z_top_after = tuple(z.dask.dicts[z.name].indices) assert z_top_before == z_top_after, "z_top mutated" dsk = optimize_blockwise(z.dask, keys=list(dask.core.flatten(z.__dask_keys__()))) assert isinstance(dsk, HighLevelGraph) assert len([layer for layer in dsk.dicts.values() if isinstance(layer, Blockwise)]) == 1 dsk = optimize_blockwise(HighLevelGraph.merge(w.dask, z.dask), keys=list(dask.core.flatten([w.__dask_keys__(), z.__dask_keys__()]))) assert isinstance(dsk, HighLevelGraph) assert len([layer for layer in z.dask.dicts.values() if isinstance(layer, Blockwise)]) >= 1
def _groupby_to_disk( ddf, write_func, col_groups, agg_cols, agg_list, out_path, freq_limit, tree_width, on_host, stat_name="categories", concat_groups=False, name_sep="_", ): if not col_groups: return {} if concat_groups: if agg_list and agg_list != ["count"]: raise ValueError("Cannot use concat_groups=True with aggregations other than count") if agg_cols: raise ValueError("Cannot aggregate continuous-column stats with concat_groups=True") # Update tree_width tw = {} for col in col_groups: col = [col] if isinstance(col, str) else col col_str = _make_name(*col, sep=name_sep) if tree_width is None: tw[col_str] = 8 elif isinstance(tree_width, int): tw[col_str] = tree_width else: tw[col_str] = tree_width.get(col_str, None) or 8 tree_width = tw # Make dedicated output directory for the categories fs = get_fs_token_paths(out_path)[0] out_path = fs.sep.join([out_path, stat_name]) fs.mkdirs(out_path, exist_ok=True) dsk = {} token = tokenize(ddf, col_groups, out_path, freq_limit, tree_width, on_host) level_1_name = "level_1-" + token split_name = "split-" + token level_2_name = "level_2-" + token level_3_name = "level_3-" + token finalize_labels_name = stat_name + "-" + token for p in range(ddf.npartitions): dsk[(level_1_name, p)] = ( _top_level_groupby, (ddf._name, p), col_groups, tree_width, agg_cols, agg_list, on_host, concat_groups, name_sep, ) k = 0 for c, col in enumerate(col_groups): col = [col] if isinstance(col, str) else col col_str = _make_name(*col, sep=name_sep) for s in range(tree_width[col_str]): dsk[(split_name, p, c, s)] = (getitem, (level_1_name, p), k) k += 1 col_groups_str = [] for c, col in enumerate(col_groups): col = [col] if isinstance(col, str) else col col_str = _make_name(*col, sep=name_sep) col_groups_str.append(col_str) freq_limit_val = None if freq_limit: freq_limit_val = freq_limit[col_str] if isinstance(freq_limit, dict) else freq_limit for s in range(tree_width[col_str]): dsk[(level_2_name, c, s)] = ( _mid_level_groupby, [(split_name, p, c, s) for p in range(ddf.npartitions)], col, agg_cols, agg_list, freq_limit_val, on_host, concat_groups, name_sep, ) dsk[(level_3_name, c)] = ( write_func, [(level_2_name, c, s) for s in range(tree_width[col_str])], out_path, col, on_host, concat_groups, name_sep, ) dsk[finalize_labels_name] = ( _finish_labels, [(level_3_name, c) for c, col in enumerate(col_groups)], col_groups_str, ) graph = HighLevelGraph.from_collections(finalize_labels_name, dsk, dependencies=[ddf]) return graph, finalize_labels_name
def blockwise( func, out_ind, *args, name=None, token=None, dtype=None, adjust_chunks=None, new_axes=None, align_arrays=True, concatenate=None, meta=None, **kwargs, ): """Tensor operation: Generalized inner and outer products A broad class of blocked algorithms and patterns can be specified with a concise multi-index notation. The ``blockwise`` function applies an in-memory function across multiple blocks of multiple inputs in a variety of ways. Many dask.array operations are special cases of blockwise including elementwise, broadcasting, reductions, tensordot, and transpose. Parameters ---------- func : callable Function to apply to individual tuples of blocks out_ind : iterable Block pattern of the output, something like 'ijk' or (1, 2, 3) *args : sequence of Array, index pairs Sequence like (x, 'ij', y, 'jk', z, 'i') **kwargs : dict Extra keyword arguments to pass to function dtype : np.dtype Datatype of resulting array. concatenate : bool, keyword only If true concatenate arrays along dummy indices, else provide lists adjust_chunks : dict Dictionary mapping index to function to be applied to chunk sizes new_axes : dict, keyword only New indexes and their dimension lengths align_arrays: bool Whether or not to align chunks along equally sized dimensions when multiple arrays are provided. This allows for larger chunks in some arrays to be broken into smaller ones that match chunk sizes in other arrays such that they are compatible for block function mapping. If this is false, then an error will be thrown if arrays do not already have the same number of blocks in each dimension. Examples -------- 2D embarrassingly parallel operation from two arrays, x, and y. >>> import operator, numpy as np, dask.array as da >>> x = da.from_array([[1, 2], ... [3, 4]], chunks=(1, 2)) >>> y = da.from_array([[10, 20], ... [0, 0]]) >>> z = blockwise(operator.add, 'ij', x, 'ij', y, 'ij', dtype='f8') >>> z.compute() array([[11, 22], [ 3, 4]]) Outer product multiplying a by b, two 1-d vectors >>> a = da.from_array([0, 1, 2], chunks=1) >>> b = da.from_array([10, 50, 100], chunks=1) >>> z = blockwise(np.outer, 'ij', a, 'i', b, 'j', dtype='f8') >>> z.compute() array([[ 0, 0, 0], [ 10, 50, 100], [ 20, 100, 200]]) z = x.T >>> z = blockwise(np.transpose, 'ji', x, 'ij', dtype=x.dtype) >>> z.compute() array([[1, 3], [2, 4]]) The transpose case above is illustrative because it does transposition both on each in-memory block by calling ``np.transpose`` and on the order of the blocks themselves, by switching the order of the index ``ij -> ji``. We can compose these same patterns with more variables and more complex in-memory functions z = X + Y.T >>> z = blockwise(lambda x, y: x + y.T, 'ij', x, 'ij', y, 'ji', dtype='f8') >>> z.compute() array([[11, 2], [23, 4]]) Any index, like ``i`` missing from the output index is interpreted as a contraction (note that this differs from Einstein convention; repeated indices do not imply contraction.) In the case of a contraction the passed function should expect an iterable of blocks on any array that holds that index. To receive arrays concatenated along contracted dimensions instead pass ``concatenate=True``. Inner product multiplying a by b, two 1-d vectors >>> def sequence_dot(a_blocks, b_blocks): ... result = 0 ... for a, b in zip(a_blocks, b_blocks): ... result += a.dot(b) ... return result >>> z = blockwise(sequence_dot, '', a, 'i', b, 'i', dtype='f8') >>> z.compute() 250 Add new single-chunk dimensions with the ``new_axes=`` keyword, including the length of the new dimension. New dimensions will always be in a single chunk. >>> def f(a): ... return a[:, None] * np.ones((1, 5)) >>> z = blockwise(f, 'az', a, 'a', new_axes={'z': 5}, dtype=a.dtype) New dimensions can also be multi-chunk by specifying a tuple of chunk sizes. This has limited utility as is (because the chunks are all the same), but the resulting graph can be modified to achieve more useful results (see ``da.map_blocks``). >>> z = blockwise(f, 'az', a, 'a', new_axes={'z': (5, 5)}, dtype=x.dtype) >>> z.chunks ((1, 1, 1), (5, 5)) If the applied function changes the size of each chunk you can specify this with a ``adjust_chunks={...}`` dictionary holding a function for each index that modifies the dimension size in that index. >>> def double(x): ... return np.concatenate([x, x]) >>> y = blockwise(double, 'ij', x, 'ij', ... adjust_chunks={'i': lambda n: 2 * n}, dtype=x.dtype) >>> y.chunks ((2, 2), (2,)) Include literals by indexing with None >>> z = blockwise(operator.add, 'ij', x, 'ij', 1234, None, dtype=x.dtype) >>> z.compute() array([[1235, 1236], [1237, 1238]]) """ out = name new_axes = new_axes or {} # Input Validation if len(set(out_ind)) != len(out_ind): raise ValueError( "Repeated elements not allowed in output index", [k for k, v in toolz.frequencies(out_ind).items() if v > 1], ) new = (set(out_ind) - {a for arg in args[1::2] if arg is not None for a in arg} - set(new_axes or ())) if new: raise ValueError("Unknown dimension", new) from dask.array.core import normalize_arg, unify_chunks if align_arrays: chunkss, arrays = unify_chunks(*args) else: arginds = [(a, i) for (a, i) in toolz.partition(2, args) if i is not None] chunkss = {} # For each dimension, use the input chunking that has the most blocks; # this will ensure that broadcasting works as expected, and in # particular the number of blocks should be correct if the inputs are # consistent. for arg, ind in arginds: for c, i in zip(arg.chunks, ind): if i not in chunkss or len(c) > len(chunkss[i]): chunkss[i] = c arrays = args[::2] for k, v in new_axes.items(): if not isinstance(v, tuple): v = (v, ) chunkss[k] = v arginds = zip(arrays, args[1::2]) numblocks = {} dependencies = [] arrays = [] # Normalize arguments argindsstr = [] for arg, ind in arginds: if ind is None: arg = normalize_arg(arg) arg, collections = unpack_collections(arg) dependencies.extend(collections) else: if (hasattr(arg, "ndim") and hasattr(ind, "__len__") and arg.ndim != len(ind)): raise ValueError( "Index string %s does not match array dimension %d" % (ind, arg.ndim)) numblocks[arg.name] = arg.numblocks arrays.append(arg) arg = arg.name argindsstr.extend((arg, ind)) # Normalize keyword arguments kwargs2 = {} for k, v in kwargs.items(): v = normalize_arg(v) v, collections = unpack_collections(v) dependencies.extend(collections) kwargs2[k] = v # Finish up the name if not out: out = "{}-{}".format( token or utils.funcname(func).strip("_"), base.tokenize(func, out_ind, argindsstr, dtype, **kwargs), ) graph = core_blockwise( func, out, out_ind, *argindsstr, numblocks=numblocks, dependencies=dependencies, new_axes=new_axes, concatenate=concatenate, **kwargs2, ) graph = HighLevelGraph.from_collections(out, graph, dependencies=arrays + dependencies) chunks = [chunkss[i] for i in out_ind] if adjust_chunks: for i, ind in enumerate(out_ind): if ind in adjust_chunks: if callable(adjust_chunks[ind]): chunks[i] = tuple(map(adjust_chunks[ind], chunks[i])) elif isinstance(adjust_chunks[ind], numbers.Integral): chunks[i] = tuple(adjust_chunks[ind] for _ in chunks[i]) elif isinstance(adjust_chunks[ind], (tuple, list)): if len(adjust_chunks[ind]) != len(chunks[i]): raise ValueError( f"Dimension {i} has {len(chunks[i])} blocks, adjust_chunks " f"specified with {len(adjust_chunks[ind])} blocks") chunks[i] = tuple(adjust_chunks[ind]) else: raise NotImplementedError( "adjust_chunks values must be callable, int, or tuple") chunks = tuple(chunks) if meta is None: from dask.array.utils import compute_meta meta = compute_meta(func, dtype, *args[::2], **kwargs) return new_da_object(graph, out, chunks, meta=meta, dtype=dtype)
def rearrange_by_column_tasks(df, column, max_branch=32, npartitions=None, ignore_index=False): """Order divisions of DataFrame so that all values within column(s) align This enacts a task-based shuffle. It contains most of the tricky logic around the complex network of tasks. Typically before this function is called a new column, ``"_partitions"`` has been added to the dataframe, containing the output partition number of every row. This function produces a new dataframe where every row is in the proper partition. It accomplishes this by splitting each input partition into several pieces, and then concatenating pieces from different input partitions into output partitions. If there are enough partitions then it does this work in stages to avoid scheduling overhead. Lets explain the motivation for this further. Imagine that we have 1000 input partitions and 1000 output partitions. In theory we could split each input into 1000 pieces, and then move the 1 000 000 resulting pieces around, and then concatenate them all into 1000 output groups. This would be fine, but the central scheduling overhead of 1 000 000 tasks would become a bottleneck. Instead we do this in stages so that we split each of the 1000 inputs into 30 pieces (we now have 30 000 pieces) move those around, concatenate back down to 1000, and then do the same process again. This has the same result as the full transfer, but now we've moved data twice (expensive) but done so with only 60 000 tasks (cheap). Note that the `column` input may correspond to a list of columns (rather than just a single column name). In this case, the `shuffle_group` and `shuffle_group_2` functions will use hashing to map each row to an output partition. This approach may require the same rows to be hased multiple times, but avoids the need to assign a new "_partitions" column. Parameters ---------- df: dask.dataframe.DataFrame column: str or list A column name on which we want to split, commonly ``"_partitions"`` which is assigned by functions upstream. This could also be a list of columns (in which case shuffle_group will create a hash array/column). max_branch: int The maximum number of splits per input partition. Defaults to 32. If there are more partitions than this then the shuffling will occur in stages in order to avoid creating npartitions**2 tasks Increasing this number increases scheduling overhead but decreases the number of full-dataset transfers that we have to make. npartitions: Optional[int] The desired number of output partitions Returns ------- df3: dask.dataframe.DataFrame See also -------- rearrange_by_column_disk: same operation, but uses partd rearrange_by_column: parent function that calls this or rearrange_by_column_disk shuffle_group: does the actual splitting per-partition """ max_branch = max_branch or 32 if (npartitions or df.npartitions) <= max_branch: # We are creating a small number of output partitions. # No need for staged shuffling. Staged shuffling will # sometimes require extra work/communication in this case. token = tokenize(df, column, npartitions) shuffle_name = f"simple-shuffle-{token}" npartitions = npartitions or df.npartitions shuffle_layer = SimpleShuffleLayer( shuffle_name, column, npartitions, df.npartitions, ignore_index, df._name, df._meta, ) graph = HighLevelGraph.from_collections(shuffle_name, shuffle_layer, dependencies=[df]) return new_dd_object(graph, shuffle_name, df._meta, [None] * (npartitions + 1)) n = df.npartitions stages = int(math.ceil(math.log(n) / math.log(max_branch))) if stages > 1: k = int(math.ceil(n**(1 / stages))) else: k = n inputs = [ tuple(digit(i, j, k) for j in range(stages)) for i in range(k**stages) ] npartitions_orig = df.npartitions token = tokenize(df, stages, column, n, k) for stage in range(stages): stage_name = f"shuffle-{stage}-{token}" stage_layer = ShuffleLayer( stage_name, column, inputs, stage, npartitions, n, k, ignore_index, df._name, df._meta, ) graph = HighLevelGraph.from_collections(stage_name, stage_layer, dependencies=[df]) df = new_dd_object(graph, stage_name, df._meta, df.divisions) if npartitions is not None and npartitions != npartitions_orig: token = tokenize(df, npartitions) repartition_group_token = "repartition-group-" + token dsk = {(repartition_group_token, i): ( shuffle_group_2, k, column, ignore_index, npartitions, ) for i, k in enumerate(df.__dask_keys__())} repartition_get_name = "repartition-get-" + token for p in range(npartitions): dsk[(repartition_get_name, p)] = ( shuffle_group_get, (repartition_group_token, p % npartitions_orig), p, ) graph2 = HighLevelGraph.from_collections(repartition_get_name, dsk, dependencies=[df]) df2 = new_dd_object(graph2, repartition_get_name, df._meta, [None] * (npartitions + 1)) else: df2 = df df2.divisions = (None, ) * (npartitions_orig + 1) return df2
def _approximate_quantile(df, q): """Approximate quantiles of DataFrame or Series. [NOTE: Same logic as dask.dataframe Series quantile] """ # current implementation needs q to be sorted so # sort if array-like, otherwise leave it alone q_ndarray = np.array(q) if q_ndarray.ndim > 0: q_ndarray.sort(kind="mergesort") q = q_ndarray # Lets assume we are dealing with a DataFrame throughout if isinstance(df, (Series, Index)): df = df.to_frame() assert isinstance(df, DataFrame) final_type = df._meta._constructor # Create metadata meta = df._meta_nonempty.quantiles(q=q) # Define final action (create df with quantiles as index) def finalize_tsk(tsk): return (final_type, tsk) return_type = df.__class__ # pandas/cudf uses quantile in [0, 1] # numpy / cupy uses [0, 100] qs = np.asarray(q) token = tokenize(df, qs) if len(qs) == 0: name = "quantiles-" + token empty_index = gd.Index([], dtype=float) return Series( { (name, 0): final_type( {col: [] for col in df.columns}, name=df.name, index=empty_index, ) }, name, df._meta, [None, None], ) else: new_divisions = [np.min(q), np.max(q)] name = "quantiles-1-" + token val_dsk = {(name, i): (_quantile, key, qs) for i, key in enumerate(df.__dask_keys__())} name2 = "quantiles-2-" + token merge_dsk = { (name2, 0): finalize_tsk( (merge_quantiles, qs, [qs] * df.npartitions, sorted(val_dsk))) } dsk = toolz.merge(val_dsk, merge_dsk) graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[df]) df = return_type(graph, name2, meta, new_divisions) def set_quantile_index(df): df.index = q return df df = df.map_partitions(set_quantile_index, meta=meta) return df
def _apply_offset(self, df: dd.DataFrame, offset: int, end: int) -> dd.DataFrame: """ Limit the dataframe to the window [offset, end]. That is unfortunately, not so simple as we do not know how many items we have in each partition. We have therefore no other way than to calculate (!!!) the sizes of each partition (this means we need to compute the dataframe already here). After that, we can create a new dataframe from the old dataframe by calculating for each partition if and how much it should be used. We do this via generating our own dask computation graph as we need to pass the partition number to the selection function, which is not possible with normal "map_partitions". """ # As we need to calculate the partition size, we better persist # the df. I think... # TODO: check if this is the best thing to do df = df.persist() # First, we need to find out which partitions we want to use. # Therefore we count the total number of entries partition_borders = df.map_partitions(lambda x: len(x)).compute() partition_borders = partition_borders.cumsum().to_dict() # Now we let each of the partitions figure out, how much it needs to return # using these partition borders # For this, we generate out own dask computation graph (as it does not really) # fit well with one of the already present methods # (a) we define a method to be calculated on each partition # This method returns the part of the partition, which falls between [offset, fetch] def select_from_to(df, partition_index): this_partition_border_left = (partition_borders[partition_index - 1] if partition_index > 0 else 0) this_partition_border_right = partition_borders[partition_index] if (end and end < this_partition_border_left) or ( offset and offset >= this_partition_border_right): return df.iloc[0:0] from_index = max(offset - this_partition_border_left, 0) if offset else 0 to_index = ( min(end, this_partition_border_right) if end else this_partition_border_right) - this_partition_border_left return df.iloc[from_index:to_index] # Then we (b) define a task graph. It should calculate the function above on each of the partitions of # df (specified by (df._name, i) for each partition i). As an argument, we pass the partition_index. dask_graph_name = df._name + "-limit" dask_graph_dict = {} for partition_index in range(df.npartitions): dask_graph_dict[(dask_graph_name, partition_index)] = ( select_from_to, (df._name, partition_index), partition_index, ) # We replace df with our new graph graph = HighLevelGraph.from_collections(dask_graph_name, dask_graph_dict, dependencies=[df]) return new_dd_object(graph, dask_graph_name, df._meta, df.divisions)
def map_blocks( func: Callable[..., T_DSorDA], obj: Union[DataArray, Dataset], args: Sequence[Any] = (), kwargs: Mapping[str, Any] = None, template: Union[DataArray, Dataset] = None, ) -> T_DSorDA: """Apply a function to each block of a DataArray or Dataset. .. warning:: This function is experimental and its signature may change. Parameters ---------- func : callable User-provided function that accepts a DataArray or Dataset as its first parameter ``obj``. The function will receive a subset or 'block' of ``obj`` (see below), corresponding to one chunk along each chunked dimension. ``func`` will be executed as ``func(subset_obj, *subset_args, **kwargs)``. This function must return either a single DataArray or a single Dataset. This function cannot add a new chunked dimension. obj : DataArray, Dataset Passed to the function as its first argument, one block at a time. args : sequence Passed to func after unpacking and subsetting any xarray objects by blocks. xarray objects in args must be aligned with obj, otherwise an error is raised. kwargs : mapping Passed verbatim to func after unpacking. xarray objects, if any, will not be subset to blocks. Passing dask collections in kwargs is not allowed. template : DataArray or Dataset, optional xarray object representing the final result after compute is called. If not provided, the function will be first run on mocked-up data, that looks like ``obj`` but has sizes 0, to determine properties of the returned object such as dtype, variable names, attributes, new dimensions and new indexes (if any). ``template`` must be provided if the function changes the size of existing dimensions. When provided, ``attrs`` on variables in `template` are copied over to the result. Any ``attrs`` set by ``func`` will be ignored. Returns ------- A single DataArray or Dataset with dask backend, reassembled from the outputs of the function. Notes ----- This function is designed for when ``func`` needs to manipulate a whole xarray object subset to each block. Each block is loaded into memory. In the more common case where ``func`` can work on numpy arrays, it is recommended to use ``apply_ufunc``. If none of the variables in ``obj`` is backed by dask arrays, calling this function is equivalent to calling ``func(obj, *args, **kwargs)``. See Also -------- dask.array.map_blocks, xarray.apply_ufunc, xarray.Dataset.map_blocks xarray.DataArray.map_blocks Examples -------- Calculate an anomaly from climatology using ``.groupby()``. Using ``xr.map_blocks()`` allows for parallel operations with knowledge of ``xarray``, its indices, and its methods like ``.groupby()``. >>> def calculate_anomaly(da, groupby_type="time.month"): ... gb = da.groupby(groupby_type) ... clim = gb.mean(dim="time") ... return gb - clim ... >>> time = xr.cftime_range("1990-01", "1992-01", freq="M") >>> month = xr.DataArray(time.month, coords={"time": time}, dims=["time"]) >>> np.random.seed(123) >>> array = xr.DataArray( ... np.random.rand(len(time)), ... dims=["time"], ... coords={"time": time, "month": month}, ... ).chunk() >>> array.map_blocks(calculate_anomaly, template=array).compute() <xarray.DataArray (time: 24)> array([ 0.12894847, 0.11323072, -0.0855964 , -0.09334032, 0.26848862, 0.12382735, 0.22460641, 0.07650108, -0.07673453, -0.22865714, -0.19063865, 0.0590131 , -0.12894847, -0.11323072, 0.0855964 , 0.09334032, -0.26848862, -0.12382735, -0.22460641, -0.07650108, 0.07673453, 0.22865714, 0.19063865, -0.0590131 ]) Coordinates: * time (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00 month (time) int64 1 2 3 4 5 6 7 8 9 10 11 12 1 2 3 4 5 6 7 8 9 10 11 12 Note that one must explicitly use ``args=[]`` and ``kwargs={}`` to pass arguments to the function being applied in ``xr.map_blocks()``: >>> array.map_blocks( ... calculate_anomaly, ... kwargs={"groupby_type": "time.year"}, ... template=array, ... ) # doctest: +ELLIPSIS <xarray.DataArray (time: 24)> dask.array<<this-array>-calculate_anomaly, shape=(24,), dtype=float64, chunksize=(24,), chunktype=numpy.ndarray> Coordinates: * time (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00 month (time) int64 dask.array<chunksize=(24,), meta=np.ndarray> """ def _wrapper( func: Callable, args: List, kwargs: dict, arg_is_array: Iterable[bool], expected: dict, ): """ Wrapper function that receives datasets in args; converts to dataarrays when necessary; passes these to the user function `func` and checks returned objects for expected shapes/sizes/etc. """ converted_args = [ dataset_to_dataarray(arg) if is_array else arg for is_array, arg in zip(arg_is_array, args) ] result = func(*converted_args, **kwargs) # check all dims are present missing_dimensions = set(expected["shapes"]) - set(result.sizes) if missing_dimensions: raise ValueError( f"Dimensions {missing_dimensions} missing on returned object." ) # check that index lengths and values are as expected for name, index in result.xindexes.items(): if name in expected["shapes"]: if len(index) != expected["shapes"][name]: raise ValueError( f"Received dimension {name!r} of length {len(index)}. Expected length {expected['shapes'][name]}." ) if name in expected["indexes"]: expected_index = expected["indexes"][name] if not index.equals(expected_index): raise ValueError( f"Expected index {name!r} to be {expected_index!r}. Received {index!r} instead." ) # check that all expected variables were returned check_result_variables(result, expected, "coords") if isinstance(result, Dataset): check_result_variables(result, expected, "data_vars") return make_dict(result) if template is not None and not isinstance(template, (DataArray, Dataset)): raise TypeError( f"template must be a DataArray or Dataset. Received {type(template).__name__} instead." ) if not isinstance(args, Sequence): raise TypeError("args must be a sequence (for example, a list or tuple).") if kwargs is None: kwargs = {} elif not isinstance(kwargs, Mapping): raise TypeError("kwargs must be a mapping (for example, a dict)") for value in kwargs.values(): if dask.is_dask_collection(value): raise TypeError( "Cannot pass dask collections in kwargs yet. Please compute or " "load values before passing to map_blocks." ) if not dask.is_dask_collection(obj): return func(obj, *args, **kwargs) all_args = [obj] + list(args) is_xarray = [isinstance(arg, (Dataset, DataArray)) for arg in all_args] is_array = [isinstance(arg, DataArray) for arg in all_args] # there should be a better way to group this. partition? xarray_indices, xarray_objs = unzip( (index, arg) for index, arg in enumerate(all_args) if is_xarray[index] ) others = [ (index, arg) for index, arg in enumerate(all_args) if not is_xarray[index] ] # all xarray objects must be aligned. This is consistent with apply_ufunc. aligned = align(*xarray_objs, join="exact") xarray_objs = tuple( dataarray_to_dataset(arg) if is_da else arg for is_da, arg in zip(is_array, aligned) ) _, npargs = unzip( sorted(list(zip(xarray_indices, xarray_objs)) + others, key=lambda x: x[0]) ) # check that chunk sizes are compatible input_chunks = dict(npargs[0].chunks) input_indexes = dict(npargs[0].xindexes) for arg in xarray_objs[1:]: assert_chunks_compatible(npargs[0], arg) input_chunks.update(arg.chunks) input_indexes.update(arg.xindexes) if template is None: # infer template by providing zero-shaped arrays template = infer_template(func, aligned[0], *args, **kwargs) template_indexes = set(template.xindexes) preserved_indexes = template_indexes & set(input_indexes) new_indexes = template_indexes - set(input_indexes) indexes = {dim: input_indexes[dim] for dim in preserved_indexes} indexes.update({k: template.xindexes[k] for k in new_indexes}) output_chunks = { dim: input_chunks[dim] for dim in template.dims if dim in input_chunks } else: # template xarray object has been provided with proper sizes and chunk shapes indexes = dict(template.xindexes) if isinstance(template, DataArray): output_chunks = dict( zip(template.dims, template.chunks) # type: ignore[arg-type] ) else: output_chunks = dict(template.chunks) for dim in output_chunks: if dim in input_chunks and len(input_chunks[dim]) != len(output_chunks[dim]): raise ValueError( "map_blocks requires that one block of the input maps to one block of output. " f"Expected number of output chunks along dimension {dim!r} to be {len(input_chunks[dim])}. " f"Received {len(output_chunks[dim])} instead. Please provide template if not provided, or " "fix the provided template." ) if isinstance(template, DataArray): result_is_array = True template_name = template.name template = template._to_temp_dataset() elif isinstance(template, Dataset): result_is_array = False else: raise TypeError( f"func output must be DataArray or Dataset; got {type(template)}" ) # We're building a new HighLevelGraph hlg. We'll have one new layer # for each variable in the dataset, which is the result of the # func applied to the values. graph: Dict[Any, Any] = {} new_layers: DefaultDict[str, Dict[Any, Any]] = collections.defaultdict(dict) gname = "{}-{}".format( dask.utils.funcname(func), dask.base.tokenize(npargs[0], args, kwargs) ) # map dims to list of chunk indexes ichunk = {dim: range(len(chunks_v)) for dim, chunks_v in input_chunks.items()} # mapping from chunk index to slice bounds input_chunk_bounds = { dim: np.cumsum((0,) + chunks_v) for dim, chunks_v in input_chunks.items() } output_chunk_bounds = { dim: np.cumsum((0,) + chunks_v) for dim, chunks_v in output_chunks.items() } def subset_dataset_to_block( graph: dict, gname: str, dataset: Dataset, input_chunk_bounds, chunk_index ): """ Creates a task that subsets an xarray dataset to a block determined by chunk_index. Block extents are determined by input_chunk_bounds. Also subtasks that subset the constituent variables of a dataset. """ # this will become [[name1, variable1], # [name2, variable2], # ...] # which is passed to dict and then to Dataset data_vars = [] coords = [] chunk_tuple = tuple(chunk_index.values()) for name, variable in dataset.variables.items(): # make a task that creates tuple of (dims, chunk) if dask.is_dask_collection(variable.data): # recursively index into dask_keys nested list to get chunk chunk = variable.__dask_keys__() for dim in variable.dims: chunk = chunk[chunk_index[dim]] chunk_variable_task = (f"{name}-{gname}-{chunk[0]}",) + chunk_tuple graph[chunk_variable_task] = ( tuple, [variable.dims, chunk, variable.attrs], ) else: # non-dask array possibly with dimensions chunked on other variables # index into variable appropriately subsetter = { dim: _get_chunk_slicer(dim, chunk_index, input_chunk_bounds) for dim in variable.dims } subset = variable.isel(subsetter) chunk_variable_task = ( f"{name}-{gname}-{dask.base.tokenize(subset)}", ) + chunk_tuple graph[chunk_variable_task] = ( tuple, [subset.dims, subset, subset.attrs], ) # this task creates dict mapping variable name to above tuple if name in dataset._coord_names: coords.append([name, chunk_variable_task]) else: data_vars.append([name, chunk_variable_task]) return (Dataset, (dict, data_vars), (dict, coords), dataset.attrs) # iterate over all possible chunk combinations for chunk_tuple in itertools.product(*ichunk.values()): # mapping from dimension name to chunk index chunk_index = dict(zip(ichunk.keys(), chunk_tuple)) blocked_args = [ subset_dataset_to_block(graph, gname, arg, input_chunk_bounds, chunk_index) if isxr else arg for isxr, arg in zip(is_xarray, npargs) ] # expected["shapes", "coords", "data_vars", "indexes"] are used to # raise nice error messages in _wrapper expected = {} # input chunk 0 along a dimension maps to output chunk 0 along the same dimension # even if length of dimension is changed by the applied function expected["shapes"] = { k: output_chunks[k][v] for k, v in chunk_index.items() if k in output_chunks } expected["data_vars"] = set(template.data_vars.keys()) # type: ignore[assignment] expected["coords"] = set(template.coords.keys()) # type: ignore[assignment] # TODO: benbovy - flexible indexes: clean this up # for now assumes pandas index (thus can be indexed) but it won't be the case for # all indexes expected_indexes = {} for dim in indexes: idx = indexes[dim].to_pandas_index()[ _get_chunk_slicer(dim, chunk_index, output_chunk_bounds) ] expected_indexes[dim] = PandasIndex(idx) expected["indexes"] = expected_indexes from_wrapper = (gname,) + chunk_tuple graph[from_wrapper] = (_wrapper, func, blocked_args, kwargs, is_array, expected) # mapping from variable name to dask graph key var_key_map: Dict[Hashable, str] = {} for name, variable in template.variables.items(): if name in indexes: continue gname_l = f"{name}-{gname}" var_key_map[name] = gname_l key: Tuple[Any, ...] = (gname_l,) for dim in variable.dims: if dim in chunk_index: key += (chunk_index[dim],) else: # unchunked dimensions in the input have one chunk in the result # output can have new dimensions with exactly one chunk key += (0,) # We're adding multiple new layers to the graph: # The first new layer is the result of the computation on # the array. # Then we add one layer per variable, which extracts the # result for that variable, and depends on just the first new # layer. new_layers[gname_l][key] = (operator.getitem, from_wrapper, name) hlg = HighLevelGraph.from_collections( gname, graph, dependencies=[arg for arg in npargs if dask.is_dask_collection(arg)], ) # This adds in the getitems for each variable in the dataset. hlg = HighLevelGraph( {**hlg.layers, **new_layers}, dependencies={ **hlg.dependencies, **{name: {gname} for name in new_layers.keys()}, }, ) result = Dataset(coords=indexes, attrs=template.attrs) for index in result.xindexes: result[index].attrs = template[index].attrs result[index].encoding = template[index].encoding for name, gname_l in var_key_map.items(): dims = template[name].dims var_chunks = [] for dim in dims: if dim in output_chunks: var_chunks.append(output_chunks[dim]) elif dim in indexes: var_chunks.append((len(indexes[dim]),)) elif dim in template.dims: # new unindexed dimension var_chunks.append((template.sizes[dim],)) data = dask.array.Array( hlg, name=gname_l, chunks=var_chunks, dtype=template[name].dtype ) result[name] = (dims, data, template[name].attrs) result[name].encoding = template[name].encoding result = result.set_coords(template._coord_names) if result_is_array: da = dataset_to_dataarray(result) da.name = template_name return da # type: ignore[return-value] return result # type: ignore[return-value]
def test_dicts_deprecated(): a = {"x": 1, "y": (inc, "x")} hg = HighLevelGraph({"a": a}, {"a": set()}) with pytest.warns(FutureWarning, match="HighLevelGraph.layers"): assert hg.dicts == hg.layers
def map_blocks( func: Callable[..., T_DSorDA], obj: Union[DataArray, Dataset], args: Sequence[Any] = (), kwargs: Mapping[str, Any] = None, ) -> T_DSorDA: """Apply a function to each chunk of a DataArray or Dataset. This function is experimental and its signature may change. Parameters ---------- func: callable User-provided function that accepts a DataArray or Dataset as its first parameter. The function will receive a subset of 'obj' (see below), corresponding to one chunk along each chunked dimension. ``func`` will be executed as ``func(obj_subset, *args, **kwargs)``. The function will be first run on mocked-up data, that looks like 'obj' but has sizes 0, to determine properties of the returned object such as dtype, variable names, new dimensions and new indexes (if any). This function must return either a single DataArray or a single Dataset. This function cannot change size of existing dimensions, or add new chunked dimensions. obj: DataArray, Dataset Passed to the function as its first argument, one dask chunk at a time. args: Sequence Passed verbatim to func after unpacking, after the sliced obj. xarray objects, if any, will not be split by chunks. Passing dask collections is not allowed. kwargs: Mapping Passed verbatim to func after unpacking. xarray objects, if any, will not be split by chunks. Passing dask collections is not allowed. Returns ------- A single DataArray or Dataset with dask backend, reassembled from the outputs of the function. Notes ----- This function is designed for when one needs to manipulate a whole xarray object within each chunk. In the more common case where one can work on numpy arrays, it is recommended to use apply_ufunc. If none of the variables in obj is backed by dask, calling this function is equivalent to calling ``func(obj, *args, **kwargs)``. See Also -------- dask.array.map_blocks, xarray.apply_ufunc, xarray.Dataset.map_blocks, xarray.DataArray.map_blocks Examples -------- Calculate an anomaly from climatology using ``.groupby()``. Using ``xr.map_blocks()`` allows for parallel operations with knowledge of ``xarray``, its indices, and its methods like ``.groupby()``. >>> def calculate_anomaly(da, groupby_type="time.month"): ... # Necessary workaround to xarray's check with zero dimensions ... # https://github.com/pydata/xarray/issues/3575 ... if sum(da.shape) == 0: ... return da ... gb = da.groupby(groupby_type) ... clim = gb.mean(dim="time") ... return gb - clim >>> time = xr.cftime_range("1990-01", "1992-01", freq="M") >>> np.random.seed(123) >>> array = xr.DataArray( ... np.random.rand(len(time)), dims="time", coords=[time] ... ).chunk() >>> xr.map_blocks(calculate_anomaly, array).compute() <xarray.DataArray (time: 24)> array([ 0.12894847, 0.11323072, -0.0855964 , -0.09334032, 0.26848862, 0.12382735, 0.22460641, 0.07650108, -0.07673453, -0.22865714, -0.19063865, 0.0590131 , -0.12894847, -0.11323072, 0.0855964 , 0.09334032, -0.26848862, -0.12382735, -0.22460641, -0.07650108, 0.07673453, 0.22865714, 0.19063865, -0.0590131 ]) Coordinates: * time (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00 Note that one must explicitly use ``args=[]`` and ``kwargs={}`` to pass arguments to the function being applied in ``xr.map_blocks()``: >>> xr.map_blocks( ... calculate_anomaly, array, kwargs={"groupby_type": "time.year"}, ... ) <xarray.DataArray (time: 24)> array([ 0.15361741, -0.25671244, -0.31600032, 0.008463 , 0.1766172 , -0.11974531, 0.43791243, 0.14197797, -0.06191987, -0.15073425, -0.19967375, 0.18619794, -0.05100474, -0.42989909, -0.09153273, 0.24841842, -0.30708526, -0.31412523, 0.04197439, 0.0422506 , 0.14482397, 0.35985481, 0.23487834, 0.12144652]) Coordinates: * time (time) object 1990-01-31 00:00:00 ... 1991-12-31 00:00:00 """ def _wrapper(func, obj, to_array, args, kwargs): if to_array: obj = dataset_to_dataarray(obj) result = func(obj, *args, **kwargs) for name, index in result.indexes.items(): if name in obj.indexes: if len(index) != len(obj.indexes[name]): raise ValueError( "Length of the %r dimension has changed. This is not allowed." % name ) return make_dict(result) if not isinstance(args, Sequence): raise TypeError("args must be a sequence (for example, a list or tuple).") if kwargs is None: kwargs = {} elif not isinstance(kwargs, Mapping): raise TypeError("kwargs must be a mapping (for example, a dict)") for value in list(args) + list(kwargs.values()): if dask.is_dask_collection(value): raise TypeError( "Cannot pass dask collections in args or kwargs yet. Please compute or " "load values before passing to map_blocks." ) if not dask.is_dask_collection(obj): return func(obj, *args, **kwargs) if isinstance(obj, DataArray): # only using _to_temp_dataset would break # func = lambda x: x.to_dataset() # since that relies on preserving name. if obj.name is None: dataset = obj._to_temp_dataset() else: dataset = obj.to_dataset() input_is_array = True else: dataset = obj input_is_array = False input_chunks = dataset.chunks template: Union[DataArray, Dataset] = infer_template(func, obj, *args, **kwargs) if isinstance(template, DataArray): result_is_array = True template_name = template.name template = template._to_temp_dataset() elif isinstance(template, Dataset): result_is_array = False else: raise TypeError( f"func output must be DataArray or Dataset; got {type(template)}" ) template_indexes = set(template.indexes) dataset_indexes = set(dataset.indexes) preserved_indexes = template_indexes & dataset_indexes new_indexes = template_indexes - dataset_indexes indexes = {dim: dataset.indexes[dim] for dim in preserved_indexes} indexes.update({k: template.indexes[k] for k in new_indexes}) # We're building a new HighLevelGraph hlg. We'll have one new layer # for each variable in the dataset, which is the result of the # func applied to the values. graph: Dict[Any, Any] = {} new_layers: DefaultDict[str, Dict[Any, Any]] = collections.defaultdict(dict) gname = "{}-{}".format( dask.utils.funcname(func), dask.base.tokenize(dataset, args, kwargs) ) # map dims to list of chunk indexes ichunk = {dim: range(len(chunks_v)) for dim, chunks_v in input_chunks.items()} # mapping from chunk index to slice bounds chunk_index_bounds = { dim: np.cumsum((0,) + chunks_v) for dim, chunks_v in input_chunks.items() } # iterate over all possible chunk combinations for v in itertools.product(*ichunk.values()): chunk_index_dict = dict(zip(dataset.dims, v)) # this will become [[name1, variable1], # [name2, variable2], # ...] # which is passed to dict and then to Dataset data_vars = [] coords = [] for name, variable in dataset.variables.items(): # make a task that creates tuple of (dims, chunk) if dask.is_dask_collection(variable.data): # recursively index into dask_keys nested list to get chunk chunk = variable.__dask_keys__() for dim in variable.dims: chunk = chunk[chunk_index_dict[dim]] chunk_variable_task = (f"{gname}-{chunk[0]}",) + v graph[chunk_variable_task] = ( tuple, [variable.dims, chunk, variable.attrs], ) else: # non-dask array with possibly chunked dimensions # index into variable appropriately subsetter = {} for dim in variable.dims: if dim in chunk_index_dict: which_chunk = chunk_index_dict[dim] subsetter[dim] = slice( chunk_index_bounds[dim][which_chunk], chunk_index_bounds[dim][which_chunk + 1], ) subset = variable.isel(subsetter) chunk_variable_task = ( "{}-{}".format(gname, dask.base.tokenize(subset)), ) + v graph[chunk_variable_task] = ( tuple, [subset.dims, subset, subset.attrs], ) # this task creates dict mapping variable name to above tuple if name in dataset._coord_names: coords.append([name, chunk_variable_task]) else: data_vars.append([name, chunk_variable_task]) from_wrapper = (gname,) + v graph[from_wrapper] = ( _wrapper, func, (Dataset, (dict, data_vars), (dict, coords), dataset.attrs), input_is_array, args, kwargs, ) # mapping from variable name to dask graph key var_key_map: Dict[Hashable, str] = {} for name, variable in template.variables.items(): if name in indexes: continue gname_l = f"{gname}-{name}" var_key_map[name] = gname_l key: Tuple[Any, ...] = (gname_l,) for dim in variable.dims: if dim in chunk_index_dict: key += (chunk_index_dict[dim],) else: # unchunked dimensions in the input have one chunk in the result key += (0,) # We're adding multiple new layers to the graph: # The first new layer is the result of the computation on # the array. # Then we add one layer per variable, which extracts the # result for that variable, and depends on just the first new # layer. new_layers[gname_l][key] = (operator.getitem, from_wrapper, name) hlg = HighLevelGraph.from_collections(gname, graph, dependencies=[dataset]) for gname_l, layer in new_layers.items(): # This adds in the getitems for each variable in the dataset. hlg.dependencies[gname_l] = {gname} hlg.layers[gname_l] = layer result = Dataset(coords=indexes, attrs=template.attrs) for name, gname_l in var_key_map.items(): dims = template[name].dims var_chunks = [] for dim in dims: if dim in input_chunks: var_chunks.append(input_chunks[dim]) elif dim in indexes: var_chunks.append((len(indexes[dim]),)) elif dim in template.dims: # new unindexed dimension var_chunks.append((template.sizes[dim],)) data = dask.array.Array( hlg, name=gname_l, chunks=var_chunks, dtype=template[name].dtype ) result[name] = (dims, data, template[name].attrs) result = result.set_coords(template._coord_names) if result_is_array: da = dataset_to_dataarray(result) da.name = template_name return da # type: ignore return result # type: ignore
def _wrap(self, funcname, *args, size=None, chunks="auto", extra_chunks=(), **kwargs): """Wrap numpy random function to produce dask.array random function extra_chunks should be a chunks tuple to append to the end of chunks """ if size is not None and not isinstance(size, (tuple, list)): size = (size, ) shapes = list({ ar.shape for ar in chain(args, kwargs.values()) if isinstance(ar, (Array, np.ndarray)) }) if size is not None: shapes.append(size) # broadcast to the final size(shape) size = broadcast_shapes(*shapes) chunks = normalize_chunks( chunks, size, # ideally would use dtype here dtype=kwargs.get("dtype", np.float64), ) slices = slices_from_chunks(chunks) def _broadcast_any(ar, shape, chunks): if isinstance(ar, Array): return broadcast_to(ar, shape).rechunk(chunks) if isinstance(ar, np.ndarray): return np.ascontiguousarray(np.broadcast_to(ar, shape)) # Broadcast all arguments, get tiny versions as well # Start adding the relevant bits to the graph dsk = {} lookup = {} small_args = [] dependencies = [] for i, ar in enumerate(args): if isinstance(ar, (np.ndarray, Array)): res = _broadcast_any(ar, size, chunks) if isinstance(res, Array): dependencies.append(res) lookup[i] = res.name elif isinstance(res, np.ndarray): name = f"array-{tokenize(res)}" lookup[i] = name dsk[name] = res small_args.append(ar[tuple(0 for _ in ar.shape)]) else: small_args.append(ar) small_kwargs = {} for key, ar in kwargs.items(): if isinstance(ar, (np.ndarray, Array)): res = _broadcast_any(ar, size, chunks) if isinstance(res, Array): dependencies.append(res) lookup[key] = res.name elif isinstance(res, np.ndarray): name = f"array-{tokenize(res)}" lookup[key] = name dsk[name] = res small_kwargs[key] = ar[tuple(0 for _ in ar.shape)] else: small_kwargs[key] = ar sizes = list(product(*chunks)) seeds = random_state_data(len(sizes), self._numpy_state) token = tokenize(seeds, size, chunks, args, kwargs) name = f"{funcname}-{token}" keys = product([name], *([range(len(bd)) for bd in chunks] + [[0]] * len(extra_chunks))) blocks = product(*[range(len(bd)) for bd in chunks]) vals = [] for seed, size, slc, block in zip(seeds, sizes, slices, blocks): arg = [] for i, ar in enumerate(args): if i not in lookup: arg.append(ar) else: if isinstance(ar, Array): arg.append((lookup[i], ) + block) else: # np.ndarray arg.append((getitem, lookup[i], slc)) kwrg = {} for k, ar in kwargs.items(): if k not in lookup: kwrg[k] = ar else: if isinstance(ar, Array): kwrg[k] = (lookup[k], ) + block else: # np.ndarray kwrg[k] = (getitem, lookup[k], slc) vals.append((_apply_random, self._RandomState, funcname, seed, size, arg, kwrg)) meta = _apply_random( self._RandomState, funcname, seed, (0, ) * len(size), small_args, small_kwargs, ) dsk.update(dict(zip(keys, vals))) graph = HighLevelGraph.from_collections(name, dsk, dependencies=dependencies) return Array(graph, name, chunks + extra_chunks, meta=meta)
def test_keyset_deprecated(): a = {"x": 1, "y": (inc, "x")} hg = HighLevelGraph({"a": a}, {"a": set()}) with pytest.warns(FutureWarning, match="HighLevelGraph.keys"): assert hg.keyset() == hg.keys()
def choice(self, a, size=None, replace=True, p=None, chunks="auto"): dependencies = [] # Normalize and validate `a` if isinstance(a, Integral): # On windows the output dtype differs if p is provided or # absent, see https://github.com/numpy/numpy/issues/9867 dummy_p = np.array([1]) if p is not None else p dtype = np.random.choice(1, size=(), p=dummy_p).dtype len_a = a if a < 0: raise ValueError("a must be greater than 0") else: a = asarray(a) a = a.rechunk(a.shape) dtype = a.dtype if a.ndim != 1: raise ValueError("a must be one dimensional") len_a = len(a) dependencies.append(a) a = a.__dask_keys__()[0] # Normalize and validate `p` if p is not None: if not isinstance(p, Array): # If p is not a dask array, first check the sum is close # to 1 before converting. p = np.asarray(p) if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0): raise ValueError("probabilities do not sum to 1") p = asarray(p) else: p = p.rechunk(p.shape) if p.ndim != 1: raise ValueError("p must be one dimensional") if len(p) != len_a: raise ValueError("a and p must have the same size") dependencies.append(p) p = p.__dask_keys__()[0] if size is None: size = () elif not isinstance(size, (tuple, list)): size = (size, ) chunks = normalize_chunks(chunks, size, dtype=np.float64) if not replace and len(chunks[0]) > 1: err_msg = ("replace=False is not currently supported for " "dask.array.choice with multi-chunk output " "arrays") raise NotImplementedError(err_msg) sizes = list(product(*chunks)) state_data = random_state_data(len(sizes), self._numpy_state) name = "da.random.choice-%s" % tokenize(state_data, size, chunks, a, replace, p) keys = product([name], *(range(len(bd)) for bd in chunks)) dsk = { k: (_choice, state, a, size, replace, p) for k, state, size in zip(keys, state_data, sizes) } graph = HighLevelGraph.from_collections(name, dsk, dependencies=dependencies) return Array(graph, name, chunks, dtype=dtype)
def fit(model, x, y, compute=True, shuffle_blocks=True, random_state=None, **kwargs): """ Fit scikit learn model against dask arrays Model must support the ``partial_fit`` interface for online or batch learning. Ideally your rows are independent and identically distributed. By default, this function will step through chunks of the arrays in random order. Parameters ---------- model: sklearn model Any model supporting partial_fit interface x: dask Array Two dimensional array, likely tall and skinny y: dask Array One dimensional array with same chunks as x's rows compute : bool Whether to compute this result shuffle_blocks : bool Whether to shuffle the blocks with ``random_state`` or not random_state : int or numpy.random.RandomState Random state to use when shuffling blocks kwargs: options to pass to partial_fit Examples -------- >>> import dask.array as da >>> X = da.random.random((10, 3), chunks=(5, 3)) >>> y = da.random.randint(0, 2, 10, chunks=(5,)) >>> from sklearn.linear_model import SGDClassifier >>> sgd = SGDClassifier() >>> sgd = da.learn.fit(sgd, X, y, classes=[1, 0]) >>> sgd # doctest: +SKIP SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False) This passes all of X and y through the classifier sequentially. We can use the classifier as normal on in-memory data >>> import numpy as np >>> sgd.predict(np.random.random((4, 3))) # doctest: +SKIP array([1, 0, 0, 1]) Or predict on a larger dataset >>> z = da.random.random((400, 3), chunks=(100, 3)) >>> da.learn.predict(sgd, z) # doctest: +SKIP dask.array<x_11, shape=(400,), chunks=((100, 100, 100, 100),), dtype=int64> """ if not hasattr(x, "chunks") and hasattr(x, "to_dask_array"): x = x.to_dask_array() assert x.ndim == 2 if y is not None: if not hasattr(y, "chunks") and hasattr(y, "to_dask_array"): y = y.to_dask_array() assert y.ndim == 1 assert x.chunks[0] == y.chunks[0] assert hasattr(model, "partial_fit") if len(x.chunks[1]) > 1: x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1]))) nblocks = len(x.chunks[0]) order = list(range(nblocks)) if shuffle_blocks: rng = sklearn.utils.check_random_state(random_state) rng.shuffle(order) name = "fit-" + dask.base.tokenize(model, x, y, kwargs, order) dsk = {(name, -1): model} dsk.update({(name, i): ( _partial_fit, (name, i - 1), (x.name, order[i], 0), (getattr(y, "name", ""), order[i]), kwargs, ) for i in range(nblocks)}) graphs = {x.name: x.__dask_graph__(), name: dsk} if hasattr(y, "__dask_graph__"): graphs[y.name] = y.__dask_graph__() try: from dask.highlevelgraph import HighLevelGraph new_dsk = HighLevelGraph.merge(*graphs.values()) except ImportError: from dask import sharedict new_dsk = sharedict.merge(*graphs.values()) value = Delayed((name, nblocks - 1), new_dsk) if compute: return value.compute() else: return value
def dask_reproject( src: da.Array, src_geobox: GeoBox, dst_geobox: GeoBox, resampling: str = "nearest", chunks: Optional[Tuple[int, int]] = None, src_nodata: Optional[NodataType] = None, dst_nodata: Optional[NodataType] = None, axis: int = 0, name: str = "reproject", ) -> da.Array: """ Reproject to GeoBox as dask operation :param src : Input src[(time,) y,x (, band)] :param src_geobox: GeoBox of the source array :param dst_geobox: GeoBox of the destination :param resampling: Resampling strategy as a string: nearest, bilinear, average, mode ... :param chunks : In Y,X dimensions only, default is to use same input chunk size :param axis : Index of Y axis (default is 0) :param src_nodata: nodata marker for source image :param dst_nodata: nodata marker for dst image :param name : Dask graph name, "reproject" is the default """ if chunks is None: chunks = src.chunksize[axis:axis + 2] if dst_nodata is None: dst_nodata = src_nodata assert src.shape[axis:axis + 2] == src_geobox.shape yx_shape = dst_geobox.shape yx_chunks = unpack_chunks(chunks, yx_shape) dst_chunks = src.chunks[:axis] + yx_chunks + src.chunks[axis + 2:] dst_shape = src.shape[:axis] + yx_shape + src.shape[axis + 2:] # tuple(*dims1, y, x, *dims2) -- complete shape in blocks dims1 = tuple(map(len, dst_chunks[:axis])) dims2 = tuple(map(len, dst_chunks[axis + 2:])) assert dims2 == () deps = [src] tile_shape = (yx_chunks[0][0], yx_chunks[1][0]) gbt = GeoboxTiles(dst_geobox, tile_shape) xy_chunks_with_data = list(gbt.tiles(src_geobox.extent)) name = randomize(name) dsk: Dict[Any, Any] = {} block_impl = (_reproject_block_bool_impl if src.dtype == "bool" else _reproject_block_impl) for idx in xy_chunks_with_data: _dst_geobox = gbt[idx] rr = compute_reproject_roi(src_geobox, _dst_geobox) _src = crop_2d_dense(src, rr.roi_src, axis=axis) _src_geobox = src_geobox[rr.roi_src] deps.append(_src) for ii1 in np.ndindex(dims1): # TODO: band dims dsk[(name, *ii1, *idx)] = ( block_impl, (_src.name, *ii1, 0, 0), _src_geobox, _dst_geobox, resampling, src_nodata, dst_nodata, axis, ) fill_value = 0 if dst_nodata is None else dst_nodata shape_in_blocks = tuple(map(len, dst_chunks)) mk_empty = empty_maker(fill_value, src.dtype, dsk) for idx in np.ndindex(shape_in_blocks): # TODO: other dims k = (name, *idx) if k not in dsk: bshape = tuple(ch[i] for ch, i in zip(dst_chunks, idx)) dsk[k] = mk_empty(bshape) dsk = HighLevelGraph.from_collections(name, dsk, dependencies=deps) return da.Array(dsk, name, chunks=dst_chunks, dtype=src.dtype, shape=dst_shape)
def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: # Joining is a bit more complicated, so lets do it in steps: # 1. We now have two inputs (from left and right), so we fetch them both dc_lhs, dc_rhs = self.assert_inputs(rel, 2, context) cc_lhs = dc_lhs.column_container cc_rhs = dc_rhs.column_container # 2. dask's merge will do some smart things with columns, which have the same name # on lhs an rhs (which also includes reordering). # However, that will confuse our column numbering in SQL. # So we make our life easier by converting the column names into unique names # We will convert back in the end cc_lhs_renamed = cc_lhs.make_unique("lhs") cc_rhs_renamed = cc_rhs.make_unique("rhs") dc_lhs_renamed = DataContainer(dc_lhs.df, cc_lhs_renamed) dc_rhs_renamed = DataContainer(dc_rhs.df, cc_rhs_renamed) df_lhs_renamed = dc_lhs_renamed.assign() df_rhs_renamed = dc_rhs_renamed.assign() join_type = rel.getJoinType() join_type = self.JOIN_TYPE_MAPPING[str(join_type)] # 3. The join condition can have two forms, that we can understand # (a) a = b # (b) X AND Y AND a = b AND Z ... (can also be multiple a = b) # The first case is very simple and we do not need any additional filter # In the second case we do a merge on all the a = b, # and then apply a filter using the other expressions. # In all other cases, we need to do a full table cross join and filter afterwards. # As this is probably non-sense for large tables, but there is no other # known solution so far. join_condition = rel.getCondition() lhs_on, rhs_on, filter_condition = self._split_join_condition( join_condition) logger.debug( f"Joining with type {join_type} on columns {lhs_on}, {rhs_on}.") # lhs_on and rhs_on are the indices of the columns to merge on. # The given column indices are for the full, merged table which consists # of lhs and rhs put side-by-side (in this order) # We therefore need to normalize the rhs indices relative to the rhs table. rhs_on = [index - len(df_lhs_renamed.columns) for index in rhs_on] # 4. dask can only merge on the same column names. # We therefore create new columns on purpose, which have a distinct name. assert len(lhs_on) == len(rhs_on) if lhs_on: # 5. Now we can finally merge on these columns # The resulting dataframe will contain all (renamed) columns from the lhs and rhs # plus the added columns df = self._join_on_columns( df_lhs_renamed, df_rhs_renamed, lhs_on, rhs_on, join_type, ) else: # 5. We are in the complex join case # where we have no column to merge on # This means we have no other chance than to merge # everything with everything... # TODO: we should implement a shortcut # for filter conditions that are always false def merge_single_partitions(lhs_partition, rhs_partition): # Do a cross join with the two partitions # TODO: it would be nice to apply the filter already here # problem: this would mean we need to ship the rex to the # workers (as this is executed on the workers), # which is definitely not possible (java dependency, JVM start...) lhs_partition = lhs_partition.assign(common=1) rhs_partition = rhs_partition.assign(common=1) return lhs_partition.merge(rhs_partition, on="common").drop(columns="common") # Iterate nested over all partitions from lhs and rhs and merge them name = "cross-join-" + tokenize(df_lhs_renamed, df_rhs_renamed) dsk = {(name, i * df_rhs_renamed.npartitions + j): ( merge_single_partitions, (df_lhs_renamed._name, i), (df_rhs_renamed._name, j), ) for i in range(df_lhs_renamed.npartitions) for j in range(df_rhs_renamed.npartitions)} graph = HighLevelGraph.from_collections( name, dsk, dependencies=[df_lhs_renamed, df_rhs_renamed]) meta = dd.dispatch.concat( [df_lhs_renamed._meta_nonempty, df_rhs_renamed._meta_nonempty], axis=1) # TODO: Do we know the divisions in any way here? divisions = [None] * (len(dsk) + 1) df = dd.DataFrame(graph, name, meta=meta, divisions=divisions) warnings.warn( "Need to do a cross-join, which is typically very resource heavy", ResourceWarning, ) # 6. So the next step is to make sure # we have the correct column order (and to remove the temporary join columns) correct_column_order = list(df_lhs_renamed.columns) + list( df_rhs_renamed.columns) cc = ColumnContainer(df.columns).limit_to(correct_column_order) # and to rename them like the rel specifies row_type = rel.getRowType() field_specifications = [str(f) for f in row_type.getFieldNames()] cc = cc.rename({ from_col: to_col for from_col, to_col in zip(cc.columns, field_specifications) }) cc = self.fix_column_to_row_type(cc, row_type) dc = DataContainer(df, cc) # 7. Last but not least we apply any filters by and-chaining together the filters if filter_condition: # This line is a bit of code duplication with RexCallPlugin - but I guess it is worth to keep it separate filter_condition = reduce( operator.and_, [ RexConverter.convert(rex, dc, context=context) for rex in filter_condition ], ) logger.debug(f"Additionally applying filter {filter_condition}") df = filter_or_scalar(df, filter_condition) dc = DataContainer(df, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc
def test_highlevelgraph_dicts_deprecation(): with pytest.warns(FutureWarning): layers = {"a": BasicLayer({"x": 1, "y": (inc, "x")})} hg = HighLevelGraph(layers, {"a": set()}) assert hg.dicts == layers
def groupby_agg( ddf, gb_cols, aggs_in, split_every=None, split_out=None, dropna=True, sep="___", sort=False, as_index=True, ): """ Optimized groupby aggregation for Dask-CuDF. This aggregation algorithm only supports the following options: {"count", "mean", "std", "var", "sum", "min", "max"} This "optimized" approach is more performant than the algorithm in `dask.dataframe`, because it allows the cudf backend to perform multiple aggregations at once. """ # Deal with default split_out and split_every params if split_every is False: split_every = ddf.npartitions split_every = split_every or 8 split_out = split_out or 1 # Standardize `gb_cols` and `columns` lists aggs = aggs_in.copy() if isinstance(gb_cols, str): gb_cols = [gb_cols] columns = [c for c in ddf.columns if c not in gb_cols] str_cols_out = False if isinstance(aggs, dict): # Use `str_cols_out` to specify if the output columns # will have str (rather than MultiIndex/tuple) names. # This happens when all values in the `aggs` dict are # strings (no lists) str_cols_out = True for col in aggs: if isinstance(aggs[col], str): aggs[col] = [aggs[col]] else: str_cols_out = False if col in gb_cols: columns.append(col) # Assert that aggregations are supported _supported = {"count", "mean", "std", "var", "sum", "min", "max"} if not _is_supported(aggs, _supported): raise ValueError( f"Supported aggs include {_supported} for groupby_agg API. " f"Aggregations must be specified with dict or list syntax.") # Always convert aggs to dict for consistency if isinstance(aggs, list): aggs = {col: aggs for col in columns} # Begin graph construction dsk = {} token = tokenize(ddf, gb_cols, aggs) partition_agg_name = "groupby_partition_agg-" + token tree_reduce_name = "groupby_tree_reduce-" + token gb_agg_name = "groupby_agg-" + token for p in range(ddf.npartitions): # Perform groupby aggregation on each partition. # Split each result into `split_out` chunks (by hashing `gb_cols`) dsk[(partition_agg_name, p)] = ( _groupby_partition_agg, (ddf._name, p), gb_cols, aggs, columns, split_out, dropna, sort, sep, ) # Pick out each chunk using `getitem` for s in range(split_out): dsk[(tree_reduce_name, p, s, 0)] = ( getitem, (partition_agg_name, p), s, ) # Build reduction tree parts = ddf.npartitions widths = [parts] while parts > 1: parts = math.ceil(parts / split_every) widths.append(parts) height = len(widths) for s in range(split_out): for depth in range(1, height): for group in range(widths[depth]): p_max = widths[depth - 1] lstart = split_every * group lstop = min(lstart + split_every, p_max) node_list = [(tree_reduce_name, p, s, depth - 1) for p in range(lstart, lstop)] dsk[(tree_reduce_name, group, s, depth)] = ( _tree_node_agg, node_list, gb_cols, split_out, dropna, sort, sep, ) # Final output partitions. _aggs = aggs.copy() if str_cols_out: # Metadata should use `str` for dict values if that is # what the user originally specified (column names will # be str, rather than tuples). for col in aggs: _aggs[col] = _aggs[col][0] _meta = ddf._meta.groupby(gb_cols, as_index=as_index).agg(_aggs) for s in range(split_out): dsk[(gb_agg_name, s)] = ( _finalize_gb_agg, (tree_reduce_name, 0, s, height - 1), gb_cols, aggs, columns, _meta.columns, as_index, sort, sep, str_cols_out, ) divisions = [None] * (split_out + 1) graph = HighLevelGraph.from_collections(gb_agg_name, dsk, dependencies=[ddf]) return new_dd_object(graph, gb_agg_name, _meta, divisions)
def warp(self, dem=None, proj="EPSG:4326", **kwargs): """Delayed warp across an entire AOI or Image Creates a new dask image by deferring calls to the warp_geometry on chunks Args: dem (ndarray): optional. A DEM for warping to specific elevation planes proj (str): optional. An EPSG proj string to project the image data into ("EPSG:32612") Returns: daskarray: a warped image as deferred image array """ try: img_md = self.rda.metadata["image"] x_size = img_md["tileXSize"] y_size = img_md["tileYSize"] except (AttributeError, KeyError): x_size = kwargs.get("chunk_size", 256) y_size = kwargs.get("chunk_size", 256) # Create an affine transform to convert between real-world and pixels if self.proj is None: from_proj = "EPSG:4326" else: from_proj = self.proj try: # NOTE: this only works on images that have rda rpcs metadata center = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).centroid g = box(*center.buffer(self.rda.metadata["rpcs"]["gsd"] / 2).bounds) tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", ops.transform(tfm, g).area ** 0.5) current_bounds = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).bounds except (AttributeError, KeyError, TypeError): tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area / (self.shape[1] * self.shape[2])) ** 0.5) current_bounds = self.bounds tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj), pyproj.Proj(init=proj)) itfm = partial(pyproj.transform, pyproj.Proj(init=proj), pyproj.Proj(init=from_proj)) output_bounds = ops.transform(tfm, box(*current_bounds)).bounds gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3], 0.0, -1 * gsd) ll = ~gtf * (output_bounds[:2]) ur = ~gtf * (output_bounds[2:]) x_chunks = int((ur[0] - ll[0]) / x_size) + 1 y_chunks = int((ll[1] - ur[1]) / y_size) + 1 num_bands = self.shape[0] try: dtype = img_md["dataType"] except: dtype = 'uint8' daskmeta = { "dask": {}, "chunks": (num_bands, y_size, x_size), "dtype": dtype, "name": "warp-{}".format(self.name), "shape": (num_bands, y_chunks * y_size, x_chunks * x_size) } def px_to_geom(xmin, ymin): xmax = int(xmin + x_size) ymax = int(ymin + y_size) bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin))) return box(*bounds) full_bounds = box(*output_bounds) dasks = [] if isinstance(dem, GeoDaskImage): if dem.proj != proj: dem = dem.warp(proj=proj, dem=dem) dasks.append(dem.dask) for y in range(y_chunks): for x in range(x_chunks): xmin = x * x_size ymin = y * y_size geometry = px_to_geom(xmin, ymin) daskmeta["dask"][(daskmeta["name"], 0, y, x)] = (self._warp, geometry, gsd, dem, proj, dtype, 5) daskmeta["dask"], _ = optimization.cull(HighLevelGraph.merge(daskmeta["dask"], *dasks), list(daskmeta["dask"].keys())) gi = mapping(full_bounds) gt = AffineTransform(gtf, proj) image = GeoDaskImage(daskmeta, __geo_interface__=gi, __geo_transform__=gt) return image[box(*output_bounds)]
def add_row_order_factory(table_proxy, datasets): """ Generate arrays which add the appropriate rows for each array row chunk of a dataset, as well as returning the appropriate row ordering for that chunk Each array chunk (and by implication dataset) is linked by a call to :func:`daskms.writes.add_row_orders` to a previous chunk, either in the same array or the previous array. This establishes an order on how: 1. Rows are added to the table. 2. Column writes are performed. Returns ------- list of :class:`dask.array.Array` row orderings for each dataset """ prev_key = None prev_deps = [] row_add_ops = [] for di, ds in enumerate(datasets): data_vars = ds.data_vars found = False for k, v in data_vars.items(): dims = v.dims array = v.data # Need something with a row dimension if not dims[0] == 'row': continue found = True token = dask.base.tokenize(array) name = '-'.join(('add-rows', str(di), token)) layers = {} for b in range(array.numblocks[0]): key = (name, b) array_key = (array.name, b) + (0, ) * (array.ndim - 1) layers[key] = (add_row_orders, array_key, table_proxy, prev_key) prev_key = key graph = HighLevelGraph.from_collections(name, layers, prev_deps + [array]) chunks = (array.chunks[0], ) row_adds = da.Array(graph, name, chunks, dtype=np.object) row_add_ops.append(row_adds) prev_deps = [row_adds] break if not found: raise ValueError("Couldn't find an array with " "which to establish a row ordering " "in dataset %d" % di) return row_add_ops
def rearrange_by_column_p2p( df: DataFrame, column: str, npartitions: int | None = None, ): from dask.dataframe import DataFrame npartitions = npartitions or df.npartitions token = tokenize(df, column, npartitions) setup = delayed(shuffle_setup, pure=True)(NewShuffleMetadata( ShuffleId(token), df._meta, column, npartitions, )) transferred = df.map_partitions( shuffle_transfer, token, setup, meta=df, enforce_metadata=False, transform_divisions=False, ) barrier_key = "shuffle-barrier-" + token barrier_dsk = { barrier_key: (shuffle_barrier, token, transferred.__dask_keys__()) } barrier = Delayed( barrier_key, HighLevelGraph.from_collections(barrier_key, barrier_dsk, dependencies=[transferred]), ) name = "shuffle-unpack-" + token dsk = {(name, i): (shuffle_unpack, token, i, barrier_key) for i in range(npartitions)} # TODO: update to use blockwise. # Changes task names, so breaks setting worker restrictions at the moment. # Also maybe would be nice if the `DataFrameIOLayer` interface supported this? # dsk = blockwise( # shuffle_unpack, # name, # "i", # token, # None, # BlockwiseDepDict({(i,): i for i in range(npartitions)}), # "i", # barrier_key, # None, # numblocks={}, # ) return DataFrame( HighLevelGraph.from_collections(name, dsk, [barrier]), name, df._meta, [None] * (npartitions + 1), )
def sjoin(left, right, how="inner", predicate="intersects", **kwargs): """ Spatial join of two GeoDataFrames. Parameters ---------- left, right : geopandas or dask_geopandas GeoDataFrames If a geopandas.GeoDataFrame is passed, it is considered as a dask_geopandas.GeoDataFrame with 1 partition (without spatial partitioning information). how : string, default 'inner' The type of join. Currently only 'inner' is supported. predicate : string, default 'intersects' Binary predicate how to match corresponding rows of the left and right GeoDataFrame. Possible values: 'contains', 'contains_properly', 'covered_by', 'covers', 'crosses', 'intersects', 'overlaps', 'touches', 'within'. Returns ------- dask_geopandas.GeoDataFrame Notes ----- If both the left and right GeoDataFrame have spatial partitioning information available (the ``spatial_partitions`` attribute is set), the output partitions are determined based on intersection of the spatial partitions. In all other cases, the output partitions are all combinations (cartesian/cross product) of all input partition of the left and right GeoDataFrame. """ if "op" in kwargs: predicate = kwargs.pop("op") deprecation_message = ( "The `op` parameter is deprecated and will be removed" " in a future release. Please use the `predicate` parameter" " instead." ) warnings.warn(deprecation_message, FutureWarning, stacklevel=2) if how != "inner": raise NotImplementedError("Only how='inner' is supported right now") if isinstance(left, geopandas.GeoDataFrame): left = from_geopandas(left, npartitions=1) if isinstance(right, geopandas.GeoDataFrame): right = from_geopandas(right, npartitions=1) name = "sjoin-" + tokenize(left, right, how, predicate) meta = geopandas.sjoin(left._meta, right._meta, how=how, predicate=predicate) if left.spatial_partitions is not None and right.spatial_partitions is not None: # Spatial partitions are known -> use them to trim down the list of # partitions that need to be joined parts = geopandas.sjoin( left.spatial_partitions.to_frame("geometry"), right.spatial_partitions.to_frame("geometry"), how="inner", predicate="intersects", ) parts_left = np.asarray(parts.index) parts_right = np.asarray(parts["index_right"].values) using_spatial_partitions = True else: # Unknown spatial partitions -> full cartesian (cross) product of all # combinations of the partitions of the left and right dataframe n_left = left.npartitions n_right = right.npartitions parts_left = np.repeat(np.arange(n_left), n_right) parts_right = np.tile(np.arange(n_right), n_left) using_spatial_partitions = False dsk = {} new_spatial_partitions = [] for i, (l, r) in enumerate(zip(parts_left, parts_right)): dsk[(name, i)] = ( geopandas.sjoin, (left._name, l), (right._name, r), how, predicate, ) # TODO preserve spatial partitions of the output if only left has spatial # partitions if using_spatial_partitions: lr = left.spatial_partitions.iloc[l] rr = right.spatial_partitions.iloc[r] # extent = lr.intersection(rr).buffer(buffer).intersection(lr.union(rr)) extent = lr.intersection(rr) new_spatial_partitions.append(extent) divisions = [None] * (len(dsk) + 1) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[left, right]) if not using_spatial_partitions: new_spatial_partitions = None return GeoDataFrame(graph, name, meta, divisions, new_spatial_partitions)
def to_csv(x: xarray.DataArray, path: str, *, nogil: bool = True, **kwargs): """Print DataArray to CSV. When x has numpy backend, this function is functionally equivalent to (but much) faster than):: x.to_pandas().to_csv(path_or_buf, **kwargs) When x has dask backend, this function returns a dask delayed object which will write to the disk only when its .compute() method is invoked. Formatting and optional compression are parallelised across all available CPUs, using one dask task per chunk on the first dimension. Chunks on other dimensions will be merged ahead of computation. :param x: :class:`~xarray.DataArray` with one or two dimensions :param str path: Output file path :param bool nogil: If True, use accelerated C implementation. Several kwargs won't be processed correctly (see limitations below). If False, use pandas to_csv method (slow, and does not release the GIL). nogil=True exclusively supports float and integer values dtypes (but the coords can be anything). In case of incompatible dtype, nogil is automatically switched to False. :param kwargs: Passed verbatim to :meth:`pandas.DataFrame.to_csv` or :meth:`pandas.Series.to_csv` **Limitations** - Fancy URIs are not (yet) supported. - compression='zip' is not supported. All other compression methods (gzip, bz2, xz) are supported. - When running with nogil=True, the following parameters are ignored: columns, quoting, quotechar, doublequote, escapechar, chunksize, decimal **Distributed computing** This function supports `dask distributed`_, with the caveat that all workers must write to the same shared mountpoint and that the shared filesystem must strictly guarantee **close-open coherency**, meaning that one must be able to call write() and then close() on a file descriptor from one host and then immediately afterwards open() from another host and see the output from the first host. Note that, for performance reasons, most network filesystems do not enable this feature by default. Alternatively, one may write to local mountpoints and then manually collect and concatenate the partial outputs. """ if not isinstance(x, xarray.DataArray): raise ValueError("first argument must be a DataArray") # Health checks if not isinstance(path, str): raise ValueError("path_or_buf must be a file path") if x.ndim not in (1, 2): raise ValueError("cannot convert arrays with %d dimensions into " "pandas objects" % x.ndim) if nogil and x.dtype.kind not in "if": nogil = False # Extract row and columns indices indices = [x.get_index(dim) for dim in x.dims] if x.ndim == 2: index, columns = indices else: index = indices[0] columns = None compression = kwargs.pop("compression", "infer") compress = _compress_func(path, compression) mode = kwargs.pop("mode", "w") if mode not in "wa": raise ValueError('mode: expected w or a; got "%s"' % mode) # Fast exit for numpy backend if not x.chunks: bdata = kernels.to_csv(x.values, index, columns, True, nogil, kwargs) if compress: bdata = compress(bdata) with open(path, mode + "b") as fh: fh.write(bdata) return None # Merge chunks on all dimensions beyond the first x = x.chunk((x.chunks[0], ) + tuple((s, ) for s in x.shape[1:])) # Manually define the dask graph tok = tokenize(x.data, index, columns, compression, path, kwargs) name1 = "to_csv_encode-" + tok name2 = "to_csv_compress-" + tok name3 = "to_csv_write-" + tok name4 = "to_csv-" + tok dsk: dict[str | tuple, tuple] = {} assert x.chunks assert x.chunks[0] offset = 0 for i, size in enumerate(x.chunks[0]): # Slice index index_i = index[offset:offset + size] offset += size x_i = (x.data.name, i) + (0, ) * (x.ndim - 1) # Step 1: convert to CSV and encode to binary blob if i == 0: # First chunk: print header dsk[name1, i] = (kernels.to_csv, x_i, index_i, columns, True, nogil, kwargs) else: kwargs_i = kwargs.copy() kwargs_i["header"] = False dsk[name1, i] = (kernels.to_csv, x_i, index_i, None, False, nogil, kwargs_i) # Step 2 (optional): compress if compress: prevname = name2 dsk[name2, i] = compress, (name1, i) else: prevname = name1 # Step 3: write to file if i == 0: # First chunk: overwrite file if it already exists dsk[name3, i] = kernels.to_file, path, mode + "b", (prevname, i) else: # Next chunks: wait for previous chunk to complete and append dsk[name3, i] = (kernels.to_file, path, "ab", (prevname, i), (name3, i - 1)) # Rename final key dsk[name4] = dsk.pop((name3, i)) hlg = HighLevelGraph.from_collections(name4, dsk, (x, )) return Delayed(name4, hlg)
def warp(self, dem=None, proj="EPSG:4326", **kwargs): """Delayed warp across an entire AOI or Image Creates a new dask image by deferring calls to the warp_geometry on chunks Args: dem (ndarray): optional. A DEM for warping to specific elevation planes proj (str): optional. An EPSG proj string to project the image data into ("EPSG:32612") Returns: daskarray: a warped image as deferred image array """ try: img_md = self.rda.metadata["image"] x_size = img_md["tileXSize"] y_size = img_md["tileYSize"] except (AttributeError, KeyError): x_size = kwargs.get("chunk_size", 256) y_size = kwargs.get("chunk_size", 256) # Create an affine transform to convert between real-world and pixels if self.proj is None: from_proj = "EPSG:4326" else: from_proj = self.proj try: # NOTE: this only works on images that have rda rpcs metadata center = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).centroid g = box(*(center.buffer(self.rda.metadata["rpcs"]["gsd"] / 2).bounds)) tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", ops.transform(tfm, g).area ** 0.5) current_bounds = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).bounds except (AttributeError, KeyError, TypeError): tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area / (self.shape[1] * self.shape[2])) ** 0.5 ) current_bounds = self.bounds tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj), pyproj.Proj(init=proj)) itfm = partial(pyproj.transform, pyproj.Proj(init=proj), pyproj.Proj(init=from_proj)) output_bounds = ops.transform(tfm, box(*current_bounds)).bounds gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3], 0.0, -1 * gsd) ll = ~gtf * (output_bounds[:2]) ur = ~gtf * (output_bounds[2:]) x_chunks = int((ur[0] - ll[0]) / x_size) + 1 y_chunks = int((ll[1] - ur[1]) / y_size) + 1 num_bands = self.shape[0] try: dtype = RDA_TO_DTYPE[img_md["dataType"]] except: dtype = 'uint8' daskmeta = { "dask": {}, "chunks": (num_bands, y_size, x_size), "dtype": dtype, "name": "warp-{}".format(self.name), "shape": (num_bands, y_chunks * y_size, x_chunks * x_size) } def px_to_geom(xmin, ymin): xmax = int(xmin + x_size) ymax = int(ymin + y_size) bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin))) return box(*bounds) full_bounds = box(*output_bounds) dasks = [] if isinstance(dem, GeoDaskImage): if dem.proj != proj: dem = dem.warp(proj=proj, dem=dem) dasks.append(dem.dask) for y in xrange(y_chunks): for x in xrange(x_chunks): xmin = x * x_size ymin = y * y_size geometry = px_to_geom(xmin, ymin) daskmeta["dask"][(daskmeta["name"], 0, y, x)] = (self._warp, geometry, gsd, dem, proj, dtype, 5) daskmeta["dask"], _ = optimization.cull(HighLevelGraph.merge(daskmeta["dask"], *dasks), list(daskmeta["dask"].keys())) gi = mapping(full_bounds) gt = AffineTransform(gtf, proj) image = GeoDaskImage(daskmeta, __geo_interface__ = gi, __geo_transform__ = gt) return image[box(*output_bounds)]
def reduction( args, chunk=None, aggregate=None, combine=None, meta=None, token=None, chunk_kwargs=None, aggregate_kwargs=None, combine_kwargs=None, split_every=None, **kwargs, ): """Generic tree reduction operation. Parameters ---------- args : Positional arguments for the `chunk` function. All `dask.dataframe` objects should be partitioned and indexed equivalently. chunk : function [block-per-arg] -> block Function to operate on each block of data aggregate : function list-of-blocks -> block Function to operate on the list of results of chunk combine : function list-of-blocks -> block, optional Function to operate on intermediate lists of results of chunk in a tree-reduction. If not provided, defaults to aggregate. $META token : str, optional The name to use for the output keys. chunk_kwargs : dict, optional Keywords for the chunk function only. aggregate_kwargs : dict, optional Keywords for the aggregate function only. combine_kwargs : dict, optional Keywords for the combine function only. split_every : int, optional Group partitions into groups of this size while performing a tree-reduction. If set to False, no tree-reduction will be used, and all intermediates will be concatenated and passed to ``aggregate``. Default is 8. kwargs : All remaining keywords will be passed to ``chunk``, ``aggregate``, and ``combine``. """ if chunk_kwargs is None: chunk_kwargs = dict() if aggregate_kwargs is None: aggregate_kwargs = dict() chunk_kwargs.update(kwargs) aggregate_kwargs.update(kwargs) if combine is None: if combine_kwargs: raise ValueError("`combine_kwargs` provided with no `combine`") combine = aggregate combine_kwargs = aggregate_kwargs else: if combine_kwargs is None: combine_kwargs = dict() combine_kwargs.update(kwargs) if not isinstance(args, (tuple, list)): args = [args] npartitions = set( arg.npartitions for arg in args if isinstance(arg, _Frame) ) if len(npartitions) > 1: raise ValueError("All arguments must have same number of partitions") npartitions = npartitions.pop() if split_every is None: split_every = 8 elif split_every is False: split_every = npartitions elif split_every < 2 or not isinstance(split_every, int): raise ValueError("split_every must be an integer >= 2") token_key = tokenize( token or (chunk, aggregate), meta, args, chunk_kwargs, aggregate_kwargs, combine_kwargs, split_every, ) # Chunk a = "{0}-chunk-{1}".format(token or funcname(chunk), token_key) if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: dsk = { (a, 0, i): (chunk, key) for i, key in enumerate(args[0].__dask_keys__()) } else: dsk = { (a, 0, i): ( apply, chunk, [(x._name, i) if isinstance(x, _Frame) else x for x in args], chunk_kwargs, ) for i in range(args[0].npartitions) } # Combine b = "{0}-combine-{1}".format(token or funcname(combine), token_key) k = npartitions depth = 0 while k > split_every: for part_i, inds in enumerate(partition_all(split_every, range(k))): conc = (list, [(a, depth, i) for i in inds]) dsk[(b, depth + 1, part_i)] = ( (apply, combine, [conc], combine_kwargs) if combine_kwargs else (combine, conc) ) k = part_i + 1 a = b depth += 1 # Aggregate b = "{0}-agg-{1}".format(token or funcname(aggregate), token_key) conc = (list, [(a, depth, i) for i in range(k)]) if aggregate_kwargs: dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs) else: dsk[(b, 0)] = (aggregate, conc) if meta is None: meta_chunk = _emulate(apply, chunk, args, chunk_kwargs) meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs) meta = dd.core.make_meta(meta) graph = HighLevelGraph.from_collections(b, dsk, dependencies=args) return dd.core.new_dd_object(graph, b, meta, (None, None))
def reshape_yxbt( xx: xr.Dataset, name: str = "reshape_yxbt", yx_chunks: Union[int, Tuple[int, int]] = -1, ) -> xr.DataArray: """ Reshape Dask-backed ``xr.Dataset[Time,Y,X]`` into ``xr.DataArray[Y,X,Band,Time]``. On the output DataArray there is exactly one chunk along both Time and Band dimensions. :param xx: Dataset with 3 dimensional bands, dimension order (time, y, x) :param name: Dask name of the output operation :param yx_chunks: If supplied subdivide YX chunks of input into smaller sections, note that this can only make yx chunks smaller not bigger. Every output chunk depends on one input chunk only, so output chunks might not be regular, for example if input chunk sizes are 10, and yx_chunks=3, you'll get chunks sized 3,3,3,1,3,3,3,1... (example only, never use chunks that small) .. note: Chunks along first dimension ought to be of size 1 exactly (default for time dimension when using dc.load). """ if isinstance(yx_chunks, int): yx_chunks = (yx_chunks, yx_chunks) if not is_dask_collection(xx): raise ValueError("Currently this code works only on Dask inputs") if not all(dv.data.numblocks[0] == dv.data.shape[0] for dv in xx.data_vars.values()): raise ValueError( "All input bands should have chunk=1 for the first dimension") name0 = name name = randomize(name) blocks, _ = _get_chunks_for_all_bands(xx) b0, *_ = xx.data_vars.values() attrs = dict(b0.attrs) nb = len(xx.data_vars.values()) nt, ny, nx = b0.shape deps = [dv.data for dv in xx.data_vars.values()] shape = (ny, nx, nb, nt) dtype = b0.dtype dims = b0.dims[1:] + ("band", b0.dims[0]) maxy, maxx = yx_chunks ychunks, xchunks = b0.data.chunks[1:3] _yy = list(_split_chunks(ychunks, maxy)) _xx = list(_split_chunks(xchunks, maxx)) ychunks = tuple(roi.stop - roi.start for _, _, roi in _yy) xchunks = tuple(roi.stop - roi.start for _, _, roi in _xx) chunks = [ychunks, xchunks, (nb, ), (nt, )] dsk = {} for iy, iy_src, y_roi in _yy: for ix, ix_src, x_roi in _xx: crop_yx = (y_roi, x_roi) _blocks = blocks[:, :, iy_src, ix_src].tolist() dsk[(name, iy, ix, 0, 0)] = ( functools.partial(_reshape_yxbt_impl, crop_yx=crop_yx), _blocks, ) dsk = HighLevelGraph.from_collections(name, dsk, dependencies=deps) data = da.Array(dsk, name, chunks=chunks, dtype=dtype, shape=shape) coords: Dict[Hashable, Any] = {k: c for k, c in xx.coords.items()} coords["band"] = list(xx.data_vars) return xr.DataArray(data=data, dims=dims, coords=coords, name=name0, attrs=attrs)
def map_blocks( func: Callable[..., T_DSorDA], obj: Union[DataArray, Dataset], args: Sequence[Any] = (), kwargs: Mapping[str, Any] = None, ) -> T_DSorDA: """Apply a function to each chunk of a DataArray or Dataset. This function is experimental and its signature may change. Parameters ---------- func: callable User-provided function that accepts a DataArray or Dataset as its first parameter. The function will receive a subset of 'obj' (see below), corresponding to one chunk along each chunked dimension. ``func`` will be executed as ``func(obj_subset, *args, **kwargs)``. The function will be first run on mocked-up data, that looks like 'obj' but has sizes 0, to determine properties of the returned object such as dtype, variable names, new dimensions and new indexes (if any). This function must return either a single DataArray or a single Dataset. This function cannot change size of existing dimensions, or add new chunked dimensions. obj: DataArray, Dataset Passed to the function as its first argument, one dask chunk at a time. args: Sequence Passed verbatim to func after unpacking, after the sliced obj. xarray objects, if any, will not be split by chunks. Passing dask collections is not allowed. kwargs: Mapping Passed verbatim to func after unpacking. xarray objects, if any, will not be split by chunks. Passing dask collections is not allowed. Returns ------- A single DataArray or Dataset with dask backend, reassembled from the outputs of the function. Notes ----- This function is designed for when one needs to manipulate a whole xarray object within each chunk. In the more common case where one can work on numpy arrays, it is recommended to use apply_ufunc. If none of the variables in obj is backed by dask, calling this function is equivalent to calling ``func(obj, *args, **kwargs)``. See Also -------- dask.array.map_blocks, xarray.apply_ufunc, xarray.Dataset.map_blocks, xarray.DataArray.map_blocks """ def _wrapper(func, obj, to_array, args, kwargs): if to_array: obj = dataset_to_dataarray(obj) result = func(obj, *args, **kwargs) for name, index in result.indexes.items(): if name in obj.indexes: if len(index) != len(obj.indexes[name]): raise ValueError( "Length of the %r dimension has changed. This is not allowed." % name) return make_dict(result) if not isinstance(args, Sequence): raise TypeError( "args must be a sequence (for example, a list or tuple).") if kwargs is None: kwargs = {} elif not isinstance(kwargs, Mapping): raise TypeError("kwargs must be a mapping (for example, a dict)") for value in list(args) + list(kwargs.values()): if dask.is_dask_collection(value): raise TypeError( "Cannot pass dask collections in args or kwargs yet. Please compute or " "load values before passing to map_blocks.") if not dask.is_dask_collection(obj): return func(obj, *args, **kwargs) if isinstance(obj, DataArray): # only using _to_temp_dataset would break # func = lambda x: x.to_dataset() # since that relies on preserving name. if obj.name is None: dataset = obj._to_temp_dataset() else: dataset = obj.to_dataset() input_is_array = True else: dataset = obj input_is_array = False input_chunks = dataset.chunks template: Union[DataArray, Dataset] = infer_template(func, obj, *args, **kwargs) if isinstance(template, DataArray): result_is_array = True template_name = template.name template = template._to_temp_dataset() elif isinstance(template, Dataset): result_is_array = False else: raise TypeError( f"func output must be DataArray or Dataset; got {type(template)}") template_indexes = set(template.indexes) dataset_indexes = set(dataset.indexes) preserved_indexes = template_indexes & dataset_indexes new_indexes = template_indexes - dataset_indexes indexes = {dim: dataset.indexes[dim] for dim in preserved_indexes} indexes.update({k: template.indexes[k] for k in new_indexes}) graph: Dict[Any, Any] = {} gname = "{}-{}".format(dask.utils.funcname(func), dask.base.tokenize(dataset, args, kwargs)) # map dims to list of chunk indexes ichunk = { dim: range(len(chunks_v)) for dim, chunks_v in input_chunks.items() } # mapping from chunk index to slice bounds chunk_index_bounds = { dim: np.cumsum((0, ) + chunks_v) for dim, chunks_v in input_chunks.items() } # iterate over all possible chunk combinations for v in itertools.product(*ichunk.values()): chunk_index_dict = dict(zip(dataset.dims, v)) # this will become [[name1, variable1], # [name2, variable2], # ...] # which is passed to dict and then to Dataset data_vars = [] coords = [] for name, variable in dataset.variables.items(): # make a task that creates tuple of (dims, chunk) if dask.is_dask_collection(variable.data): # recursively index into dask_keys nested list to get chunk chunk = variable.__dask_keys__() for dim in variable.dims: chunk = chunk[chunk_index_dict[dim]] chunk_variable_task = (f"{gname}-{chunk[0]}", ) + v graph[chunk_variable_task] = ( tuple, [variable.dims, chunk, variable.attrs], ) else: # non-dask array with possibly chunked dimensions # index into variable appropriately subsetter = {} for dim in variable.dims: if dim in chunk_index_dict: which_chunk = chunk_index_dict[dim] subsetter[dim] = slice( chunk_index_bounds[dim][which_chunk], chunk_index_bounds[dim][which_chunk + 1], ) subset = variable.isel(subsetter) chunk_variable_task = ("{}-{}".format( gname, dask.base.tokenize(subset)), ) + v graph[chunk_variable_task] = ( tuple, [subset.dims, subset, subset.attrs], ) # this task creates dict mapping variable name to above tuple if name in dataset._coord_names: coords.append([name, chunk_variable_task]) else: data_vars.append([name, chunk_variable_task]) from_wrapper = (gname, ) + v graph[from_wrapper] = ( _wrapper, func, (Dataset, (dict, data_vars), (dict, coords), dataset.attrs), input_is_array, args, kwargs, ) # mapping from variable name to dask graph key var_key_map: Dict[Hashable, str] = {} for name, variable in template.variables.items(): if name in indexes: continue gname_l = f"{gname}-{name}" var_key_map[name] = gname_l key: Tuple[Any, ...] = (gname_l, ) for dim in variable.dims: if dim in chunk_index_dict: key += (chunk_index_dict[dim], ) else: # unchunked dimensions in the input have one chunk in the result key += (0, ) graph[key] = (operator.getitem, from_wrapper, name) graph = HighLevelGraph.from_collections(gname, graph, dependencies=[dataset]) result = Dataset(coords=indexes, attrs=template.attrs) for name, gname_l in var_key_map.items(): dims = template[name].dims var_chunks = [] for dim in dims: if dim in input_chunks: var_chunks.append(input_chunks[dim]) elif dim in indexes: var_chunks.append((len(indexes[dim]), )) data = dask.array.Array(graph, name=gname_l, chunks=var_chunks, dtype=template[name].dtype) result[name] = (dims, data, template[name].attrs) result = result.set_coords(template._coord_names) if result_is_array: da = dataset_to_dataarray(result) da.name = template_name return da # type: ignore return result # type: ignore
def _parallel_var(ddf, meta, skipna, split_every, out): def _local_var(x, skipna): if skipna: n = x.count(skipna=skipna) avg = x.mean(skipna=skipna) else: # Not skipping nulls, so might as well # avoid the full `count` operation n = len(x) avg = x.sum(skipna=skipna) / n m2 = ((x - avg) ** 2).sum(skipna=skipna) return n, avg, m2 def _aggregate_var(parts): n, avg, m2 = parts[0] for i in range(1, len(parts)): n_a, avg_a, m2_a = n, avg, m2 n_b, avg_b, m2_b = parts[i] n = n_a + n_b avg = (n_a * avg_a + n_b * avg_b) / n delta = avg_b - avg_a m2 = m2_a + m2_b + delta ** 2 * n_a * n_b / n return n, avg, m2 def _finalize_var(vals): n, _, m2 = vals return m2 / (n - 1) # Build graph nparts = ddf.npartitions if not split_every: split_every = nparts name = "var-" + tokenize(skipna, split_every, out) local_name = "local-" + name num = ddf._get_numeric_data() dsk = { (local_name, n, 0): (_local_var, (num._name, n), skipna) for n in range(nparts) } # Use reduction tree widths = [nparts] while nparts > 1: nparts = math.ceil(nparts / split_every) widths.append(nparts) height = len(widths) for depth in range(1, height): for group in range(widths[depth]): p_max = widths[depth - 1] lstart = split_every * group lstop = min(lstart + split_every, p_max) node_list = [ (local_name, p, depth - 1) for p in range(lstart, lstop) ] dsk[(local_name, group, depth)] = (_aggregate_var, node_list) if height == 1: group = depth = 0 dsk[(name, 0)] = (_finalize_var, (local_name, group, depth)) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf]) result = dd.core.new_dd_object(graph, name, meta, (None, None)) if isinstance(ddf, DataFrame): result.divisions = (min(ddf.columns), max(ddf.columns)) return handle_out(out, result)
def grid(vis, uvw, flags, weights, frequencies, grid_config, wmin=-1e30, wmax=1e30, streams=None): """ Grids the supplied visibilities in parallel. Note that a grid is create for each visibility chunk. Parameters ---------- vis : :class:`dask.array.Array` visibilities of shape :code:`(row, chan, corr)` uvw : :class:`dask.array.Array` uvw coordinates of shape :code:`(row, 3)` flags : :class:`dask.array.Array` flags of shape :code:`(row, chan, corr)` weights : :class:`dask.array.Array` weights of shape :code:`(row, chan, corr)`. frequencies : :class:`dask.array.Array` frequencies of shape :code:`(chan,)` grid_config : :class:`GridderConfigWrapper` Gridding Configuration wmin : float Minimum W coordinate to grid. Defaults to -1e30. wmax : float Maximum W coordinate to grid. Default to 1e30. streams : int, optional Number of parallel gridding operations. Default to None, in which case as many grids as visibility chunks will be created. Returns ------- grid : :class:`dask.array.Array` grid of shape :code:`(ny, nx, corr)` """ if len(frequencies.chunks[0]) != 1: raise ValueError("Chunking in channel currently unsupported") # Create a baseline object per row chunk baselines = da.blockwise(_nifty_baselines, ("row", ), uvw, ("row", "uvw"), frequencies, ("chan", ), dtype=np.object) if len(frequencies.chunks[0]) != 1: raise ValueError("Chunking in channel unsupported") gc = grid_config.object grids = [] for corr in range(vis.shape[2]): corr_flags = flags[:, :, corr] corr_vis = vis[:, :, corr] corr_weights = weights[:, :, corr] indices = da.blockwise( _nifty_indices, ("row", ), baselines, ("row", ), gc, None, corr_flags, ("row", "chan"), -1, None, # channel begin -1, None, # channel end wmin, None, wmax, None, dtype=np.int32) if streams is None: # Standard parallel reduction, possibly memory hungry # if many threads (and thus grids) are gridding # parallel grid = da.blockwise(_nifty_grid, ("row", "nu", "nv"), baselines, ("row", ), gc, None, indices, ("row", ), corr_vis, ("row", "chan"), corr_weights, ("row", "chan"), new_axes={ "nu": gc.Nu(), "nv": gc.Nv() }, adjust_chunks={"row": 1}, dtype=np.complex128) grids.append(grid.sum(axis=0)) else: # Stream reduction layers = GridStreamReduction(baselines, indices, gc, corr_vis, corr_weights, corr, streams) deps = [baselines, indices, corr_vis, corr_weights] graph = HighLevelGraph.from_collections(layers.name, layers, deps) chunks = corr_vis.chunks grid_stream_red = da.Array(graph, layers.name, chunks, vis.dtype) layers = FinalGridReduction(layers) deps = [grid_stream_red] graph = HighLevelGraph.from_collections(layers.name, layers, deps) chunks = ((gc.Nu(), ), (gc.Nv(), )) corr_grid = da.Array(graph, layers.name, chunks, vis.dtype) grids.append(corr_grid) return da.stack(grids, axis=2)
def affine_transform(image, matrix, offset=0.0, output_shape=None, order=1, output_chunks=None, **kwargs): """Apply an affine transform using Dask. For every output chunk, only the slice containing the relevant part of the image is processed. Chunkwise processing is performed either using `ndimage.affine_transform` or `cupyx.scipy.ndimage.affine_transform`, depending on the input type. Notes ----- Differences to `ndimage.affine_transformation`: - currently, prefiltering is not supported (affecting the output in case of interpolation `order > 1`) - default order is 1 - modes 'reflect', 'mirror' and 'wrap' are not supported Arguments equal to `ndimage.affine_transformation`, except for `output_chunks`. Parameters ---------- image : array_like (Numpy Array, Cupy Array, Dask Array...) The image array. matrix : array (ndim,), (ndim, ndim), (ndim, ndim+1) or (ndim+1, ndim+1) Transformation matrix. offset : float or sequence, optional The offset into the array where the transform is applied. If a float, `offset` is the same for each axis. If a sequence, `offset` should contain one value for each axis. output_shape : tuple of ints, optional The shape of the array to be returned. order : int, optional The order of the spline interpolation. Note that for order>1 scipy's affine_transform applies prefiltering, which is not yet supported and skipped in this implementation. output_chunks : tuple of ints, optional The shape of the chunks of the output Dask Array. Returns ------- affine_transform : Dask Array A dask array representing the transformed output """ if not type(image) == da.core.Array: image = da.from_array(image) if output_shape is None: output_shape = image.shape if output_chunks is None: output_chunks = image.shape # Perform test run to ensure parameter validity. ndimage_affine_transform(np.zeros([0] * image.ndim), matrix, offset) # Make sure parameters contained in matrix and offset # are not overlapping, i.e. that the offset is valid as # it needs to be modified for each chunk. # Further parameter checks are performed directly by # `ndimage.affine_transform`. matrix = np.asarray(matrix) offset = np.asarray(offset).squeeze() # these lines were copied and adapted from `ndimage.affine_transform` if (matrix.ndim == 2 and matrix.shape[1] == image.ndim + 1 and (matrix.shape[0] in [image.ndim, image.ndim + 1])): # assume input is homogeneous coordinate transformation matrix offset = matrix[:image.ndim, image.ndim] matrix = matrix[:image.ndim, :image.ndim] # process kwargs # prefilter is not yet supported if 'prefilter' in kwargs: if kwargs['prefilter'] and order > 1: warnings.warn( 'Currently, `dask_image.ndinterp.affine_transform` ' 'doesn\'t support `prefilter=True`. Proceeding with' ' `prefilter=False`, which if order > 1 can lead ' 'to the output containing more blur than with ' 'prefiltering.', UserWarning) del kwargs['prefilter'] if 'mode' in kwargs: if kwargs['mode'] in ['wrap', 'reflect', 'mirror']: raise (NotImplementedError("Mode %s is not currently supported." % kwargs['mode'])) n = image.ndim image_shape = image.shape # calculate output array properties normalized_chunks = da.core.normalize_chunks(output_chunks, tuple(output_shape)) block_indices = product(*(range(len(bds)) for bds in normalized_chunks)) block_offsets = [np.cumsum((0, ) + bds[:-1]) for bds in normalized_chunks] # use dispatching mechanism to determine backend affine_transform_method = dispatch_affine_transform(image) asarray_method = dispatch_asarray(image) # construct dask graph for output array # using unique and deterministic identifier output_name = 'affine_transform-' + tokenize( image, matrix, offset, output_shape, output_chunks, kwargs) output_layer = {} rel_images = [] for ib, block_ind in enumerate(block_indices): out_chunk_shape = [ normalized_chunks[dim][block_ind[dim]] for dim in range(n) ] out_chunk_offset = [ block_offsets[dim][block_ind[dim]] for dim in range(n) ] out_chunk_edges = np.array([i for i in np.ndindex(tuple([2] * n))])\ * np.array(out_chunk_shape) + np.array(out_chunk_offset) # map output chunk edges onto input image coordinates # to define the input region relevant for the current chunk if matrix.ndim == 1 and len(matrix) == image.ndim: rel_image_edges = matrix * out_chunk_edges + offset else: rel_image_edges = np.dot(matrix, out_chunk_edges.T).T + offset rel_image_i = np.min(rel_image_edges, 0) rel_image_f = np.max(rel_image_edges, 0) # Calculate edge coordinates required for the footprint of the # spline kernel according to # https://github.com/scipy/scipy/blob/9c0d08d7d11fc33311a96d2ac3ad73c8f6e3df00/scipy/ndimage/src/ni_interpolation.c#L412-L419 # noqa: E501 # Also see this discussion: # https://github.com/dask/dask-image/issues/24#issuecomment-706165593 # noqa: E501 for dim in range(n): if order % 2 == 0: rel_image_i[dim] += 0.5 rel_image_f[dim] += 0.5 rel_image_i[dim] = np.floor(rel_image_i[dim]) - order // 2 rel_image_f[dim] = np.floor(rel_image_f[dim]) - order // 2 + order if order == 0: # required for consistency with scipy.ndimage rel_image_i[dim] -= 1 # clip image coordinates to image extent for dim, s in zip(range(n), image_shape): rel_image_i[dim] = np.clip(rel_image_i[dim], 0, s - 1) rel_image_f[dim] = np.clip(rel_image_f[dim], 0, s - 1) rel_image_slice = tuple([ slice(int(rel_image_i[dim]), int(rel_image_f[dim]) + 2) for dim in range(n) ]) rel_image = image[rel_image_slice] """Block comment for future developers explaining how `offset` is transformed into `offset_prime` for each output chunk. Modify offset to point into cropped image. y = Mx + o Coordinate substitution: y' = y - y0(min_coord_px) x' = x - x0(chunk_offset) Then: y' = Mx' + o + Mx0 - y0 M' = M o' = o + Mx0 - y0 """ offset_prime = offset + np.dot(matrix, out_chunk_offset) - rel_image_i output_layer[(output_name, ) + block_ind] = ( affine_transform_method, (da.core.concatenate3, rel_image.__dask_keys__()), asarray_method(matrix), offset_prime, tuple(out_chunk_shape), # output_shape None, # out order, 'constant' if 'mode' not in kwargs else kwargs['mode'], 0. if 'cval' not in kwargs else kwargs['cval'], False # prefilter ) rel_images.append(rel_image) graph = HighLevelGraph.from_collections(output_name, output_layer, dependencies=[image] + rel_images) meta = dispatch_asarray(image)([0]).astype(image.dtype) transformed = da.Array( graph, output_name, shape=tuple(output_shape), # chunks=output_chunks, chunks=normalized_chunks, meta=meta) return transformed
def regenerate_dataset( cls, dataset, output_path, columns=None, file_size=None, part_size=None, cats=None, conts=None, labels=None, storage_options=None, ): """Regenerate an NVTabular Dataset for efficient processing. Example Usage:: dataset = Dataset("/path/to/data_pq", engine="parquet") dataset.regenerate_dataset( out_path, part_size="1MiB", file_size="10MiB" ) Parameters ----------- dataset : Dataset Input `Dataset` object (to be regenerated). output_path : string Root directory path to use for the new (regenerated) dataset. columns : list[string], optional Subset of columns to include in the regenerated dataset. file_size : int or string, optional Desired size of each output file. part_size : int or string, optional Desired partition size to use within regeneration algorithm. Note that this is effectively the size of each contiguous write operation in cudf. cats : list[string], optional Categorical column list. conts : list[string], optional Continuous column list. labels : list[string], optional Label column list. storage_options : dict, optional Storage-option kwargs to pass through to the `fsspec` file-system interface. Returns ------- result : int or Delayed If `compute=True` (default), the return value will be an integer corresponding to the number of generated data files. If `False`, the returned value will be a `Delayed` object. """ # Specify ideal file size and partition size row_group_size = 128_000_000 file_size = parse_bytes(file_size) or row_group_size * 100 part_size = parse_bytes(part_size) or row_group_size * 10 part_size = min(part_size, file_size) fs, _, _ = get_fs_token_paths(output_path, mode="wb", storage_options=storage_options) # Start by converting the original dataset to a Dask-Dataframe # object in CPU memory. We avoid GPU memory in case the original # dataset is prone to OOM errors. _ddf = dataset.engine.to_ddf(columns=columns, cpu=True) # Prepare general metadata (gmd) gmd = {} cats = cats or [] conts = conts or [] labels = labels or [] if not len(cats + conts + labels): warnings.warn( "General-metadata information not detected! " "Please pass lists for `cats`, `conts`, and `labels` as" "arguments to `regenerate_dataset` to ensure a complete " "and correct _metadata.json file.") col_idx = {str(name): i for i, name in enumerate(_ddf.columns)} gmd["cats"] = [{"col_name": c, "index": col_idx[c]} for c in cats] gmd["conts"] = [{"col_name": c, "index": col_idx[c]} for c in conts] gmd["labels"] = [{"col_name": c, "index": col_idx[c]} for c in labels] # Get list of partition lengths token = tokenize( dataset, output_path, columns, part_size, file_size, cats, conts, labels, storage_options, ) getlen_name = "getlen-" + token name = "all-" + getlen_name dsk = {(getlen_name, i): (len, (_ddf._name, i)) for i in range(_ddf.npartitions)} dsk[name] = [(getlen_name, i) for i in range(_ddf.npartitions)] graph = HighLevelGraph.from_collections(name, dsk, dependencies=[_ddf]) size_list = Delayed(name, graph).compute() # Get memory usage per row using first partition p0_mem_size = _ddf.partitions[0].memory_usage( deep=True, index=True).sum().compute() mem_per_row = int(float(p0_mem_size) / float(size_list[0])) # Determine the number of rows to assign to each output partition # and the number of output partitions to assign to each output file rows_per_part = int(part_size / mem_per_row) parts_per_file = int(file_size / part_size) # Construct re-partition graph dsk2 = {} repartition_name = "repartition-" + token split_name = "split-" + repartition_name getitem_name = "getitem-" + repartition_name gets = defaultdict(list) out_parts = 0 remaining_out_part_rows = rows_per_part for i, in_part_size in enumerate(size_list): # The `split` dictionary will be passed to this input # partition to dictate how that partition will be split # into different output partitions/files. The "key" of # this dict is the output partition, and the value is a # tuple specifying the (start, end) row range. split = {} last = 0 while in_part_size >= remaining_out_part_rows: gets[out_parts].append(i) split[out_parts] = (last, last + remaining_out_part_rows) last += remaining_out_part_rows in_part_size = in_part_size - remaining_out_part_rows remaining_out_part_rows = rows_per_part out_parts += 1 if in_part_size: gets[out_parts].append(i) split[out_parts] = (last, last + in_part_size) remaining_out_part_rows -= in_part_size if remaining_out_part_rows == 0: remaining_out_part_rows = rows_per_part out_parts += 1 dsk2[(split_name, i)] = (_split_part, (_ddf._name, i), split) npartitions = max(gets) + 1 for k, v_list in gets.items(): last = None _concat_list = [] for v in v_list: key = (getitem_name, v, k) _concat_list.append(key) dsk2[key] = (operator.getitem, (split_name, v), k) ignore_index = True dsk2[(repartition_name, k)] = (_concat, _concat_list, ignore_index) graph2 = HighLevelGraph.from_collections(repartition_name, dsk2, dependencies=[_ddf]) divisions = [None] * (npartitions + 1) _ddf2 = new_dd_object(graph2, repartition_name, _ddf._meta, divisions) # Make sure the root directory exists fs.mkdirs(output_path, exist_ok=True) # Construct rewrite graph dsk3 = {} rewrite_name = "rewrite-" + token write_data_name = "write-data-" + rewrite_name write_metadata_name = "write-metadata-" + rewrite_name inputs = [] final_inputs = [] for i in range(_ddf2.npartitions): index = i // parts_per_file nex_index = (i + 1) // parts_per_file package_task = (index != nex_index) or (i == (_ddf2.npartitions - 1)) fn = f"part.{index}.parquet" inputs.append((repartition_name, i)) if package_task: final_inputs.append((write_data_name, i)) dsk3[(write_data_name, i)] = ( _write_data, inputs, output_path, fs, fn, ) inputs = [] # Final task collects and writes all metadata dsk3[write_metadata_name] = ( _write_metadata_file, final_inputs, fs, output_path, gmd, ) graph3 = HighLevelGraph.from_collections(write_metadata_name, dsk3, dependencies=[_ddf2]) return Delayed(write_metadata_name, graph3)
def _ddf_to_dataset( ddf, fs, output_path, shuffle, file_partition_map, out_files_per_proc, cat_names, cont_names, label_names, output_format, client, num_threads, cpu, suffix="", partition_on=None, ): # Construct graph for Dask-based dataset write token = tokenize(ddf, shuffle, out_files_per_proc, cat_names, cont_names, label_names, suffix, partition_on) name = "write-processed-" + token write_name = name + "-partition" + token # Check that the data is in the correct place assert isinstance(ddf._meta, pd.DataFrame) is cpu dsk = {} task_list = [] if partition_on: # Use hive partitioning to write the data cached_writers = False for idx in range(ddf.npartitions): task_list.append((write_name, idx)) dsk[task_list[-1]] = ( _write_partitioned, (ddf._name, idx), f"part.{idx}{suffix}", output_path, partition_on, shuffle, fs, cat_names, cont_names, label_names, output_format, num_threads, cpu, ) dsk[name] = ( _write_metadata_files, task_list, output_path, output_format, cpu, ) elif file_partition_map is not None: # Use specified mapping of data to output files cached_writers = False full_graph = ddf.dask for fn, parts in file_partition_map.items(): # Isolate subgraph for this output file subgraph = DaskSubgraph(full_graph, ddf._name, parts) task_list.append((write_name, fn)) dsk[task_list[-1]] = ( _write_subgraph, subgraph, fn, output_path, shuffle, fs, cat_names, cont_names, label_names, output_format, num_threads, cpu, suffix, ) dsk[name] = ( _write_metadata_files, task_list, output_path, output_format, cpu, ) else: cached_writers = True for idx in range(ddf.npartitions): key = (write_name, idx) dsk[key] = ( _write_output_partition, (ddf._name, idx), output_path, shuffle, out_files_per_proc, fs, cat_names, cont_names, label_names, output_format, num_threads, cpu, suffix, ) task_list.append(key) dsk[name] = (lambda x: x, task_list) graph = _ensure_optimize_dataframe_graph( dsk=HighLevelGraph.from_collections(name, dsk, dependencies=[ddf]), keys=[name], ) out = Delayed(name, graph) # Trigger write execution if client: out = client.compute(out).result() else: out = dask.compute(out, scheduler="synchronous")[0] if cached_writers: # Follow-up Shuffling and _metadata creation _finish_dataset(client, ddf, output_path, fs, output_format, cpu)
def test_bind(layers): dsk1 = {("a-1", h1): 1, ("a-1", h2): 2} dsk2 = {"b-1": (add, ("a-1", h1), ("a-1", h2))} dsk3 = {"c-1": "b-1"} cnt = NodeCounter() dsk4 = {("d-1", h1): (cnt.f, 1), ("d-1", h2): (cnt.f, 2)} dsk4b = {"e": (cnt.f, 3)} if layers: dsk1 = HighLevelGraph({"a-1": dsk1}, {"a-1": set()}) dsk2 = HighLevelGraph({ "a-1": dsk1, "b-1": dsk2 }, { "a-1": set(), "b-1": {"a-1"} }) dsk3 = HighLevelGraph( { "a-1": dsk1, "b-1": dsk2, "c-1": dsk3 }, { "a-1": set(), "b-1": {"a-1"}, "c-1": {"b-1"} }, ) dsk4 = HighLevelGraph({ "d-1": dsk4, "e": dsk4b }, { "d-1": set(), "e": set() }) else: dsk2.update(dsk1) dsk3.update(dsk2) dsk4.update(dsk4b) # t1 = Tuple(dsk1, [("a", h1), ("a", h2)]) t2 = Tuple(dsk2, ["b-1"]) t3 = Tuple(dsk3, ["c-1"]) t4 = Tuple(dsk4, [("d-1", h1), ("d-1", h2), "e"]) # Multiple names bound1 = bind(t3, t4, seed=1, assume_layers=layers) cloned_a_name = clone_key("a-1", seed=1) assert bound1.__dask_graph__()[cloned_a_name, h1][0] is chunks.bind assert bound1.__dask_graph__()[cloned_a_name, h2][0] is chunks.bind assert bound1.compute() == (3, ) assert cnt.n == 3 bound2 = bind(t3, t4, omit=t2, seed=1, assume_layers=layers) cloned_c_name = clone_key("c-1", seed=1) assert bound2.__dask_graph__()[cloned_c_name][0] is chunks.bind assert bound2.compute() == (3, ) assert cnt.n == 6 bound3 = bind(t4, t3, seed=1, assume_layers=layers) cloned_d_name = clone_key("d-1", seed=1) cloned_e_name = clone_key("e", seed=1) assert bound3.__dask_graph__()[cloned_d_name, h1][0] is chunks.bind assert bound3.__dask_graph__()[cloned_d_name, h2][0] is chunks.bind assert bound3.__dask_graph__()[cloned_e_name][0] is chunks.bind assert bound3.compute() == (1, 2, 3) assert cnt.n == 9