def dask_hist2d(x: da.Array, y: da.Array, bins: int, range, density=False):
    if x.shape != y.shape:
        raise ValueError(
            f"Mismatch in argument shaoes: x.shape == {x.shape}; y.shape == {y.shape}"
        )

    token = tokenize(x, y, bins, range, density)
    name = "histogram2d-sum-" + token

    x_keys = flatten(x.__dask_keys__())
    y_keys = flatten(y.__dask_keys__())

    dsk = {
        (name, i, 0, 0): (_block_fast_hist2d, xi, yi, bins, range)
        for i, (xi, yi) in enumerate(zip(x_keys, y_keys))
    }
    dtype = np.histogram2d([], [])[0].dtype

    graph = HighLevelGraph.from_collections(name, dsk, dependencies=(x, y))

    # turn graph into a 3D array of shape (nchunks, nbins, nbins)
    nchunks = len(list(flatten(x.__dask_keys__())))
    chunks = ((1,) * nchunks, (bins,), (bins,))
    mapped = Array(graph, name, chunks, dtype=dtype)
    n = mapped.sum(axis=0)
    return n
示例#2
0
    def persist(self, collections):
        """ Persist dask collections on cluster

        Starts computation of the collection on the cluster in the background.
        Provides a new dask collection that is semantically identical to the
        previous one, but now based off of futures currently in execution.

        Parameters
        ----------
        collections: sequence or single dask object
            Collections like dask.array or dataframe or dask.value objects

        Returns
        -------
        List of collections, or single collection, depending on type of input.

        Examples
        --------
        >>> xx = executor.persist(x)  # doctest: +SKIP
        >>> xx, yy = executor.persist([x, y])  # doctest: +SKIP

        See Also
        --------
        Executor.compute
        """
        if isinstance(collections, (tuple, list, set, frozenset)):
            singleton = False
        else:
            singleton = True
            collections = [collections]

        assert all(isinstance(c, Base) for c in collections)

        groups = groupby(lambda x: x._optimize, collections)
        dsk = merge([opt(merge([v.dask for v in val]),
                         [v._keys() for v in val])
                    for opt, val in groups.items()])

        d = {k: unpack_remotedata(v) for k, v in dsk.items()}
        dsk2 = {k: v[0] for k, v in d.items()}
        dependencies = {k: v[1] for k, v in d.items()}

        for k, v in dsk2.items():
            dependencies[k] |= set(_deps(dsk, v))

        names = list({k for c in collections for k in flatten(c._keys())})

        self._send_to_scheduler({'op': 'update-graph',
                                 'tasks': valmap(dumps_task, dsk2),
                                 'dependencies': dependencies,
                                 'keys': names,
                                 'client': self.id})
        result = [redict_collection(c, {k: Future(k, self)
                                        for k in flatten(c._keys())})
                for c in collections]
        if singleton:
            return first(result)
        else:
            return result
示例#3
0
def test_inlined_array():
    A = da.ones((10, 10), chunks=(2, 2), dtype=np.float64)
    B = da.full((10, 10), np.float64(2), chunks=(2, 2))
    C = A + B
    E = C + 1

    D = inlined_array(C)
    assert len(C.__dask_graph__().layers) == 3
    assert D.name == C.name
    assert D.name in D.__dask_graph__().layers
    assert A.name not in D.__dask_graph__().layers
    assert B.name not in D.__dask_graph__().layers
    graph_keys = set(flatten(D.__dask_graph__().keys()))
    assert graph_keys == set(flatten(D.__dask_keys__()))
    assert_array_equal(D, C)

    D = inlined_array(C, [A, B])
    assert len(D.__dask_graph__().layers) == 1
    assert D.name == C.name
    assert D.name in D.__dask_graph__().layers
    assert A.name not in D.__dask_graph__().layers
    assert B.name not in D.__dask_graph__().layers
    graph_keys = set(flatten(D.__dask_graph__().keys()))
    assert graph_keys == set(flatten(D.__dask_keys__()))
    assert_array_equal(D, C)

    D = inlined_array(C, [A])
    assert len(D.__dask_graph__().layers) == 2
    assert D.name == C.name
    assert D.name in D.__dask_graph__().layers
    assert A.name not in D.__dask_graph__().layers
    assert B.name in D.__dask_graph__().layers
    graph_keys = set(flatten(D.__dask_graph__().keys()))
    assert graph_keys == set(flatten([a.__dask_keys__() for a in [D, B]]))
    assert_array_equal(D, C)

    D = inlined_array(C, [B])
    assert len(D.__dask_graph__().layers) == 2
    assert D.name == C.name
    assert D.name in D.__dask_graph__().layers
    assert A.name in D.__dask_graph__().layers
    assert B.name not in D.__dask_graph__().layers
    graph_keys = set(flatten(D.__dask_graph__().keys()))
    assert graph_keys == set(flatten([a.__dask_keys__() for a in [D, A]]))
    assert_array_equal(D, C)

    D = inlined_array(E, [A])
    assert len(D.__dask_graph__().layers) == 3
    assert D.name == E.name
    assert D.name in D.__dask_graph__().layers
    assert B.name in D.__dask_graph__().layers
    assert A.name not in D.__dask_graph__().layers
    assert C.name in D.__dask_graph__().layers
    graph_keys = set(flatten(D.__dask_graph__().keys()))
    assert graph_keys == set(flatten([a.__dask_keys__() for a in [D, B, C]]))
    assert_array_equal(D, E)
示例#4
0
    def fit(self, columns: ColumnNames, ddf: dd.DataFrame):
        # User passed in a list of column groups. We need to figure out
        # if this list contains any multi-column groups, and if there
        # are any (obvious) problems with these groups
        columns_uniq = list(set(flatten(columns, container=tuple)))
        columns_all = list(flatten(columns, container=tuple))
        if sorted(columns_all) != sorted(
                columns_uniq) and self.encode_type == "joint":
            # If we are doing "joint" encoding, there must be unique mapping
            # between input column names and column groups.  Otherwise, more
            # than one unique-value table could be used to encode the same
            # column.
            raise ValueError("Same column name included in multiple groups.")

        for group in columns:
            if isinstance(group, tuple) and len(group) > 1:
                # For multi-column groups, we concatenate column names
                # to get the "group" name.
                name = _make_name(*group, sep=self.name_sep)
                for col in group:
                    self.storage_name[col] = name

        # Check metadata type to reset on_host and cat_cache if the
        # underlying ddf is already a pandas-backed collection
        if isinstance(ddf._meta, pd.DataFrame):
            self.on_host = False
            # Cannot use "device" caching if the data is pandas-backed
            self.cat_cache = "host" if self.cat_cache == "device" else self.cat_cache
            if self.search_sorted:
                # Pandas' search_sorted only works with Series.
                # For now, it is safest to disallow this option.
                self.search_sorted = False
                warnings.warn(
                    "Cannot use `search_sorted=True` for pandas-backed data.")

        # convert tuples to lists
        columns = [list(c) if isinstance(c, tuple) else c for c in columns]
        dsk, key = _category_stats(
            ddf,
            columns,
            [],
            [],
            self.out_path,
            self.freq_threshold,
            self.tree_width,
            self.on_host,
            concat_groups=self.encode_type == "joint",
            name_sep=self.name_sep,
            max_size=self.max_size,
            num_buckets=self.num_buckets,
        )
        # TODO: we can't check the dtypes on the ddf here since they are incorrect
        # for cudf's list type. So, we're checking the partitions. fix.
        return Delayed(key,
                       dsk), ddf.map_partitions(lambda df: _is_list_dtype(df))
示例#5
0
    def __init__(
        self,
        cont_names=None,
        stats=None,
        columns=None,
        fold_groups=None,
        tree_width=None,
        out_path=None,
        on_host=True,
        freq_threshold=None,
        stat_name=None,
        concat_groups=False,
        name_sep="_",
        fold_name="__fold__",
        fold_seed=42,
        kfold=None,
    ):
        # Set column_groups if the user has passed in a list of columns
        self.column_groups = None
        if isinstance(columns, str):
            columns = [columns]
        if isinstance(columns, list):
            self.column_groups = columns
            columns = list(set(flatten(columns, container=list)))

        # Add fold_groups to columns
        if fold_groups and kfold > 1:
            fold_groups = [fold_groups] if isinstance(fold_groups,
                                                      str) else fold_groups
            columns = columns or []
            self.column_groups = self.column_groups or []
            for col in list(set(flatten(fold_groups, container=list))):
                if col not in columns:
                    columns.append(col)

        super(GroupbyStatistics, self).__init__(columns)
        self.cont_names = cont_names or []
        self.stats = stats or []
        self.categories = {}
        self.tree_width = tree_width or 8
        self.on_host = on_host
        self.freq_threshold = freq_threshold
        self.out_path = out_path or "./"
        self.stat_name = stat_name or "categories"
        self.op_name = "GroupbyStatistics-" + self.stat_name
        self.concat_groups = concat_groups
        self.name_sep = name_sep
        self.kfold = kfold or 3
        self.fold_name = fold_name
        self.fold_seed = fold_seed
        self.fold_groups = fold_groups
示例#6
0
def inlined_array(a, inline_arrays=None):
    """ Flatten underlying graph """
    agraph = a.__dask_graph__()
    akeys = set(flatten(a.__dask_keys__()))

    # Inline everything except the output keys
    if inline_arrays is None:
        inline_keys = set(agraph.keys()) - akeys
        dsk2 = inline(agraph, keys=inline_keys, inline_constants=True)
        dsk3, _ = cull(dsk2, akeys)

        graph = HighLevelGraph.from_collections(a.name, dsk3, [])
        return da.Array(graph, a.name, a.chunks, dtype=a.dtype)

    # We're given specific arrays to inline, promote to list
    if isinstance(inline_arrays, da.Array):
        inline_arrays = [inline_arrays]
    elif isinstance(inline_arrays, tuple):
        inline_arrays = list(inline_arrays)

    if not isinstance(inline_arrays, list):
        raise TypeError("Invalid inline_arrays, must be "
                        "(None, list, tuple, dask.array.Array)")

    layers = agraph.layers.copy()
    deps = agraph.dependencies.copy()
    inline_keys = set()
    dsk = dict(layers[a.name])

    # Inline specified arrays
    for array in inline_arrays:
        # Remove array from layers and dependencies
        try:
            dsk.update(layers.pop(array.name))
            del deps[array.name]
        except KeyError:
            raise ValueError("%s is not a valid dependency of a"
                             % array.name)

        # Record keys to inline
        inline_keys.update(flatten(array.__dask_keys__()))

    dsk2 = inline(dsk, keys=inline_keys, inline_constants=True)
    dsk3, _ = cull(dsk2, akeys)

    layers[a.name] = dsk3
    graph = HighLevelGraph(layers, deps)

    return da.Array(graph, a.name, a.chunks, a.dtype)
示例#7
0
    def transform(self, columns: ColumnNames, df: DataFrameType) -> DataFrameType:
        new_df = df.copy(deep=False)
        if isinstance(self.freq_threshold, dict):
            assert all(x in self.freq_threshold for x in columns)

        if self.encode_type == "combo":
            # Case (3) - We want to track multi- and single-column groups separately
            #            when we are NOT performing a joint encoding. This is because
            #            there is not a 1-to-1 mapping for columns in multi-col groups.
            #            We use `multi_col_group` to preserve the list format of
            #            multi-column groups only, and use `cat_names` to store the
            #            string representation of both single- and multi-column groups.
            #
            cat_names, multi_col_group = _get_multicolumn_names(columns, df.columns, self.name_sep)
        else:
            # Case (1) & (2) - Simple 1-to-1 mapping
            multi_col_group = {}
            cat_names = list(flatten(columns, container=tuple))

        # Encode each column-group separately
        for name in cat_names:
            try:
                # Use the column-group `list` directly (not the string name)
                use_name = multi_col_group.get(name, name)

                # Storage name may be different than group for case (2)
                # Only use the "aliased" `storage_name` if we are dealing with
                # a multi-column group, or if we are doing joint encoding

                if use_name != name or self.encode_type == "joint":
                    storage_name = self.storage_name.get(name, name)
                else:
                    storage_name = name

                if isinstance(use_name, tuple):
                    use_name = list(use_name)

                path = self.categories[storage_name]
                new_df[name] = _encode(
                    use_name,
                    storage_name,
                    path,
                    df,
                    self.cat_cache,
                    na_sentinel=self.na_sentinel,
                    freq_threshold=self.freq_threshold[name]
                    if isinstance(self.freq_threshold, dict)
                    else self.freq_threshold,
                    search_sorted=self.search_sorted,
                    buckets=self.num_buckets,
                    encode_type=self.encode_type,
                    cat_names=cat_names,
                    max_size=self.max_size,
                )
                if self.dtype:
                    new_df[name] = new_df[name].astype(self.dtype, copy=False)
            except Exception as e:
                raise RuntimeError(f"Failed to categorical encode column {name}") from e

        return new_df
示例#8
0
文件: delayed.py 项目: jakirkham/dask
def optimize(dsk, keys, **kwargs):
    if not isinstance(keys, (list, set)):
        keys = [keys]
    if not isinstance(dsk, HighLevelGraph):
        dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=())
    dsk = dsk.cull(set(flatten(keys)))
    return dsk
示例#9
0
def report(report_queue, scheduler_queue, who_has, dsk, result):
    """ Report to outside world

    For a normal get function this coroutine is almost non-essential.
    It just starts and stops the scheduler coroutine.
    """
    if isinstance(result, list):
        result_flat = set(flatten(result))
    else:
        result_flat = set([result])
    out_keys = set(result_flat)

    scheduler_queue.put_nowait({
        'op': 'update-graph',
        'dsk': dsk,
        'keys': out_keys
    })

    finished_results = {k for k in out_keys if k in who_has}

    while finished_results != out_keys:
        msg = yield report_queue.get()
        if msg['op'] == 'task-finished':
            if msg['key'] in out_keys:
                finished_results.add(msg['key'])
        if msg['op'] == 'lost-data':
            if msg['key'] in finished_results:
                finished_results.remove(msg['key'])
        if msg['op'] == 'task-erred':
            scheduler_queue.put_nowait({'op': 'close'})
            raise msg['exception']
    scheduler_queue.put_nowait({'op': 'close'})

    raise Return(out_keys)
示例#10
0
def cached_array(array):
    """
    Return a new array that functionally has the same values as array,
    but flattens the underlying graph and introduces a cache lookup
    when the individual array chunks are accessed.

    Useful for caching data that can fit in-memory for the duration
    of the graph's execution.
    """
    dsk = dict(array.__dask_graph__())
    keys = set(flatten(array.__dask_keys__()))

    # Inline + cull everything except the current array
    inline_keys = set(dsk.keys() - keys)
    dsk2 = inline(dsk, inline_keys, inline_constants=True)
    dsk3, _ = cull(dsk2, keys)

    # Create a cache used to store array values
    cache = ArrayCache(uuid.uuid4().hex)

    for k in keys:
        dsk3[k] = (cache_entry, cache, Key(k), dsk3.pop(k))

    graph = HighLevelGraph.from_collections(array.name, dsk3, [])

    return da.Array(graph, array.name, array.chunks, array.dtype)
示例#11
0
def get_futures(self, arr):
    """
    Return the list of futures of a Dask array associated to a cartesian communicator.

    Parameters
    ----------
    arr: Dask Array
        A dask array distributed as the cartesian communicator. Note: the array can
        have more axes than the communicator as long as they are not distributed.
    """
    if not isinstance(arr, Array):
        raise TypeError(f"Expected a Dask Array; got {type(arr)}.")

    self.check_dims(tuple(len(chunks) for chunks in arr.chunks))

    idxs, _ = zip(*self.normalize_dims())
    coords = self.normalize_coords()
    keys = tuple(flatten(arr.__dask_keys__()))
    key_idx = {}
    for key in keys:
        coord = tuple(key[_i + 1] for _i in idxs)
        key_idx[key] = coords.index(coord)

    keys = sorted(keys, key=key_idx.__getitem__)
    restrictions = {
        KeyPatch(key): worker
        for key, worker in zip(keys, self.workers)
    }

    arr = arr.persist(workers=restrictions)
    assert len(self) == len(arr.dask.values())

    return list(arr.dask[key] for key in keys)
示例#12
0
文件: core.py 项目: tym1062/dask-cudf
def optimize(dsk, keys, **kwargs):
    flatkeys = list(flatten(keys)) if isinstance(keys, list) else [keys]
    dsk, dependencies = cull(dsk, flatkeys)
    dsk, dependencies = fuse(dsk, keys, dependencies=dependencies,
                             ave_width=_globals.get('fuse_ave_width', 1))
    dsk, _ = cull(dsk, keys)
    return dsk
示例#13
0
def _transform_ddf(ddf, column_groups, meta=None):
    if isinstance(column_groups, ColumnGroup):
        column_groups = [column_groups]

    columns = list(flatten(cg.flattened_columns for cg in column_groups))

    # Check if we are only selecting columns (no transforms).
    # If so, we should perform column selection at the ddf level.
    # Otherwise, Dask will not push the column selection into the
    # IO function.
    if all((c.op is None and not c.parents) for c in column_groups):
        return ddf[_get_unique(columns)]

    if isinstance(meta, dict) and isinstance(ddf._meta, pd.DataFrame):
        dtypes = meta
        meta = type(ddf._meta)({k: [] for k in columns})
        for column, dtype in dtypes.items():
            meta[column] = meta[column].astype(dtype)

    elif not meta:
        # TODO: constructing meta like this loses dtype information on the ddf
        # and sets it all to 'float64'. We should propogate dtype information along
        # with column names in the columngroup graph. This currently only
        # happesn during intermediate 'fit' transforms, so as long as statoperators
        # don't require dtype information on the DDF this doesn't matter all that much
        meta = type(ddf._meta)({k: [] for k in columns})

    return ddf.map_partitions(
        _transform_partition,
        column_groups,
        meta=meta,
    )
示例#14
0
    def _get(self, dsk, keys, restrictions=None, raise_on_error=True):
        flatkeys = list(flatten([keys]))
        futures = {key: Future(key, self) for key in flatkeys}

        d = {k: unpack_remotedata(v) for k, v in dsk.items()}
        dsk2 = {k: v[0] for k, v in d.items()}
        dsk3 = {k: v for k, v in dsk2.items() if (k == v) is not True}

        dependencies = {k: v[1] for k, v in d.items()}

        for k, v in dsk3.items():
            dependencies[k] |= set(_deps(dsk, v))

        self._send_to_scheduler({
            'op': 'update-graph',
            'tasks': valmap(dumps_task, dsk3),
            'dependencies': dependencies,
            'keys': flatkeys,
            'restrictions': restrictions or {},
            'client': self.id
        })

        packed = pack_data(keys, futures)
        if raise_on_error:
            result = yield self._gather(packed)
        else:
            try:
                result = yield self._gather(packed)
                result = 'OK', result
            except Exception as e:
                result = 'error', e
        raise gen.Return(result)
示例#15
0
    def __init__(
        self,
        cont_names=None,
        stats=["count"],
        columns=None,
        tree_width=None,
        cat_cache="host",
        out_path=None,
        on_host=True,
        name_sep="_",
        stat_name=None,
    ):
        self.column_groups = None
        self.storage_name = {}
        self.name_sep = name_sep
        if isinstance(columns, str):
            columns = [columns]
        if isinstance(columns, list):
            self.column_groups = columns
            columns = list(set(flatten(columns, container=list)))
            for group in self.column_groups:
                if isinstance(group, list) and len(group) > 1:
                    name = nvt_cat._make_name(*group, sep=self.name_sep)
                    for col in group:
                        self.storage_name[col] = name

        super().__init__(columns=columns, replace=False)
        self.cont_names = cont_names
        self.stats = stats
        self.tree_width = tree_width
        self.out_path = out_path
        self.on_host = on_host
        self.cat_cache = cat_cache
        self.stat_name = stat_name or "gb_categories"
示例#16
0
 def __init__(self, columns=None, num_buckets=None, freq_limit=0, encode_type="joint"):
     if isinstance(columns, list):
         columns = list(set(flatten(columns, container=list)))
     super().__init__(columns=columns)
     self.num_buckets = num_buckets
     self.freq_limit = freq_limit
     self.encode_type = encode_type
示例#17
0
def report(report_queue, scheduler_queue, who_has, dsk, result):
    """ Report to outside world

    For a normal get function this coroutine is almost non-essential.
    It just starts and stops the scheduler coroutine.
    """
    if isinstance(result, list):
        result_flat = set(flatten(result))
    else:
        result_flat = set([result])
    out_keys = set(result_flat)

    scheduler_queue.put_nowait({'op': 'update-graph',
                                'dsk': dsk,
                                'keys': out_keys})

    finished_results = {k for k in out_keys if k in who_has}

    while finished_results != out_keys:
        msg = yield report_queue.get()
        if msg['op'] == 'task-finished':
            if msg['key'] in out_keys:
                finished_results.add(msg['key'])
        if msg['op'] == 'lost-data':
            if msg['key'] in finished_results:
                finished_results.remove(msg['key'])
        if msg['op'] == 'task-erred':
            scheduler_queue.put_nowait({'op': 'close'})
            raise msg['exception']
    scheduler_queue.put_nowait({'op': 'close'})

    raise Return(out_keys)
示例#18
0
    def _get(self, dsk, keys, restrictions=None, raise_on_error=True):
        flatkeys = list(flatten([keys]))
        futures = {key: Future(key, self) for key in flatkeys}

        d = {k: unpack_remotedata(v) for k, v in dsk.items()}
        dsk2 = {k: v[0] for k, v in d.items()}
        dsk3 = {k: v for k, v in dsk2.items() if (k == v) is not True}

        dependencies = {k: v[1] for k, v in d.items()}

        for k, v in dsk3.items():
            dependencies[k] |= set(_deps(dsk, v))

        self._send_to_scheduler({'op': 'update-graph',
                                 'tasks': valmap(dumps_task, dsk3),
                                 'dependencies': dependencies,
                                 'keys': flatkeys,
                                 'restrictions': restrictions or {},
                                 'client': self.id})

        packed = pack_data(keys, futures)
        if raise_on_error:
            result = yield self._gather(packed)
        else:
            try:
                result = yield self._gather(packed)
                result = 'OK', result
            except Exception as e:
                result = 'error', e
        raise gen.Return(result)
示例#19
0
def inline_pattern(dsk: dict, pat_ls: List[str], inline_constants: bool) -> dict:
    """
    Inline tasks whose keys match certain patterns.

    Parameters
    ----------
    dsk : dict
        Input dask graph.
    pat_ls : List[str]
        List of patterns to check.
    inline_constants : bool
        Whether to inline constants.

    Returns
    -------
    dsk : dict
        Dask graph with keys inlined.

    See Also
    -------
    dask.optimization.inline
    """
    keys = [k for k in dsk.keys() if check_pat(k, pat_ls)]
    if keys:
        dsk = inline(dsk, keys, inline_constants=inline_constants)
        for k in keys:
            del dsk[k]
        if inline_constants:
            dsk, dep = cull(dsk, set(list(flatten(keys))))
    return dsk
def downscale_dask(
    array: Any,
    reduction: Callable[[NDArray[Any], Tuple[int, ...]], NDArray[Any]],
    scale_factors: Union[int, Sequence[int], Dict[int, int]],
    **kwargs: Any,
) -> Any:

    if not np.all((np.array(array.shape) % np.array(scale_factors)) == 0):
        raise ValueError(
            f"Coarsening factors {scale_factors} do not align with array shape {array.shape}."
        )

    array = align_chunks(array, scale_factors)
    name = "downscale-" + tokenize(reduction, array, scale_factors)
    dsk = {
        (name,) + key[1:]: (apply, reduction, [key, scale_factors], kwargs)
        for key in flatten(array.__dask_keys__())
    }
    chunks = tuple(
        tuple(int(size // scale_factors[axis]) for size in sizes)
        for axis, sizes in enumerate(array.chunks)
    )

    meta = reduction(
        np.empty(scale_factors, dtype=array.dtype), scale_factors, **kwargs
    )
    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[array])
    return Array(graph, name, chunks, meta=meta)
示例#21
0
文件: chunk.py 项目: m-rossi/dask
def argtopk(a_plus_idx, k, axis, keepdims):
    """Chunk and combine function of argtopk

    Extract the indices of the k largest elements from a on the given axis.
    If k is negative, extract the indices of the -k smallest elements instead.
    Note that, unlike in the parent function, the returned elements
    are not sorted internally.
    """
    assert keepdims is True
    axis = axis[0]

    if isinstance(a_plus_idx, list):
        a_plus_idx = list(flatten(a_plus_idx))
        a = np.concatenate([ai for ai, _ in a_plus_idx], axis)
        idx = np.concatenate(
            [np.broadcast_to(idxi, ai.shape) for ai, idxi in a_plus_idx], axis)
    else:
        a, idx = a_plus_idx

    if abs(k) >= a.shape[axis]:
        return a_plus_idx

    idx2 = np.argpartition(a, -k, axis=axis)
    k_slice = slice(-k, None) if k > 0 else slice(-k)
    idx2 = idx2[tuple(k_slice if i == axis else slice(None)
                      for i in range(a.ndim))]
    return np.take_along_axis(a, idx2,
                              axis), np.take_along_axis(idx, idx2, axis)
示例#22
0
def write_blocks(source, target, region: Optional[Tuple[slice, ...]]) -> da.Array:
    """
    Return a dask array with where each chunk contains the result of writing 
    each chunk of `source` to `target`.
    """

    slices = slices_from_chunks(source.chunks)
    if region:
        slices = [fuse_slice(region, slc) for slc in slices]

    source_name = 'store-source-' + tokenize(source)
    store_name = 'store-' + tokenize(source)
    
    layers = {source_name: source.__dask_graph__()}
    deps = {source_name: set()}
    
    dsk = {}
    chunks = tuple((1,) * s for s in source.blocks.shape)
    
    for slice, key in zip(slices, flatten(source.__dask_keys__())):
        dsk[(store_name,) + key[1:]] = (ndwrapper, store_chunk, source.ndim, key, target, slice)
    
    layers[store_name] = dsk
    deps[store_name] = {source_name}
    store_dsk = HighLevelGraph(layers, deps)
    
    return da.Array(store_dsk,
                    store_name,
                    shape=source.blocks.shape,
                    chunks=chunks,
                    dtype=int)
示例#23
0
def optimize(dsk, keys, **kwargs):
    if not isinstance(keys, (list, set)):
        keys = [keys]
    keys = list(core.flatten(keys))

    if not isinstance(dsk, HighLevelGraph):
        dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=())
    else:
        # Perform Blockwise optimizations for HLG input
        dsk = optimize_dataframe_getitem(dsk, keys=keys)
        dsk = optimize_blockwise(dsk, keys=keys)
        dsk = fuse_roots(dsk, keys=keys)
    dsk = dsk.cull(set(keys))

    # Do not perform low-level fusion unless the user has
    # specified True explicitly. The configuration will
    # be None by default.
    if not config.get("optimization.fuse.active"):
        return dsk

    dependencies = dsk.get_all_dependencies()
    dsk = ensure_dict(dsk)

    fuse_subgraphs = config.get("optimization.fuse.subgraphs")
    if fuse_subgraphs is None:
        fuse_subgraphs = True
    dsk, _ = fuse(
        dsk,
        keys,
        dependencies=dependencies,
        fuse_subgraphs=fuse_subgraphs,
    )
    dsk, _ = cull(dsk, keys)
    return dsk
示例#24
0
def test_TaskGraph_complex(c, s, a, b):
    da = pytest.importorskip("dask.array")
    gp = TaskGraph(s)
    x = da.random.random((2000, 2000), chunks=(1000, 1000))
    y = ((x + x.T) - x.mean(axis=0)).persist()
    yield wait(y)
    gp.update()
    assert len(gp.layout.index) == len(gp.node_source.data["x"])
    assert len(gp.layout.index) == len(s.tasks)
    z = (x - y).sum().persist()
    yield wait(z)
    gp.update()
    assert len(gp.layout.index) == len(gp.node_source.data["x"])
    assert len(gp.layout.index) == len(s.tasks)
    del z
    yield gen.sleep(0.2)
    gp.update()
    assert len(gp.layout.index) == sum(v == "True"
                                       for v in gp.node_source.data["visible"])
    assert len(gp.layout.index) == len(s.tasks)
    assert max(gp.layout.index.values()) < len(gp.node_source.data["visible"])
    assert gp.layout.next_index == len(gp.node_source.data["visible"])
    gp.update()
    assert set(gp.layout.index.values()) == set(range(len(gp.layout.index)))
    visible = gp.node_source.data["visible"]
    keys = list(map(tokey, flatten(y.__dask_keys__())))
    assert all(visible[gp.layout.index[key]] == "True" for key in keys)
示例#25
0
def _transform_ddf(ddf, workflow_nodes, meta=None, additional_columns=None):
    # Check if we are only selecting columns (no transforms).
    # If so, we should perform column selection at the ddf level.
    # Otherwise, Dask will not push the column selection into the
    # IO function.
    if not workflow_nodes:
        return ddf[_get_unique(additional_columns)] if additional_columns else ddf

    if isinstance(workflow_nodes, WorkflowNode):
        workflow_nodes = [workflow_nodes]

    columns = list(flatten(wfn.output_columns.names for wfn in workflow_nodes))
    columns += additional_columns if additional_columns else []

    if isinstance(meta, dict) and isinstance(ddf._meta, pd.DataFrame):
        dtypes = meta
        meta = type(ddf._meta)({k: [] for k in columns})
        for column, dtype in dtypes.items():
            meta[column] = meta[column].astype(dtype)

    elif not meta:
        # TODO: constructing meta like this loses dtype information on the ddf
        # and sets it all to 'float64'. We should propagate dtype information along
        # with column names in the columngroup graph. This currently only
        # happesn during intermediate 'fit' transforms, so as long as statoperators
        # don't require dtype information on the DDF this doesn't matter all that much
        meta = type(ddf._meta)({k: [] for k in columns})

    return ddf.map_partitions(
        _transform_partition,
        workflow_nodes,
        additional_columns=additional_columns,
        meta=meta,
        enforce_metadata=False,
    )
示例#26
0
def _build_map_layer(
        func: Callable,
        prev_name: str,
        new_name: str,
        collection,
        dependencies: tuple[Delayed, ...] = (),
) -> Layer:
    """Apply func to all keys of collection. Create a Blockwise layer whenever possible;
    fall back to MaterializedLayer otherwise.

    Parameters
    ----------
    func
        Callable to be invoked on the graph node
    prev_name : str
        name of the layer to map from; in case of dask base collections, this is the
        collection name. Note how third-party collections, e.g. xarray.Dataset, can
        have multiple names.
    new_name : str
        name of the layer to map to
    collection
        Arbitrary dask collection
    dependencies
        Zero or more Delayed objects, which will be passed as arbitrary variadic args to
        func after the collection's chunk
    """
    if _can_apply_blockwise(collection):
        # Use a Blockwise layer
        try:
            numblocks = collection.numblocks
        except AttributeError:
            numblocks = (collection.npartitions, )
        indices = tuple(i for i, _ in enumerate(numblocks))
        kwargs = {
            "_deps": [d.key for d in dependencies]
        } if dependencies else {}

        return blockwise(
            func,
            new_name,
            indices,
            prev_name,
            indices,
            numblocks={prev_name: numblocks},
            dependencies=dependencies,
            **kwargs,
        )
    else:
        # Delayed, bag.Item, dataframe.core.Scalar, or third-party collection;
        # fall back to MaterializedLayer
        dep_keys = tuple(d.key for d in dependencies)
        return MaterializedLayer({
            replace_name_in_key(k, {prev_name: new_name}): (func, k) + dep_keys
            for k in flatten(collection.__dask_keys__())
            if get_name_from_key(k) == prev_name
        })
示例#27
0
def optimize(
    dsk,
    keys,
    fuse_keys=None,
    fast_functions=None,
    inline_functions_fast_functions=(getter_inline,),
    rename_fused_keys=True,
    **kwargs,
):
    """Optimize dask for array computation

    1.  Cull tasks not necessary to evaluate keys
    2.  Remove full slicing, e.g. x[:]
    3.  Inline fast functions like getitem and np.transpose
    """
    if not isinstance(keys, (list, set)):
        keys = [keys]
    keys = list(flatten(keys))

    if not isinstance(dsk, HighLevelGraph):
        dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=())

    dsk = optimize_blockwise(dsk, keys=keys)
    dsk = fuse_roots(dsk, keys=keys)
    dsk = dsk.cull(set(keys))

    # Perform low-level fusion unless the user has
    # specified False explicitly.
    if config.get("optimization.fuse.active") is False:
        return dsk

    dependencies = dsk.get_all_dependencies()
    dsk = ensure_dict(dsk)

    # Low level task optimizations
    if fast_functions is not None:
        inline_functions_fast_functions = fast_functions

    hold = hold_keys(dsk, dependencies)

    dsk, dependencies = fuse(
        dsk,
        hold + keys + (fuse_keys or []),
        dependencies,
        rename_keys=rename_fused_keys,
    )
    if inline_functions_fast_functions:
        dsk = inline_functions(
            dsk,
            keys,
            dependencies=dependencies,
            fast_functions=inline_functions_fast_functions,
        )

    return optimize_slices(dsk)
示例#28
0
def reproject_band(band, geobox, resampling, dims, dask_chunks=None):
    """ Reproject a single measurement to the geobox. """
    if not hasattr(band.data, 'dask') or dask_chunks is None:
        data = reproject_array(band.data, band.nodata, band.geobox, geobox,
                               resampling)
        return wrap_in_dataarray(data, band, geobox, dims)

    dask_name = 'warp_{name}-{token}'.format(name=band.name,
                                             token=uuid.uuid4().hex)
    dependencies = [band.data]

    spatial_chunks = tuple(
        dask_chunks.get(k, geobox.shape[i]) for i, k in enumerate(geobox.dims))

    gt = GeoboxTiles(geobox, spatial_chunks)
    new_layer = {}

    for tile_index in numpy.ndindex(gt.shape):
        sub_geobox = gt[tile_index]
        # find the input array slice from the output geobox
        reproject_roi = compute_reproject_roi(band.geobox,
                                              sub_geobox,
                                              padding=1)

        # find the chunk from the input array with the slice index
        subset_band = band[(..., ) + reproject_roi.roi_src].chunk(-1)

        if min(subset_band.shape) == 0:
            # pad the empty chunk
            new_layer[(dask_name, ) + tile_index] = (numpy.full,
                                                     sub_geobox.shape,
                                                     band.nodata, band.dtype)
        else:
            # next 3 lines to generate the new graph
            dependencies.append(subset_band.data)
            # get the input dask array for the function `reproject_array`
            band_key = list(flatten(subset_band.data.__dask_keys__()))[0]
            # generate a new layer of dask graph with reroject
            new_layer[(dask_name, ) + tile_index] = (reproject_array, band_key,
                                                     band.nodata,
                                                     subset_band.geobox,
                                                     sub_geobox, resampling)

    # create a new graph with the additional layer and pack the graph into dask.array
    # since only regular chunking is allowed at the higher level dask.array interface,
    # to manipulate the graph seems to be the easiest way to obtain a dask.array with irregular chunks after reproject
    data = dask.array.Array(band.data.dask.from_collections(
        dask_name, new_layer, dependencies=dependencies),
                            dask_name,
                            chunks=spatial_chunks,
                            dtype=band.dtype,
                            shape=gt.base.shape)

    return wrap_in_dataarray(data, band, geobox, dims)
示例#29
0
def modf(x):
    # Not actually object dtype, just need to specify something
    tmp = elemwise(np.modf, x, dtype=object)
    left = "modf1-" + tmp.name
    right = "modf2-" + tmp.name
    ldsk = {(left, ) + key[1:]: (getitem, key, 0)
            for key in core.flatten(tmp.__dask_keys__())}
    rdsk = {(right, ) + key[1:]: (getitem, key, 1)
            for key in core.flatten(tmp.__dask_keys__())}

    a = np.empty_like(getattr(x, "_meta", x),
                      shape=(1, ) * x.ndim,
                      dtype=x.dtype)
    l, r = np.modf(a)

    graph = HighLevelGraph.from_collections(left, ldsk, dependencies=[tmp])
    L = Array(graph, left, chunks=tmp.chunks, meta=l)
    graph = HighLevelGraph.from_collections(right, rdsk, dependencies=[tmp])
    R = Array(graph, right, chunks=tmp.chunks, meta=r)
    return L, R
示例#30
0
    def dataframe_optimize(dsk, keys, **kwargs):
        if not isinstance(keys, (list, set)):
            keys = [keys]
        keys = list(core.flatten(keys))

        if not isinstance(dsk, HighLevelGraph):
            dsk = HighLevelGraph.from_collections(id(dsk),
                                                  dsk,
                                                  dependencies=())

        dsk = rewrite_simple_shuffle_layer(dsk, keys=keys)
        return optimize(dsk, keys, **kwargs)
示例#31
0
def test_persist(c, s, a, b):
    da = pytest.importorskip('dask.array')
    x = da.random.random((10, 10), chunks=(5, 5))
    y = da.random.random((10, 10), chunks=(5, 5))

    low = x.persist(priority=-1)
    futures = c.map(slowinc, range(10), delay=0.1)
    high = y.persist(priority=1)
    yield wait(high)
    assert all(s.processing.values())
    assert all(s.tasks[tokey(k)].state in ('processing', 'waiting')
               for k in flatten(low.__dask_keys__()))
示例#32
0
def test_persist(c, s, a, b):
    da = pytest.importorskip("dask.array")
    x = da.random.random((10, 10), chunks=(5, 5))
    y = da.random.random((10, 10), chunks=(5, 5))

    low = x.persist(priority=-1)
    futures = c.map(slowinc, range(10), delay=0.1)
    high = y.persist(priority=1)
    yield wait(high)
    assert all(s.processing.values())
    assert all(s.tasks[tokey(k)].state in ("processing", "waiting")
               for k in flatten(low.__dask_keys__()))
示例#33
0
    def fit(self, columns: ColumnNames, ddf: dask_cudf.DataFrame):
        # User passed in a list of column groups. We need to figure out
        # if this list contains any multi-column groups, and if there
        # are any (obvious) problems with these groups
        columns_uniq = list(set(flatten(columns, container=tuple)))
        columns_all = list(flatten(columns, container=tuple))
        if sorted(columns_all) != sorted(
                columns_uniq) and self.encode_type == "joint":
            # If we are doing "joint" encoding, there must be unique mapping
            # between input column names and column groups.  Otherwise, more
            # than one unique-value table could be used to encode the same
            # column.
            raise ValueError("Same column name included in multiple groups.")

        for group in columns:
            if isinstance(group, tuple) and len(group) > 1:
                # For multi-column groups, we concatenate column names
                # to get the "group" name.
                name = _make_name(*group, sep=self.name_sep)
                for col in group:
                    self.storage_name[col] = name

        # convert tuples to lists
        columns = [list(c) if isinstance(c, tuple) else c for c in columns]
        dsk, key = _category_stats(
            ddf,
            columns,
            [],
            [],
            self.out_path,
            self.freq_threshold,
            self.tree_width,
            self.on_host,
            concat_groups=self.encode_type == "joint",
            name_sep=self.name_sep,
        )
        # TODO: we can't use the dtypes on the ddf here since they are incorrect
        # so we're loading from the partitions. fix.
        return Delayed(key, dsk), ddf.map_partitions(lambda gdf: gdf.dtypes)
示例#34
0
async def test_persist(c, s):
    da = pytest.importorskip("dask.array")
    x = da.random.random((10, 10), chunks=(5, 5))
    y = da.random.random((10, 10), chunks=(5, 5))

    low = x.persist(priority=-1)
    futures = c.map(slowinc, range(10), delay=0.1)
    high = y.persist(priority=1)
    async with Worker(s.address, nthreads=1):
        await wait(high)
        assert all(s.processing.values())
        assert all(s.tasks[stringify(k)].state in ("processing", "waiting")
                   for k in flatten(low.__dask_keys__()))
示例#35
0
    def _get(self, dsk, keys, restrictions=None):
        flatkeys = list(flatten(keys))
        for key in flatkeys:
            if key not in self.futures:
                self.futures[key] = {'event': Event(), 'status': None}
        futures = {key: Future(key, self) for key in flatkeys}

        self.scheduler_queue.put_nowait({'op': 'update-graph',
                                         'dsk': dsk,
                                         'keys': flatkeys,
                                         'restrictions': restrictions or {}})

        packed = pack_data(keys, futures)
        result = yield self._gather(packed)
        raise gen.Return(result)
示例#36
0
    def _get(self, dsk, keys, restrictions=None, raise_on_error=True):
        flatkeys = list(flatten([keys]))
        futures = {key: Future(key, self) for key in flatkeys}

        self.loop.add_callback(
            self.scheduler_queue.put_nowait,
            {"op": "update-graph", "dsk": dsk, "keys": flatkeys, "restrictions": restrictions or {}},
        )

        packed = pack_data(keys, futures)
        if raise_on_error:
            result = yield self._gather(packed)
        else:
            try:
                result = yield self._gather(packed)
                result = "OK", result
            except Exception as e:
                result = "error", e
        raise gen.Return(result)
示例#37
0
def progress(*futures, **kwargs):
    """ Track progress of futures

    This operates differently in the notebook and the console

    *  Notebook:  This returns immediately, leaving an IPython widget on screen
    *  Console:  This blocks until the computation completes

    Parameters
    ----------
    futures: Futures
        A list of futures or keys to track
    notebook: bool (optional)
        Running in the notebook or not (defaults to guess)
    multi: bool (optional)
        Track different functions independently (defaults to True)
    complete: bool (optional)
        Track all keys (True) or only keys that have not yet run (False)
        (defaults to True)

    Examples
    --------
    >>> progress(futures)  # doctest: +SKIP
    [########################################] | 100% Completed |  1.7s
    """
    notebook = kwargs.pop('notebook', None)
    multi = kwargs.pop('multi', True)
    complete = kwargs.pop('complete', True)
    assert not kwargs

    futures = list(flatten(list(futures)))
    if not isinstance(futures, (set, list)):
        futures = [futures]
    if notebook is None:
        notebook = is_kernel()  # often but not always correct assumption
    if notebook:
        if multi:
            bar = MultiProgressWidget(futures, complete=complete)
        else:
            bar = ProgressWidget(futures, complete=complete)
        return bar
    else:
        TextProgressBar(futures, complete=complete)
示例#38
0
    def _get(self, dsk, keys, restrictions=None, raise_on_error=True):
        flatkeys = list(flatten([keys]))
        futures = {key: Future(key, self) for key in flatkeys}

        self.send_to_scheduler({'op': 'update-graph',
                                'dsk': dsk,
                                'keys': flatkeys,
                                'restrictions': restrictions or {}})

        packed = pack_data(keys, futures)
        if raise_on_error:
            result = yield self._gather(packed)
        else:
            try:
                result = yield self._gather(packed)
                result = 'OK', result
            except Exception as e:
                result = 'error', e
        raise gen.Return(result)
示例#39
0
    def _get(self, dsk, keys, restrictions=None, raise_on_error=True):
        flatkeys = list(flatten([keys]))
        futures = {key: Future(key, self) for key in flatkeys}
        dsk2 = {k: unpack_remotedata(v)[0] for k, v in dsk.items()}
        dsk3 = {k: v for k, v in dsk2.items() if (k == v) is not True}

        self._send_to_scheduler({'op': 'update-graph',
                                'dsk': dsk3,
                                'keys': flatkeys,
                                'restrictions': restrictions or {}})

        packed = pack_data(keys, futures)
        if raise_on_error:
            result = yield self._gather(packed)
        else:
            try:
                result = yield self._gather(packed)
                result = 'OK', result
            except Exception as e:
                result = 'error', e
        raise gen.Return(result)
示例#40
0
def test_flatten():
    assert list(flatten(())) == []
    assert list(flatten('foo')) == ['foo']
示例#41
0
文件: test_core.py 项目: OspreyX/dask
def test_flatten():
    assert list(flatten(())) == []