Пример #1
0
def apply_gufunc(func, signature, *args, **kwargs):
    """
    Apply a generalized ufunc or similar python function to arrays.

    ``signature`` determines if the function consumes or produces core
    dimensions. The remaining dimensions in given input arrays (``*args``)
    are considered loop dimensions and are required to broadcast
    naturally against each other.

    In other terms, this function is like ``np.vectorize``, but for
    the blocks of dask arrays. If the function itself shall also
    be vectorized use ``vectorize=True`` for convenience.

    Parameters
    ----------
    func : callable
        Function to call like ``func(*args, **kwargs)`` on input arrays
        (``*args``) that returns an array or tuple of arrays. If multiple
        arguments with non-matching dimensions are supplied, this function is
        expected to vectorize (broadcast) over axes of positional arguments in
        the style of NumPy universal functions [1]_ (if this is not the case,
        set ``vectorize=True``). If this function returns multiple outputs,
        ``output_core_dims`` has to be set as well.
    signature: string
        Specifies what core dimensions are consumed and produced by ``func``.
        According to the specification of numpy.gufunc signature [2]_
    *args : numeric
        Input arrays or scalars to the callable function.
    axes: List of tuples, optional, keyword only
        A list of tuples with indices of axes a generalized ufunc should operate on.
        For instance, for a signature of ``"(i,j),(j,k)->(i,k)"`` appropriate for
        matrix multiplication, the base elements are two-dimensional matrices
        and these are taken to be stored in the two last axes of each argument. The
        corresponding axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``.
        For simplicity, for generalized ufuncs that operate on 1-dimensional arrays
        (vectors), a single integer is accepted instead of a single-element tuple,
        and for generalized ufuncs for which all outputs are scalars, the output
        tuples can be omitted.
    axis: int, optional, keyword only
        A single axis over which a generalized ufunc should operate. This is a short-cut
        for ufuncs that operate over a single, shared core dimension, equivalent to passing
        in axes with entries of (axis,) for each single-core-dimension argument and ``()`` for
        all others. For instance, for a signature ``"(i),(i)->()"``, it is equivalent to passing
        in ``axes=[(axis,), (axis,), ()]``.
    keepdims: bool, optional, keyword only
        If this is set to True, axes which are reduced over will be left in the result as
        a dimension with size one, so that the result will broadcast correctly against the
        inputs. This option can only be used for generalized ufuncs that operate on inputs
        that all have the same number of core dimensions and with outputs that have no core
        dimensions , i.e., with signatures like ``"(i),(i)->()"`` or ``"(m,m)->()"``.
        If used, the location of the dimensions in the output can be controlled with axes
        and axis.
    output_dtypes : Optional, dtype or list of dtypes, keyword only
        Valid numpy dtype specification or list thereof.
        If not given, a call of ``func`` with a small set of data
        is performed in order to try to automatically determine the
        output dtypes.
    output_sizes : dict, optional, keyword only
        Optional mapping from dimension names to sizes for outputs. Only used if
        new core dimensions (not found on inputs) appear on outputs.
    vectorize: bool, keyword only
        If set to ``True``, ``np.vectorize`` is applied to ``func`` for
        convenience. Defaults to ``False``.
    allow_rechunk: Optional, bool, keyword only
        Allows rechunking, otherwise chunk sizes need to match and core
        dimensions are to consist only of one chunk.
        Warning: enabling this can increase memory usage significantly.
        Defaults to ``False``.
    **kwargs : dict
        Extra keyword arguments to pass to `func`

    Returns
    -------
    Single dask.array.Array or tuple of dask.array.Array

    Examples
    --------
    >>> import dask.array as da
    >>> import numpy as np
    >>> def stats(x):
    ...     return np.mean(x, axis=-1), np.std(x, axis=-1)
    >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30))
    >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a)
    >>> mean.compute().shape
    (10, 20)


    >>> def outer_product(x, y):
    ...     return np.einsum("i,j->ij", x, y)
    >>> a = da.random.normal(size=(   20,30), chunks=(10, 30))
    >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40))
    >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, vectorize=True)
    >>> c.compute().shape
    (10, 20, 30, 40)

    References
    ----------
    .. [1] https://docs.scipy.org/doc/numpy/reference/ufuncs.html
    .. [2] https://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html
    """
    axes = kwargs.pop("axes", None)
    axis = kwargs.pop("axis", None)
    keepdims = kwargs.pop("keepdims", False)
    output_dtypes = kwargs.pop("output_dtypes", None)
    output_sizes = kwargs.pop("output_sizes", None)
    vectorize = kwargs.pop("vectorize", None)
    allow_rechunk = kwargs.pop("allow_rechunk", False)

    # Input processing:
    ## Signature
    if not isinstance(signature, str):
        raise TypeError("`signature` has to be of type string")
    input_coredimss, output_coredimss = _parse_gufunc_signature(signature)

    ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples
    nout = None if not isinstance(output_coredimss,
                                  list) else len(output_coredimss)

    ## Determine and handle output_dtypes
    if output_dtypes is None:
        if vectorize:
            tempfunc = np.vectorize(func, signature=signature)
        else:
            tempfunc = func
        output_dtypes = apply_infer_dtype(tempfunc, args, kwargs,
                                          "apply_gufunc", "output_dtypes",
                                          nout)

    if isinstance(output_dtypes, (tuple, list)):
        if nout is None:
            if len(output_dtypes) > 1:
                raise ValueError(
                    ("Must specify single dtype or list of one dtype "
                     "for `output_dtypes` for function with one output"))
            otypes = output_dtypes
            output_dtypes = output_dtypes[0]
        else:
            otypes = output_dtypes
    else:
        if nout is not None:
            raise ValueError(
                "Must specify tuple of dtypes for `output_dtypes` for function with multiple outputs"
            )
        otypes = [output_dtypes]

    ## Vectorize function, if required
    if vectorize:
        func = np.vectorize(func, signature=signature, otypes=otypes)

    ## Miscellaneous
    if output_sizes is None:
        output_sizes = {}

    ## Axes
    input_axes, output_axes = _validate_normalize_axes(axes, axis, keepdims,
                                                       input_coredimss,
                                                       output_coredimss)

    # Main code:
    ## Cast all input arrays to dask
    args = [asarray(a) for a in args]

    if len(input_coredimss) != len(args):
        ValueError(
            "According to `signature`, `func` requires %d arguments, but %s given"
            % (len(input_coredimss), len(args)))

    ## Axes: transpose input arguments
    transposed_args = []
    for arg, iax, input_coredims in zip(args, input_axes, input_coredimss):
        shape = arg.shape
        iax = tuple(a if a < 0 else a - len(shape) for a in iax)
        tidc = tuple(i
                     for i in range(-len(shape) + 0, 0) if i not in iax) + iax
        transposed_arg = arg.transpose(tidc)
        transposed_args.append(transposed_arg)
    args = transposed_args

    ## Assess input args for loop dims
    input_shapes = [a.shape for a in args]
    input_chunkss = [a.chunks for a in args]
    num_loopdims = [
        len(s) - len(cd) for s, cd in zip(input_shapes, input_coredimss)
    ]
    max_loopdims = max(num_loopdims) if num_loopdims else None
    core_input_shapes = [
        dict(zip(icd, s[n:]))
        for s, n, icd in zip(input_shapes, num_loopdims, input_coredimss)
    ]
    core_shapes = merge(*core_input_shapes)
    core_shapes.update(output_sizes)

    loop_input_dimss = [
        tuple("__loopdim%d__" % d
              for d in range(max_loopdims - n, max_loopdims))
        for n in num_loopdims
    ]
    input_dimss = [l + c for l, c in zip(loop_input_dimss, input_coredimss)]

    loop_output_dims = max(loop_input_dimss,
                           key=len) if loop_input_dimss else tuple()

    ## Assess input args for same size and chunk sizes
    ### Collect sizes and chunksizes of all dims in all arrays
    dimsizess = {}
    chunksizess = {}
    for dims, shape, chunksizes in zip(input_dimss, input_shapes,
                                       input_chunkss):
        for dim, size, chunksize in zip(dims, shape, chunksizes):
            dimsizes = dimsizess.get(dim, [])
            dimsizes.append(size)
            dimsizess[dim] = dimsizes
            chunksizes_ = chunksizess.get(dim, [])
            chunksizes_.append(chunksize)
            chunksizess[dim] = chunksizes_
    ### Assert correct partitioning, for case:
    for dim, sizes in dimsizess.items():
        #### Check that the arrays have same length for same dimensions or dimension `1`
        if set(sizes).union({1}) != {1, max(sizes)}:
            raise ValueError(
                "Dimension `'{}'` with different lengths in arrays".format(
                    dim))
        if not allow_rechunk:
            chunksizes = chunksizess[dim]
            #### Check if core dimensions consist of only one chunk
            if (dim in core_shapes) and (chunksizes[0][0] < core_shapes[dim]):
                raise ValueError(
                    "Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \
chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \
significantly.".format(dim))
            #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1
            relevant_chunksizes = list(
                unique(c for s, c in zip(sizes, chunksizes) if s > 1))
            if len(relevant_chunksizes) > 1:
                raise ValueError(
                    "Dimension `'{}'` with different chunksize present".format(
                        dim))

    ## Apply function - use blockwise here
    arginds = list(concat(zip(args, input_dimss)))

    ### Use existing `blockwise` but only with loopdims to enforce
    ### concatenation for coredims that appear also at the output
    ### Modifying `blockwise` could improve things here.
    try:
        tmp = blockwise(  # First try to compute meta
            func,
            loop_output_dims,
            *arginds,
            concatenate=True,
            **kwargs)
    except ValueError:
        # If computing meta doesn't work, provide it explicitly based on
        # provided dtypes
        sample = arginds[0]._meta
        if isinstance(output_dtypes, tuple):
            meta = tuple(
                meta_from_array(sample, dtype=odt)
                for ocd, odt in zip(output_coredimss, output_dtypes))
        else:
            meta = tuple(
                meta_from_array(sample, dtype=odt)
                for ocd, odt in zip((output_coredimss, ), (output_dtypes, )))
        tmp = blockwise(func,
                        loop_output_dims,
                        *arginds,
                        concatenate=True,
                        meta=meta,
                        **kwargs)

    if isinstance(tmp._meta, tuple):
        metas = tmp._meta
    else:
        metas = (tmp._meta, )

    ## Prepare output shapes
    loop_output_shape = tmp.shape
    loop_output_chunks = tmp.chunks
    keys = list(flatten(tmp.__dask_keys__()))
    name, token = keys[0][0].split("-")

    ### *) Treat direct output
    if nout is None:
        output_coredimss = [output_coredimss]
        output_dtypes = [output_dtypes]

    ## Split output
    leaf_arrs = []
    for i, (ocd, oax,
            meta) in enumerate(zip(output_coredimss, output_axes, metas)):
        core_output_shape = tuple(core_shapes[d] for d in ocd)
        core_chunkinds = len(ocd) * (0, )
        output_shape = loop_output_shape + core_output_shape
        output_chunks = loop_output_chunks + core_output_shape
        leaf_name = "%s_%d-%s" % (name, i, token)
        leaf_dsk = {(leaf_name, ) + key[1:] + core_chunkinds:
                    ((getitem, key, i) if nout else key)
                    for key in keys}
        graph = HighLevelGraph.from_collections(leaf_name,
                                                leaf_dsk,
                                                dependencies=[tmp])
        meta = meta_from_array(meta, len(output_shape))
        leaf_arr = Array(graph,
                         leaf_name,
                         chunks=output_chunks,
                         shape=output_shape,
                         meta=meta)

        ### Axes:
        if keepdims:
            slices = len(
                leaf_arr.shape) * (slice(None), ) + len(oax) * (np.newaxis, )
            leaf_arr = leaf_arr[slices]

        tidcs = [None] * len(leaf_arr.shape)
        for i, oa in zip(range(-len(oax), 0), oax):
            tidcs[oa] = i
        j = 0
        for i in range(len(tidcs)):
            if tidcs[i] is None:
                tidcs[i] = j
                j += 1
        leaf_arr = leaf_arr.transpose(tidcs)
        leaf_arrs.append(leaf_arr)

    return (*leaf_arrs, ) if nout else leaf_arrs[0]  # Undo *) from above
Пример #2
0
def unpack_collections(expr):
    """Normalize a python object and merge all sub-graphs.

    - Replace ``Delayed`` with their keys
    - Convert literals to things the schedulers can handle
    - Extract dask graphs from all enclosed values

    Parameters
    ----------
    expr : object
        The object to be normalized. This function knows how to handle
        dask collections, as well as most builtin python types.

    Returns
    -------
    task : normalized task to be run
    collections : a tuple of collections

    Examples
    --------
    >>> import dask
    >>> a = delayed(1, 'a')
    >>> b = delayed(2, 'b')
    >>> task, collections = unpack_collections([a, b, 3])
    >>> task  # doctest: +SKIP
    ['a', 'b', 3]
    >>> collections  # doctest: +SKIP
    (a, b)

    >>> task, collections = unpack_collections({a: 1, b: 2})
    >>> task  # doctest: +SKIP
    (dict, [['a', 1], ['b', 2]])
    >>> collections  # doctest: +SKIP
    {a, b}
    """
    if isinstance(expr, Delayed):
        return expr._key, (expr, )

    if is_dask_collection(expr):
        finalized = finalize(expr)
        return finalized._key, (finalized, )

    if isinstance(expr, Iterator):
        expr = tuple(expr)

    typ = type(expr)

    if typ in (list, tuple, set):
        args, collections = unzip((unpack_collections(e) for e in expr), 2)
        args = list(args)
        collections = tuple(unique(concat(collections), key=id))
        # Ensure output type matches input type
        if typ is not list:
            args = (typ, args)
        return args, collections

    if typ is dict:
        args, collections = unpack_collections([[k, v]
                                                for k, v in expr.items()])
        return (dict, args), collections

    if typ is slice:
        args, collections = unpack_collections(
            [expr.start, expr.stop, expr.step])
        return (slice, ) + tuple(args), collections

    if is_dataclass(expr):
        args, collections = unpack_collections(
            [[f.name, getattr(expr, f.name)] for f in dataclass_fields(expr)])

        return (apply, typ, (), (dict, args)), collections

    return expr, ()
Пример #3
0
 def __iter__(self):
     return toolz.unique(toolz.concat(self.layers.values()))
Пример #4
0
def test_distinct_with_key():
    seq = [{"a": i} for i in [0, 1, 2, 1, 2, 3, 2, 3, 4, 5]]
    bag = db.from_sequence(seq, npartitions=3)
    expected = list(unique(seq, key=lambda x: x["a"]))
    assert_eq(bag.distinct(key="a"), expected)
    assert_eq(bag.distinct(key=lambda x: x["a"]), expected)
Пример #5
0
def _paths_to_cats(paths, file_scheme):
    """
    Extract categorical fields and labels from hive- or drill-style paths.
    FixMe: This has been pasted from https://github.com/dask/fastparquet/pull/471
    Use fastparquet.api.paths_to_cats from fastparquet>0.3.2 instead.

    Parameters
    ----------
    paths (Iterable[str]): file paths relative to root
    file_scheme (str):

    Returns
    -------
    cats (OrderedDict[str, List[Any]]): a dict of field names and their values
    """
    if file_scheme in ["simple", "flat", "other"]:
        cats = {}
        return cats

    cats = OrderedDict()
    raw_cats = OrderedDict()
    s = ex_from_sep("/")
    paths = toolz.unique(paths)
    if file_scheme == "hive":
        partitions = toolz.unique(
            (k, v) for path in paths for k, v in s.findall(path))
        for key, val in partitions:
            cats.setdefault(key, set()).add(val_to_num(val))
            raw_cats.setdefault(key, set()).add(val)
    else:
        i_val = toolz.unique((i, val) for path in paths
                             for i, val in enumerate(path.split("/")[:-1]))
        for i, val in i_val:
            key = "dir%i" % i
            cats.setdefault(key, set()).add(val_to_num(val))
            raw_cats.setdefault(key, set()).add(val)

    for key, v in cats.items():
        # Check that no partition names map to the same value after transformation by val_to_num
        raw = raw_cats[key]
        if len(v) != len(raw):
            conflicts_by_value = OrderedDict()
            for raw_val in raw_cats[key]:
                conflicts_by_value.setdefault(val_to_num(raw_val),
                                              set()).add(raw_val)
            conflicts = [
                c for k in conflicts_by_value.values() if len(k) > 1 for c in k
            ]
            raise ValueError("Partition names map to the same value: %s" %
                             conflicts)
        vals_by_type = groupby_types(v)

        # Check that all partition names map to the same type after transformation by val_to_num
        if len(vals_by_type) > 1:
            examples = [x[0] for x in vals_by_type.values()]
            warnings.warn(
                "Partition names coerce to values of different types, e.g. %s"
                % examples)

    cats = OrderedDict([(key, list(v)) for key, v in cats.items()])
    return cats
Пример #6
0
 def _find_unique(series):
     values = series.dropna()
     if series.name in self.columns_with_iterables:
         values = tlz.concat(values)
     return list(tlz.unique(values))
Пример #7
0
def plot_cache(
    results, dsk, start_time, metric_name, palette="Viridis", label_size=60, **kwargs
):
    """Visualize the results of profiling in a bokeh plot.

    Parameters
    ----------
    results : sequence
        Output of CacheProfiler.results
    dsk : dict
        The dask graph being profiled.
    start_time : float
        Start time of the profile.
    metric_name : string
        Metric used to measure cache size
    palette : string, optional
        Name of the bokeh palette to use, must be a member of
        bokeh.palettes.all_palettes.
    label_size: int (optional)
        Maximum size of output labels in plot, defaults to 60
    **kwargs
        Other keyword arguments, passed to bokeh.figure. These will override
        all defaults set by visualize.

    Returns
    -------
    The completed bokeh plot object.
    """
    bp = import_required("bokeh.plotting", _BOKEH_MISSING_MSG)
    from bokeh.models import HoverTool

    defaults = dict(
        title="Profile Results",
        tools="hover,save,reset,wheel_zoom,xpan",
        toolbar_location="above",
        width=800,
        height=300,
    )
    # Support plot_width and plot_height for backwards compatibility
    if "plot_width" in kwargs:
        kwargs["width"] = kwargs.pop("plot_width")
    if "plot_height" in kwargs:
        kwargs["height"] = kwargs.pop("plot_height")
    defaults.update(**kwargs)

    if results:
        starts, ends = list(zip(*results))[3:]
        tics = sorted(unique(starts + ends))
        groups = groupby(lambda d: pprint_task(d[1], dsk, label_size), results)
        data = {}
        for k, vals in groups.items():
            cnts = dict.fromkeys(tics, 0)
            for v in vals:
                cnts[v.cache_time] += v.metric
                cnts[v.free_time] -= v.metric
            data[k] = [0] + list(accumulate(add, pluck(1, sorted(cnts.items()))))

        tics = [0] + [i - start_time for i in tics]
        p = bp.figure(x_range=[0, max(tics)], **defaults)

        for (key, val), color in zip(data.items(), get_colors(palette, data.keys())):
            p.line(
                "x",
                "y",
                line_color=color,
                line_width=3,
                source=bp.ColumnDataSource(
                    {"x": tics, "y": val, "label": [key for i in val]}
                ),
            )

    else:
        p = bp.figure(y_range=[0, 10], x_range=[0, 10], **defaults)
    p.yaxis.axis_label = "Cache Size ({0})".format(metric_name)
    p.xaxis.axis_label = "Time (s)"

    hover = p.select(HoverTool)
    hover.tooltips = """
    <div>
        <span style="font-size: 14px; font-weight: bold;">Task:</span>&nbsp;
        <span style="font-size: 10px; font-family: Monaco, monospace;">@label</span>
    </div>
    """
    return p
Пример #8
0
def _calculate_divisions(
    df: DataFrame,
    partition_col: Series,
    repartition: bool,
    npartitions: int,
    upsample: float = 1.0,
    partition_size: float = 128e6,
) -> Tuple[List, List, List]:
    """
    Utility function to calculate divisions for calls to `map_partitions`
    """
    sizes = df.map_partitions(sizeof) if repartition else []
    divisions = partition_col._repartition_quantiles(npartitions,
                                                     upsample=upsample)
    mins = partition_col.map_partitions(M.min)
    maxes = partition_col.map_partitions(M.max)

    try:
        divisions, sizes, mins, maxes = compute(divisions, sizes, mins, maxes)
    except TypeError as e:
        # When there are nulls and a column is non-numeric, a TypeError is sometimes raised as a result of
        # 1) computing mins/maxes above, 2) every null being switched to NaN, and 3) NaN being a float.
        # Also, Pandas ExtensionDtypes may cause TypeErrors when dealing with special nulls such as pd.NaT or pd.NA.
        # If this happens, we hint the user about eliminating nulls beforehand.
        if not is_numeric_dtype(partition_col.dtype):
            obj, suggested_method = (
                ("column",
                 f"`.dropna(subset=['{partition_col.name}'])`") if any(
                     partition_col._name == df[c]._name for c in df) else
                ("series", "`.loc[series[~series.isna()]]`"))
            raise NotImplementedError(
                f"Divisions calculation failed for non-numeric {obj} '{partition_col.name}'.\n"
                f"This is probably due to the presence of nulls, which Dask does not entirely support in the index.\n"
                f"We suggest you try with {suggested_method}.") from e
        # For numeric types there shouldn't be problems with nulls, so we raise as-it-is this particular TypeError
        else:
            raise e

    divisions = methods.tolist(divisions)
    if type(sizes) is not list:
        sizes = methods.tolist(sizes)
    mins = methods.tolist(mins)
    maxes = methods.tolist(maxes)

    empty_dataframe_detected = pd.isna(divisions).all()
    if repartition or empty_dataframe_detected:
        total = sum(sizes)
        npartitions = max(math.ceil(total / partition_size), 1)
        npartitions = min(npartitions, df.npartitions)
        n = len(divisions)
        try:
            divisions = np.interp(
                x=np.linspace(0, n - 1, npartitions + 1),
                xp=np.linspace(0, n - 1, n),
                fp=divisions,
            ).tolist()
        except (TypeError, ValueError):  # str type
            indexes = np.linspace(0, n - 1, npartitions + 1).astype(int)
            divisions = [divisions[i] for i in indexes]
    else:
        # Drop duplicate divisions returned by partition quantiles
        divisions = list(toolz.unique(divisions[:-1])) + [divisions[-1]]

    mins = remove_nans(mins)
    maxes = remove_nans(maxes)
    if pd.api.types.is_categorical_dtype(partition_col.dtype):
        dtype = partition_col.dtype
        mins = pd.Categorical(mins, dtype=dtype).codes.tolist()
        maxes = pd.Categorical(maxes, dtype=dtype).codes.tolist()

    return divisions, mins, maxes