def task_label(task): """Label for a task on a dot graph. Examples -------- >>> from operator import add >>> task_label((add, 1, 2)) 'add' >>> task_label((add, (add, 1, 2), 3)) 'add(...)' """ func = task[0] if func is apply: func = task[1] if hasattr(func, "funcs"): if len(func.funcs) > 1: return f"{funcname(func.funcs[0])}(...)" else: head = funcname(func.funcs[0]) else: head = funcname(func) if any(has_sub_tasks(i) for i in task[1:]): return f"{head}(...)" else: return head
def test_funcname_toolz(): @curry def foo(a, b, c): pass assert funcname(foo) == "foo" assert funcname(foo(1)) == "foo"
def test_funcname_toolz(): toolz = pytest.importorskip('toolz') @toolz.curry def foo(a, b, c): pass assert funcname(foo) == 'foo' assert funcname(foo(1)) == 'foo'
def test_funcname_toolz(): toolz = pytest.importorskip("toolz") @toolz.curry def foo(a, b, c): pass assert funcname(foo) == "foo" assert funcname(foo(1)) == "foo"
def test_funcname_multipledispatch(): md = pytest.importorskip('multipledispatch') @md.dispatch(int, int, int) def foo(a, b, c): pass assert funcname(foo) == 'foo' assert funcname(functools.partial(foo, a=1)) == 'foo'
def test_funcname_multipledispatch(): md = pytest.importorskip("multipledispatch") @md.dispatch(int, int, int) def foo(a, b, c): pass assert funcname(foo) == "foo" assert funcname(functools.partial(foo, a=1)) == "foo"
def test_funcname_numpy_vectorize(): np = pytest.importorskip("numpy") vfunc = np.vectorize(int) assert funcname(vfunc) == "vectorize_int" # Regression test for https://github.com/pydata/xarray/issues/3303 # Partial functions don't have a __name__ attribute func = functools.partial(np.add, out=None) vfunc = np.vectorize(func) assert funcname(vfunc) == "vectorize_add"
def _funcname(x): try: if isinstance(x, BatchedCalls): x = x.items[0][0] except Exception: pass return funcname(x)
def test_dont_concatenate_single_chunks(shape, chunks): x = da.ones(shape, chunks=shape) y = x.rechunk(chunks) dsk = dict(y.dask) assert not any(funcname(task[0]).startswith('concat') for task in dsk.values() if dask.istask(task))
def _get_worker_plugin_name(plugin) -> str: """Returns the worker plugin name. If plugin has no name attribute a random name is used.""" if hasattr(plugin, "name"): return plugin.name else: return funcname(type(plugin)) + "-" + str(uuid.uuid4())
def __init__(self, func, nin, nout): self._ufunc = np.frompyfunc(func, nin, nout) self._func = func self.nin = nin self.nout = nout self._name = funcname(func) self.__name__ = "frompyfunc-%s" % self._name
def _parse_wrap_args(func, args, kwargs, shape): if isinstance(shape, np.ndarray): shape = shape.tolist() if not isinstance(shape, (tuple, list)): shape = (shape, ) name = kwargs.pop("name", None) chunks = kwargs.pop("chunks", "auto") dtype = kwargs.pop("dtype", None) if dtype is None: dtype = func(shape, *args, **kwargs).dtype dtype = np.dtype(dtype) chunks = normalize_chunks(chunks, shape, dtype=dtype) name = name or funcname(func) + "-" + tokenize(func, shape, chunks, dtype, args, kwargs) return { "shape": shape, "dtype": dtype, "kwargs": kwargs, "chunks": chunks, "name": name, }
def __init__( self, func, output_indices, indices, numblocks, feed_index=0, axis=None, ): self.func = func self.output_indices = tuple(output_indices) self.indices = tuple((name, tuple(ind) if ind is not None else ind) for name, ind in indices) self.numblocks = numblocks if axis is None: raise ValueError("axis not set") if axis in self.output_indices: raise ValueError("axis in output_indices") self.feed_index = feed_index self.axis = axis token = tokenize(self.func, self.output_indices, self.indices, self.numblocks, self.feed_index, self.axis) self.func_name = funcname(self.func) self.name = "-".join((self.func_name, token))
def _funcname(x): try: if isinstance(x, list): x = x[0][0] except Exception: pass return funcname(x)
def elemwise(op, *args, **kwargs): # See also da.core.elemwise. Note: dask seems to be able to convert Python # and numpy objects in this function, thus supporting operations between # dask objects and others. This would be useful for us as well. # Do not support mismatching chunking for now. n_chunk = None slice_dim = None for arg in args: if isinstance(arg, DatasetCollection): if n_chunk is not None: assert n_chunk == arg.n_chunk assert slice_dim == arg.slice_dim else: n_chunk = arg.n_chunk slice_dim = arg.slice_dim out = '{}-{}'.format(funcname(op), tokenize(op, *args)) out_ind = (0, ) # Handling only 1D chunking here, so everything is (0,). arginds = list( (a, (0, ) if isinstance(a, DatasetCollection) else None) for a in args) numblocks = {a.name: a.numblocks for a, ind in arginds if ind is not None} argindsstr = list( concat([(a if ind is None else a.name, ind) for a, ind in arginds])) dsk = top(op, out, out_ind, *argindsstr, numblocks=numblocks, **kwargs) dsks = [a.dask for a, ind in arginds if ind is not None] return DatasetCollection(sharedict.merge((out, dsk), *dsks), out, n_chunk, slice_dim)
def call_function(func, func_token, args, kwargs, pure=None, nout=None): dask_key_name = kwargs.pop("dask_key_name", None) pure = kwargs.pop("pure", pure) if dask_key_name is None: name = "{}-{}".format( funcname(func), tokenize(func_token, *args, pure=pure, **kwargs), ) else: name = dask_key_name args2, collections = unzip(map(unpack_collections, args), 2) collections = list(concat(collections)) if kwargs: dask_kwargs, collections2 = unpack_collections(kwargs) collections.extend(collections2) task = (apply, func, list(args2), dask_kwargs) else: task = (func, ) + args2 graph = HighLevelGraph.from_collections(name, {name: task}, dependencies=collections) nout = nout if nout is not None else None return Delayed(name, graph, length=nout)
def test_dont_concatenate_single_chunks(shape, chunks): x = da.ones(shape, chunks=shape) y = x.rechunk(chunks) dsk = dict(y.dask) assert not any( funcname(task[0]).startswith("concat") for task in dsk.values() if dask.istask(task))
def _emulate(func, *args, **kwargs): """ Apply a function using args / kwargs. If arguments contain dd.DataFrame / dd.Series, using internal cache (``_meta``) for calculation """ with raise_on_meta_error(funcname(func)): return func(*_extract_meta(args), **_extract_meta(kwargs))
def test_funcname_long(): def a_long_function_name_11111111111111111111111111111111111111111111111(): pass result = funcname( a_long_function_name_11111111111111111111111111111111111111111111111) assert "a_long_function_name" in result assert len(result) < 60
def map_partitions(self, func, *args, **kwargs): dtype = kwargs.pop('dtype', None) shape = kwargs.pop('shape', None) token = tokenize(self, func, args, kwargs, dtype, shape) new = '{0}-{1}'.format(funcname(func), token) old = self.name args = list(args) dsk = dict(((new, i), (apply, func, [(old, i)] + args, kwargs)) for i in range(self.npartitions)) dsk.update(self.dask) return Matrix(dsk, new, self.npartitions, dtype, shape)
def map_partitions(self, func, *args, **kwargs): example = func(self._example, *args, **kwargs) name = funcname(func) + '-' + tokenize(self, func, *args, **kwargs) if not args and not kwargs: dsk = {(name, i): (func, key) for i, key in enumerate(self.__dask_keys__())} else: dsk = {(name, i): (apply, func, list((key,) + args), kwargs) for i, key in enumerate(self.__dask_keys__())} if isinstance(example, gpd.base.GeoPandasBase): regions = self._regions else: regions = [None] * (self.npartitions + 1) return typeof(example)(merge(dsk, self.dask), name, example, regions)
def map_partitions(func, *args, **kwargs): """ Apply Python function on each DataFrame partition. Parameters ---------- func : function Function applied to each partition. args, kwargs : Arguments and keywords to pass to the function. At least one of the args should be a dask_gdf object. """ meta = kwargs.pop('meta', None) if meta is not None: meta = make_meta(meta) if 'token' in kwargs: name = kwargs.pop('token') token = tokenize(meta, *args, **kwargs) else: name = funcname(func) token = tokenize(func, meta, *args, **kwargs) name = '{0}-{1}'.format(name, token) args = align_partitions(args) if meta is None: meta = _emulate(func, *args, **kwargs) meta = make_meta(meta) if all(isinstance(arg, Scalar) for arg in args): dask = { (name, 0): (apply, func, (tuple, [(x._name, 0) for x in args]), kwargs) } return Scalar(merge(dask, *[x.dask for x in args]), name, meta) dfs = [df for df in args if isinstance(df, _Frame)] dsk = {} for i in range(dfs[0].npartitions): values = [(x._name, i if isinstance(x, _Frame) else 0) if isinstance( x, (_Frame, Scalar)) else x for x in args] dsk[(name, i)] = (apply, func, values, kwargs) dasks = [arg.dask for arg in args if isinstance(arg, (_Frame, Scalar))] return new_dd_object(merge(dsk, *dasks), name, meta, args[0].divisions)
def test_funcname(): def foo(a, b, c): pass assert funcname(foo) == 'foo' assert funcname(functools.partial(foo, a=1)) == 'foo' assert funcname(M.sum) == 'sum' assert funcname(lambda: 1) == 'lambda' class Foo(object): pass assert funcname(Foo) == 'Foo' assert 'Foo' in funcname(Foo())
def test_funcname(): def foo(a, b, c): pass assert funcname(foo) == "foo" assert funcname(functools.partial(foo, a=1)) == "foo" assert funcname(M.sum) == "sum" assert funcname(lambda: 1) == "lambda" class Foo: pass assert funcname(Foo) == "Foo" assert "Foo" in funcname(Foo())
def reduction(args, chunk=None, aggregate=None, combine=None, meta=None, token=None, chunk_kwargs=None, aggregate_kwargs=None, combine_kwargs=None, split_every=None, **kwargs): """Generic tree reduction operation. Parameters ---------- args : Positional arguments for the `chunk` function. All `dask.dataframe` objects should be partitioned and indexed equivalently. chunk : function [block-per-arg] -> block Function to operate on each block of data aggregate : function list-of-blocks -> block Function to operate on the list of results of chunk combine : function list-of-blocks -> block, optional Function to operate on intermediate lists of results of chunk in a tree-reduction. If not provided, defaults to aggregate. $META token : str, optional The name to use for the output keys. chunk_kwargs : dict, optional Keywords for the chunk function only. aggregate_kwargs : dict, optional Keywords for the aggregate function only. combine_kwargs : dict, optional Keywords for the combine function only. split_every : int, optional Group partitions into groups of this size while performing a tree-reduction. If set to False, no tree-reduction will be used, and all intermediates will be concatenated and passed to ``aggregate``. Default is 8. kwargs : All remaining keywords will be passed to ``chunk``, ``aggregate``, and ``combine``. """ if chunk_kwargs is None: chunk_kwargs = dict() if aggregate_kwargs is None: aggregate_kwargs = dict() chunk_kwargs.update(kwargs) aggregate_kwargs.update(kwargs) if combine is None: if combine_kwargs: raise ValueError("`combine_kwargs` provided with no `combine`") combine = aggregate combine_kwargs = aggregate_kwargs else: if combine_kwargs is None: combine_kwargs = dict() combine_kwargs.update(kwargs) if not isinstance(args, (tuple, list)): args = [args] npartitions = set(arg.npartitions for arg in args if isinstance(arg, _Frame)) if len(npartitions) > 1: raise ValueError("All arguments must have same number of partitions") npartitions = npartitions.pop() if split_every is None: split_every = 8 elif split_every is False: split_every = npartitions elif split_every < 2 or not isinstance(split_every, int): raise ValueError("split_every must be an integer >= 2") token_key = tokenize(token or (chunk, aggregate), meta, args, chunk_kwargs, aggregate_kwargs, combine_kwargs, split_every) # Chunk a = '{0}-chunk-{1}'.format(token or funcname(chunk), token_key) if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: dsk = {(a, 0, i): (chunk, key) for i, key in enumerate(args[0].__dask_keys__())} else: dsk = {(a, 0, i): (apply, chunk, [(x._name, i) if isinstance(x, _Frame) else x for x in args], chunk_kwargs) for i in range(args[0].npartitions)} # Combine b = '{0}-combine-{1}'.format(token or funcname(combine), token_key) k = npartitions depth = 0 while k > split_every: for part_i, inds in enumerate(partition_all(split_every, range(k))): conc = (list, [(a, depth, i) for i in inds]) dsk[(b, depth + 1, part_i)] = ( (apply, combine, [conc], combine_kwargs) if combine_kwargs else (combine, conc)) k = part_i + 1 a = b depth += 1 # Aggregate b = '{0}-agg-{1}'.format(token or funcname(aggregate), token_key) conc = (list, [(a, depth, i) for i in range(k)]) if aggregate_kwargs: dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs) else: dsk[(b, 0)] = (aggregate, conc) if meta is None: meta_chunk = _emulate(apply, chunk, args, chunk_kwargs) meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs) meta = make_meta(meta) for arg in args: if isinstance(arg, _Frame): dsk.update(arg.dask) return new_dd_object(dsk, b, meta, (None, None))
def __repr__(self): return '%s(%s)' % (funcname( self.func), ', '.join(map(repr, self.children)) + (', ' + ', '.join('%s=%s' % (k, v) for k, v in self.kwargs.items()) if self.kwargs else ''))
def test_name(): assert funcname(sizeof) == 'sizeof'
def map_overlap( func, df, before, after, *args, meta=no_default, enforce_metadata=True, transform_divisions=True, align_dataframes=True, **kwargs, ): """Apply a function to each partition, sharing rows with adjacent partitions. Parameters ---------- func : function The function applied to each partition. If this function accepts the special ``partition_info`` keyword argument, it will recieve information on the partition's relative location within the dataframe. df: dd.DataFrame, dd.Series args, kwargs : Positional and keyword arguments to pass to the function. Positional arguments are computed on a per-partition basis, while keyword arguments are shared across all partitions. The partition itself will be the first positional argument, with all other arguments passed *after*. Arguments can be ``Scalar``, ``Delayed``, or regular Python objects. DataFrame-like args (both dask and pandas) will be repartitioned to align (if necessary) before applying the function; see ``align_dataframes`` to control this behavior. enforce_metadata : bool, default True Whether to enforce at runtime that the structure of the DataFrame produced by ``func`` actually matches the structure of ``meta``. This will rename and reorder columns for each partition, and will raise an error if this doesn't work or types don't match. before : int or timedelta The rows to prepend to partition ``i`` from the end of partition ``i - 1``. after : int or timedelta The rows to append to partition ``i`` from the beginning of partition ``i + 1``. transform_divisions : bool, default True Whether to apply the function onto the divisions and apply those transformed divisions to the output. align_dataframes : bool, default True Whether to repartition DataFrame- or Series-like args (both dask and pandas) so their divisions align before applying the function. This requires all inputs to have known divisions. Single-partition inputs will be split into multiple partitions. If False, all inputs must have either the same number of partitions or a single partition. Single-partition inputs will be broadcast to every partition of multi-partition inputs. $META See Also -------- dd.DataFrame.map_overlap """ args = (df, ) + args dfs = [df for df in args if isinstance(df, _Frame)] if isinstance(before, datetime.timedelta) or isinstance( after, datetime.timedelta): if not is_datetime64_any_dtype( dfs[0].index._meta_nonempty.inferred_type): raise TypeError( "Must have a `DatetimeIndex` when using string offset " "for `before` and `after`") else: if not (isinstance(before, Integral) and before >= 0 and isinstance(after, Integral) and after >= 0): raise ValueError("before and after must be positive integers") name = kwargs.pop("token", None) parent_meta = kwargs.pop("parent_meta", None) assert callable(func) if name is not None: token = tokenize(meta, before, after, *args, **kwargs) else: name = "overlap-" + funcname(func) token = tokenize(func, meta, before, after, *args, **kwargs) name = f"{name}-{token}" if align_dataframes: args = _maybe_from_pandas(args) try: args = _maybe_align_partitions(args) except ValueError as e: raise ValueError( f"{e}. If you don't want the partitions to be aligned, and are " "calling `map_overlap` directly, pass `align_dataframes=False`." ) from e meta = _get_meta_map_partitions(args, dfs, func, kwargs, meta, parent_meta) if all(isinstance(arg, Scalar) for arg in args): layer = { (name, 0): ( apply, func, (tuple, [(arg._name, 0) for arg in args]), kwargs, ) } graph = HighLevelGraph.from_collections(name, layer, dependencies=args) return Scalar(graph, name, meta) args2 = [] dependencies = [] divisions = _get_divisions_map_partitions(align_dataframes, transform_divisions, dfs, func, args, kwargs) def _handle_frame_argument(arg): dsk = {} prevs_parts_dsk, prevs = _get_previous_partitions(arg, before) dsk.update(prevs_parts_dsk) nexts_parts_dsk, nexts = _get_nexts_partitions(arg, after) dsk.update(nexts_parts_dsk) name_a = "overlap-concat-" + tokenize(arg) for i, (prev, current, next) in enumerate(zip(prevs, arg.__dask_keys__(), nexts)): key = (name_a, i) dsk[key] = (_combined_parts, prev, current, next, before, after) graph = HighLevelGraph.from_collections(name_a, dsk, dependencies=[arg]) return new_dd_object(graph, name_a, meta, divisions) for arg in args: if isinstance(arg, _Frame): arg = _handle_frame_argument(arg) args2.append(arg) dependencies.append(arg) continue arg = normalize_arg(arg) arg2, collections = unpack_collections(arg) if collections: args2.append(arg2) dependencies.extend(collections) else: args2.append(arg) kwargs3 = {} simple = True for k, v in kwargs.items(): v = normalize_arg(v) v, collections = unpack_collections(v) dependencies.extend(collections) kwargs3[k] = v if collections: simple = False if has_keyword(func, "partition_info"): partition_info = {(i, ): { "number": i, "division": division } for i, division in enumerate(divisions[:-1])} args2.insert(0, BlockwiseDepDict(partition_info)) orig_func = func def func(partition_info, *args, **kwargs): return orig_func(*args, **kwargs, partition_info=partition_info) if enforce_metadata: dsk = partitionwise_graph( apply_and_enforce, name, func, before, after, *args2, dependencies=dependencies, _func=overlap_chunk, _meta=meta, **kwargs3, ) else: kwargs4 = kwargs if simple else kwargs3 dsk = partitionwise_graph( overlap_chunk, name, func, before, after, *args2, **kwargs4, dependencies=dependencies, ) graph = HighLevelGraph.from_collections(name, dsk, dependencies=dependencies) return new_dd_object(graph, name, meta, divisions)
def from_map( func, *iterables, args=None, meta=None, divisions=None, label=None, token=None, enforce_metadata=True, **kwargs, ): """Create a DataFrame collection from a custom function map WARNING: The ``from_map`` API is experimental, and stability is not yet guaranteed. Use at your own risk! Parameters ---------- func : callable Function used to create each partition. If ``func`` satisfies the ``DataFrameIOFunction`` protocol, column projection will be enabled. *iterables : Iterable objects Iterable objects to map to each output partition. All iterables must be the same length. This length determines the number of partitions in the output collection (only one element of each iterable will be passed to ``func`` for each partition). args : list or tuple, optional Positional arguments to broadcast to each output partition. Note that these arguments will always be passed to ``func`` after the ``iterables`` positional arguments. $META divisions : tuple, str, optional Partition boundaries along the index. For tuple, see https://docs.dask.org/en/latest/dataframe-design.html#partitions For string 'sorted' will compute the delayed values to find index values. Assumes that the indexes are mutually sorted. If None, then won't use index information label : str, optional String to use as the function-name label in the output collection-key names. token : str, optional String to use as the "token" in the output collection-key names. enforce_metadata : bool, default True Whether to enforce at runtime that the structure of the DataFrame produced by ``func`` actually matches the structure of ``meta``. This will rename and reorder columns for each partition, and will raise an error if this doesn't work or types don't match. **kwargs: Key-word arguments to broadcast to each output partition. These same arguments will be passed to ``func`` for every output partition. Examples -------- >>> import pandas as pd >>> import dask.dataframe as dd >>> func = lambda x, size=0: pd.Series([x] * size) >>> inputs = ["A", "B"] >>> dd.from_map(func, inputs, size=2).compute() 0 A 1 A 0 B 1 B dtype: object This API can also be used as an alternative to other file-based IO functions, like ``read_parquet`` (which are already just ``from_map`` wrapper functions): >>> import pandas as pd >>> import dask.dataframe as dd >>> paths = ["0.parquet", "1.parquet", "2.parquet"] >>> dd.from_map(pd.read_parquet, paths).head() # doctest: +SKIP name timestamp 2000-01-01 00:00:00 Laura 2000-01-01 00:00:01 Oliver 2000-01-01 00:00:02 Alice 2000-01-01 00:00:03 Victor 2000-01-01 00:00:04 Bob Since ``from_map`` allows you to map an arbitrary function to any number of iterable objects, it can be a very convenient means of implementing functionality that may be missing from from other DataFrame-creation methods. For example, if you happen to have apriori knowledge about the number of rows in each of the files in a dataset, you can generate a DataFrame collection with a global RangeIndex: >>> import pandas as pd >>> import numpy as np >>> import dask.dataframe as dd >>> paths = ["0.parquet", "1.parquet", "2.parquet"] >>> file_sizes = [86400, 86400, 86400] >>> def func(path, row_offset): ... # Read parquet file and set RangeIndex offset ... df = pd.read_parquet(path) ... return df.set_index( ... pd.RangeIndex(row_offset, row_offset+len(df)) ... ) >>> def get_ddf(paths, file_sizes): ... offsets = [0] + list(np.cumsum(file_sizes)) ... return dd.from_map( ... func, paths, offsets[:-1], divisions=offsets ... ) >>> ddf = get_ddf(paths, file_sizes) # doctest: +SKIP >>> ddf.index # doctest: +SKIP Dask Index Structure: npartitions=3 0 int64 86400 ... 172800 ... 259200 ... dtype: int64 Dask Name: myfunc, 6 tasks See Also -------- dask.dataframe.from_delayed dask.layers.DataFrameIOLayer """ # Input validation if not callable(func): raise ValueError("`func` argument must be `callable`") lengths = set() iterables = list(iterables) for i, iterable in enumerate(iterables): if not isinstance(iterable, Iterable): raise ValueError( f"All elements of `iterables` must be Iterable, got {type(iterable)}" ) try: lengths.add(len(iterable)) except (AttributeError, TypeError): iterables[i] = list(iterable) lengths.add(len(iterables[i])) if len(lengths) == 0: raise ValueError("`from_map` requires at least one Iterable input") elif len(lengths) > 1: raise ValueError("All `iterables` must have the same length") if lengths == {0}: raise ValueError("All `iterables` must have a non-zero length") # Check for `produces_tasks` and `creation_info`. # These options are included in the function signature, # because they are not intended for "public" use. produces_tasks = kwargs.pop("produces_tasks", False) creation_info = kwargs.pop("creation_info", None) if produces_tasks or len(iterables) == 1: if len(iterables) > 1: # Tasks are not detected correctly when they are "packed" # within an outer list/tuple raise ValueError( "Multiple iterables not supported when produces_tasks=True") inputs = iterables[0] packed = False else: inputs = list(zip(*iterables)) packed = True # Define collection name label = label or funcname(func) token = token or tokenize(func, meta, inputs, args, divisions, enforce_metadata, **kwargs) name = f"{label}-{token}" # Get "projectable" column selection. # Note that this relies on the IO function # ducktyping with DataFrameIOFunction column_projection = func.columns if isinstance( func, DataFrameIOFunction) else None # NOTE: Most of the metadata-handling logic used here # is copied directly from `map_partitions` if meta is None: meta = _emulate( func, *(inputs[0] if packed else inputs[:1]), *(args or []), udf=True, **kwargs, ) meta_is_emulated = True else: meta = make_meta(meta) meta_is_emulated = False if not (has_parallel_type(meta) or is_arraylike(meta) and meta.shape): if not meta_is_emulated: raise TypeError( "Meta is not valid, `from_map` expects output to be a pandas object. " "Try passing a pandas object as meta or a dict or tuple representing the " "(name, dtype) of the columns.") # If `meta` is not a pandas object, the concatenated results will be a # different type meta = make_meta(_concat([meta])) # Ensure meta is empty DataFrame meta = make_meta(meta) # Define io_func if packed or args or kwargs or enforce_metadata: io_func = _PackedArgCallable( func, args=args, kwargs=kwargs, meta=meta if enforce_metadata else None, enforce_metadata=enforce_metadata, packed=packed, ) else: io_func = func # Construct DataFrameIOLayer layer = DataFrameIOLayer( name, column_projection, inputs, io_func, label=label, produces_tasks=produces_tasks, creation_info=creation_info, ) # Return new DataFrame-collection object divisions = divisions or [None] * (len(inputs) + 1) graph = HighLevelGraph.from_collections(name, layer, dependencies=[]) return new_dd_object(graph, name, meta, divisions)
def test_funcname_numpy_vectorize(): np = pytest.importorskip("numpy") func = np.vectorize(int) assert funcname(func) == "vectorize_int"
def blockwise( func, out_ind, *args, name=None, token=None, dtype=None, adjust_chunks=None, new_axes=None, align_arrays=True, concatenate=None, meta=None, **kwargs, ): """Tensor operation: Generalized inner and outer products A broad class of blocked algorithms and patterns can be specified with a concise multi-index notation. The ``blockwise`` function applies an in-memory function across multiple blocks of multiple inputs in a variety of ways. Many dask.array operations are special cases of blockwise including elementwise, broadcasting, reductions, tensordot, and transpose. Parameters ---------- func : callable Function to apply to individual tuples of blocks out_ind : iterable Block pattern of the output, something like 'ijk' or (1, 2, 3) *args : sequence of Array, index pairs Sequence like (x, 'ij', y, 'jk', z, 'i') **kwargs : dict Extra keyword arguments to pass to function dtype : np.dtype Datatype of resulting array. concatenate : bool, keyword only If true concatenate arrays along dummy indices, else provide lists adjust_chunks : dict Dictionary mapping index to function to be applied to chunk sizes new_axes : dict, keyword only New indexes and their dimension lengths align_arrays: bool Whether or not to align chunks along equally sized dimensions when multiple arrays are provided. This allows for larger chunks in some arrays to be broken into smaller ones that match chunk sizes in other arrays such that they are compatible for block function mapping. If this is false, then an error will be thrown if arrays do not already have the same number of blocks in each dimension. Examples -------- 2D embarrassingly parallel operation from two arrays, x, and y. >>> import operator, numpy as np, dask.array as da >>> x = da.from_array([[1, 2], ... [3, 4]], chunks=(1, 2)) >>> y = da.from_array([[10, 20], ... [0, 0]]) >>> z = blockwise(operator.add, 'ij', x, 'ij', y, 'ij', dtype='f8') >>> z.compute() array([[11, 22], [ 3, 4]]) Outer product multiplying a by b, two 1-d vectors >>> a = da.from_array([0, 1, 2], chunks=1) >>> b = da.from_array([10, 50, 100], chunks=1) >>> z = blockwise(np.outer, 'ij', a, 'i', b, 'j', dtype='f8') >>> z.compute() array([[ 0, 0, 0], [ 10, 50, 100], [ 20, 100, 200]]) z = x.T >>> z = blockwise(np.transpose, 'ji', x, 'ij', dtype=x.dtype) >>> z.compute() array([[1, 3], [2, 4]]) The transpose case above is illustrative because it does transposition both on each in-memory block by calling ``np.transpose`` and on the order of the blocks themselves, by switching the order of the index ``ij -> ji``. We can compose these same patterns with more variables and more complex in-memory functions z = X + Y.T >>> z = blockwise(lambda x, y: x + y.T, 'ij', x, 'ij', y, 'ji', dtype='f8') >>> z.compute() array([[11, 2], [23, 4]]) Any index, like ``i`` missing from the output index is interpreted as a contraction (note that this differs from Einstein convention; repeated indices do not imply contraction.) In the case of a contraction the passed function should expect an iterable of blocks on any array that holds that index. To receive arrays concatenated along contracted dimensions instead pass ``concatenate=True``. Inner product multiplying a by b, two 1-d vectors >>> def sequence_dot(a_blocks, b_blocks): ... result = 0 ... for a, b in zip(a_blocks, b_blocks): ... result += a.dot(b) ... return result >>> z = blockwise(sequence_dot, '', a, 'i', b, 'i', dtype='f8') >>> z.compute() 250 Add new single-chunk dimensions with the ``new_axes=`` keyword, including the length of the new dimension. New dimensions will always be in a single chunk. >>> def f(a): ... return a[:, None] * np.ones((1, 5)) >>> z = blockwise(f, 'az', a, 'a', new_axes={'z': 5}, dtype=a.dtype) New dimensions can also be multi-chunk by specifying a tuple of chunk sizes. This has limited utility as is (because the chunks are all the same), but the resulting graph can be modified to achieve more useful results (see ``da.map_blocks``). >>> z = blockwise(f, 'az', a, 'a', new_axes={'z': (5, 5)}, dtype=x.dtype) >>> z.chunks ((1, 1, 1), (5, 5)) If the applied function changes the size of each chunk you can specify this with a ``adjust_chunks={...}`` dictionary holding a function for each index that modifies the dimension size in that index. >>> def double(x): ... return np.concatenate([x, x]) >>> y = blockwise(double, 'ij', x, 'ij', ... adjust_chunks={'i': lambda n: 2 * n}, dtype=x.dtype) >>> y.chunks ((2, 2), (2,)) Include literals by indexing with None >>> z = blockwise(operator.add, 'ij', x, 'ij', 1234, None, dtype=x.dtype) >>> z.compute() array([[1235, 1236], [1237, 1238]]) """ out = name new_axes = new_axes or {} # Input Validation if len(set(out_ind)) != len(out_ind): raise ValueError( "Repeated elements not allowed in output index", [k for k, v in toolz.frequencies(out_ind).items() if v > 1], ) new = (set(out_ind) - {a for arg in args[1::2] if arg is not None for a in arg} - set(new_axes or ())) if new: raise ValueError("Unknown dimension", new) from dask.array.core import normalize_arg, unify_chunks if align_arrays: chunkss, arrays = unify_chunks(*args) else: arginds = [(a, i) for (a, i) in toolz.partition(2, args) if i is not None] chunkss = {} # For each dimension, use the input chunking that has the most blocks; # this will ensure that broadcasting works as expected, and in # particular the number of blocks should be correct if the inputs are # consistent. for arg, ind in arginds: for c, i in zip(arg.chunks, ind): if i not in chunkss or len(c) > len(chunkss[i]): chunkss[i] = c arrays = args[::2] for k, v in new_axes.items(): if not isinstance(v, tuple): v = (v, ) chunkss[k] = v arginds = zip(arrays, args[1::2]) numblocks = {} dependencies = [] arrays = [] # Normalize arguments argindsstr = [] for arg, ind in arginds: if ind is None: arg = normalize_arg(arg) arg, collections = unpack_collections(arg) dependencies.extend(collections) else: if (hasattr(arg, "ndim") and hasattr(ind, "__len__") and arg.ndim != len(ind)): raise ValueError( "Index string %s does not match array dimension %d" % (ind, arg.ndim)) numblocks[arg.name] = arg.numblocks arrays.append(arg) arg = arg.name argindsstr.extend((arg, ind)) # Normalize keyword arguments kwargs2 = {} for k, v in kwargs.items(): v = normalize_arg(v) v, collections = unpack_collections(v) dependencies.extend(collections) kwargs2[k] = v # Finish up the name if not out: out = "{}-{}".format( token or utils.funcname(func).strip("_"), base.tokenize(func, out_ind, argindsstr, dtype, **kwargs), ) graph = core_blockwise( func, out, out_ind, *argindsstr, numblocks=numblocks, dependencies=dependencies, new_axes=new_axes, concatenate=concatenate, **kwargs2, ) graph = HighLevelGraph.from_collections(out, graph, dependencies=arrays + dependencies) chunks = [chunkss[i] for i in out_ind] if adjust_chunks: for i, ind in enumerate(out_ind): if ind in adjust_chunks: if callable(adjust_chunks[ind]): chunks[i] = tuple(map(adjust_chunks[ind], chunks[i])) elif isinstance(adjust_chunks[ind], numbers.Integral): chunks[i] = tuple(adjust_chunks[ind] for _ in chunks[i]) elif isinstance(adjust_chunks[ind], (tuple, list)): if len(adjust_chunks[ind]) != len(chunks[i]): raise ValueError( f"Dimension {i} has {len(chunks[i])} blocks, adjust_chunks " f"specified with {len(adjust_chunks[ind])} blocks") chunks[i] = tuple(adjust_chunks[ind]) else: raise NotImplementedError( "adjust_chunks values must be callable, int, or tuple") chunks = tuple(chunks) if meta is None: from dask.array.utils import compute_meta meta = compute_meta(func, dtype, *args[::2], **kwargs) return new_da_object(graph, out, chunks, meta=meta, dtype=dtype)
def map_overlap(func, df, before, after, *args, **kwargs): """Apply a function to each partition, sharing rows with adjacent partitions. Parameters ---------- func : function Function applied to each partition. df : dd.DataFrame, dd.Series before : int or timedelta The rows to prepend to partition ``i`` from the end of partition ``i - 1``. after : int or timedelta The rows to append to partition ``i`` from the beginning of partition ``i + 1``. args, kwargs : Arguments and keywords to pass to the function. The partition will be the first argument, and these will be passed *after*. See Also -------- dd.DataFrame.map_overlap """ if isinstance(before, datetime.timedelta) or isinstance(after, datetime.timedelta): if not is_datetime64_any_dtype(df.index._meta_nonempty.inferred_type): raise TypeError( "Must have a `DatetimeIndex` when using string offset " "for `before` and `after`" ) else: if not ( isinstance(before, Integral) and before >= 0 and isinstance(after, Integral) and after >= 0 ): raise ValueError("before and after must be positive integers") if "token" in kwargs: func_name = kwargs.pop("token") token = tokenize(df, before, after, *args, **kwargs) else: func_name = "overlap-" + funcname(func) token = tokenize(func, df, before, after, *args, **kwargs) if "meta" in kwargs: meta = kwargs.pop("meta") else: meta = _emulate(func, df, *args, **kwargs) meta = make_meta(meta, index=df._meta.index, parent_meta=df._meta) name = f"{func_name}-{token}" name_a = "overlap-prepend-" + tokenize(df, before) name_b = "overlap-append-" + tokenize(df, after) df_name = df._name dsk = {} timedelta_partition_message = ( "Partition size is less than specified window. " "Try using ``df.repartition`` to increase the partition size" ) if before and isinstance(before, Integral): prevs = [None] for i in range(df.npartitions - 1): key = (name_a, i) dsk[key] = (M.tail, (df_name, i), before) prevs.append(key) elif isinstance(before, datetime.timedelta): # Assumes monotonic (increasing?) index divs = pd.Series(df.divisions) deltas = divs.diff().iloc[1:-1] # In the first case window-size is larger than at least one partition, thus it is # necessary to calculate how many partitions must be used for each rolling task. # Otherwise, these calculations can be skipped (faster) if (before > deltas).any(): pt_z = divs[0] prevs = [None] for i in range(df.npartitions - 1): # Select all indexes of relevant partitions between the current partition and # the partition with the highest division outside the rolling window (before) pt_i = divs[i + 1] # lower-bound the search to the first division lb = max(pt_i - before, pt_z) first, j = divs[i], i while first > lb and j > 0: first = first - deltas[j] j = j - 1 key = (name_a, i) dsk[key] = ( _tail_timedelta, [(df_name, k) for k in range(j, i + 1)], (df_name, i + 1), before, ) prevs.append(key) else: prevs = [None] for i in range(df.npartitions - 1): key = (name_a, i) dsk[key] = ( _tail_timedelta, [(df_name, i)], (df_name, i + 1), before, ) prevs.append(key) else: prevs = [None] * df.npartitions if after and isinstance(after, Integral): nexts = [] for i in range(1, df.npartitions): key = (name_b, i) dsk[key] = (M.head, (df_name, i), after) nexts.append(key) nexts.append(None) elif isinstance(after, datetime.timedelta): # TODO: Do we have a use-case for this? Pandas doesn't allow negative rolling windows deltas = pd.Series(df.divisions).diff().iloc[1:-1] if (after > deltas).any(): raise ValueError(timedelta_partition_message) nexts = [] for i in range(1, df.npartitions): key = (name_b, i) dsk[key] = (_head_timedelta, (df_name, i - 0), (df_name, i), after) nexts.append(key) nexts.append(None) else: nexts = [None] * df.npartitions for i, (prev, current, next) in enumerate(zip(prevs, df.__dask_keys__(), nexts)): dsk[(name, i)] = ( overlap_chunk, func, prev, current, next, before, after, args, kwargs, ) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df]) return df._constructor(graph, name, meta, df.divisions)