def meth(self, other, *args, **kwargs): meta = _emulate(op, self, other) return map_partitions(op, self, other, meta=meta, enforce_metadata=False, *args, **kwargs)
def transform(self, dataset): if isinstance(dataset, pd.DataFrame): dataset = self.partial_transform(dataset) return dataset elif isinstance(dataset, dd.DataFrame): temp_dir = tempfile.mkdtemp() # Get meta beforehand meta = _emulate(self.partial_transform, dataset, udf=True) self.reset_transform( ) # Reset transform before applying to full dataset # Apply to full dataset dataset.map_partitions(self.partial_transform, meta=meta) \ .to_parquet(temp_dir, compute=False, engine='fastparquet') \ .compute(scheduler='single-threaded') return temp_dir
def concat_indexed_dataframes(dfs, axis=0, join="outer"): """ Concatenate indexed dataframes together along the index """ if join not in ("inner", "outer"): raise ValueError("'join' must be 'inner' or 'outer'") from dask.dataframe.core import _emulate dummy = _emulate(pd.concat, dfs, axis=axis, join=join) dfs = _maybe_from_pandas(dfs) dfs2, divisions, parts = align_partitions(*dfs) empties = [df._pd for df in dfs] parts2 = [[df if df is not None else empty for df, empty in zip(part, empties)] for part in parts] name = "concat-indexed-" + tokenize(join, *dfs) dsk = dict(((name, i), (_pdconcat, part, axis, join)) for i, part in enumerate(parts2)) return _Frame(toolz.merge(dsk, *[df.dask for df in dfs2]), name, dummy, divisions)
def concat_indexed_dataframes(dfs, axis=0, join='outer'): """ Concatenate indexed dataframes together along the index """ if join not in ('inner', 'outer'): raise ValueError("'join' must be 'inner' or 'outer'") from dask.dataframe.core import _emulate dummy = _emulate(pd.concat, dfs, axis=axis, join=join) dfs = _maybe_from_pandas(dfs) dfs2, divisions, parts = align_partitions(*dfs) empties = [df._pd for df in dfs] parts2 = [[ df if df is not None else empty for df, empty in zip(part, empties) ] for part in parts] name = 'concat-indexed-' + tokenize(join, *dfs) dsk = dict(((name, i), (_pdconcat, part, axis, join)) for i, part in enumerate(parts2)) return _Frame(toolz.merge(dsk, *[df.dask for df in dfs2]), name, dummy, divisions)
def map_overlap(func, df, before, after, *args, **kwargs): """Apply a function to each partition, sharing rows with adjacent partitions. Parameters ---------- func : function Function applied to each partition. df : dd.DataFrame, dd.Series before : int or timedelta The rows to prepend to partition ``i`` from the end of partition ``i - 1``. after : int or timedelta The rows to append to partition ``i`` from the beginning of partition ``i + 1``. args, kwargs : Arguments and keywords to pass to the function. The partition will be the first argument, and these will be passed *after*. See Also -------- dd.DataFrame.map_overlap """ if isinstance(before, datetime.timedelta) or isinstance(after, datetime.timedelta): if not is_datetime64_any_dtype(df.index._meta_nonempty.inferred_type): raise TypeError( "Must have a `DatetimeIndex` when using string offset " "for `before` and `after`" ) else: if not ( isinstance(before, Integral) and before >= 0 and isinstance(after, Integral) and after >= 0 ): raise ValueError("before and after must be positive integers") if "token" in kwargs: func_name = kwargs.pop("token") token = tokenize(df, before, after, *args, **kwargs) else: func_name = "overlap-" + funcname(func) token = tokenize(func, df, before, after, *args, **kwargs) if "meta" in kwargs: meta = kwargs.pop("meta") else: meta = _emulate(func, df, *args, **kwargs) meta = make_meta(meta, index=df._meta.index, parent_meta=df._meta) name = f"{func_name}-{token}" name_a = "overlap-prepend-" + tokenize(df, before) name_b = "overlap-append-" + tokenize(df, after) df_name = df._name dsk = {} timedelta_partition_message = ( "Partition size is less than specified window. " "Try using ``df.repartition`` to increase the partition size" ) if before and isinstance(before, Integral): prevs = [None] for i in range(df.npartitions - 1): key = (name_a, i) dsk[key] = (M.tail, (df_name, i), before) prevs.append(key) elif isinstance(before, datetime.timedelta): # Assumes monotonic (increasing?) index divs = pd.Series(df.divisions) deltas = divs.diff().iloc[1:-1] # In the first case window-size is larger than at least one partition, thus it is # necessary to calculate how many partitions must be used for each rolling task. # Otherwise, these calculations can be skipped (faster) if (before > deltas).any(): pt_z = divs[0] prevs = [None] for i in range(df.npartitions - 1): # Select all indexes of relevant partitions between the current partition and # the partition with the highest division outside the rolling window (before) pt_i = divs[i + 1] # lower-bound the search to the first division lb = max(pt_i - before, pt_z) first, j = divs[i], i while first > lb and j > 0: first = first - deltas[j] j = j - 1 key = (name_a, i) dsk[key] = ( _tail_timedelta, [(df_name, k) for k in range(j, i + 1)], (df_name, i + 1), before, ) prevs.append(key) else: prevs = [None] for i in range(df.npartitions - 1): key = (name_a, i) dsk[key] = ( _tail_timedelta, [(df_name, i)], (df_name, i + 1), before, ) prevs.append(key) else: prevs = [None] * df.npartitions if after and isinstance(after, Integral): nexts = [] for i in range(1, df.npartitions): key = (name_b, i) dsk[key] = (M.head, (df_name, i), after) nexts.append(key) nexts.append(None) elif isinstance(after, datetime.timedelta): # TODO: Do we have a use-case for this? Pandas doesn't allow negative rolling windows deltas = pd.Series(df.divisions).diff().iloc[1:-1] if (after > deltas).any(): raise ValueError(timedelta_partition_message) nexts = [] for i in range(1, df.npartitions): key = (name_b, i) dsk[key] = (_head_timedelta, (df_name, i - 0), (df_name, i), after) nexts.append(key) nexts.append(None) else: nexts = [None] * df.npartitions for i, (prev, current, next) in enumerate(zip(prevs, df.__dask_keys__(), nexts)): dsk[(name, i)] = ( overlap_chunk, func, prev, current, next, before, after, args, kwargs, ) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df]) return df._constructor(graph, name, meta, df.divisions)
def apply_concat_apply(args, chunk=None, aggregate=None, combine=None, meta=no_default, token=None, chunk_kwargs=None, aggregate_kwargs=None, combine_kwargs=None, split_every=None, split_out=None, split_out_setup=None, split_out_setup_kwargs=None, **kwargs): """Apply a function to blocks, then concat, then apply again Parameters ---------- args : Positional arguments for the `chunk` function. All `dask.dataframe` objects should be partitioned and indexed equivalently. chunk : function [block-per-arg] -> block Function to operate on each block of data aggregate : function concatenated-block -> block Function to operate on the concatenated result of chunk combine : function concatenated-block -> block, optional Function to operate on intermediate concatenated results of chunk in a tree-reduction. If not provided, defaults to aggregate. token : str, optional The name to use for the output keys. chunk_kwargs : dict, optional Keywords for the chunk function only. aggregate_kwargs : dict, optional Keywords for the aggregate function only. combine_kwargs : dict, optional Keywords for the combine function only. split_every : int, optional Group partitions into groups of this size while performing a tree-reduction. If set to False, no tree-reduction will be used, and all intermediates will be concatenated and passed to ``aggregate``. Default is 8. split_out : int, optional Number of output partitions. Split occurs after first chunk reduction. split_out_setup : callable, optional If provided, this function is called on each chunk before performing the hash-split. It should return a pandas object, where each row (excluding the index) is hashed. If not provided, the chunk is hashed as is. split_out_setup_kwargs : dict, optional Keywords for the `split_out_setup` function only. kwargs : All remaining keywords will be passed to ``chunk``, ``aggregate``, and ``combine``. Examples -------- >>> def chunk(a_block, b_block): ... pass >>> def agg(df): ... pass >>> apply_concat_apply([a, b], chunk=chunk, aggregate=agg) # doctest: +SKIP """ if chunk_kwargs is None: chunk_kwargs = dict() if aggregate_kwargs is None: aggregate_kwargs = dict() chunk_kwargs.update(kwargs) aggregate_kwargs.update(kwargs) if combine is None: if combine_kwargs: raise ValueError("`combine_kwargs` provided with no `combine`") combine = aggregate combine_kwargs = aggregate_kwargs else: if combine_kwargs is None: combine_kwargs = dict() combine_kwargs.update(kwargs) if not isinstance(args, (tuple, list)): args = [args] npartitions = set(arg.npartitions for arg in args if isinstance(arg, SparseFrame)) if len(npartitions) > 1: raise ValueError("All arguments must have same number of partitions") npartitions = npartitions.pop() if split_every is None: split_every = 8 elif split_every is False: split_every = npartitions elif split_every < 2 or not isinstance(split_every, int): raise ValueError("split_every must be an integer >= 2") token_key = tokenize(token or (chunk, aggregate), meta, args, chunk_kwargs, aggregate_kwargs, combine_kwargs, split_every, split_out, split_out_setup, split_out_setup_kwargs) # Chunk a = '{0}-chunk-{1}'.format(token or funcname(chunk), token_key) if len(args) == 1 and isinstance(args[0], SparseFrame) and not chunk_kwargs: dsk = {(a, 0, i, 0): (chunk, key) for i, key in enumerate(args[0].__dask_keys__())} else: dsk = {(a, 0, i, 0): (apply, chunk, [ (x._name, i) if isinstance(x, SparseFrame) else x for x in args ], chunk_kwargs) for i in range(args[0].npartitions)} # Split # this splits the blocks (usually) by their index and # basically performs a task sort such that the next tree # aggregation will result in the desired number of partitions # given by the split_out parameter if split_out and split_out > 1: split_prefix = 'split-%s' % token_key shard_prefix = 'shard-%s' % token_key for i in range(args[0].npartitions): # For now we assume that split_out_setup selects the index # as we will only support index groupbys for now. So we can # use the function provided by dask. dsk[(split_prefix, i)] = (hash_shard, (a, 0, i, 0), split_out, split_out_setup, split_out_setup_kwargs) # At this point we have dictionaries of dataframes. The dictionary keys # correspond to the hashed index value. Such that rows with the same index # have the same dictionary key. # The next line unpacks this dictionaries into pure dataframes again # now with the correct dask key for their partition. So at this point # we might have shards of a single row in the next step they are combined again. for j in range(split_out): dsk[(shard_prefix, 0, i, j)] = (getitem, (split_prefix, i), j) a = shard_prefix else: split_out = 1 # Combine b = '{0}-combine-{1}'.format(token or funcname(combine), token_key) k = npartitions depth = 0 while k > split_every: for part_i, inds in enumerate(partition_all(split_every, range(k))): for j in range(split_out): conc = (sp.SparseFrame.vstack, [(a, depth, i, j) for i in inds]) # Finally we apply the combine function on the concatenated # results. This is usually the same as the aggregate # function. if combine_kwargs: dsk[(b, depth + 1, part_i, j)] = (apply, combine, [conc], combine_kwargs) else: dsk[(b, depth + 1, part_i, j)] = (combine, conc) k = part_i + 1 a = b depth += 1 # Aggregate for j in range(split_out): b = '{0}-agg-{1}'.format(token or funcname(aggregate), token_key) conc = (sp.SparseFrame.vstack, [(a, depth, i, j) for i in range(k)]) if aggregate_kwargs: dsk[(b, j)] = (apply, aggregate, [conc], aggregate_kwargs) else: dsk[(b, j)] = (aggregate, conc) if meta is no_default: meta_chunk = _emulate(chunk, *args, **chunk_kwargs) meta = _emulate(aggregate, sp.SparseFrame.vstack([meta_chunk]), **aggregate_kwargs) for arg in args: if isinstance(arg, SparseFrame): dsk.update(arg.dask) divisions = [None] * (split_out + 1) return SparseFrame(dsk, b, meta, divisions)
def elemwise(op, *args, **kwargs): """ Elementwise operation for dask.Sparseframes Parameters ---------- op: function Function that takes as first parameter the underlying df args: Contains Dataframes kwargs: Contains meta. """ meta = kwargs.pop('meta', no_default) _name = funcname(op) + '-' + tokenize(op, kwargs, *args) # if pd.Series or pd.DataFrame change to dd.DataFrame args = _maybe_from_pandas(args) # Align DataFrame blocks if divisions are different. from .multi import _maybe_align_partitions # to avoid cyclical import args = _maybe_align_partitions(args) # extract all dask instances dasks = [ arg for arg in args if isinstance(arg, (SparseFrame, _Frame, Scalar)) ] # extract all dask frames dfs = [df for df in dasks if isinstance(df, (_Frame, SparseFrame))] # We take divisions from the first dask frame divisions = dfs[0].divisions _is_broadcastable = partial(is_broadcastable, dfs) dfs = list(remove(_is_broadcastable, dfs)) n = len(divisions) - 1 other = [(i, arg) for i, arg in enumerate(args) if not isinstance(arg, (_Frame, Scalar, SparseFrame))] # Get dsks graph tuple keys and adjust the key length of Scalar keys = [ d.__dask_keys__() * n if isinstance(d, Scalar) or _is_broadcastable(d) else d.__dask_keys__() for d in dasks ] if other: dsk = {(_name, i): (apply, partial_by_order, list(frs), { 'function': op, 'other': other }) for i, frs in enumerate(zip(*keys))} else: dsk = {(_name, i): (op, ) + frs for i, frs in enumerate(zip(*keys))} dsk = merge(dsk, *[d.dask for d in dasks]) if meta is no_default: if len(dfs) >= 2 and len(dasks) != len(dfs): # should not occur in current funcs msg = 'elemwise with 2 or more DataFrames and Scalar is not supported' raise NotImplementedError(msg) meta = _emulate(op, *args, **kwargs) return SparseFrame(dsk, _name, meta, divisions)
def from_map( func, *iterables, args=None, meta=None, divisions=None, label=None, token=None, enforce_metadata=True, **kwargs, ): """Create a DataFrame collection from a custom function map WARNING: The ``from_map`` API is experimental, and stability is not yet guaranteed. Use at your own risk! Parameters ---------- func : callable Function used to create each partition. If ``func`` satisfies the ``DataFrameIOFunction`` protocol, column projection will be enabled. *iterables : Iterable objects Iterable objects to map to each output partition. All iterables must be the same length. This length determines the number of partitions in the output collection (only one element of each iterable will be passed to ``func`` for each partition). args : list or tuple, optional Positional arguments to broadcast to each output partition. Note that these arguments will always be passed to ``func`` after the ``iterables`` positional arguments. $META divisions : tuple, str, optional Partition boundaries along the index. For tuple, see https://docs.dask.org/en/latest/dataframe-design.html#partitions For string 'sorted' will compute the delayed values to find index values. Assumes that the indexes are mutually sorted. If None, then won't use index information label : str, optional String to use as the function-name label in the output collection-key names. token : str, optional String to use as the "token" in the output collection-key names. enforce_metadata : bool, default True Whether to enforce at runtime that the structure of the DataFrame produced by ``func`` actually matches the structure of ``meta``. This will rename and reorder columns for each partition, and will raise an error if this doesn't work or types don't match. **kwargs: Key-word arguments to broadcast to each output partition. These same arguments will be passed to ``func`` for every output partition. Examples -------- >>> import pandas as pd >>> import dask.dataframe as dd >>> func = lambda x, size=0: pd.Series([x] * size) >>> inputs = ["A", "B"] >>> dd.from_map(func, inputs, size=2).compute() 0 A 1 A 0 B 1 B dtype: object This API can also be used as an alternative to other file-based IO functions, like ``read_parquet`` (which are already just ``from_map`` wrapper functions): >>> import pandas as pd >>> import dask.dataframe as dd >>> paths = ["0.parquet", "1.parquet", "2.parquet"] >>> dd.from_map(pd.read_parquet, paths).head() # doctest: +SKIP name timestamp 2000-01-01 00:00:00 Laura 2000-01-01 00:00:01 Oliver 2000-01-01 00:00:02 Alice 2000-01-01 00:00:03 Victor 2000-01-01 00:00:04 Bob Since ``from_map`` allows you to map an arbitrary function to any number of iterable objects, it can be a very convenient means of implementing functionality that may be missing from from other DataFrame-creation methods. For example, if you happen to have apriori knowledge about the number of rows in each of the files in a dataset, you can generate a DataFrame collection with a global RangeIndex: >>> import pandas as pd >>> import numpy as np >>> import dask.dataframe as dd >>> paths = ["0.parquet", "1.parquet", "2.parquet"] >>> file_sizes = [86400, 86400, 86400] >>> def func(path, row_offset): ... # Read parquet file and set RangeIndex offset ... df = pd.read_parquet(path) ... return df.set_index( ... pd.RangeIndex(row_offset, row_offset+len(df)) ... ) >>> def get_ddf(paths, file_sizes): ... offsets = [0] + list(np.cumsum(file_sizes)) ... return dd.from_map( ... func, paths, offsets[:-1], divisions=offsets ... ) >>> ddf = get_ddf(paths, file_sizes) # doctest: +SKIP >>> ddf.index # doctest: +SKIP Dask Index Structure: npartitions=3 0 int64 86400 ... 172800 ... 259200 ... dtype: int64 Dask Name: myfunc, 6 tasks See Also -------- dask.dataframe.from_delayed dask.layers.DataFrameIOLayer """ # Input validation if not callable(func): raise ValueError("`func` argument must be `callable`") lengths = set() iterables = list(iterables) for i, iterable in enumerate(iterables): if not isinstance(iterable, Iterable): raise ValueError( f"All elements of `iterables` must be Iterable, got {type(iterable)}" ) try: lengths.add(len(iterable)) except (AttributeError, TypeError): iterables[i] = list(iterable) lengths.add(len(iterables[i])) if len(lengths) == 0: raise ValueError("`from_map` requires at least one Iterable input") elif len(lengths) > 1: raise ValueError("All `iterables` must have the same length") if lengths == {0}: raise ValueError("All `iterables` must have a non-zero length") # Check for `produces_tasks` and `creation_info`. # These options are included in the function signature, # because they are not intended for "public" use. produces_tasks = kwargs.pop("produces_tasks", False) creation_info = kwargs.pop("creation_info", None) if produces_tasks or len(iterables) == 1: if len(iterables) > 1: # Tasks are not detected correctly when they are "packed" # within an outer list/tuple raise ValueError( "Multiple iterables not supported when produces_tasks=True") inputs = iterables[0] packed = False else: inputs = list(zip(*iterables)) packed = True # Define collection name label = label or funcname(func) token = token or tokenize(func, meta, inputs, args, divisions, enforce_metadata, **kwargs) name = f"{label}-{token}" # Get "projectable" column selection. # Note that this relies on the IO function # ducktyping with DataFrameIOFunction column_projection = func.columns if isinstance( func, DataFrameIOFunction) else None # NOTE: Most of the metadata-handling logic used here # is copied directly from `map_partitions` if meta is None: meta = _emulate( func, *(inputs[0] if packed else inputs[:1]), *(args or []), udf=True, **kwargs, ) meta_is_emulated = True else: meta = make_meta(meta) meta_is_emulated = False if not (has_parallel_type(meta) or is_arraylike(meta) and meta.shape): if not meta_is_emulated: raise TypeError( "Meta is not valid, `from_map` expects output to be a pandas object. " "Try passing a pandas object as meta or a dict or tuple representing the " "(name, dtype) of the columns.") # If `meta` is not a pandas object, the concatenated results will be a # different type meta = make_meta(_concat([meta])) # Ensure meta is empty DataFrame meta = make_meta(meta) # Define io_func if packed or args or kwargs or enforce_metadata: io_func = _PackedArgCallable( func, args=args, kwargs=kwargs, meta=meta if enforce_metadata else None, enforce_metadata=enforce_metadata, packed=packed, ) else: io_func = func # Construct DataFrameIOLayer layer = DataFrameIOLayer( name, column_projection, inputs, io_func, label=label, produces_tasks=produces_tasks, creation_info=creation_info, ) # Return new DataFrame-collection object divisions = divisions or [None] * (len(inputs) + 1) graph = HighLevelGraph.from_collections(name, layer, dependencies=[]) return new_dd_object(graph, name, meta, divisions)