def test_shuffle_empty_partitions(method): df = pd.DataFrame({'x': [1, 2, 3] * 10}) ddf = dd.from_pandas(df, npartitions=3) s = shuffle(ddf, ddf.x, npartitions=6, shuffle=method) parts = compute_as_if_collection(dd.DataFrame, s.dask, s.__dask_keys__()) for p in parts: assert s.columns == p.columns
def check_and_return(ddfs, dfs, join): sol = concat(dfs, join=join) res = dd.concat(ddfs, join=join, interleave_partitions=divisions) assert_eq(res, sol) if known: parts = compute_as_if_collection(dd.DataFrame, res.dask, res.__dask_keys__()) for p in [i.iloc[:0] for i in parts]: res._meta == p # will error if schemas don't align assert not cat_index or has_known_categories(res.index) == known return res
def rearrange_by_column_disk(df, column, npartitions=None, compute=False): """Shuffle using local disk See Also -------- rearrange_by_column_tasks: Same function, but using tasks rather than partd Has a more informative docstring """ if npartitions is None: npartitions = df.npartitions token = tokenize(df, column, npartitions) always_new_token = uuid.uuid1().hex p = ("zpartd-" + always_new_token, ) dsk1 = {p: (maybe_buffered_partd(), )} # Partition data on disk name = "shuffle-partition-" + always_new_token dsk2 = {(name, i): (shuffle_group_3, key, column, npartitions, p) for i, key in enumerate(df.__dask_keys__())} dependencies = [] if compute: graph = HighLevelGraph.merge(df.dask, dsk1, dsk2) graph = HighLevelGraph.from_collections(name, graph, dependencies=[df]) keys = [p, sorted(dsk2)] pp, values = compute_as_if_collection(DataFrame, graph, keys) dsk1 = {p: pp} dsk2 = dict(zip(sorted(dsk2), values)) else: dependencies.append(df) # Barrier barrier_token = "barrier-" + always_new_token dsk3 = {barrier_token: (barrier, list(dsk2))} # Collect groups name = "shuffle-collect-" + token dsk4 = {(name, i): (collect, p, i, df._meta, barrier_token) for i in range(npartitions)} divisions = (None, ) * (npartitions + 1) layer = toolz.merge(dsk1, dsk2, dsk3, dsk4) graph = HighLevelGraph.from_collections(name, layer, dependencies=dependencies) return new_dd_object(graph, name, df._meta, divisions)
def test_read_csv_index(): with filetext(csv_text) as fn: f = dd.read_csv(fn, blocksize=20).set_index('amount') result = f.compute(scheduler='sync') assert result.index.name == 'amount' blocks = compute_as_if_collection(dd.DataFrame, f.dask, f.__dask_keys__(), scheduler='sync') for i, block in enumerate(blocks): if i < len(f.divisions) - 2: assert (block.index < f.divisions[i + 1]).all() if i > 0: assert (block.index >= f.divisions[i]).all() expected = pd.read_csv(fn).set_index('amount') assert_eq(result, expected)
def test_compte_as_if_collection_low_level_task_graph(): # See https://github.com/dask/dask/pull/7969 da = pytest.importorskip("dask.array") x = da.arange(10) # Boolean flag to ensure MyDaskArray.__dask_optimize__ is called optimized = False class MyDaskArray(da.Array): """Dask Array subclass with validation logic in __dask_optimize__""" @classmethod def __dask_optimize__(cls, dsk, keys, **kwargs): # Ensure `compute_as_if_collection` don't convert to a low-level task graph assert type(dsk) is HighLevelGraph nonlocal optimized optimized = True return super().__dask_optimize__(dsk, keys, **kwargs) result = compute_as_if_collection(MyDaskArray, x.__dask_graph__(), x.__dask_keys__())[0] assert optimized da.utils.assert_eq(x, result)
def categorize(df, columns=None, index=None, split_every=None, **kwargs): """Convert columns of the DataFrame to category dtype. Parameters ---------- columns : list, optional A list of column names to convert to categoricals. By default any column with an object dtype is converted to a categorical, and any unknown categoricals are made known. index : bool, optional Whether to categorize the index. By default, object indices are converted to categorical, and unknown categorical indices are made known. Set True to always categorize the index, False to never. split_every : int, optional Group partitions into groups of this size while performing a tree-reduction. If set to False, no tree-reduction will be used. Default is 16. kwargs Keyword arguments are passed on to compute. """ meta = df._meta if columns is None: columns = list(meta.select_dtypes(["object", "category"]).columns) elif is_scalar(columns): columns = [columns] # Filter out known categorical columns columns = [ c for c in columns if not ( is_categorical_dtype(meta[c]) and has_known_categories(meta[c])) ] if index is not False: if is_categorical_dtype(meta.index): index = not has_known_categories(meta.index) elif index is None: index = meta.index.dtype == object # Nothing to do if not len(columns) and index is False: return df if split_every is None: split_every = 16 elif split_every is False: split_every = df.npartitions elif not isinstance(split_every, Integral) or split_every < 2: raise ValueError("split_every must be an integer >= 2") token = tokenize(df, columns, index, split_every) a = "get-categories-chunk-" + token dsk = {(a, i): (_get_categories, key, columns, index) for (i, key) in enumerate(df.__dask_keys__())} prefix = "get-categories-agg-" + token k = df.npartitions depth = 0 while k > split_every: b = prefix + str(depth) for part_i, inds in enumerate(partition_all(split_every, range(k))): dsk[(b, part_i)] = (_get_categories_agg, [(a, i) for i in inds]) k = part_i + 1 a = b depth += 1 dsk[(prefix, 0)] = (_get_categories_agg, [(a, i) for i in range(k)]) dsk.update(df.dask) # Compute the categories categories, index = compute_as_if_collection(df.__class__, dsk, (prefix, 0), **kwargs) # some operations like get_dummies() rely on the order of categories categories = {k: v.sort_values() for k, v in categories.items()} # Categorize each partition return df.map_partitions(_categorize_block, categories, index)
def shuffle( df: DataFrame, column_names: List[str], npartitions: Optional[int] = None, ignore_index: bool = False, ) -> DataFrame: """Order divisions of DataFrame so that all values within column(s) align This enacts a task-based shuffle using explicit-comms. It requires a full dataset read, serialization and shuffle. This is expensive. If possible you should avoid shuffles. This does not preserve a meaningful index/partitioning scheme. This is not deterministic if done in parallel. Requires an activate client. Parameters ---------- df: dask.dataframe.DataFrame Dataframe to shuffle column_names: list of strings List of column names on which we want to split. npartitions: int or None The desired number of output partitions. If None, the number of output partitions equals `df.npartitions` ignore_index: bool Ignore index during shuffle. If True, performance may improve, but index values will not be preserved. Returns ------- df: dask.dataframe.DataFrame Shuffled dataframe Developer Notes --------------- The implementation consist of three steps: (a) Extend the dask graph of `df` with a call to `shuffle_group()` for each dataframe partition and submit the graph. (b) Submit a task on each worker that shuffle (all-to-all communicate) the groups from (a) and return a list of dataframe-partitions. (c) Submit a dask graph that extract (using `getitem()`) individual dataframe-partitions from (b). """ c = comms.default_comms() # As default we preserve number of partitions if npartitions is None: npartitions = df.npartitions # Step (a): partition/group each dataframe-partition name = ("explicit-comms-shuffle-group-" f"{tokenize(df, column_names, npartitions, ignore_index)}") df = df.persist( ) # Making sure optimizations are apply on the existing graph dsk = dict(df.__dask_graph__()) output_keys = [] for input_key in df.__dask_keys__(): output_key = (name, input_key[1]) dsk[output_key] = ( shuffle_group, input_key, column_names, 0, npartitions, npartitions, ignore_index, npartitions, ) output_keys.append(output_key) # Compute `df_groups`, which is a list of futures, one future per partition in `df`. # Each future points to a dict of length `df.npartitions` that maps each # partition-id to a DataFrame. df_groups = compute_as_if_collection(type(df), dsk, output_keys, sync=False) wait(df_groups) for f in df_groups: # Check for errors if f.status == "error": f.result() # raise exception # Step (b): find out which workers has what part of `df_groups`, # find the number of output each worker should have, # and submit `local_shuffle()` on each worker. key_to_part = {str(part.key): part for part in df_groups} in_parts = defaultdict(list) # Map worker -> [list of futures] for key, workers in c.client.who_has(df_groups).items(): # Note, if multiple workers have the part, we pick the first worker in_parts[first(workers)].append(key_to_part[key]) # Let's create a dict that specifices the number of partitions each worker has in_nparts = {} workers = set() # All ranks that have a partition of `df` for rank, worker in enumerate(c.worker_addresses): nparts = len(in_parts.get(worker, ())) if nparts > 0: in_nparts[rank] = nparts workers.add(rank) workers_sorted = sorted(workers) # Find the output partitions for each worker div = npartitions // len(workers) rank_to_out_part_ids = {} # rank -> [list of partition id] for i, rank in enumerate(workers_sorted): rank_to_out_part_ids[rank] = list(range(div * i, div * (i + 1))) for rank, i in zip(workers_sorted, range(div * len(workers), npartitions)): rank_to_out_part_ids[rank].append(i) # Run `local_shuffle()` on each worker result_futures = {} for rank, worker in enumerate(c.worker_addresses): if rank in workers: result_futures[rank] = c.submit( worker, local_shuffle, in_nparts, in_parts[worker], rank_to_out_part_ids, ignore_index, ) distributed.wait(list(result_futures.values())) del df_groups # Step (c): extract individual dataframe-partitions name = f"explicit-comms-shuffle-getitem-{tokenize(name)}" dsk = {} meta = None for rank, parts in rank_to_out_part_ids.items(): for i, part_id in enumerate(parts): dsk[(name, part_id)] = (getitem, result_futures[rank], i) if meta is None: # Get the meta from the first output partition meta = delayed(make_meta)(delayed(getitem)( result_futures[rank], i)).compute() assert meta is not None divs = [None] * (len(dsk) + 1) return new_dd_object(dsk, name, meta, divs).persist()
def to_hdf( df, path, key, mode="a", append=False, scheduler=None, name_function=None, compute=True, lock=None, dask_kwargs={}, **kwargs, ): """Store Dask Dataframe to Hierarchical Data Format (HDF) files This is a parallel version of the Pandas function of the same name. Please see the Pandas docstring for more detailed information about shared keyword arguments. This function differs from the Pandas version by saving the many partitions of a Dask DataFrame in parallel, either to many files, or to many datasets within the same file. You may specify this parallelism with an asterix ``*`` within the filename or datapath, and an optional ``name_function``. The asterix will be replaced with an increasing sequence of integers starting from ``0`` or with the result of calling ``name_function`` on each of those integers. This function only supports the Pandas ``'table'`` format, not the more specialized ``'fixed'`` format. Parameters ---------- path : string, pathlib.Path Path to a target filename. Supports strings, ``pathlib.Path``, or any object implementing the ``__fspath__`` protocol. May contain a ``*`` to denote many filenames. key : string Datapath within the files. May contain a ``*`` to denote many locations name_function : function A function to convert the ``*`` in the above options to a string. Should take in a number from 0 to the number of partitions and return a string. (see examples below) compute : bool Whether or not to execute immediately. If False then this returns a ``dask.Delayed`` value. lock : bool, Lock, optional Lock to use to prevent concurrency issues. By default a ``threading.Lock``, ``multiprocessing.Lock`` or ``SerializableLock`` will be used depending on your scheduler if a lock is required. See dask.utils.get_scheduler_lock for more information about lock selection. scheduler : string The scheduler to use, like "threads" or "processes" **other: See pandas.to_hdf for more information Examples -------- Save Data to a single file >>> df.to_hdf('output.hdf', '/data') # doctest: +SKIP Save data to multiple datapaths within the same file: >>> df.to_hdf('output.hdf', '/data-*') # doctest: +SKIP Save data to multiple files: >>> df.to_hdf('output-*.hdf', '/data') # doctest: +SKIP Save data to multiple files, using the multiprocessing scheduler: >>> df.to_hdf('output-*.hdf', '/data', scheduler='processes') # doctest: +SKIP Specify custom naming scheme. This writes files as '2000-01-01.hdf', '2000-01-02.hdf', '2000-01-03.hdf', etc.. >>> from datetime import date, timedelta >>> base = date(year=2000, month=1, day=1) >>> def name_function(i): ... ''' Convert integer 0 to n to a string ''' ... return base + timedelta(days=i) >>> df.to_hdf('*.hdf', '/data', name_function=name_function) # doctest: +SKIP Returns ------- filenames : list Returned if ``compute`` is True. List of file names that each partition is saved to. delayed : dask.Delayed Returned if ``compute`` is False. Delayed object to execute ``to_hdf`` when computed. See Also -------- read_hdf: to_parquet: """ name = "to-hdf-" + uuid.uuid1().hex pd_to_hdf = getattr(df._partition_type, "to_hdf") single_file = True single_node = True path = stringify_path(path) # if path is string, format using i_name if isinstance(path, str): if path.count("*") + key.count("*") > 1: raise ValueError( "A maximum of one asterisk is accepted in file path and dataset key" ) fmt_obj = lambda path, i_name: path.replace("*", i_name) if "*" in path: single_file = False else: if key.count("*") > 1: raise ValueError("A maximum of one asterisk is accepted in dataset key") fmt_obj = lambda path, _: path if "*" in key: single_node = False if "format" in kwargs and kwargs["format"] not in ["t", "table"]: raise ValueError("Dask only support 'table' format in hdf files.") if mode not in ("a", "w", "r+"): raise ValueError("Mode must be one of 'a', 'w' or 'r+'") if name_function is None: name_function = build_name_function(df.npartitions - 1) # we guarantee partition order is preserved when its saved and read # so we enforce name_function to maintain the order of its input. if not (single_file and single_node): formatted_names = [name_function(i) for i in range(df.npartitions)] if formatted_names != sorted(formatted_names): warn( "To preserve order between partitions name_function " "must preserve the order of its input" ) # If user did not specify scheduler and write is sequential default to the # sequential scheduler. otherwise let the _get method choose the scheduler if ( scheduler is None and not config.get("scheduler", None) and single_node and single_file ): scheduler = "single-threaded" # handle lock default based on whether we're writing to a single entity _actual_get = get_scheduler(collections=[df], scheduler=scheduler) if lock is None: if not single_node: lock = True elif not single_file and _actual_get is not MP_GET: # if we're writing to multiple files with the multiprocessing # scheduler we don't need to lock lock = True else: lock = False if lock: lock = get_scheduler_lock(df, scheduler=scheduler) kwargs.update({"format": "table", "mode": mode, "append": append}) dsk = dict() i_name = name_function(0) dsk[(name, 0)] = ( _pd_to_hdf, pd_to_hdf, lock, [(df._name, 0), fmt_obj(path, i_name), key.replace("*", i_name)], kwargs, ) kwargs2 = kwargs.copy() if single_file: kwargs2["mode"] = "a" if single_node: kwargs2["append"] = True filenames = [] for i in range(0, df.npartitions): i_name = name_function(i) filenames.append(fmt_obj(path, i_name)) for i in range(1, df.npartitions): i_name = name_function(i) task = ( _pd_to_hdf, pd_to_hdf, lock, [(df._name, i), fmt_obj(path, i_name), key.replace("*", i_name)], kwargs2, ) if single_file: link_dep = i - 1 if single_node else 0 task = (_link, (name, link_dep), task) dsk[(name, i)] = task dsk = merge(df.dask, dsk) if single_file and single_node: keys = [(name, df.npartitions - 1)] else: keys = [(name, i) for i in range(df.npartitions)] if compute: compute_as_if_collection( DataFrame, dsk, keys, scheduler=scheduler, **dask_kwargs ) return filenames else: return delayed([Delayed(k, dsk) for k in keys])
def to_feather( df, path, write_index=True, storage_options=None, compute=True, compute_kwargs=None, ): """Store Dask.dataframe to Feather files Notes ----- Each partition will be written to a separate file. Parameters ---------- df : dask_geopandas.GeoDataFrame path : string or pathlib.Path Destination directory for data. Prepend with protocol like ``s3://`` or ``hdfs://`` for remote data. write_index : boolean, default True Whether or not to write the index. Defaults to True. storage_options : dict, default None Key/value pairs to be passed on to the file-system backend, if any (inferred from the path, such as "s3://..."). Please see ``fsspec`` for more details. compute : bool, default True If True (default) then the result is computed immediately. If False then a ``dask.delayed`` object is returned for future computation. compute_kwargs : dict, default True Options to be passed in to the compute method See Also -------- dask_geopandas.read_feather: Read Feather data to dask.dataframe """ # based on the to_orc function from dask # Get engine engine = FeatherDatasetEngine # Process file path storage_options = storage_options or {} fs, _, _ = get_fs_token_paths(path, mode="wb", storage_options=storage_options) # Trim any protocol information from the path before forwarding path = fs._strip_protocol(path) if not write_index: # Not writing index - might as well drop it df = df.reset_index(drop=True) # Use df.npartitions to define file-name list fs.mkdirs(path, exist_ok=True) filenames = [f"part.{i}.feather" for i in range(df.npartitions)] # Construct IO graph dsk = {} name = "to-feather-" + tokenize(df, fs, path, write_index, storage_options) part_tasks = [] for d, filename in enumerate(filenames): dsk[(name, d)] = ( apply, engine.write_partition, [ (df._name, d), path, fs, filename, ], ) part_tasks.append((name, d)) dsk[name] = (lambda x: None, part_tasks) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df]) # Compute or return future if compute: if compute_kwargs is None: compute_kwargs = dict() from dask_geopandas import GeoDataFrame return compute_as_if_collection(GeoDataFrame, graph, part_tasks, **compute_kwargs) return Scalar(graph, name, "")
def to_orc( df, path, engine="pyarrow", write_index=True, storage_options=None, compute=True, compute_kwargs=None, ): """Store Dask.dataframe to ORC files Notes ----- Each partition will be written to a separate file. Parameters ---------- df : dask.dataframe.DataFrame path : string or pathlib.Path Destination directory for data. Prepend with protocol like ``s3://`` or ``hdfs://`` for remote data. engine : 'pyarrow' or ORCEngine Parquet library to use. If only one library is installed, it will use that one; if both, it will use 'fastparquet'. write_index : boolean, default True Whether or not to write the index. Defaults to True. storage_options : dict, default None Key/value pairs to be passed on to the file-system backend, if any. compute : bool, default True If True (default) then the result is computed immediately. If False then a ``dask.delayed`` object is returned for future computation. compute_kwargs : dict, default True Options to be passed in to the compute method Examples -------- >>> df = dd.read_csv(...) # doctest: +SKIP >>> df.to_orc('/path/to/output/', ...) # doctest: +SKIP See Also -------- read_orc: Read ORC data to dask.dataframe """ # Get engine engine = _get_engine(engine, write=True) if hasattr(path, "name"): path = stringify_path(path) fs, _, _ = get_fs_token_paths(path, mode="wb", storage_options=storage_options) # Trim any protocol information from the path before forwarding path = fs._strip_protocol(path) if not write_index: # Not writing index - might as well drop it df = df.reset_index(drop=True) # Use df.npartitions to define file-name list fs.mkdirs(path, exist_ok=True) filenames = [f"part.{i}.orc" for i in range(df.npartitions)] # Construct IO graph dsk = {} name = "to-orc-" + tokenize( df, fs, path, engine, write_index, storage_options, ) final_name = name + "-final" for d, filename in enumerate(filenames): dsk[(name, d)] = ( apply, engine.write_partition, [ (df._name, d), path, fs, filename, ], ) part_tasks = list(dsk.keys()) dsk[(final_name, 0)] = (lambda x: None, part_tasks) graph = HighLevelGraph.from_collections((final_name, 0), dsk, dependencies=[df]) # Compute or return future if compute: if compute_kwargs is None: compute_kwargs = dict() return compute_as_if_collection(DataFrame, graph, part_tasks, **compute_kwargs) return Scalar(graph, final_name, "")
def to_parquet_binned( df, path, nbins, engine="auto", compression="default", write_index=True, append=False, overwrite=False, ignore_divisions=False, partition_on=None, storage_options=None, custom_metadata=None, write_metadata_file=True, compute=True, compute_kwargs=None, schema=None, **kwargs, ): compute_kwargs = compute_kwargs or {} if compression == "default": if snappy is not None: compression = "snappy" else: compression = None partition_on = partition_on or [] if isinstance(partition_on, str): partition_on = [partition_on] if set(partition_on) - set(df.columns): raise ValueError("Partitioning on non-existent column. " "partition_on=%s ." "columns=%s" % (str(partition_on), str(list(df.columns)))) if isinstance(engine, str): engine = get_engine(engine) if hasattr(path, "name"): path = stringify_path(path) fs, _, _ = get_fs_token_paths(path, mode="wb", storage_options=storage_options) # Trim any protocol information from the path before forwarding path = fs._strip_protocol(path) if overwrite: if isinstance(fs, LocalFileSystem): working_dir = fs.expand_path(".")[0] if path.rstrip("/") == working_dir.rstrip("/"): raise ValueError( "Cannot clear the contents of the current working directory!" ) if append: raise ValueError( "Cannot use both `overwrite=True` and `append=True`!") if fs.exists(path) and fs.isdir(path): # Only remove path contents if # (1) The path exists # (2) The path is a directory # (3) The path is not the current working directory fs.rm(path, recursive=True) # Save divisions and corresponding index name. This is necessary, # because we may be resetting the index to write the file division_info = {"divisions": df.divisions, "name": df.index.name} if division_info["name"] is None: # As of 0.24.2, pandas will rename an index with name=None # when df.reset_index() is called. The default name is "index", # but dask will always change the name to the NONE_LABEL constant if NONE_LABEL not in df.columns: division_info["name"] = NONE_LABEL elif write_index: raise ValueError( "Index must have a name if __null_dask_index__ is a column.") else: warnings.warn( "If read back by Dask, column named __null_dask_index__ " "will be set to the index (and renamed to None).") # There are some "resrved" names that may be used as the default column # name after resetting the index. However, we don't want to treat it as # a "special" name if the string is already used as a "real" column name. reserved_names = [] for name in ["index", "level_0"]: if name not in df.columns: reserved_names.append(name) # If write_index==True (default), reset the index and record the # name of the original index in `index_cols` (we will set the name # to the NONE_LABEL constant if it is originally `None`). # `fastparquet` will use `index_cols` to specify the index column(s) # in the metadata. `pyarrow` will revert the `reset_index` call # below if `index_cols` is populated (because pyarrow will want to handle # index preservation itself). For both engines, the column index # will be written to "pandas metadata" if write_index=True index_cols = [] if write_index: real_cols = set(df.columns) none_index = list(df._meta.index.names) == [None] df = df.reset_index() if none_index: df.columns = [ c if c not in reserved_names else NONE_LABEL for c in df.columns ] index_cols = [c for c in set(df.columns) - real_cols] else: # Not writing index - might as well drop it df = df.reset_index(drop=True) _to_parquet_kwargs = { "engine", "compression", "write_index", "append", "ignore_divisions", "partition_on", "storage_options", "write_metadata_file", "compute", } kwargs_pass = { k: v for k, v in kwargs.items() if k not in _to_parquet_kwargs } # Engine-specific initialization steps to write the dataset. # Possibly create parquet metadata, and load existing stuff if appending meta, schema, i_offset = engine.initialize_write( df, fs, path, append=append, ignore_divisions=ignore_divisions, partition_on=partition_on, division_info=division_info, index_cols=index_cols, schema=schema, **kwargs_pass, ) # Use i_offset and df.npartitions to define file-name list filenames = [ "part.%i.parquet" % (i + i_offset) for i in range(df.npartitions) ] # Construct IO graph dsk = {} name = "to-parquet-binned" + tokenize( df, fs, path, append, ignore_divisions, partition_on, division_info, index_cols, schema, ) part_tasks = [] kwargs_pass["fmd"] = meta kwargs_pass["compression"] = compression kwargs_pass["index_cols"] = index_cols kwargs_pass["schema"] = schema if custom_metadata: if b"pandas" in custom_metadata.keys(): raise ValueError( "User-defined key/value metadata (custom_metadata) can not " "contain a b'pandas' key. This key is reserved by Pandas, " "and overwriting the corresponding value can render the " "entire dataset unreadable.") kwargs_pass["custom_metadata"] = custom_metadata # Override write_partition to write binned parquet files engine.write_partition = write_partition_binned for d, filename in enumerate(filenames): dsk[(name, d)] = ( apply, engine.write_partition, [ engine, (df._name, d), path, fs, filename, partition_on, write_metadata_file, nbins, ], toolz.merge(kwargs_pass, {"head": True}) if d == 0 else kwargs_pass, ) part_tasks.append((name, d)) final_name = "metadata-" + name # Collect metadata and write _metadata if write_metadata_file: dsk[(final_name, 0)] = ( apply, engine.write_metadata, [ part_tasks, meta, fs, path, ], { "append": append, "compression": compression }, ) else: dsk[(final_name, 0)] = (lambda x: None, part_tasks) graph = HighLevelGraph.from_collections(final_name, dsk, dependencies=[df]) out = Delayed(name, graph) if compute: return compute_as_if_collection(Scalar, graph, [(final_name, 0)], **compute_kwargs) else: return Scalar(graph, final_name, "")