Python compute_as_if_collection示例，dask.base.compute_as_if_collection Python示例

示例#1

0

显示文件

文件： test_shuffle.py 项目： fortizc/dask

def test_shuffle_empty_partitions(method):
    df = pd.DataFrame({'x': [1, 2, 3] * 10})
    ddf = dd.from_pandas(df, npartitions=3)
    s = shuffle(ddf, ddf.x, npartitions=6, shuffle=method)
    parts = compute_as_if_collection(dd.DataFrame, s.dask, s.__dask_keys__())
    for p in parts:
        assert s.columns == p.columns

示例#2

0

显示文件

文件： test_shuffle.py 项目： jashrathod/CV-on-the-Cloud

def test_shuffle_empty_partitions(method):
    df = pd.DataFrame({'x': [1, 2, 3] * 10})
    ddf = dd.from_pandas(df, npartitions=3)
    s = shuffle(ddf, ddf.x, npartitions=6, shuffle=method)
    parts = compute_as_if_collection(dd.DataFrame, s.dask, s.__dask_keys__())
    for p in parts:
        assert s.columns == p.columns

示例#3

0

显示文件

文件： test_multi.py 项目： floriango/dask

 def check_and_return(ddfs, dfs, join):
     sol = concat(dfs, join=join)
     res = dd.concat(ddfs, join=join, interleave_partitions=divisions)
     assert_eq(res, sol)
     if known:
         parts = compute_as_if_collection(dd.DataFrame, res.dask,
                                          res.__dask_keys__())
         for p in [i.iloc[:0] for i in parts]:
             res._meta == p  # will error if schemas don't align
     assert not cat_index or has_known_categories(res.index) == known
     return res

示例#4

0

显示文件

def rearrange_by_column_disk(df, column, npartitions=None, compute=False):
    """Shuffle using local disk

    See Also
    --------
    rearrange_by_column_tasks:
        Same function, but using tasks rather than partd
        Has a more informative docstring
    """
    if npartitions is None:
        npartitions = df.npartitions

    token = tokenize(df, column, npartitions)
    always_new_token = uuid.uuid1().hex

    p = ("zpartd-" + always_new_token, )
    dsk1 = {p: (maybe_buffered_partd(), )}

    # Partition data on disk
    name = "shuffle-partition-" + always_new_token
    dsk2 = {(name, i): (shuffle_group_3, key, column, npartitions, p)
            for i, key in enumerate(df.__dask_keys__())}

    dependencies = []
    if compute:
        graph = HighLevelGraph.merge(df.dask, dsk1, dsk2)
        graph = HighLevelGraph.from_collections(name, graph, dependencies=[df])
        keys = [p, sorted(dsk2)]
        pp, values = compute_as_if_collection(DataFrame, graph, keys)
        dsk1 = {p: pp}
        dsk2 = dict(zip(sorted(dsk2), values))
    else:
        dependencies.append(df)

    # Barrier
    barrier_token = "barrier-" + always_new_token
    dsk3 = {barrier_token: (barrier, list(dsk2))}

    # Collect groups
    name = "shuffle-collect-" + token
    dsk4 = {(name, i): (collect, p, i, df._meta, barrier_token)
            for i in range(npartitions)}

    divisions = (None, ) * (npartitions + 1)

    layer = toolz.merge(dsk1, dsk2, dsk3, dsk4)
    graph = HighLevelGraph.from_collections(name,
                                            layer,
                                            dependencies=dependencies)
    return new_dd_object(graph, name, df._meta, divisions)

示例#5

0

显示文件

def test_read_csv_index():
    with filetext(csv_text) as fn:
        f = dd.read_csv(fn, blocksize=20).set_index('amount')
        result = f.compute(scheduler='sync')
        assert result.index.name == 'amount'

        blocks = compute_as_if_collection(dd.DataFrame, f.dask,
                                          f.__dask_keys__(),
                                          scheduler='sync')
        for i, block in enumerate(blocks):
            if i < len(f.divisions) - 2:
                assert (block.index < f.divisions[i + 1]).all()
            if i > 0:
                assert (block.index >= f.divisions[i]).all()

        expected = pd.read_csv(fn).set_index('amount')
        assert_eq(result, expected)

示例#6

0

显示文件

文件： test_csv.py 项目： caseyclements/dask

def test_read_csv_index():
    with filetext(csv_text) as fn:
        f = dd.read_csv(fn, blocksize=20).set_index('amount')
        result = f.compute(scheduler='sync')
        assert result.index.name == 'amount'

        blocks = compute_as_if_collection(dd.DataFrame, f.dask,
                                          f.__dask_keys__(),
                                          scheduler='sync')
        for i, block in enumerate(blocks):
            if i < len(f.divisions) - 2:
                assert (block.index < f.divisions[i + 1]).all()
            if i > 0:
                assert (block.index >= f.divisions[i]).all()

        expected = pd.read_csv(fn).set_index('amount')
        assert_eq(result, expected)

示例#7

0

显示文件

def test_compte_as_if_collection_low_level_task_graph():
    # See https://github.com/dask/dask/pull/7969
    da = pytest.importorskip("dask.array")
    x = da.arange(10)

    # Boolean flag to ensure MyDaskArray.__dask_optimize__ is called
    optimized = False

    class MyDaskArray(da.Array):
        """Dask Array subclass with validation logic in __dask_optimize__"""
        @classmethod
        def __dask_optimize__(cls, dsk, keys, **kwargs):
            # Ensure `compute_as_if_collection` don't convert to a low-level task graph
            assert type(dsk) is HighLevelGraph
            nonlocal optimized
            optimized = True
            return super().__dask_optimize__(dsk, keys, **kwargs)

    result = compute_as_if_collection(MyDaskArray, x.__dask_graph__(),
                                      x.__dask_keys__())[0]
    assert optimized
    da.utils.assert_eq(x, result)

示例#8

0

显示文件

文件： categorical.py 项目： m-rossi/dask

def categorize(df, columns=None, index=None, split_every=None, **kwargs):
    """Convert columns of the DataFrame to category dtype.

    Parameters
    ----------
    columns : list, optional
        A list of column names to convert to categoricals. By default any
        column with an object dtype is converted to a categorical, and any
        unknown categoricals are made known.
    index : bool, optional
        Whether to categorize the index. By default, object indices are
        converted to categorical, and unknown categorical indices are made
        known. Set True to always categorize the index, False to never.
    split_every : int, optional
        Group partitions into groups of this size while performing a
        tree-reduction. If set to False, no tree-reduction will be used.
        Default is 16.
    kwargs
        Keyword arguments are passed on to compute.
    """
    meta = df._meta
    if columns is None:
        columns = list(meta.select_dtypes(["object", "category"]).columns)
    elif is_scalar(columns):
        columns = [columns]

    # Filter out known categorical columns
    columns = [
        c for c in columns if not (
            is_categorical_dtype(meta[c]) and has_known_categories(meta[c]))
    ]

    if index is not False:
        if is_categorical_dtype(meta.index):
            index = not has_known_categories(meta.index)
        elif index is None:
            index = meta.index.dtype == object

    # Nothing to do
    if not len(columns) and index is False:
        return df

    if split_every is None:
        split_every = 16
    elif split_every is False:
        split_every = df.npartitions
    elif not isinstance(split_every, Integral) or split_every < 2:
        raise ValueError("split_every must be an integer >= 2")

    token = tokenize(df, columns, index, split_every)
    a = "get-categories-chunk-" + token
    dsk = {(a, i): (_get_categories, key, columns, index)
           for (i, key) in enumerate(df.__dask_keys__())}

    prefix = "get-categories-agg-" + token
    k = df.npartitions
    depth = 0
    while k > split_every:
        b = prefix + str(depth)
        for part_i, inds in enumerate(partition_all(split_every, range(k))):
            dsk[(b, part_i)] = (_get_categories_agg, [(a, i) for i in inds])
        k = part_i + 1
        a = b
        depth += 1

    dsk[(prefix, 0)] = (_get_categories_agg, [(a, i) for i in range(k)])
    dsk.update(df.dask)

    # Compute the categories
    categories, index = compute_as_if_collection(df.__class__, dsk,
                                                 (prefix, 0), **kwargs)

    # some operations like get_dummies() rely on the order of categories
    categories = {k: v.sort_values() for k, v in categories.items()}

    # Categorize each partition
    return df.map_partitions(_categorize_block, categories, index)

示例#9

0

显示文件

文件： shuffle.py 项目： rapidsai/dask-cuda

def shuffle(
    df: DataFrame,
    column_names: List[str],
    npartitions: Optional[int] = None,
    ignore_index: bool = False,
) -> DataFrame:
    """Order divisions of DataFrame so that all values within column(s) align

    This enacts a task-based shuffle using explicit-comms. It requires a full
    dataset read, serialization and shuffle. This is expensive. If possible
    you should avoid shuffles.

    This does not preserve a meaningful index/partitioning scheme. This is not
    deterministic if done in parallel.

    Requires an activate client.

    Parameters
    ----------
    df: dask.dataframe.DataFrame
        Dataframe to shuffle
    column_names: list of strings
        List of column names on which we want to split.
    npartitions: int or None
        The desired number of output partitions. If None, the number of output
        partitions equals `df.npartitions`
    ignore_index: bool
        Ignore index during shuffle.  If True, performance may improve,
        but index values will not be preserved.

    Returns
    -------
    df: dask.dataframe.DataFrame
        Shuffled dataframe

    Developer Notes
    ---------------
    The implementation consist of three steps:
      (a) Extend the dask graph of `df` with a call to `shuffle_group()` for each
          dataframe partition and submit the graph.
      (b) Submit a task on each worker that shuffle (all-to-all communicate)
          the groups from (a) and return a list of dataframe-partitions.
      (c) Submit a dask graph that extract (using `getitem()`) individual
          dataframe-partitions from (b).
    """
    c = comms.default_comms()

    # As default we preserve number of partitions
    if npartitions is None:
        npartitions = df.npartitions

    # Step (a): partition/group each dataframe-partition
    name = ("explicit-comms-shuffle-group-"
            f"{tokenize(df, column_names, npartitions, ignore_index)}")
    df = df.persist(
    )  # Making sure optimizations are apply on the existing graph
    dsk = dict(df.__dask_graph__())
    output_keys = []
    for input_key in df.__dask_keys__():
        output_key = (name, input_key[1])
        dsk[output_key] = (
            shuffle_group,
            input_key,
            column_names,
            0,
            npartitions,
            npartitions,
            ignore_index,
            npartitions,
        )
        output_keys.append(output_key)

    # Compute `df_groups`, which is a list of futures, one future per partition in `df`.
    # Each future points to a dict of length `df.npartitions` that maps each
    # partition-id to a DataFrame.
    df_groups = compute_as_if_collection(type(df),
                                         dsk,
                                         output_keys,
                                         sync=False)
    wait(df_groups)
    for f in df_groups:  # Check for errors
        if f.status == "error":
            f.result()  # raise exception

    # Step (b): find out which workers has what part of `df_groups`,
    #           find the number of output each worker should have,
    #           and submit `local_shuffle()` on each worker.
    key_to_part = {str(part.key): part for part in df_groups}
    in_parts = defaultdict(list)  # Map worker -> [list of futures]
    for key, workers in c.client.who_has(df_groups).items():
        # Note, if multiple workers have the part, we pick the first worker
        in_parts[first(workers)].append(key_to_part[key])

    # Let's create a dict that specifices the number of partitions each worker has
    in_nparts = {}
    workers = set()  # All ranks that have a partition of `df`
    for rank, worker in enumerate(c.worker_addresses):
        nparts = len(in_parts.get(worker, ()))
        if nparts > 0:
            in_nparts[rank] = nparts
            workers.add(rank)
    workers_sorted = sorted(workers)

    # Find the output partitions for each worker
    div = npartitions // len(workers)
    rank_to_out_part_ids = {}  # rank -> [list of partition id]
    for i, rank in enumerate(workers_sorted):
        rank_to_out_part_ids[rank] = list(range(div * i, div * (i + 1)))
    for rank, i in zip(workers_sorted, range(div * len(workers), npartitions)):
        rank_to_out_part_ids[rank].append(i)

    # Run `local_shuffle()` on each worker
    result_futures = {}
    for rank, worker in enumerate(c.worker_addresses):
        if rank in workers:
            result_futures[rank] = c.submit(
                worker,
                local_shuffle,
                in_nparts,
                in_parts[worker],
                rank_to_out_part_ids,
                ignore_index,
            )
    distributed.wait(list(result_futures.values()))
    del df_groups

    # Step (c): extract individual dataframe-partitions
    name = f"explicit-comms-shuffle-getitem-{tokenize(name)}"
    dsk = {}
    meta = None
    for rank, parts in rank_to_out_part_ids.items():
        for i, part_id in enumerate(parts):
            dsk[(name, part_id)] = (getitem, result_futures[rank], i)
            if meta is None:
                # Get the meta from the first output partition
                meta = delayed(make_meta)(delayed(getitem)(
                    result_futures[rank], i)).compute()
    assert meta is not None

    divs = [None] * (len(dsk) + 1)
    return new_dd_object(dsk, name, meta, divs).persist()

示例#10

0

显示文件

def to_hdf(
    df,
    path,
    key,
    mode="a",
    append=False,
    scheduler=None,
    name_function=None,
    compute=True,
    lock=None,
    dask_kwargs={},
    **kwargs,
):
    """Store Dask Dataframe to Hierarchical Data Format (HDF) files

    This is a parallel version of the Pandas function of the same name.  Please
    see the Pandas docstring for more detailed information about shared keyword
    arguments.

    This function differs from the Pandas version by saving the many partitions
    of a Dask DataFrame in parallel, either to many files, or to many datasets
    within the same file.  You may specify this parallelism with an asterix
    ``*`` within the filename or datapath, and an optional ``name_function``.
    The asterix will be replaced with an increasing sequence of integers
    starting from ``0`` or with the result of calling ``name_function`` on each
    of those integers.

    This function only supports the Pandas ``'table'`` format, not the more
    specialized ``'fixed'`` format.

    Parameters
    ----------
    path : string, pathlib.Path
        Path to a target filename. Supports strings, ``pathlib.Path``, or any
        object implementing the ``__fspath__`` protocol. May contain a ``*`` to
        denote many filenames.
    key : string
        Datapath within the files.  May contain a ``*`` to denote many locations
    name_function : function
        A function to convert the ``*`` in the above options to a string.
        Should take in a number from 0 to the number of partitions and return a
        string. (see examples below)
    compute : bool
        Whether or not to execute immediately.  If False then this returns a
        ``dask.Delayed`` value.
    lock : bool, Lock, optional
        Lock to use to prevent concurrency issues.  By default a
        ``threading.Lock``, ``multiprocessing.Lock`` or ``SerializableLock``
        will be used depending on your scheduler if a lock is required. See
        dask.utils.get_scheduler_lock for more information about lock
        selection.
    scheduler : string
        The scheduler to use, like "threads" or "processes"
    **other:
        See pandas.to_hdf for more information

    Examples
    --------
    Save Data to a single file

    >>> df.to_hdf('output.hdf', '/data')            # doctest: +SKIP

    Save data to multiple datapaths within the same file:

    >>> df.to_hdf('output.hdf', '/data-*')          # doctest: +SKIP

    Save data to multiple files:

    >>> df.to_hdf('output-*.hdf', '/data')          # doctest: +SKIP

    Save data to multiple files, using the multiprocessing scheduler:

    >>> df.to_hdf('output-*.hdf', '/data', scheduler='processes') # doctest: +SKIP

    Specify custom naming scheme.  This writes files as
    '2000-01-01.hdf', '2000-01-02.hdf', '2000-01-03.hdf', etc..

    >>> from datetime import date, timedelta
    >>> base = date(year=2000, month=1, day=1)
    >>> def name_function(i):
    ...     ''' Convert integer 0 to n to a string '''
    ...     return base + timedelta(days=i)

    >>> df.to_hdf('*.hdf', '/data', name_function=name_function) # doctest: +SKIP

    Returns
    -------
    filenames : list
        Returned if ``compute`` is True. List of file names that each partition
        is saved to.
    delayed : dask.Delayed
        Returned if ``compute`` is False. Delayed object to execute ``to_hdf``
        when computed.

    See Also
    --------
    read_hdf:
    to_parquet:
    """
    name = "to-hdf-" + uuid.uuid1().hex

    pd_to_hdf = getattr(df._partition_type, "to_hdf")

    single_file = True
    single_node = True

    path = stringify_path(path)

    # if path is string, format using i_name
    if isinstance(path, str):
        if path.count("*") + key.count("*") > 1:
            raise ValueError(
                "A maximum of one asterisk is accepted in file path and dataset key"
            )

        fmt_obj = lambda path, i_name: path.replace("*", i_name)

        if "*" in path:
            single_file = False
    else:
        if key.count("*") > 1:
            raise ValueError("A maximum of one asterisk is accepted in dataset key")

        fmt_obj = lambda path, _: path

    if "*" in key:
        single_node = False

    if "format" in kwargs and kwargs["format"] not in ["t", "table"]:
        raise ValueError("Dask only support 'table' format in hdf files.")

    if mode not in ("a", "w", "r+"):
        raise ValueError("Mode must be one of 'a', 'w' or 'r+'")

    if name_function is None:
        name_function = build_name_function(df.npartitions - 1)

    # we guarantee partition order is preserved when its saved and read
    # so we enforce name_function to maintain the order of its input.
    if not (single_file and single_node):
        formatted_names = [name_function(i) for i in range(df.npartitions)]
        if formatted_names != sorted(formatted_names):
            warn(
                "To preserve order between partitions name_function "
                "must preserve the order of its input"
            )

    # If user did not specify scheduler and write is sequential default to the
    # sequential scheduler. otherwise let the _get method choose the scheduler
    if (
        scheduler is None
        and not config.get("scheduler", None)
        and single_node
        and single_file
    ):
        scheduler = "single-threaded"

    # handle lock default based on whether we're writing to a single entity
    _actual_get = get_scheduler(collections=[df], scheduler=scheduler)
    if lock is None:
        if not single_node:
            lock = True
        elif not single_file and _actual_get is not MP_GET:
            # if we're writing to multiple files with the multiprocessing
            # scheduler we don't need to lock
            lock = True
        else:
            lock = False
    if lock:
        lock = get_scheduler_lock(df, scheduler=scheduler)

    kwargs.update({"format": "table", "mode": mode, "append": append})

    dsk = dict()

    i_name = name_function(0)
    dsk[(name, 0)] = (
        _pd_to_hdf,
        pd_to_hdf,
        lock,
        [(df._name, 0), fmt_obj(path, i_name), key.replace("*", i_name)],
        kwargs,
    )

    kwargs2 = kwargs.copy()
    if single_file:
        kwargs2["mode"] = "a"
    if single_node:
        kwargs2["append"] = True

    filenames = []
    for i in range(0, df.npartitions):
        i_name = name_function(i)
        filenames.append(fmt_obj(path, i_name))

    for i in range(1, df.npartitions):
        i_name = name_function(i)
        task = (
            _pd_to_hdf,
            pd_to_hdf,
            lock,
            [(df._name, i), fmt_obj(path, i_name), key.replace("*", i_name)],
            kwargs2,
        )
        if single_file:
            link_dep = i - 1 if single_node else 0
            task = (_link, (name, link_dep), task)
        dsk[(name, i)] = task

    dsk = merge(df.dask, dsk)
    if single_file and single_node:
        keys = [(name, df.npartitions - 1)]
    else:
        keys = [(name, i) for i in range(df.npartitions)]

    if compute:
        compute_as_if_collection(
            DataFrame, dsk, keys, scheduler=scheduler, **dask_kwargs
        )
        return filenames
    else:
        return delayed([Delayed(k, dsk) for k in keys])

示例#11

0

显示文件

文件： arrow.py 项目： geopandas/dask-geopandas

def to_feather(
    df,
    path,
    write_index=True,
    storage_options=None,
    compute=True,
    compute_kwargs=None,
):
    """Store Dask.dataframe to Feather files

    Notes
    -----
    Each partition will be written to a separate file.

    Parameters
    ----------
    df : dask_geopandas.GeoDataFrame
    path : string or pathlib.Path
        Destination directory for data.  Prepend with protocol like ``s3://``
        or ``hdfs://`` for remote data.
    write_index : boolean, default True
        Whether or not to write the index. Defaults to True.
    storage_options : dict, default None
        Key/value pairs to be passed on to the file-system backend, if any
        (inferred from the path, such as "s3://...").
        Please see ``fsspec`` for more details.
    compute : bool, default True
        If True (default) then the result is computed immediately. If False
        then a ``dask.delayed`` object is returned for future computation.
    compute_kwargs : dict, default True
        Options to be passed in to the compute method

    See Also
    --------
    dask_geopandas.read_feather: Read Feather data to dask.dataframe
    """
    # based on the to_orc function from dask

    # Get engine
    engine = FeatherDatasetEngine

    # Process file path
    storage_options = storage_options or {}
    fs, _, _ = get_fs_token_paths(path,
                                  mode="wb",
                                  storage_options=storage_options)
    # Trim any protocol information from the path before forwarding
    path = fs._strip_protocol(path)

    if not write_index:
        # Not writing index - might as well drop it
        df = df.reset_index(drop=True)

    # Use df.npartitions to define file-name list
    fs.mkdirs(path, exist_ok=True)
    filenames = [f"part.{i}.feather" for i in range(df.npartitions)]

    # Construct IO graph
    dsk = {}
    name = "to-feather-" + tokenize(df, fs, path, write_index, storage_options)
    part_tasks = []
    for d, filename in enumerate(filenames):
        dsk[(name, d)] = (
            apply,
            engine.write_partition,
            [
                (df._name, d),
                path,
                fs,
                filename,
            ],
        )
        part_tasks.append((name, d))
    dsk[name] = (lambda x: None, part_tasks)
    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df])

    # Compute or return future
    if compute:
        if compute_kwargs is None:
            compute_kwargs = dict()
        from dask_geopandas import GeoDataFrame

        return compute_as_if_collection(GeoDataFrame, graph, part_tasks,
                                        **compute_kwargs)
    return Scalar(graph, name, "")

示例#12

0

显示文件

def to_orc(
    df,
    path,
    engine="pyarrow",
    write_index=True,
    storage_options=None,
    compute=True,
    compute_kwargs=None,
):
    """Store Dask.dataframe to ORC files

    Notes
    -----
    Each partition will be written to a separate file.

    Parameters
    ----------
    df : dask.dataframe.DataFrame
    path : string or pathlib.Path
        Destination directory for data.  Prepend with protocol like ``s3://``
        or ``hdfs://`` for remote data.
    engine : 'pyarrow' or ORCEngine
        Parquet library to use. If only one library is installed, it will use
        that one; if both, it will use 'fastparquet'.
    write_index : boolean, default True
        Whether or not to write the index. Defaults to True.
    storage_options : dict, default None
        Key/value pairs to be passed on to the file-system backend, if any.
    compute : bool, default True
        If True (default) then the result is computed immediately. If False
        then a ``dask.delayed`` object is returned for future computation.
    compute_kwargs : dict, default True
        Options to be passed in to the compute method

    Examples
    --------
    >>> df = dd.read_csv(...)  # doctest: +SKIP
    >>> df.to_orc('/path/to/output/', ...)  # doctest: +SKIP

    See Also
    --------
    read_orc: Read ORC data to dask.dataframe
    """

    # Get engine
    engine = _get_engine(engine, write=True)

    if hasattr(path, "name"):
        path = stringify_path(path)
    fs, _, _ = get_fs_token_paths(path,
                                  mode="wb",
                                  storage_options=storage_options)
    # Trim any protocol information from the path before forwarding
    path = fs._strip_protocol(path)

    if not write_index:
        # Not writing index - might as well drop it
        df = df.reset_index(drop=True)

    # Use df.npartitions to define file-name list
    fs.mkdirs(path, exist_ok=True)
    filenames = [f"part.{i}.orc" for i in range(df.npartitions)]

    # Construct IO graph
    dsk = {}
    name = "to-orc-" + tokenize(
        df,
        fs,
        path,
        engine,
        write_index,
        storage_options,
    )
    final_name = name + "-final"
    for d, filename in enumerate(filenames):
        dsk[(name, d)] = (
            apply,
            engine.write_partition,
            [
                (df._name, d),
                path,
                fs,
                filename,
            ],
        )
    part_tasks = list(dsk.keys())
    dsk[(final_name, 0)] = (lambda x: None, part_tasks)
    graph = HighLevelGraph.from_collections((final_name, 0),
                                            dsk,
                                            dependencies=[df])

    # Compute or return future
    if compute:
        if compute_kwargs is None:
            compute_kwargs = dict()
        return compute_as_if_collection(DataFrame, graph, part_tasks,
                                        **compute_kwargs)
    return Scalar(graph, final_name, "")

示例#13

0

显示文件

文件： binning.py 项目： quuhua911/DeepLearningExamples

def to_parquet_binned(
    df,
    path,
    nbins,
    engine="auto",
    compression="default",
    write_index=True,
    append=False,
    overwrite=False,
    ignore_divisions=False,
    partition_on=None,
    storage_options=None,
    custom_metadata=None,
    write_metadata_file=True,
    compute=True,
    compute_kwargs=None,
    schema=None,
    **kwargs,
):
    compute_kwargs = compute_kwargs or {}

    if compression == "default":
        if snappy is not None:
            compression = "snappy"
        else:
            compression = None

    partition_on = partition_on or []
    if isinstance(partition_on, str):
        partition_on = [partition_on]

    if set(partition_on) - set(df.columns):
        raise ValueError("Partitioning on non-existent column. "
                         "partition_on=%s ."
                         "columns=%s" %
                         (str(partition_on), str(list(df.columns))))

    if isinstance(engine, str):
        engine = get_engine(engine)

    if hasattr(path, "name"):
        path = stringify_path(path)
    fs, _, _ = get_fs_token_paths(path,
                                  mode="wb",
                                  storage_options=storage_options)
    # Trim any protocol information from the path before forwarding
    path = fs._strip_protocol(path)

    if overwrite:
        if isinstance(fs, LocalFileSystem):
            working_dir = fs.expand_path(".")[0]
            if path.rstrip("/") == working_dir.rstrip("/"):
                raise ValueError(
                    "Cannot clear the contents of the current working directory!"
                )
        if append:
            raise ValueError(
                "Cannot use both `overwrite=True` and `append=True`!")
        if fs.exists(path) and fs.isdir(path):
            # Only remove path contents if
            # (1) The path exists
            # (2) The path is a directory
            # (3) The path is not the current working directory
            fs.rm(path, recursive=True)

    # Save divisions and corresponding index name. This is necessary,
    # because we may be resetting the index to write the file
    division_info = {"divisions": df.divisions, "name": df.index.name}
    if division_info["name"] is None:
        # As of 0.24.2, pandas will rename an index with name=None
        # when df.reset_index() is called.  The default name is "index",
        # but dask will always change the name to the NONE_LABEL constant
        if NONE_LABEL not in df.columns:
            division_info["name"] = NONE_LABEL
        elif write_index:
            raise ValueError(
                "Index must have a name if __null_dask_index__ is a column.")
        else:
            warnings.warn(
                "If read back by Dask, column named __null_dask_index__ "
                "will be set to the index (and renamed to None).")

    # There are some "resrved" names that may be used as the default column
    # name after resetting the index. However, we don't want to treat it as
    # a "special" name if the string is already used as a "real" column name.
    reserved_names = []
    for name in ["index", "level_0"]:
        if name not in df.columns:
            reserved_names.append(name)

    # If write_index==True (default), reset the index and record the
    # name of the original index in `index_cols` (we will set the name
    # to the NONE_LABEL constant if it is originally `None`).
    # `fastparquet` will use `index_cols` to specify the index column(s)
    # in the metadata.  `pyarrow` will revert the `reset_index` call
    # below if `index_cols` is populated (because pyarrow will want to handle
    # index preservation itself).  For both engines, the column index
    # will be written to "pandas metadata" if write_index=True
    index_cols = []
    if write_index:
        real_cols = set(df.columns)
        none_index = list(df._meta.index.names) == [None]
        df = df.reset_index()
        if none_index:
            df.columns = [
                c if c not in reserved_names else NONE_LABEL
                for c in df.columns
            ]
        index_cols = [c for c in set(df.columns) - real_cols]
    else:
        # Not writing index - might as well drop it
        df = df.reset_index(drop=True)

    _to_parquet_kwargs = {
        "engine",
        "compression",
        "write_index",
        "append",
        "ignore_divisions",
        "partition_on",
        "storage_options",
        "write_metadata_file",
        "compute",
    }
    kwargs_pass = {
        k: v
        for k, v in kwargs.items() if k not in _to_parquet_kwargs
    }

    # Engine-specific initialization steps to write the dataset.
    # Possibly create parquet metadata, and load existing stuff if appending
    meta, schema, i_offset = engine.initialize_write(
        df,
        fs,
        path,
        append=append,
        ignore_divisions=ignore_divisions,
        partition_on=partition_on,
        division_info=division_info,
        index_cols=index_cols,
        schema=schema,
        **kwargs_pass,
    )

    # Use i_offset and df.npartitions to define file-name list
    filenames = [
        "part.%i.parquet" % (i + i_offset) for i in range(df.npartitions)
    ]

    # Construct IO graph
    dsk = {}
    name = "to-parquet-binned" + tokenize(
        df,
        fs,
        path,
        append,
        ignore_divisions,
        partition_on,
        division_info,
        index_cols,
        schema,
    )
    part_tasks = []
    kwargs_pass["fmd"] = meta
    kwargs_pass["compression"] = compression
    kwargs_pass["index_cols"] = index_cols
    kwargs_pass["schema"] = schema
    if custom_metadata:
        if b"pandas" in custom_metadata.keys():
            raise ValueError(
                "User-defined key/value metadata (custom_metadata) can not "
                "contain a b'pandas' key.  This key is reserved by Pandas, "
                "and overwriting the corresponding value can render the "
                "entire dataset unreadable.")
        kwargs_pass["custom_metadata"] = custom_metadata
    # Override write_partition to write binned parquet files
    engine.write_partition = write_partition_binned
    for d, filename in enumerate(filenames):
        dsk[(name, d)] = (
            apply,
            engine.write_partition,
            [
                engine,
                (df._name, d),
                path,
                fs,
                filename,
                partition_on,
                write_metadata_file,
                nbins,
            ],
            toolz.merge(kwargs_pass, {"head": True})
            if d == 0 else kwargs_pass,
        )
        part_tasks.append((name, d))

    final_name = "metadata-" + name
    # Collect metadata and write _metadata

    if write_metadata_file:
        dsk[(final_name, 0)] = (
            apply,
            engine.write_metadata,
            [
                part_tasks,
                meta,
                fs,
                path,
            ],
            {
                "append": append,
                "compression": compression
            },
        )
    else:
        dsk[(final_name, 0)] = (lambda x: None, part_tasks)

    graph = HighLevelGraph.from_collections(final_name, dsk, dependencies=[df])
    out = Delayed(name, graph)

    if compute:
        return compute_as_if_collection(Scalar, graph, [(final_name, 0)],
                                        **compute_kwargs)
    else:
        return Scalar(graph, final_name, "")