예제 #1
0
def filter_quantile_union(
        df: DataFrame,
        quantile: int,
        quantile_range: str = 'upper') -> Tuple[DataFrame, Series]:
    """
    Get part of the data above or below a determined quantile

     Parameters
    ----------
    df : DataFrame
        The dataframe from which the distribution should be estimated
    
    quantile : integer
        The distribution section to be used as threshold

    quantile_range : string
        default: upper
        The region which should be filtered, where upper indicates values
        above the quantile and lower otherwise

    Returns
    -------
    Two-Element Tuple
        A tuple where the first item is the resulting filtered dataframe
        and the second is a series with the quantile threshold value
    """

    if quantile == 1:
        q = df.quantile(q=0.25)
    elif quantile == 2:
        q = df.quantile(q=0.5)
    elif quantile == 3:
        q = df.quantile(q=0.75)
    else:
        print("Invalid quantile")
        return
    q = q.compute()
    q2 = q.to_frame()

    q2 = q2.transpose()
    q2 = q2.reset_index()

    if quantile_range == 'upper':
        data = df.loc[(df.AnswerCount > q2.AnswerCount) |
                      (df.ViewCount > q2.ViewCount) |
                      (df.CommentCount > q2.CommentCount) |
                      (df.FavoriteCount > q2.FavoriteCount) |
                      (df.Score > q2.Score)]
    elif quantile_range == 'lower':
        data = df.loc[(df.AnswerCount < q2.AnswerCount) |
                      (df.ViewCount < q2.ViewCount) |
                      (df.CommentCount < q2.CommentCount) |
                      (df.FavoriteCount < q2.FavoriteCount) |
                      (df.Score < q2.Score)]
    else:
        #TODO raise an error here
        print('Ivalid range of data')
        data = None

    return data, q
예제 #2
0
def compute_and_set_divisions(df: DataFrame, **kwargs) -> DataFrame:
    mins, maxes, lens = _compute_partition_stats(df.index,
                                                 allow_overlap=True,
                                                 **kwargs)
    if len(mins) == len(df.divisions) - 1:
        df._divisions = tuple(mins) + (maxes[-1], )
        if not any(mins[i] >= maxes[i - 1] for i in range(1, len(mins))):
            return df

    return fix_overlap(df, mins, maxes, lens)
예제 #3
0
파일: resample.py 프로젝트: m-rossi/dask
    def _agg(self,
             how,
             meta=None,
             fill_value=np.nan,
             how_args=(),
             how_kwargs={}):
        """Aggregate using one or more operations

        Parameters
        ----------
        how : str
            Name of aggregation operation
        fill_value : scalar, optional
            Value to use for missing values, applied during upsampling.
            Default is NaN.
        how_args : optional
            Positional arguments for aggregation operation.
        how_kwargs : optional
            Keyword arguments for aggregation operation.

        Returns
        -------
        Dask DataFrame or Series
        """
        rule = self._rule
        kwargs = self._kwargs
        name = "resample-" + tokenize(self.obj, rule, kwargs, how, *how_args,
                                      **how_kwargs)

        # Create a grouper to determine closed and label conventions
        newdivs, outdivs = _resample_bin_and_out_divs(self.obj.divisions, rule,
                                                      **kwargs)

        # Repartition divs into bins. These won't match labels after mapping
        partitioned = self.obj.repartition(newdivs, force=True)

        keys = partitioned.__dask_keys__()
        dsk = {}

        args = zip(keys, outdivs, outdivs[1:],
                   ["left"] * (len(keys) - 1) + [None])
        for i, (k, s, e, c) in enumerate(args):
            dsk[(name, i)] = (
                _resample_series,
                k,
                s,
                e,
                c,
                rule,
                kwargs,
                how,
                fill_value,
                list(how_args),
                how_kwargs,
            )

        # Infer output metadata
        meta_r = self.obj._meta_nonempty.resample(self._rule, **self._kwargs)
        meta = getattr(meta_r, how)(*how_args, **how_kwargs)

        graph = HighLevelGraph.from_collections(name,
                                                dsk,
                                                dependencies=[partitioned])
        if isinstance(meta, pd.DataFrame):
            return DataFrame(graph, name, meta, outdivs)
        return Series(graph, name, meta, outdivs)
예제 #4
0
def _calculate_divisions(
    df: DataFrame,
    partition_col: Series,
    repartition: bool,
    npartitions: int,
    upsample: float = 1.0,
    partition_size: float = 128e6,
) -> Tuple[List, List, List]:
    """
    Utility function to calculate divisions for calls to `map_partitions`
    """
    sizes = df.map_partitions(sizeof) if repartition else []
    divisions = partition_col._repartition_quantiles(npartitions,
                                                     upsample=upsample)
    mins = partition_col.map_partitions(M.min)
    maxes = partition_col.map_partitions(M.max)

    try:
        divisions, sizes, mins, maxes = compute(divisions, sizes, mins, maxes)
    except TypeError as e:
        # When there are nulls and a column is non-numeric, a TypeError is sometimes raised as a result of
        # 1) computing mins/maxes above, 2) every null being switched to NaN, and 3) NaN being a float.
        # Also, Pandas ExtensionDtypes may cause TypeErrors when dealing with special nulls such as pd.NaT or pd.NA.
        # If this happens, we hint the user about eliminating nulls beforehand.
        if not is_numeric_dtype(partition_col.dtype):
            obj, suggested_method = (
                ("column",
                 f"`.dropna(subset=['{partition_col.name}'])`") if any(
                     partition_col._name == df[c]._name for c in df) else
                ("series", "`.loc[series[~series.isna()]]`"))
            raise NotImplementedError(
                f"Divisions calculation failed for non-numeric {obj} '{partition_col.name}'.\n"
                f"This is probably due to the presence of nulls, which Dask does not entirely support in the index.\n"
                f"We suggest you try with {suggested_method}.") from e
        # For numeric types there shouldn't be problems with nulls, so we raise as-it-is this particular TypeError
        else:
            raise e

    divisions = methods.tolist(divisions)
    if type(sizes) is not list:
        sizes = methods.tolist(sizes)
    mins = methods.tolist(mins)
    maxes = methods.tolist(maxes)

    empty_dataframe_detected = pd.isna(divisions).all()
    if repartition or empty_dataframe_detected:
        total = sum(sizes)
        npartitions = max(math.ceil(total / partition_size), 1)
        npartitions = min(npartitions, df.npartitions)
        n = len(divisions)
        try:
            divisions = np.interp(
                x=np.linspace(0, n - 1, npartitions + 1),
                xp=np.linspace(0, n - 1, n),
                fp=divisions,
            ).tolist()
        except (TypeError, ValueError):  # str type
            indexes = np.linspace(0, n - 1, npartitions + 1).astype(int)
            divisions = [divisions[i] for i in indexes]
    else:
        # Drop duplicate divisions returned by partition quantiles
        divisions = list(toolz.unique(divisions[:-1])) + [divisions[-1]]

    mins = remove_nans(mins)
    maxes = remove_nans(maxes)
    if pd.api.types.is_categorical_dtype(partition_col.dtype):
        dtype = partition_col.dtype
        mins = pd.Categorical(mins, dtype=dtype).codes.tolist()
        maxes = pd.Categorical(maxes, dtype=dtype).codes.tolist()

    return divisions, mins, maxes
예제 #5
0
def set_partition(
    df: DataFrame,
    index: Union[str, Series],
    divisions: Sequence,
    max_branch: int = 32,
    drop: bool = True,
    shuffle: Optional[str] = None,
    compute: Optional[bool] = None,
) -> DataFrame:
    """Group DataFrame by index

    Sets a new index and partitions data along that index according to
    divisions.  Divisions are often found by computing approximate quantiles.
    The function ``set_index`` will do both of these steps.

    Parameters
    ----------
    df: DataFrame/Series
        Data that we want to re-partition
    index: string or Series
        Column to become the new index
    divisions: list
        Values to form new divisions between partitions
    drop: bool, default True
        Whether to delete columns to be used as the new index
    shuffle: str (optional)
        Either 'disk' for an on-disk shuffle or 'tasks' to use the task
        scheduling framework.  Use 'disk' if you are on a single machine
        and 'tasks' if you are on a distributed cluster.
    max_branch: int (optional)
        If using the task-based shuffle, the amount of splitting each
        partition undergoes.  Increase this for fewer copies but more
        scheduler overhead.

    See Also
    --------
    set_index
    shuffle
    partd
    """
    meta = df._meta._constructor_sliced([0])
    if isinstance(divisions, tuple):
        # pd.isna considers tuples to be scalars. Convert to a list.
        divisions = list(divisions)

    if not isinstance(index, Series):
        dtype = df[index].dtype
    else:
        dtype = index.dtype

    if pd.isna(divisions).any() and pd.api.types.is_integer_dtype(dtype):
        # Can't construct a Series[int64] when any / all of the divisions are NaN.
        divisions = df._meta._constructor_sliced(divisions)
    elif (pd.api.types.is_categorical_dtype(dtype)
          and UNKNOWN_CATEGORIES in dtype.categories):
        # If categories are unknown, leave as a string dtype instead.
        divisions = df._meta._constructor_sliced(divisions)
    else:
        divisions = df._meta._constructor_sliced(divisions, dtype=dtype)

    if not isinstance(index, Series):
        partitions = df[index].map_partitions(set_partitions_pre,
                                              divisions=divisions,
                                              meta=meta)
        df2 = df.assign(_partitions=partitions)
    else:
        partitions = index.map_partitions(set_partitions_pre,
                                          divisions=divisions,
                                          meta=meta)
        df2 = df.assign(_partitions=partitions, _index=index)

    df3 = rearrange_by_column(
        df2,
        "_partitions",
        max_branch=max_branch,
        npartitions=len(divisions) - 1,
        shuffle=shuffle,
        compute=compute,
        ignore_index=True,
    )

    if not isinstance(index, Series):
        df4 = df3.map_partitions(
            set_index_post_scalar,
            index_name=index,
            drop=drop,
            column_dtype=df.columns.dtype,
        )
    else:
        df4 = df3.map_partitions(
            set_index_post_series,
            index_name=index.name,
            drop=drop,
            column_dtype=df.columns.dtype,
        )

    df4.divisions = tuple(methods.tolist(divisions))

    return df4.map_partitions(M.sort_index)
예제 #6
0
def sort_values(
    df: DataFrame,
    by: Union[str, List[str]],
    npartitions: Optional[Union[int, Literal["auto"]]] = None,
    ascending: Union[bool, List[bool]] = True,
    na_position: Union[Literal["first"], Literal["last"]] = "last",
    upsample: float = 1.0,
    partition_size: float = 128e6,
    sort_function: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
    sort_function_kwargs: Optional[Mapping[str, Any]] = None,
    **kwargs,
) -> DataFrame:
    """See DataFrame.sort_values for docstring"""
    if na_position not in ("first", "last"):
        raise ValueError("na_position must be either 'first' or 'last'")
    if not isinstance(by, list):
        by = [by]
    if len(by) > 1 and df.npartitions > 1 or any(not isinstance(b, str)
                                                 for b in by):
        raise NotImplementedError(
            "Dataframes only support sorting by named columns which must be passed as a "
            "string or a list of strings; multi-partition dataframes only support sorting "
            "by a single column.\n"
            "You passed %s" % str(by))

    sort_kwargs = {
        "by": by,
        "ascending": ascending,
        "na_position": na_position,
    }
    if sort_function is None:
        sort_function = M.sort_values
    if sort_function_kwargs is not None:
        sort_kwargs.update(sort_function_kwargs)

    if df.npartitions == 1:
        return df.map_partitions(sort_function, **sort_kwargs)

    if npartitions == "auto":
        repartition = True
        npartitions = max(100, df.npartitions)
    else:
        if npartitions is None:
            npartitions = df.npartitions
        repartition = False

    sort_by_col = df[by[0]]

    divisions, mins, maxes = _calculate_divisions(df, sort_by_col, repartition,
                                                  npartitions, upsample,
                                                  partition_size)

    if len(divisions) == 2:
        return df.repartition(npartitions=1).map_partitions(
            sort_function, **sort_kwargs)

    if not isinstance(ascending, bool):
        # support [True] as input
        if (isinstance(ascending, list) and len(ascending) == 1
                and isinstance(ascending[0], bool)):
            ascending = ascending[0]
        else:
            raise NotImplementedError(
                f"Dask currently only supports a single boolean for ascending. You passed {str(ascending)}"
            )

    if (all(not pd.isna(x)
            for x in divisions) and mins == sorted(mins, reverse=not ascending)
            and maxes == sorted(maxes, reverse=not ascending)
            and all(mx < mn for mx, mn in zip(
                maxes[:-1] if ascending else maxes[1:],
                mins[1:] if ascending else mins[:-1],
            )) and npartitions == df.npartitions):
        # divisions are in the right place
        return df.map_partitions(sort_function, **sort_kwargs)

    df = rearrange_by_divisions(
        df,
        by,
        divisions,
        ascending=ascending,
        na_position=na_position,
        duplicates=False,
    )
    df = df.map_partitions(sort_function, **sort_kwargs)
    return df
예제 #7
0
def shuffle(
    df: DataFrame,
    column_names: List[str],
    npartitions: Optional[int] = None,
    ignore_index: bool = False,
) -> DataFrame:
    """Order divisions of DataFrame so that all values within column(s) align

    This enacts a task-based shuffle using explicit-comms. It requires a full
    dataset read, serialization and shuffle. This is expensive. If possible
    you should avoid shuffles.

    This does not preserve a meaningful index/partitioning scheme. This is not
    deterministic if done in parallel.

    Requires an activate client.

    Parameters
    ----------
    df: dask.dataframe.DataFrame
        Dataframe to shuffle
    column_names: list of strings
        List of column names on which we want to split.
    npartitions: int or None
        The desired number of output partitions. If None, the number of output
        partitions equals `df.npartitions`
    ignore_index: bool
        Ignore index during shuffle.  If True, performance may improve,
        but index values will not be preserved.

    Returns
    -------
    df: dask.dataframe.DataFrame
        Shuffled dataframe

    Developer Notes
    ---------------
    The implementation consist of three steps:
      (a) Extend the dask graph of `df` with a call to `shuffle_group()` for each
          dataframe partition and submit the graph.
      (b) Submit a task on each worker that shuffle (all-to-all communicate)
          the groups from (a) and return a list of dataframe-partitions.
      (c) Submit a dask graph that extract (using `getitem()`) individual
          dataframe-partitions from (b).
    """
    c = comms.default_comms()

    # As default we preserve number of partitions
    if npartitions is None:
        npartitions = df.npartitions

    # Step (a): partition/group each dataframe-partition
    name = ("explicit-comms-shuffle-group-"
            f"{tokenize(df, column_names, npartitions, ignore_index)}")
    df = df.persist(
    )  # Making sure optimizations are apply on the existing graph
    dsk = dict(df.__dask_graph__())
    output_keys = []
    for input_key in df.__dask_keys__():
        output_key = (name, input_key[1])
        dsk[output_key] = (
            shuffle_group,
            input_key,
            column_names,
            0,
            npartitions,
            npartitions,
            ignore_index,
            npartitions,
        )
        output_keys.append(output_key)

    # Compute `df_groups`, which is a list of futures, one future per partition in `df`.
    # Each future points to a dict of length `df.npartitions` that maps each
    # partition-id to a DataFrame.
    df_groups = compute_as_if_collection(type(df),
                                         dsk,
                                         output_keys,
                                         sync=False)
    wait(df_groups)
    for f in df_groups:  # Check for errors
        if f.status == "error":
            f.result()  # raise exception

    # Step (b): find out which workers has what part of `df_groups`,
    #           find the number of output each worker should have,
    #           and submit `local_shuffle()` on each worker.
    key_to_part = {str(part.key): part for part in df_groups}
    in_parts = defaultdict(list)  # Map worker -> [list of futures]
    for key, workers in c.client.who_has(df_groups).items():
        # Note, if multiple workers have the part, we pick the first worker
        in_parts[first(workers)].append(key_to_part[key])

    # Let's create a dict that specifices the number of partitions each worker has
    in_nparts = {}
    workers = set()  # All ranks that have a partition of `df`
    for rank, worker in enumerate(c.worker_addresses):
        nparts = len(in_parts.get(worker, ()))
        if nparts > 0:
            in_nparts[rank] = nparts
            workers.add(rank)
    workers_sorted = sorted(workers)

    # Find the output partitions for each worker
    div = npartitions // len(workers)
    rank_to_out_part_ids = {}  # rank -> [list of partition id]
    for i, rank in enumerate(workers_sorted):
        rank_to_out_part_ids[rank] = list(range(div * i, div * (i + 1)))
    for rank, i in zip(workers_sorted, range(div * len(workers), npartitions)):
        rank_to_out_part_ids[rank].append(i)

    # Run `local_shuffle()` on each worker
    result_futures = {}
    for rank, worker in enumerate(c.worker_addresses):
        if rank in workers:
            result_futures[rank] = c.submit(
                worker,
                local_shuffle,
                in_nparts,
                in_parts[worker],
                rank_to_out_part_ids,
                ignore_index,
            )
    distributed.wait(list(result_futures.values()))
    del df_groups

    # Step (c): extract individual dataframe-partitions
    name = f"explicit-comms-shuffle-getitem-{tokenize(name)}"
    dsk = {}
    meta = None
    for rank, parts in rank_to_out_part_ids.items():
        for i, part_id in enumerate(parts):
            dsk[(name, part_id)] = (getitem, result_futures[rank], i)
            if meta is None:
                # Get the meta from the first output partition
                meta = delayed(make_meta)(delayed(getitem)(
                    result_futures[rank], i)).compute()
    assert meta is not None

    divs = [None] * (len(dsk) + 1)
    return new_dd_object(dsk, name, meta, divs).persist()
예제 #8
0
def from_bcolz(x,
               chunksize=None,
               categorize=True,
               index=None,
               lock=lock,
               **kwargs):
    """Read BColz CTable into a Dask Dataframe

    BColz is a fast on-disk compressed column store with careful attention
    given to compression.  https://bcolz.readthedocs.io/en/latest/

    Parameters
    ----------
    x : bcolz.ctable
    chunksize : int, optional
        The size(rows) of blocks to pull out from ctable.
    categorize : bool, defaults to True
        Automatically categorize all string dtypes
    index : string, optional
        Column to make the index
    lock: bool or Lock
        Lock to use when reading or False for no lock (not-thread-safe)

    See Also
    --------
    from_array: more generic function not optimized for bcolz
    """
    if lock is True:
        lock = Lock()

    import bcolz

    import dask.array as da

    if isinstance(x, str):
        x = bcolz.ctable(rootdir=x)
    bc_chunklen = max(x[name].chunklen for name in x.names)
    if chunksize is None and bc_chunklen > 10000:
        chunksize = bc_chunklen

    categories = dict()
    if categorize:
        for name in x.names:
            if (np.issubdtype(x.dtype[name], np.string_)
                    or np.issubdtype(x.dtype[name], np.unicode_)
                    or np.issubdtype(x.dtype[name], np.object_)):
                a = da.from_array(x[name], chunks=(chunksize * len(x.names), ))
                categories[name] = da.unique(a).compute()

    columns = tuple(x.dtype.names)
    divisions = tuple(range(0, len(x), chunksize))
    divisions = divisions + (len(x) - 1, )
    if x.rootdir:
        token = tokenize(
            (x.rootdir, os.path.getmtime(x.rootdir)),
            chunksize,
            categorize,
            index,
            kwargs,
        )
    else:
        token = tokenize((id(x), x.shape, x.dtype), chunksize, categorize,
                         index, kwargs)
    new_name = "from_bcolz-" + token

    dsk = {(new_name, i): (
        dataframe_from_ctable,
        x,
        (slice(i * chunksize, (i + 1) * chunksize), ),
        columns,
        categories,
        lock,
    )
           for i in range(0, int(ceil(len(x) / chunksize)))}

    meta = dataframe_from_ctable(x, slice(0, 0), columns, categories, lock)
    result = DataFrame(dsk, new_name, meta, divisions)

    if index:
        assert index in x.names
        a = da.from_array(x[index], chunks=(chunksize * len(x.names), ))
        q = np.linspace(0, 100, len(x) // chunksize + 2)
        divisions = tuple(da.percentile(a, q).compute())
        return set_partition(result, index, divisions, **kwargs)
    else:
        return result
예제 #9
0
try:
    f = csv.writer(open('ws1.csv'), 'w', encoding="utf-8")
    f.writer(word_dict)
    f = csv.writer(open('ws2.csv'), 'w', encoding="utf-8")
    f.writer(word_dict)
    print(pd.read_csv('ws1.csv'))
    print(pd.read_csv('ws2.csv'))

except Exception as e:
    print(e)

from pandas import Series, DataFrame

li_data2 = Series(word_dict)
print(li_data2)
print(li_data2.value_counts()[:5])

import matplotlib.pyplot as plt

plt.rc("font", family="malgun gothic")
plt.plot(li_data2.value_counts()[:5])
plt.xlabel("횟수 종류")
plt.ylabel("종류별 발생수")
plt.legend("횟수")
plt.show()

print('!!!!!!!!!!!!!!!!!!!!!!!!!!')
df = DataFrame(wordlist)
print(df)