Exemplo n.º 1
0
def calc_word_freq(
    df: dd.DataFrame,
    top_words: int = 30,
    stopword: bool = True,
    lemmatize: bool = False,
    stem: bool = False,
) -> Dict[str, Any]:
    """
    Parse a categorical column of text data into words, and then
    compute the frequency distribution of words and the total
    number of words.

    Parameters
    ----------
    df
        Groupby-count on the categorical column as a dataframe
    top_words
        Number of highest frequency words to show in the
        wordcloud and word frequency bar chart
    stopword
        If True, remove stop words, else keep them
    lemmatize
        If True, lemmatize the words before computing
        the word frequencies, else don't
    stem
        If True, extract the stem of the words before
        computing the word frequencies, else don't
    """
    col = df.columns[0]
    if stopword:
        # use a regex to replace stop words and non-alphanumeric characters with empty string
        df[col] = df[col].str.replace(fr"\b(?:{'|'.join(ess)})\b|[^\w+ ]", "")
    else:
        df[col] = df[col].str.replace(r"[^\w+ ]", "")
    # convert to lowercase and split
    df[col] = df[col].str.lower().str.split()
    # "explode()" to "stack" all the words in a list into a new column
    df = df.explode(col)

    # lemmatize and stem
    if lemmatize or stem:
        df[col] = df[col].dropna()
    if lemmatize:
        lem = WordNetLemmatizer()
        df[col] = df[col].apply(lem.lemmatize, meta="object")
    if stem:
        porter = PorterStemmer()
        df[col] = df[col].apply(porter.stem, meta="object")

    # counts of words, excludes null values
    word_cnts = df.groupby(col)[df.columns[1]].sum()
    # total number of words
    nwords = word_cnts.sum()
    # total uniq words
    nuniq_words = word_cnts.shape[0]
    # words with the highest frequency
    fnl_word_cnts = word_cnts.nlargest(n=top_words)

    return {
        "word_cnts": fnl_word_cnts,
        "nwords": nwords,
        "nuniq_words": nuniq_words
    }
Exemplo n.º 2
0
def reset_index_dask(ddf: dd.DataFrame) -> dd.DataFrame:
    return ddf.assign(idx=1)\
              .assign(idx=lambda df: df.idx.cumsum() - 1)\
              .set_index('idx', sorted=True)\
              .map_partitions(lambda df: df.rename(index = {'idx': None}))
Exemplo n.º 3
0
def missing_impact_1v1(  # pylint: disable=too-many-locals
    df: dd.DataFrame,
    x: str,
    y: str,
    bins: int,
    ndist_sample: int,
    dtype: Optional[DTypeDef] = None,
) -> Intermediate:
    # pylint: disable=too-many-arguments
    """
    Calculate the distribution change on another column y when
    the missing values in x is dropped.
    """

    df0 = df[[x, y]]
    df1 = df.dropna(subset=[x])

    srs0, srs1 = df0[y], df1[y]
    minimum, maximum = srs0.min(), srs0.max()

    hists = [
        histogram(srs, dtype=dtype, bins=bins, return_edges=True)
        for srs in [srs0, srs1]
    ]
    hists = da.compute(*hists)

    meta = ColumnsMetadata()
    meta["y", "dtype"] = detect_dtype(df[y], dtype)

    if is_dtype(detect_dtype(df[y], dtype), Continuous()):
        dists = [rv_histogram((hist[0], hist[2])) for hist in hists]  # type: ignore
        xs = np.linspace(minimum, maximum, ndist_sample)

        pdfs = [dist.pdf(xs) for dist in dists]
        cdfs = [dist.cdf(xs) for dist in dists]

        distdf = pd.DataFrame(
            {
                "x": np.tile(xs, 2),
                "pdf": np.concatenate(pdfs),
                "cdf": np.concatenate(cdfs),
                "label": np.repeat(LABELS, ndist_sample),
            }
        )

        counts, xs, edges = zip(*hists)

        lower_bounds: List[float] = []
        upper_bounds: List[float] = []

        for edge in edges:
            lower_bounds.extend(edge[:-1])
            upper_bounds.extend(edge[1:])

        histdf = pd.DataFrame(
            {
                "x": np.concatenate(xs),
                "count": np.concatenate(counts),
                "label": np.repeat(LABELS, [len(count) for count in counts]),
                "lower_bound": lower_bounds,
                "upper_bound": upper_bounds,
            }
        )

        quantiles = [
            [srs.quantile(q) for q in [0, 0.25, 0.5, 0.75, 1]] for srs in [srs0, srs1]
        ]
        quantiles = dd.compute(*quantiles)

        boxdf = pd.DataFrame(quantiles)
        boxdf.columns = ["min", "q1", "q2", "q3", "max"]

        iqr = boxdf["q3"] - boxdf["q1"]
        boxdf["upper"] = np.minimum(boxdf["q3"] + 1.5 * iqr, boxdf["max"])
        boxdf["lower"] = np.maximum(boxdf["q3"] - 1.5 * iqr, boxdf["min"])
        boxdf["label"] = LABELS

        itmdt = Intermediate(
            dist=distdf,
            hist=histdf,
            box=boxdf,
            meta=meta["y"],
            x=x,
            y=y,
            visual_type="missing_impact_1v1",
        )
        return itmdt
    else:

        counts, xs = zip(*hists)

        df = pd.DataFrame(
            {
                "x": np.concatenate(xs, axis=0),
                "count": np.concatenate(counts, axis=0),
                "label": np.repeat(LABELS, [len(count) for count in counts]),
            }
        )

        # If the cardinality of a categorical column is too large,
        # we show the top `num_bins` values, sorted by their count before drop
        if len(counts[0]) > bins:
            sortidx = np.argsort(-counts[0])
            selected_xs = xs[0][sortidx[:bins]]
            df = df[df["x"].isin(selected_xs)]
            partial = (bins, len(counts[0]))
        else:
            partial = (len(counts[0]), len(counts[0]))

        meta["y", "partial"] = partial

        itmdt = Intermediate(
            hist=df, x=x, y=y, meta=meta["y"], visual_type="missing_impact_1v1",
        )
        return itmdt
Exemplo n.º 4
0
 def m_o(engine: DaskExecutionEngine, df: dd.DataFrame) -> None:
     assert 1 == df.compute().shape[0]
Exemplo n.º 5
0
 def feature_engineering(data: dd.DataFrame) -> dd.DataFrame:
     # data = data.repartition(npartitions=1)
     data = data.persist()
Exemplo n.º 6
0
def _to_parquet(ddf: dd.DataFrame, savepath: Path):

    return ddf.to_parquet(savepath)
 def lemmatize_tweets(
     self, tweet_dataframe: dask_dataframe
 ) -> Union[dask_dataframe, pd.DataFrame]:
     tweet_dataframe['text'] = tweet_dataframe.apply(
         lambda x: self._lemmatize(x['text']), axis=1, meta=str)
     return tweet_dataframe
def build_timedelta_features(ddf: dd.DataFrame,
                             config: RunConfig) -> dd.DataFrame:
    """Builds features for time differences between records or from present."""
    return (ddf.pipe(calculate_timedeltas,
                     config).pipe(build_timedelta_disqualifiers,
                                  config).pipe(convert_timedeltas_to_days))
def remove_unneeded_columns(ddf: dd.DataFrame) -> dd.DataFrame:
    uneeded_columns = [
        'date_if_conviction', 'date_if_felony_conviction', 'is_class_1_or_2',
        'is_class_3_or_4'
    ]
    return ddf.drop(uneeded_columns, axis=1)
Exemplo n.º 10
0
def shuffle_store_dask_partitions(
    ddf: dd.DataFrame,
    table: str,
    secondary_indices: Optional[InferredIndices],
    metadata_version: int,
    partition_on: List[str],
    store_factory: StoreFactory,
    df_serializer: Optional[DataFrameSerializer],
    dataset_uuid: str,
    num_buckets: int,
    sort_partitions_by: List[str],
    bucket_by: Sequence[str],
) -> da.Array:
    """
    Perform a dataset update with dask reshuffling to control partitioning.

    The shuffle operation will perform the following steps

    1. Pack payload data

       Payload data is serialized and compressed into a single byte value using
       ``distributed.protocol.serialize_bytes``, see also ``pack_payload``.

    2. Apply bucketing

       Hash the column subset ``bucket_by`` and distribute the hashes in
       ``num_buckets`` bins/buckets. Internally every bucket is identified by an
       integer and we will create one physical file for every bucket ID. The
       bucket ID is not exposed to the user and is dropped after the shuffle,
       before the store. This is done since we do not want to guarantee at the
       moment, that the hash function remains stable.

    3. Perform shuffle (dask.DataFrame.groupby.apply)

        The groupby key will be the combination of ``partition_on`` fields and the
        hash bucket ID. This will create a physical file for every unique tuple
        in ``partition_on + bucket_ID``. The function which is applied to the
        dataframe will perform all necessary subtask for storage of the dataset
        (partition_on, index calc, etc.).

    4. Unpack data (within the apply-function)

        After the shuffle, the first step is to unpack the payload data since
        the follow up tasks will require the full dataframe.

    5. Pre storage processing and parquet serialization

        We apply important pre storage processing like sorting data, applying
        final partitioning (at this time there should be only one group in the
        payload data but using the ``MetaPartition.partition_on`` guarantees the
        appropriate data structures kartothek expects are created.).
        After the preprocessing is done, the data is serialized and stored as
        parquet. The applied function will return an (empty) MetaPartition with
        indices and metadata which will then be used to commit the dataset.

    Returns
    -------

    A dask.Array holding relevant MetaPartition objects as values

    """
    if ddf.npartitions == 0:
        return ddf

    group_cols = partition_on.copy()

    if num_buckets is None:
        raise ValueError(
            "``num_buckets`` must not be None when shuffling data.")

    meta = ddf._meta
    meta[_KTK_HASH_BUCKET] = np.uint64(0)
    ddf = ddf.map_partitions(_hash_bucket, bucket_by, num_buckets, meta=meta)
    group_cols.append(_KTK_HASH_BUCKET)

    unpacked_meta = ddf._meta

    ddf = pack_payload(ddf, group_key=group_cols)
    ddf_grouped = ddf.groupby(by=group_cols)

    unpack = partial(
        _unpack_store_partition,
        secondary_indices=secondary_indices,
        sort_partitions_by=sort_partitions_by,
        table=table,
        dataset_uuid=dataset_uuid,
        partition_on=partition_on,
        store_factory=store_factory,
        df_serializer=df_serializer,
        metadata_version=metadata_version,
        unpacked_meta=unpacked_meta,
    )
    return cast(
        da.
        Array,  # Output type depends on meta but mypy cannot infer this easily.
        ddf_grouped.apply(unpack, meta=("MetaPartition", "object")),
    )
Exemplo n.º 11
0
def pack_payload(df: dd.DataFrame, group_key: Union[List[str],
                                                    str]) -> dd.DataFrame:
    """
    Pack all payload columns (everything except of group_key) into a single
    columns. This column will contain a single byte string containing the
    serialized and compressed payload data. The payload data is just dead weight
    when reshuffling. By compressing it once before the shuffle starts, this
    saves a lot of memory and network/disk IO.

    Example::

        >>> import pandas as pd
        ... import dask.dataframe as dd
        ... from dask.dataframe.shuffle import pack_payload
        ...
        ... df = pd.DataFrame({"A": [1, 1] * 2 + [2, 2] * 2 + [3, 3] * 2, "B": range(12)})
        ... ddf = dd.from_pandas(df, npartitions=2)

        >>> ddf.partitions[0].compute()

        A  B
        0  1  0
        1  1  1
        2  1  2
        3  1  3
        4  2  4
        5  2  5

        >>> pack_payload(ddf, "A").partitions[0].compute()

        A                               __dask_payload_bytes
        0  1  b'\x03\x00\x00\x00\x00\x00\x00\x00)\x00\x00\x03...
        1  2  b'\x03\x00\x00\x00\x00\x00\x00\x00)\x00\x00\x03...


    See also https://github.com/dask/dask/pull/6259

    """

    if (
            # https://github.com/pandas-dev/pandas/issues/34455
            isinstance(df._meta.index, pd.Float64Index)
            # TODO: Try to find out what's going on an file a bug report
            # For datetime indices the apply seems to be corrupt
            # s.t. apply(lambda x:x) returns different values
            or isinstance(df._meta.index, pd.DatetimeIndex)):
        return df
    if not HAS_DISTRIBUTED:
        _logger.warning(
            "Shuffle payload columns cannot be compressed since distributed is not installed."
        )
        return df

    if not isinstance(group_key, list):
        group_key = [group_key]

    packed_meta = df._meta[group_key]
    packed_meta[_PAYLOAD_COL] = b""

    _pack_payload = partial(pack_payload_pandas, group_key=group_key)

    return df.map_partitions(_pack_payload, meta=packed_meta)
Exemplo n.º 12
0
def flatten_aggregated_columns(dd: DataFrame):
    """
    API to make aggregated columns that are MultiIndex style flat

    Args:
        pd: target dataframe
    Returns:
        result pandas dataframe
    Examples:
        >>> import dask.dataframe
        >>> import pandas
        >>> pd = pandas.DataFrame({'a': [1, 10, 100, 1, 1, 100], 'b': range(0, 600, 100), 'key': [0, 1, 2, 0, 1, 2]})
        >>> print(pd)
        ... # doctest: +NORMALIZE_WHITESPACE
             a    b  key
        0    1    0    0
        1   10  100    1
        2  100  200    2
        3    1  300    0
        4    1  400    1
        5  100  500    2
        >>> dd = dask.dataframe.from_pandas(pd, npartitions=2)
        >>> groupby = dd.groupby(['key'])
        >>> groupby_result = groupby.agg({'a': ['sum', 'min'], 'b': ['mean', 'sum', 'max']})
        >>> print(groupby_result.compute())
        ... # doctest: +NORMALIZE_WHITESPACE
               a           b
             sum  min   mean  sum  max
        key
        0      2    1  150.0  300  300
        1     11    1  250.0  500  400
        2    200  100  350.0  700  500
        >>> flatten = flatten_aggregated_columns(groupby_result)
        >>> print(flatten.compute())
        ... # doctest: +NORMALIZE_WHITESPACE
             a_sum  a_min  b_mean  b_sum  b_max
        key
        0        2      1   150.0    300    300
        1       11      1   250.0    500    400
        2      200    100   350.0    700    500
        >>> print(flatten_aggregated_columns(dd).compute())
        ... # doctest: +NORMALIZE_WHITESPACE
             a    b  key
        0    1    0    0
        1   10  100    1
        2  100  200    2
        3    1  300    0
        4    1  400    1
        5  100  500    2
    """
    if not isinstance(dd.columns, MultiIndex) or dd.columns.nlevels != 2:
        return dd

    result = dd.copy()

    columns = []
    for l1, l2 in zip(dd.columns.get_level_values(0), dd.columns.get_level_values(1)):
        if l2 == '':
            columns.append(l1)
        else:
            columns.append('_'.join((l1, l2)))
    result.columns = columns

    return result
Exemplo n.º 13
0
def normalize_column_names(df: dd.DataFrame, enabled) -> dd.DataFrame:
    if enabled:
        df.columns = normalize_names(df.columns)

    return df
Exemplo n.º 14
0
    def _apply_function_over(
        self,
        df: dd.DataFrame,
        f: Callable,
        operands: List[dd.Series],
        window: org.apache.calcite.rex.RexWindow,
        group_columns: List[str],
        sort_columns: List[str],
        sort_ascending: List[bool],
        sort_null_first: List[bool],
    ) -> Tuple[dd.DataFrame, str]:
        """Apply the given function over the dataframe, possibly grouped and sorted per group"""
        temporary_operand_columns = {
            new_temporary_column(df): operand for operand in operands
        }
        df = df.assign(**temporary_operand_columns)
        # Important: move as few bytes as possible to the pickled function,
        # which is evaluated on the workers
        temporary_operand_columns = temporary_operand_columns.keys()

        # Extract the window definition
        lower_bound = to_bound_description(window.getLowerBound())
        upper_bound = to_bound_description(window.getUpperBound())

        new_column_name = new_temporary_column(df)

        @make_pickable_without_dask_sql
        def map_on_each_group(partitioned_group):
            # Apply sorting
            if sort_columns:
                partitioned_group = sort_partition_func(
                    partitioned_group, sort_columns, sort_ascending, sort_null_first
                )

            if f is None:
                # This is the row_number operator.
                # We do not need to do any windowing
                column_result = range(1, len(partitioned_group) + 1)
            else:
                # In all other cases, apply the windowing operation
                if lower_bound.is_unbounded and (
                    upper_bound.is_current_row or upper_bound.offset == 0
                ):
                    windowed_group = partitioned_group.expanding(min_periods=0)
                elif lower_bound.is_preceding and (
                    upper_bound.is_current_row or upper_bound.offset == 0
                ):
                    windowed_group = partitioned_group.rolling(
                        window=lower_bound.offset + 1, min_periods=0,
                    )
                else:
                    lower_offset = (
                        lower_bound.offset if not lower_bound.is_current_row else 0
                    )
                    if lower_bound.is_preceding and lower_offset is not None:
                        lower_offset *= -1
                    upper_offset = (
                        upper_bound.offset if not upper_bound.is_current_row else 0
                    )
                    if upper_bound.is_preceding and upper_offset is not None:
                        upper_offset *= -1

                    indexer = Indexer(lower_offset, upper_offset)
                    windowed_group = partitioned_group.rolling(
                        window=indexer, min_periods=0
                    )

                column_result = f(windowed_group, *temporary_operand_columns)

            partitioned_group = partitioned_group.assign(
                **{new_column_name: column_result}
            )

            return partitioned_group

        # Currently, pandas will always return a float for windowing operations
        meta = df._meta_nonempty.assign(**{new_column_name: 0.0})

        df = df.groupby(group_columns).apply(map_on_each_group, meta=meta)

        return df, new_column_name
Exemplo n.º 15
0
def compute_final_dataframe(df: dd.DataFrame) -> pd.DataFrame:
    """Execute dask task graph and compute final results"""
    return (df.compute().reset_index().pivot(index='drive_time',
                                             columns='trip_distance',
                                             values='avg_amount').fillna(0))
Exemplo n.º 16
0
def _15(obj: dd.DataFrame) -> KmerAlignFormat:
    ff = KmerAlignFormat()
    obj.to_csv(str(ff), sep='\t', index=False, single_file=True)
    return ff
Exemplo n.º 17
0
def _slice_timeid_column(ddf: dd.DataFrame) -> dd.DataFrame:

    ddf["day"] = ddf["timeid"].str.slice(0, 3).astype("int16")
    ddf["halfhourly_id"] = ddf["timeid"].str.slice(3, 5).astype("int8")

    return ddf.drop(columns=["timeid"])
Exemplo n.º 18
0
def _process_table_identifiers(
        pdf: DataFrame,
        dimension_combinations: Optional[List[List[str]]] = None,
        max_combination_length: int = 5) -> List[List[str]]:
    """
    Dask wrapper around extracting identifiers from a single sampled table (pdf).

    This method submits multiple sub-tasks to identify possible identifier combinations, waits for them to complete
    and returns one or more dimension combinations.

    Note that the `worker_client` call forces the task to secede from the Worker's thread-pool, therefore it does not
    block any other computations and cannot cause a deadlock while waiting for sub-tasks to finish.
    """
    with timed_block('[idparser] Computing number of rows took {:.3f} seconds',
                     logger, logging.DEBUG):
        num_rows = len(pdf)

    with timed_block('[idparser] Pruning columns took {:.3f} seconds', logger,
                     logging.DEBUG):
        # filter out columns that contain at least X% null values - null values can't be parts of the primary key
        columns = [
            col for col, count in pdf.count().compute().items()
            if count / num_rows >= NON_NULL_VALUES_RATIO
        ]

    with worker_client(separate_thread=True) as client:  # type: Client
        with timed_block(
                '[idparser] Generating combinations took {:.3f} seconds',
                logger, logging.DEBUG):
            # explore all possible dimension combinations if none are provided
            if dimension_combinations is None:
                all_possible_combinations = itertools.chain.from_iterable(
                    itertools.combinations(columns, i) for i in range(
                        1,
                        min(max_combination_length, len(columns)) + 1))
                generated_combinations: List[List[str]] = [
                    sorted(combination)
                    for combination in all_possible_combinations
                ]
            else:
                generated_combinations = dimension_combinations

        with timed_block(
                '[idparser] Waiting for all combination tasks took {:.3f} seconds',
                logger, logging.DEBUG):
            with timed_block(
                    '[idparser] Submitting all combination tasks took {:.3f} seconds',
                    logger, logging.DEBUG):
                # submit "per dimension combination" tasks
                futures = client.map(
                    lambda combination:
                    _process_possible_identifier_combination(pdf, combination),
                    generated_combinations,
                    key=[
                        f'comb_{combination}_{str(uuid4())}'
                        for combination in generated_combinations
                    ],
                    # priority=100,
                    # batch_size=32,
                    retries=2,
                )
            results = client.gather(futures)

    return [
        dimensions for dimensions, num_duplicates in results
        if num_duplicates == 0
    ]
 def filter_stopwords(
     self, tweet_dataframe: dask_dataframe
 ) -> Union[dask_dataframe, pd.DataFrame]:
     tweet_dataframe['text'] = tweet_dataframe.apply(
         lambda x: self._remove_stopwords(x['text']), axis=1, meta=str)
     return tweet_dataframe
Exemplo n.º 20
0
def map_partitions_as_meta(dd: DataFrame, func: Callable[...,
                                                         pandas.DataFrame],
                           meta: pandas.DataFrame, **kwargs):
    """
    API to do map_partitions, and reformat result using meta. It may avoid error caused by map_partitions result and meta don't match.

    Args:
            dd (DataFrame): dask dataframe to do map_partitions.
            func (Callable[[[pandas.DataFrame, ...]], pandas.DataFrame]): function for map_partitions.
            meta (pandas.DataFrame): expected schema of map_partitions result.
            kwargs: additional arguments for func.
        Returns:
            result dask dataframe
        Examples:
            >>> import dask.dataframe
            >>> import pandas
            >>> pd = pandas.DataFrame({'a1': ['1,2,3', '2,3,4', '3,4'], 'a2': ['a,b,c', 'b,c,d', 'c,d'],
            ...                        'b': [1, 2, 3], 'idx': [0, 1, 2]}).set_index('idx')
            >>> print(pd)
            ... # doctest: +NORMALIZE_WHITESPACE
                    a1     a2  b
            idx
            0    1,2,3  a,b,c  1
            1    2,3,4  b,c,d  2
            2      3,4    c,d  3
            >>> transformer = lambda pd: pd[['b', 'a1', 'a2']]
            >>> dd = dask.dataframe.from_pandas(pd, npartitions=2)
            >>> result1 = dd.map_partitions(transformer)
            >>> print(result1.compute())
            ... # doctest: +NORMALIZE_WHITESPACE
                 b     a1     a2
            idx
            0    1  1,2,3  a,b,c
            1    2  2,3,4  b,c,d
            2    3    3,4    c,d
            >>> meta = make_meta(('idx', 'int'), [('a1', 'object'), ('a2', 'object'), ('b', 'int')])
            >>> result2 = map_partitions_as_meta(dd, transformer, meta)
            >>> print(result2.compute())
            ... # doctest: +NORMALIZE_WHITESPACE
                    a1     a2  b
            idx
            0    1,2,3  a,b,c  1
            1    2,3,4  b,c,d  2
            2      3,4    c,d  3
            >>> transformer2 = lambda pd, v: pd[['b']] * v
            >>> result3 = map_partitions_as_meta(dd, transformer2, make_meta(('idx', 'int'), [('b', 'int')]),
            ...                                  v=100)
            >>> print(result3.compute())
            ... # doctest: +NORMALIZE_WHITESPACE
                   b
            idx
            0    100
            1    200
            2    300
    """
    def apply_meta(pd: pandas.DataFrame):
        result = func(pd, **kwargs)
        meta_column_names = [c for c in meta.columns]
        return result[meta_column_names]

    return dd.map_partitions(apply_meta)
Exemplo n.º 21
0
def calc_box(
    df: dd.DataFrame,
    bins: int,
    ngroups: int = 10,
    largest: bool = True
) -> Tuple[pd.DataFrame, List[str], List[float], Optional[Dict[str, int]]]:
    """
    Compute a box plot over either
        1) the values in one column
        2) the values corresponding to groups in another column
        3) the values corresponding to binning another column

    Parameters
    ----------
    df : dd.DataFrame
        dask dataframe with one or two columns
    bins : int
        number of bins to use if df has two numerical columns
    ngroups : int
        number of groups to show if df has a categorical and numerical column
    largest: bool
        when calculating a box plot per group, select the largest or smallest groups

    Returns
    -------
    Tuple[pd.DataFrame, List[str], List[float], Dict[str, int]]
        The box plot statistics in a dataframe, a list of the outlier
        groups and another list of the outlier values, a dictionary
        logging the sampled group output
    """
    # pylint: disable=too-many-locals
    grp_cnt_stats = None  # to inform the user of sampled output

    x = df.columns[0]
    if len(df.columns) == 1:
        df = _calc_box_stats(df[x], x)
    else:
        y = df.columns[1]
        if is_numerical(df[x].dtype) and is_numerical(df[y].dtype):
            minv, maxv, cnt = dask.compute(df[x].min(), df[x].max(),
                                           df[x].nunique())
            if cnt < bins:
                bins = cnt - 1
            endpts = np.linspace(minv, maxv, num=bins + 1)
            # calculate a box plot over each bin
            df = dd.concat(
                [
                    _calc_box_stats(
                        df[(df[x] >= endpts[i]) & (df[x] < endpts[i + 1])][y],
                        f"[{endpts[i]},{endpts[i+1]})",
                    ) if i != len(endpts) - 2 else _calc_box_stats(
                        df[(df[x] >= endpts[i]) & (df[x] <= endpts[i + 1])][y],
                        f"[{endpts[i]},{endpts[i+1]}]",
                    ) for i in range(len(endpts) - 1)
                ],
                axis=1,
            ).compute()
        else:
            df, grp_cnt_stats, largest_grps = _calc_groups(
                df, ngroups, largest)
            # calculate a box plot over each group
            df = dd.concat(
                [
                    _calc_box_stats(df[df[x] == grp][y], grp)
                    for grp in largest_grps
                ],
                axis=1,
            ).compute()

    df = df.append(
        pd.Series(
            {c: i + 1
             for i, c in enumerate(df.columns)},
            name="x",
        )).T
    df.index.name = "grp"
    df = df.reset_index()
    df["x0"], df[
        "x1"] = df["x"] - 0.8, df["x"] - 0.2  # width of whiskers for plotting

    outx: List[str] = []  # list for the outlier groups
    outy: List[float] = []  # list for the outlier values
    for ind in df.index:
        otlrs = df.loc[ind]["otlrs"]
        outx = outx + [df.loc[ind]["grp"]] * len(otlrs)
        outy = outy + otlrs

    return df, outx, outy, grp_cnt_stats
Exemplo n.º 22
0
def missing_impact_1vn(  # pylint: disable=too-many-locals
        df: dd.DataFrame, x: str, bins: int) -> Intermediate:
    """
    Calculate the distribution change on other columns when
    the missing values in x is dropped.
    """
    df0 = df
    df1 = df.dropna(subset=[x])
    cols = [col for col in df.columns if col != x]

    hists = {}

    for col in cols:
        range = None  # pylint: disable=redefined-builtin
        if is_numerical(df0[col].dtype):
            range = (df0[col].min(axis=0), df0[col].max(axis=0))

        hists[col] = [
            histogram(df[col], bins=bins, return_edges=True, range=range)
            for df in [df0, df1]
        ]
    (hists, ) = dd.compute(hists)

    dfs = {}

    meta = ColumnsMetadata()

    for col, hists_ in hists.items():
        counts, xs, *edges = zip(*hists_)

        labels = np.repeat(LABELS, [len(x) for x in xs])

        data = {
            "x": np.concatenate(xs),
            "count": np.concatenate(counts),
            "label": labels,
        }

        if edges:
            lower_bound: List[float] = []
            upper_bound: List[float] = []

            for edge in edges[0]:
                lower_bound.extend(edge[:-1])
                upper_bound.extend(edge[1:])

            data["lower_bound"] = lower_bound
            data["upper_bound"] = upper_bound

        df = pd.DataFrame(data)

        # If the cardinality of a categorical column is too large,
        # we show the top `num_bins` values, sorted by their count before drop
        if len(counts[0]) > bins and is_categorical(df0[col].dtype):
            sortidx = np.argsort(-counts[0])
            selected_xs = xs[0][sortidx[:bins]]
            df = df[df["x"].isin(selected_xs)]
            meta[col, "partial"] = (bins, len(counts[0]))
        else:
            meta[col, "partial"] = (len(counts[0]), len(counts[0]))

        meta[col, "dtype"] = df0[col].dtype
        dfs[col] = df

    return Intermediate(data=dfs,
                        x=x,
                        meta=meta,
                        visual_type="missing_impact_1vn")
Exemplo n.º 23
0
 def write_to_S3(data: dd.DataFrame, bucket_name: str, folder_name: str,
                 table_name: str) -> None:
     data.to_parquet(path=f"s3://{bucket_name}/{folder_name}/{table_name}",
                     compression="gzip",
                     engine="pyarrow",
                     overwrite=True)
Exemplo n.º 24
0
    def _apply_offset(self, df: dd.DataFrame, offset: int, end: int) -> dd.DataFrame:
        """
        Limit the dataframe to the window [offset, end].
        That is unfortunately, not so simple as we do not know how many
        items we have in each partition. We have therefore no other way than to
        calculate (!!!) the sizes of each partition.

        After that, we can create a new dataframe from the old
        dataframe by calculating for each partition if and how much
        it should be used.
        We do this via generating our own dask computation graph as
        we need to pass the partition number to the selection
        function, which is not possible with normal "map_partitions".
        """
        if not offset:
            # We do a (hopefully) very quick check: if the first partition
            # is already enough, we will just ust this
            first_partition_length = len(df.partitions[0])
            if first_partition_length >= end:
                return df.head(end, compute=False)

        # First, we need to find out which partitions we want to use.
        # Therefore we count the total number of entries
        partition_borders = df.map_partitions(lambda x: len(x))

        # Now we let each of the partitions figure out, how much it needs to return
        # using these partition borders
        # For this, we generate out own dask computation graph (as it does not really
        # fit well with one of the already present methods).

        # (a) we define a method to be calculated on each partition
        # This method returns the part of the partition, which falls between [offset, fetch]
        # Please note that the dask object "partition_borders", will be turned into
        # its pandas representation at this point and we can calculate the cumsum
        # (which is not possible on the dask object). Recalculating it should not cost
        # us much, as we assume the number of partitions is rather small.
        @dask.delayed
        def select_from_to(df, partition_index, partition_borders):
            partition_borders = partition_borders.cumsum().to_dict()
            this_partition_border_left = (
                partition_borders[partition_index - 1] if partition_index > 0 else 0
            )
            this_partition_border_right = partition_borders[partition_index]

            if (end and end < this_partition_border_left) or (
                offset and offset >= this_partition_border_right
            ):
                return df.iloc[0:0]

            from_index = max(offset - this_partition_border_left, 0) if offset else 0
            to_index = (
                min(end, this_partition_border_right)
                if end
                else this_partition_border_right
            ) - this_partition_border_left

            return df.iloc[from_index:to_index]

        # (b) Now we just need to apply the function on every partition
        # We do this via the delayed interface, which seems the easiest one.
        return dd.from_delayed(
            [
                select_from_to(partition, partition_number, partition_borders)
                for partition_number, partition in enumerate(df.partitions)
            ]
        )
Exemplo n.º 25
0
 def _save(self, data: dd.DataFrame) -> None:
     data.to_parquet(self._filepath,
                     storage_options=self.fs_args,
                     **self._save_args)
Exemplo n.º 26
0
def slowly_create_increasing_index(ddf: dd.DataFrame) -> dd.DataFrame:
    ddf['cs'] = 1
    ddf['cs'] = ddf.cs.cumsum()
    return ddf.set_index('cs')
Exemplo n.º 27
0
def missing_impact_1vn(  # pylint: disable=too-many-locals
    df: dd.DataFrame, x: str, bins: int, dtype: Optional[DTypeDef] = None,
) -> Intermediate:
    """
    Calculate the distribution change on other columns when
    the missing values in x is dropped.
    """
    df0 = df
    df1 = df.dropna(subset=[x])
    cols = [col for col in df.columns if col != x]

    hists = {}
    hists_restore_dtype = {}

    for col in cols:
        range = None  # pylint: disable=redefined-builtin
        if is_dtype(detect_dtype(df0[col], dtype), Continuous()):
            range = (df0[col].min(axis=0), df0[col].max(axis=0))

        hists[col] = [
            histogram(df[col], dtype=dtype, bins=bins, return_edges=True, range=range)
            for df in [df0, df1]
        ]

        # In some cases(Issue#98), dd.compute() can change the features dtypes and cause error.
        # So we need to restore features dtypes after dd.compute().
        centers_dtypes = (hists[col][0][1].dtype, hists[col][1][1].dtype)
        (hists,) = dd.compute(hists)
        dict_value = []

        # Here we do not reassign to the "hists" variable as
        # dd.compute() can change variables' types and cause error to mypy test in CircleCI .
        # Instead, we assign to a new variable hists_restore_dtype.
        for i in [0, 1]:
            intermediate = list(hists[col][i])
            intermediate[1] = intermediate[1].astype(centers_dtypes[i])
            dict_value.append(tuple(intermediate))
        hists_restore_dtype[col] = dict_value

    dfs = {}

    meta = ColumnsMetadata()

    for col, hists_ in hists_restore_dtype.items():
        counts, xs, *edges = zip(*hists_)

        labels = np.repeat(LABELS, [len(x) for x in xs])

        data = {
            "x": np.concatenate(xs),
            "count": np.concatenate(counts),
            "label": labels,
        }

        if edges:
            lower_bound: List[float] = []
            upper_bound: List[float] = []

            for edge in edges[0]:
                lower_bound.extend(edge[:-1])
                upper_bound.extend(edge[1:])

            data["lower_bound"] = lower_bound
            data["upper_bound"] = upper_bound

        df = pd.DataFrame(data)

        # If the cardinality of a categorical column is too large,
        # we show the top `num_bins` values, sorted by their count before drop
        if len(counts[0]) > bins and is_dtype(detect_dtype(df0[col], dtype), Nominal()):
            sortidx = np.argsort(-counts[0])
            selected_xs = xs[0][sortidx[:bins]]
            df = df[df["x"].isin(selected_xs)]
            meta[col, "partial"] = (bins, len(counts[0]))
        else:
            meta[col, "partial"] = (len(counts[0]), len(counts[0]))
        meta[col, "dtype"] = detect_dtype(df0[col], dtype)
        dfs[col] = df

    return Intermediate(data=dfs, x=x, meta=meta, visual_type="missing_impact_1vn")
Exemplo n.º 28
0
def compute_bivariate(
    df: dd.DataFrame,
    x: str,
    y: str,
    bins: int,
    ngroups: int,
    largest: bool,
    nsubgroups: int,
    timeunit: str,
    agg: str,
    sample_size: int,
    dtype: Optional[DTypeDef] = None,
) -> Intermediate:
    """Compute functions for plot(df, x, y).

    Parameters
    ----------
    df
        Dataframe from which plots are to be generated
    x
        A valid column name from the dataframe
    y
        A valid column name from the dataframe
    bins
        For a histogram or box plot with numerical x axis, it defines
        the number of equal-width bins to use when grouping.
    ngroups
        When grouping over a categorical column, it defines the
        number of groups to show in the plot. Ie, the number of
        bars to show in a bar chart.
    largest
        If true, when grouping over a categorical column, the groups
        with the largest count will be output. If false, the groups
        with the smallest count will be output.
    nsubgroups
        If x and y are categorical columns, ngroups refers to
        how many groups to show from column x, and nsubgroups refers to
        how many subgroups to show from column y in each group in column x.
    timeunit
        Defines the time unit to group values over for a datetime column.
        It can be "year", "quarter", "month", "week", "day", "hour",
        "minute", "second". With default value "auto", it will use the
        time unit such that the resulting number of groups is closest to 15.
    agg
        Specify the aggregate to use when aggregating over a numeric column
    sample_size
        Sample size for the scatter plot
    dtype: str or DType or dict of str or dict of DType, default None
        Specify Data Types for designated column or all columns.
        E.g.  dtype = {"a": Continuous, "b": "Nominal"} or
        dtype = {"a": Continuous(), "b": "nominal"}
        or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous()
    """
    # pylint: disable=too-many-arguments,too-many-locals

    xtype = detect_dtype(df[x], dtype)
    ytype = detect_dtype(df[y], dtype)
    if (is_dtype(xtype, Nominal()) and is_dtype(ytype, Continuous())
            or is_dtype(xtype, Continuous()) and is_dtype(ytype, Nominal())):
        x, y = (x, y) if is_dtype(xtype, Nominal()) else (y, x)
        df = df[[x, y]]
        first_rows = df.head()
        try:
            first_rows[x].apply(hash)
        except TypeError:
            df[x] = df[x].astype(str)

        (comps, ) = dask.compute(
            nom_cont_comps(df.dropna(), bins, ngroups, largest))

        return Intermediate(x=x,
                            y=y,
                            data=comps,
                            ngroups=ngroups,
                            visual_type="cat_and_num_cols")
    elif (is_dtype(xtype, DateTime()) and is_dtype(ytype, Continuous())
          or is_dtype(xtype, Continuous()) and is_dtype(ytype, DateTime())):
        x, y = (x, y) if is_dtype(xtype, DateTime()) else (y, x)
        df = df[[x, y]].dropna()
        dtnum: List[Any] = []
        # line chart
        dtnum.append(dask.delayed(_calc_line_dt)(df, timeunit, agg))
        # box plot
        dtnum.append(dask.delayed(calc_box_dt)(df, timeunit))
        dtnum = dask.compute(*dtnum)
        return Intermediate(
            x=x,
            y=y,
            linedata=dtnum[0],
            boxdata=dtnum[1],
            visual_type="dt_and_num_cols",
        )
    elif (is_dtype(xtype, DateTime()) and is_dtype(ytype, Nominal())
          or is_dtype(xtype, Nominal()) and is_dtype(ytype, DateTime())):
        x, y = (x, y) if is_dtype(xtype, DateTime()) else (y, x)
        df = df[[x, y]].dropna()
        df[y] = df[y].apply(str, meta=(y, str))
        dtcat: List[Any] = []
        # line chart
        dtcat.append(
            dask.delayed(_calc_line_dt)(df,
                                        timeunit,
                                        ngroups=ngroups,
                                        largest=largest))
        # stacked bar chart
        dtcat.append(
            dask.delayed(calc_stacked_dt)(df, timeunit, ngroups, largest))
        dtcat = dask.compute(*dtcat)
        return Intermediate(
            x=x,
            y=y,
            linedata=dtcat[0],
            stackdata=dtcat[1],
            visual_type="dt_and_cat_cols",
        )
    elif is_dtype(xtype, Nominal()) and is_dtype(ytype, Nominal()):
        df = df[[x, y]]
        first_rows = df.head()
        try:
            first_rows[x].apply(hash)
        except TypeError:
            df[x] = df[x].astype(str)
        try:
            first_rows[y].apply(hash)
        except TypeError:
            df[y] = df[y].astype(str)

        (comps, ) = dask.compute(df.dropna().groupby([x, y]).size())

        return Intermediate(
            x=x,
            y=y,
            data=comps,
            ngroups=ngroups,
            nsubgroups=nsubgroups,
            visual_type="two_cat_cols",
        )
    elif is_dtype(xtype, Continuous()) and is_dtype(ytype, Continuous()):
        df = df[[x, y]].dropna()

        data: Dict[str, Any] = {}
        # scatter plot data
        data["scat"] = df.map_partitions(
            lambda x: x.sample(min(100, x.shape[0])), meta=df)
        # hexbin plot data
        data["hex"] = df
        # box plot
        data["box"] = calc_box_num(df, bins)

        (data, ) = dask.compute(data)

        return Intermediate(
            x=x,
            y=y,
            data=data,
            spl_sz=sample_size,
            visual_type="two_num_cols",
        )
    else:
        raise UnreachableError
Exemplo n.º 29
0
def calc_line_dt(
    df: dd.DataFrame,
    unit: str,
    agg: Optional[str] = None,
    ngroups: Optional[int] = None,
    largest: Optional[bool] = None,
) -> Union[Tuple[pd.DataFrame, Dict[str, int], str], Tuple[
        pd.DataFrame, str, float], Tuple[pd.DataFrame, str], ]:
    """
    Calculate a line or multiline chart with date on the x axis. If df contains
    one datetime column, it will make a line chart of the frequency of values. If
    df contains a datetime and categorical column, it will compute the frequency
    of each categorical value in each time group. If df contains a datetime and
    numerical column, it will compute the aggregate of the numerical column grouped
    by the time groups. If df contains a datetime, categorical, and numerical column,
    it will compute the aggregate of the numerical column for values in the categorical
    column grouped by time.

    Parameters
    ----------
    df
        A dataframe
    unit
        The unit of time over which to group the values
    agg
        Aggregate to use for the numerical column
    ngroups
        Number of groups for the categorical column
    largest
        Use the largest or smallest groups in the categorical column
    """
    # pylint: disable=too-many-locals

    x = df.columns[0]  # time column
    unit = _get_timeunit(df[x].min(), df[x].max(),
                         100) if unit == "auto" else unit
    if unit not in DTMAP.keys():
        raise ValueError
    grouper = pd.Grouper(key=x,
                         freq=DTMAP[unit][0])  # for grouping the time values

    # multiline charts
    if ngroups and largest:
        hist_dict: Dict[str, Tuple[np.ndarray, np.ndarray, List[str]]] = dict()
        hist_lst: List[Tuple[np.ndarray, np.ndarray, List[str]]] = list()
        agg = ("freq" if agg is None else agg
               )  # default agg if unspecified for notational concision

        # categorical column for grouping over, each resulting group is a line in the chart
        grpby_col = df.columns[1] if len(df.columns) == 2 else df.columns[2]
        df, grp_cnt_stats, largest_grps = _calc_groups(df, grpby_col, ngroups,
                                                       largest)
        groups = df.groupby([grpby_col])

        for grp in largest_grps:
            srs = groups.get_group(grp)
            # calculate the frequencies or aggregate value in each time group
            if len(df.columns) == 3:
                dfr = srs.groupby(grouper)[df.columns[1]].agg(
                    agg).reset_index()
            else:
                dfr = srs[x].to_frame().groupby(grouper).size().reset_index()
            dfr.columns = [x, agg]
            # if grouping by week, make the label for the week the beginning Sunday
            dfr[x] = dfr[x] - pd.to_timedelta(
                6, unit="d") if unit == "week" else dfr[x]
            # format the label
            dfr["lbl"] = dfr[x].dt.to_period("S").dt.strftime(DTMAP[unit][1])
            hist_lst.append((list(dfr[agg]), list(dfr[x]), list(dfr["lbl"])))
        hist_lst = dask.compute(*hist_lst)
        for elem in zip(largest_grps, hist_lst):
            hist_dict[elem[0]] = elem[1]
        return hist_dict, grp_cnt_stats, DTMAP[unit][3]

    # single line charts
    if agg is None:  # frequency of datetime column
        miss_pct = round(df[x].isna().sum() / len(df) * 100, 1)
        dfr = df.dropna().groupby(grouper).size().reset_index()
        dfr.columns = [x, "freq"]
        dfr["pct"] = dfr["freq"] / len(df) * 100
    else:  # aggregate over a second column
        dfr = df.groupby(grouper)[df.columns[1]].agg(agg).reset_index()
        dfr.columns = [x, agg]
    dfr[x] = dfr[x] - pd.to_timedelta(6,
                                      unit="d") if unit == "week" else dfr[x]
    dfr["lbl"] = dfr[x].dt.to_period("S").dt.strftime(DTMAP[unit][1])

    return (dfr, DTMAP[unit][3], miss_pct) if agg is None else (dfr,
                                                                DTMAP[unit][3])
Exemplo n.º 30
0
def dump_dask_to_intake(dd: DataFrame, data_name: str, data_dir: Union[str, Path], catalog_file: Union[Path, str],
                        **kwargs):
    """
    API to dump dask dataframe as parquet format and add it to intake catalog

    Args:
        dd: dask dataframe to dump.
        data_name: name used as a name of intake data source.
        data_dir: directory where dask dataframe will be stored.
        catalog_file: file where data source to be added. if file doesn't exist, file will be created.
        kwargs: Any options available for dask.dataframe.to_parquet. see https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.to_parquet for detail.
    Returns:
        created parquet data source and dask to_parquet job (if you put compute=False in kwargs.)
    Examples:
        >>> import os
        >>> import shutil
        >>> import yaml
        >>> from intake.source.csv import CSVSource
        >>> import dask.dataframe
        >>> import pandas
        >>> pd = pandas.DataFrame({'a': [1, 2, 3, 4], 'b': [2, 3, 4, 5], 'c': [5, 6, 7, 8],
        ...                        'label': ['a', 'b', 'c', 'd']},
        ...                        index=[100, 200, 300, 400])
        >>> print(pd)
             a  b  c label
        100  1  2  5     a
        200  2  3  6     b
        300  3  4  7     c
        400  4  5  8     d
        >>> dd = dask.dataframe.from_pandas(pd, npartitions=2)
        >>> print(dd)
        ... # doctest: +NORMALIZE_WHITESPACE
        Dask DataFrame Structure:
                           a      b      c   label
        npartitions=2
        100            int64  int64  int64  object
        300              ...    ...    ...     ...
        400              ...    ...    ...     ...
        Dask Name: from_pandas, 2 tasks
        >>> cfile = 'test/temp/test-catalog.yaml'
        >>> ddir = 'test/temp/data-dir'
        >>> # DUMP WITH COMPUTATION
        >>> psource1, job = dump_dask_to_intake(dd, 'test-dd1', ddir, cfile)
        >>> print(psource1.name)
        test-dd1
        >>> print(psource1.read())
        ... # doctest: +NORMALIZE_WHITESPACE
               a  b  c label
        100    1  2  5     a
        200    2  3  6     b
        300    3  4  7     c
        400    4  5  8     d
        >>> print(job is None)
        True
        >>> print(yaml.safe_load(Path(cfile).open().read()))
        ... # doctest: +NORMALIZE_WHITESPACE
        {'metadata': {}, 'sources': {'test-dd1': {'args': {'urlpath': 'test/temp/data-dir/test-dd1'}, 'description': '', 'driver': 'intake_parquet.source.ParquetSource', 'metadata': {}}}}
        >>> # DUMP WITHOUT COMPUTATION
        >>> psource2, job = dump_dask_to_intake(dd, 'test-dd2', ddir, cfile, compute=False)
        >>> print(job is None)
        False
        >>> # do computation lazily
        >>> job.compute()
        >>> print(psource2.read())
        ... # doctest: +NORMALIZE_WHITESPACE
               a  b  c label
        100    1  2  5     a
        200    2  3  6     b
        300    3  4  7     c
        400    4  5  8     d
        >>> os.remove(cfile)
        >>> shutil.rmtree(ddir)
    """
    data_dir = local_or_s3_path(data_dir)

    parquet_dir = data_dir / data_name

    parquet_kwargs = {}
    if 'engine' in kwargs:
        parquet_kwargs['engine'] = kwargs['engine']

    psource = ParquetSource(str(parquet_dir), **parquet_kwargs)
    psource.name = data_name
    add_source_to_catalog(psource, catalog_file)
    to_parquet_result = dd.to_parquet(str(parquet_dir), **kwargs)

    return psource, to_parquet_result