예제 #1
0
def nom_comps(
    srs: dd.Series,
    ngroups: int,
    largest: bool,
    bins: int,
    top_words: int,
    stopword: bool,
    lemmatize: bool,
    stem: bool,
) -> Dict[str, Any]:
    """
    This function aggregates all of the computations required for plot(df, Nominal())

    Parameters
    ----------
    srs
        one categorical column
    ngroups
        Number of groups to return
    largest
        If true, show the groups with the largest count,
        else show the groups with the smallest count
    bins
        number of bins for the category length frequency histogram
    top_words
        Number of highest frequency words to show in the
        wordcloud and word frequency bar chart
    stopword
        If True, remove stop words, else keep them
    lemmatize
        If True, lemmatize the words before computing
        the word frequencies, else don't
    stem
        If True, extract the stem of the words before
        computing the word frequencies, else don't
    """  # pylint: disable=too-many-arguments

    data: Dict[str, Any] = {}

    # total rows
    data["nrows"] = srs.shape[0]
    # cast the column as string type if it contains a mutable type
    first_rows = srs.head()  # dd.Series.head() triggers a (small) data read
    try:
        first_rows.apply(hash)
    except TypeError:
        srs = srs.astype(str)
    # drop null values
    srs = drop_null(srs)

    (srs, ) = dask.persist(srs)

    ## if cfg.bar_enable or cfg.pie_enable
    # counts of unique values in the series
    grps = srs.value_counts(sort=False)
    # total number of groups
    data["nuniq"] = grps.shape[0]
    # select the largest or smallest groups
    data["bar"] = grps.nlargest(ngroups) if largest else grps.nsmallest(
        ngroups)
    ##     if cfg.barchart_bars == cfg.piechart_slices:
    data["pie"] = data["bar"]
    ##     else
    ##     data["pie"] = grps.nlargest(ngroups) if largest else grps.nsmallest(ngroups)
    ##     if cfg.insights.evenness_enable
    data["chisq"] = chisquare(grps.values)

    if not first_rows.apply(lambda x: isinstance(x, str)).all():
        srs = srs.astype(
            str)  # srs must be a string to compute the value lengths
    ## if cfg.stats_enable
    data.update(calc_cat_stats(srs, bins, data["nrows"], data["nuniq"]))
    ## if cfg.word_freq_enable
    data.update(calc_word_freq(srs, top_words, stopword, lemmatize, stem))

    return data
예제 #2
0
def dataframe_from_series_of_pandas(series_of_pandas_dataframes: Series,
                                    schema: Optional[Union[pandas.DataFrame, DataFrame, Dict[Any, Any]]] = None):
    """
    pandas.DataFrameが各行に保持されているdask.dataframe.Seriesをdask.dataframe.Dataframeに変換するAPI。
    schemaを指定すればそれを結果のスキーマとして使用し、Noneなら第1パーティションの第1行に保持されているdataframeを使用する。
    series内のdataframeのindexは無視され、元のseriesのindexで上書きされる。

    Args:
        series_of_pandas_dataframes: dask dataframe from series of pandas dataframes.
        schema: schema of resutl dataframe
    Return: dask dataframe
    Examples:
        >>> import dask.dataframe
        >>> import pandas
        >>> df = pandas.DataFrame({'a': range(100), 'b': range(100, 200)})
        >>> print(df)
        ... # doctest: +NORMALIZE_WHITESPACE
             a    b
        0    0  100
        1    1  101
        2    2  102
        3    3  103
        4    4  104
        ..  ..  ...
        95  95  195
        96  96  196
        97  97  197
        98  98  198
        99  99  199
        [100 rows x 2 columns]
        >>> ddf = dask.dataframe.from_pandas(df, npartitions=4)
        >>> def build_df_from_row(row):
        ...     pdf = pandas.DataFrame({'values': [row.a, row.a + 1, row.a + 2], 'support': [row.b, row.b, row.b]})
        ...     return pdf
        >>> df_ds = ddf.apply(build_df_from_row, axis=1)
        >>> print(df_ds)
        ... # doctest: +NORMALIZE_WHITESPACE
        Dask Series Structure:
        npartitions=4
        0     object
        25       ...
        50       ...
        75       ...
        99       ...
        dtype: object
        Dask Name: apply, 8 tasks
        >>> result = dataframe_from_series_of_pandas(df_ds)
        >>> print(result)
        ... # doctest: +NORMALIZE_WHITESPACE
        Dask DataFrame Structure:
                      values support
        npartitions=4
        0              int64   int64
        25               ...     ...
        50               ...     ...
        75               ...     ...
        99               ...     ...
        Dask Name: create_pandas_dataframe_in_partition, 12 tasks
        >>> print(result.compute())
        ... # doctest: +NORMALIZE_WHITESPACE
            values  support
        0        0      100
        0        1      100
        0        2      100
        1        1      101
        1        2      101
        ..     ...      ...
        98      99      198
        98     100      198
        99      99      199
        99     100      199
        99     101      199
        [300 rows x 2 columns]
    """
    if schema is None:
        schema = series_of_pandas_dataframes.head(1).iloc[0]

    def create_pandas_dataframe_in_partition(series_chunk: pandas.Series):
        for i, v in series_chunk.iteritems():
            v.set_axis([i] * len(v.index), axis=0, inplace=True)
        df = pandas.concat(list(series_chunk), axis=0)
        return df

    ddf = series_of_pandas_dataframes.map_partitions(create_pandas_dataframe_in_partition, meta=schema)
    return ddf