def nom_comps( srs: dd.Series, ngroups: int, largest: bool, bins: int, top_words: int, stopword: bool, lemmatize: bool, stem: bool, ) -> Dict[str, Any]: """ This function aggregates all of the computations required for plot(df, Nominal()) Parameters ---------- srs one categorical column ngroups Number of groups to return largest If true, show the groups with the largest count, else show the groups with the smallest count bins number of bins for the category length frequency histogram top_words Number of highest frequency words to show in the wordcloud and word frequency bar chart stopword If True, remove stop words, else keep them lemmatize If True, lemmatize the words before computing the word frequencies, else don't stem If True, extract the stem of the words before computing the word frequencies, else don't """ # pylint: disable=too-many-arguments data: Dict[str, Any] = {} # total rows data["nrows"] = srs.shape[0] # cast the column as string type if it contains a mutable type first_rows = srs.head() # dd.Series.head() triggers a (small) data read try: first_rows.apply(hash) except TypeError: srs = srs.astype(str) # drop null values srs = drop_null(srs) (srs, ) = dask.persist(srs) ## if cfg.bar_enable or cfg.pie_enable # counts of unique values in the series grps = srs.value_counts(sort=False) # total number of groups data["nuniq"] = grps.shape[0] # select the largest or smallest groups data["bar"] = grps.nlargest(ngroups) if largest else grps.nsmallest( ngroups) ## if cfg.barchart_bars == cfg.piechart_slices: data["pie"] = data["bar"] ## else ## data["pie"] = grps.nlargest(ngroups) if largest else grps.nsmallest(ngroups) ## if cfg.insights.evenness_enable data["chisq"] = chisquare(grps.values) if not first_rows.apply(lambda x: isinstance(x, str)).all(): srs = srs.astype( str) # srs must be a string to compute the value lengths ## if cfg.stats_enable data.update(calc_cat_stats(srs, bins, data["nrows"], data["nuniq"])) ## if cfg.word_freq_enable data.update(calc_word_freq(srs, top_words, stopword, lemmatize, stem)) return data
def dataframe_from_series_of_pandas(series_of_pandas_dataframes: Series, schema: Optional[Union[pandas.DataFrame, DataFrame, Dict[Any, Any]]] = None): """ pandas.DataFrameが各行に保持されているdask.dataframe.Seriesをdask.dataframe.Dataframeに変換するAPI。 schemaを指定すればそれを結果のスキーマとして使用し、Noneなら第1パーティションの第1行に保持されているdataframeを使用する。 series内のdataframeのindexは無視され、元のseriesのindexで上書きされる。 Args: series_of_pandas_dataframes: dask dataframe from series of pandas dataframes. schema: schema of resutl dataframe Return: dask dataframe Examples: >>> import dask.dataframe >>> import pandas >>> df = pandas.DataFrame({'a': range(100), 'b': range(100, 200)}) >>> print(df) ... # doctest: +NORMALIZE_WHITESPACE a b 0 0 100 1 1 101 2 2 102 3 3 103 4 4 104 .. .. ... 95 95 195 96 96 196 97 97 197 98 98 198 99 99 199 [100 rows x 2 columns] >>> ddf = dask.dataframe.from_pandas(df, npartitions=4) >>> def build_df_from_row(row): ... pdf = pandas.DataFrame({'values': [row.a, row.a + 1, row.a + 2], 'support': [row.b, row.b, row.b]}) ... return pdf >>> df_ds = ddf.apply(build_df_from_row, axis=1) >>> print(df_ds) ... # doctest: +NORMALIZE_WHITESPACE Dask Series Structure: npartitions=4 0 object 25 ... 50 ... 75 ... 99 ... dtype: object Dask Name: apply, 8 tasks >>> result = dataframe_from_series_of_pandas(df_ds) >>> print(result) ... # doctest: +NORMALIZE_WHITESPACE Dask DataFrame Structure: values support npartitions=4 0 int64 int64 25 ... ... 50 ... ... 75 ... ... 99 ... ... Dask Name: create_pandas_dataframe_in_partition, 12 tasks >>> print(result.compute()) ... # doctest: +NORMALIZE_WHITESPACE values support 0 0 100 0 1 100 0 2 100 1 1 101 1 2 101 .. ... ... 98 99 198 98 100 198 99 99 199 99 100 199 99 101 199 [300 rows x 2 columns] """ if schema is None: schema = series_of_pandas_dataframes.head(1).iloc[0] def create_pandas_dataframe_in_partition(series_chunk: pandas.Series): for i, v in series_chunk.iteritems(): v.set_axis([i] * len(v.index), axis=0, inplace=True) df = pandas.concat(list(series_chunk), axis=0) return df ddf = series_of_pandas_dataframes.map_partitions(create_pandas_dataframe_in_partition, meta=schema) return ddf