def _nom_calcs(srs: dd.Series, head: pd.Series, cfg: Config) -> Dict[str, Any]: """ Computations for a nominal column in plot(df) """ # dictionary of data for the bar chart and related insights data: Dict[str, Any] = {} # value counts for barchart and uniformity insight grps = srs.value_counts(sort=False) if cfg.bar.enable: # select the largest or smallest groups data["bar"] = ( grps.nlargest(cfg.bar.bars) if cfg.bar.sort_descending else grps.nsmallest(cfg.bar.bars) ) data["nuniq"] = grps.shape[0] if cfg.insight.enable: data["chisq"] = chisquare(grps.values) # chi-squared test for uniformity data["nuniq"] = grps.shape[0] # number of unique values data["npres"] = grps.sum() # number of present (not null) values if not head.apply(lambda x: isinstance(x, str)).all(): srs = srs.astype(str) # srs must be a string to compute the value lengths data["min_len"], data["max_len"] = srs.str.len().min(), srs.str.len().max() return data
def _cont_calcs(srs: dd.Series, cfg: Config) -> Dict[str, Any]: """ Computations for a continuous column in plot(df) """ # dictionary of data for the histogram and related insights data: Dict[str, Any] = {} if cfg.insight.enable: data["npres"] = srs.shape[0] # number of present (not null) values # drop infinite values srs = srs[~srs.isin({np.inf, -np.inf})] # histogram data["hist"] = da.histogram(srs, bins=cfg.hist.bins, range=(srs.min(), srs.max())) if cfg.insight.enable: data["chisq"] = chisquare(data["hist"][0]) data["norm"] = normaltest(data["hist"][0]) data["skew"] = skewtest(data["hist"][0]) data["nneg"] = (srs < 0).sum() # number of negative values data["nuniq"] = srs.nunique_approx() # number of unique values data["nzero"] = (srs == 0).sum() # number of zeros data["nreals"] = srs.shape[0] # number of non-inf values return data
def calc_nom_col(srs: dd.Series, first_rows: pd.Series, ngroups: int, largest: bool) -> Dict[str, Any]: """ Computations for a categorical column in plot(df) Parameters ---------- srs srs over which to compute the barchart and insights first_rows first rows of the dataset read into memory ngroups number of groups to show in the barchart largest whether to show the largest or smallest groups """ # dictionary of data for the bar chart and related insights data = {} ## if cfg.barchart_enable or cfg.insight.uniform_enable: grps = srs.value_counts(sort=False) ## if cfg.barchart_enable: ## nbars = cfg.barchart_nbars ## largest = cfg.barchart_largest # select the largest or smallest groups data["bar"] = grps.nlargest(ngroups) if largest else grps.nsmallest( ngroups) ## if cfg.insight.uniform_enable: # compute a chi-squared test on the frequency distribution data["chisq"] = chisquare(grps.values) ## if cfg.barchart_enable or cfg.insight.unique_enable: # total number of groups data["nuniq"] = grps.shape[0] ## if cfg.insight.missing_enable: # number of present (not null) values data["npres"] = grps.sum() ## if cfg.insight.unique_enable and not cfg.barchart_enable: ## data["nuniq"] = srs.nunique() ## if cfg.insight.missing_enable and not cfg.barchart_enable: ## data["npresent"] = srs.shape[0] ## if cfg.insight.constant_length_enable: if not first_rows.apply(lambda x: isinstance(x, str)).all(): srs = srs.astype( str) # srs must be a string to compute the value lengths length = srs.str.len() data["min_len"], data["max_len"] = length.min(), length.max() return data
def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]: """ Computations for a numerical column in plot(df) Parameters ---------- srs srs over which to compute the barchart and insights bins number of bins in the bar chart """ # dictionary of data for the histogram and related insights data: Dict[str, Any] = {} ## if cfg.insight.missing_enable: data["npres"] = srs.shape[0] ## if cfg.insight.infinity_enable: is_inf_srs = srs.isin({np.inf, -np.inf}) data["ninf"] = is_inf_srs.sum() # remove infinite values srs = srs[~is_inf_srs] ## if cfg.hist_enable or config.insight.uniform_enable or cfg.insight.normal_enable: ## bins = cfg.hist_bins data["hist"] = da.histogram(srs, bins=bins, range=[srs.min(), srs.max()]) ## if cfg.insight.uniform_enable: data["chisq"] = chisquare(data["hist"][0]) ## if cfg.insight.normal_enable data["norm"] = normaltest(data["hist"][0]) ## if cfg.insight.negative_enable: data["nneg"] = (srs < 0).sum() ## if cfg.insight.skew_enabled: data["skew"] = skewtest(data["hist"][0]) ## if cfg.insight.unique_enabled: data["nuniq"] = srs.nunique() ## if cfg.insight.zero_enabled: data["nzero"] = (srs == 0).sum() return data
def cont_comps(srs: dd.Series, cfg: Config) -> Dict[str, Any]: """ All computations required for plot(df, Continuous) """ # pylint: disable=too-many-branches data: Dict[str, Any] = {} if cfg.stats.enable or cfg.hist.enable: data["nrows"] = srs.shape[0] # total rows srs = srs.dropna() if cfg.stats.enable: data["npres"] = srs.shape[0] # number of present (not null) values srs = srs[~srs.isin({np.inf, -np.inf})] # remove infinite values if cfg.hist.enable or cfg.qqnorm.enable and cfg.insight.enable: data["hist"] = da.histogram(srs, cfg.hist.bins, (srs.min(), srs.max())) if cfg.insight.enable: data["norm"] = normaltest(data["hist"][0]) if cfg.hist.enable and cfg.insight.enable: data["chisq"] = chisquare(data["hist"][0]) # compute only the required amount of quantiles if cfg.qqnorm.enable: data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99)) elif cfg.stats.enable: data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95]) elif cfg.box.enable: data["qntls"] = srs.quantile([0.25, 0.5, 0.75]) if cfg.stats.enable or cfg.hist.enable and cfg.insight.enable: data["skew"] = skew(srs) if cfg.stats.enable or cfg.qqnorm.enable: data["mean"] = srs.mean() data["std"] = srs.std() if cfg.stats.enable: data["min"] = srs.min() data["max"] = srs.max() data["nreals"] = srs.shape[0] data["nzero"] = (srs == 0).sum() data["nneg"] = (srs < 0).sum() data["kurt"] = kurtosis(srs) data["mem_use"] = srs.memory_usage(deep=True) # compute the density histogram if cfg.kde.enable: # To avoid the singular matrix problem, gaussian_kde needs a non-zero std. if not math.isclose( dask.compute(data["min"])[0], dask.compute(data["max"])[0]): data["dens"] = da.histogram(srs, cfg.kde.bins, (srs.min(), srs.max()), density=True) # gaussian kernel density estimate data["kde"] = gaussian_kde( srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])), meta=srs)) else: data["kde"] = None if cfg.box.enable: data.update(_calc_box(srs, data["qntls"], cfg)) if cfg.value_table.enable: value_counts = srs.value_counts(sort=False) if cfg.stats.enable: data["nuniq"] = value_counts.shape[0] data["value_table"] = value_counts.nlargest(cfg.value_table.ngroups) elif cfg.stats.enable: data["nuniq"] = srs.nunique_approx() return data
def nom_comps(srs: dd.Series, head: pd.Series, cfg: Config) -> Dict[str, Any]: """ All computations required for plot(df, Nominal) """ # pylint: disable=too-many-branches data: Dict[str, Any] = dict() data["nrows"] = srs.shape[0] # total rows srs = srs.dropna() # drop null values grps = srs.value_counts( sort=False) # counts of unique values in the series data["geo"] = grps if cfg.stats.enable or cfg.bar.enable or cfg.pie.enable: data["nuniq"] = grps.shape[0] # total number of groups # compute bar and pie together unless the parameters are different if cfg.bar.enable or cfg.pie.enable: # select the largest or smallest groups data["bar"] = (grps.nlargest(cfg.bar.bars) if cfg.bar.sort_descending else grps.nsmallest(cfg.bar.bars)) if cfg.bar.bars == cfg.pie.slices and cfg.bar.sort_descending == cfg.pie.sort_descending: data["pie"] = data["bar"] else: data["pie"] = (grps.nlargest(cfg.pie.slices) if cfg.pie.sort_descending else grps.nsmallest( cfg.pie.slices)) if cfg.bar.bars == cfg.value_table.ngroups and cfg.bar.sort_descending: data["value_table"] = data["bar"] elif cfg.pie.slices == cfg.value_table.ngroups and cfg.pie.sort_descending: data["value_table"] = data["pie"] else: data["value_table"] = grps.nlargest(cfg.value_table.ngroups) if cfg.insight.enable: data["chisq"] = chisquare(grps.values) df = grps.reset_index() # dataframe with group names and counts if cfg.stats.enable or cfg.wordlen.enable: if not head.apply(lambda x: isinstance(x, str)).all(): srs = srs.astype( str) # srs must be a string to compute the value lengths if cfg.stats.enable or cfg.wordcloud.enable or cfg.wordfreq.enable: if not head.apply(lambda x: isinstance(x, str)).all(): df[df.columns[0]] = df[df.columns[0]].astype(str) if cfg.stats.enable: data.update(_calc_nom_stats(srs, df, data["nrows"], data["nuniq"])) elif cfg.wordfreq.enable and cfg.insight.enable: data["len_stats"] = { "Minimum": srs.str.len().min(), "Maximum": srs.str.len().max() } if cfg.wordlen.enable: lens = srs.str.len() data["len_hist"] = da.histogram(lens, cfg.wordlen.bins, (lens.min(), lens.max())) if cfg.wordcloud.enable or cfg.wordfreq.enable: if all( getattr(cfg.wordcloud, att) == getattr(cfg.wordfreq, att) for att in ("top_words", "stopword", "stem", "lemmatize")): word_freqs = _calc_word_freq( df, cfg.wordfreq.top_words, cfg.wordfreq.stopword, cfg.wordfreq.lemmatize, cfg.wordfreq.stem, ) data["word_cnts_cloud"] = word_freqs["word_cnts"] data["nuniq_words_cloud"] = word_freqs["nuniq_words"] else: word_freqs = _calc_word_freq( df.copy(), cfg.wordfreq.top_words, cfg.wordfreq.stopword, cfg.wordfreq.lemmatize, cfg.wordfreq.stem, ) word_freqs_cloud = _calc_word_freq( df, cfg.wordcloud.top_words, cfg.wordcloud.stopword, cfg.wordcloud.lemmatize, cfg.wordcloud.stem, ) data["word_cnts_cloud"] = word_freqs_cloud["word_cnts"] data["nuniq_words_cloud"] = word_freqs["nuniq_words"] data["word_cnts_freq"] = word_freqs["word_cnts"] data["nwords_freq"] = word_freqs["nwords"] return data
def cont_comps(srs: dd.Series, bins: int) -> Dict[str, Any]: """ This function aggregates all of the computations required for plot(df, Continuous()) Parameters ---------- srs one numerical column bins the number of bins in the histogram """ data: Dict[str, Any] = {} ## if cfg.stats_enable or cfg.hist_enable or # calculate the total number of rows then drop the missing values data["nrows"] = srs.shape[0] srs = srs.dropna() ## if cfg.stats_enable # number of not null (present) values data["npres"] = srs.shape[0] # remove infinite values srs = srs[~srs.isin({np.inf, -np.inf})] # shared computations ## if cfg.stats_enable or cfg.hist_enable or cfg.qqplot_enable and cfg.insights_enable: data["min"], data["max"] = srs.min(), srs.max() ## if cfg.hist_enable or cfg.qqplot_enable and cfg.ingsights_enable: data["hist"] = da.histogram(srs, bins=bins, range=[data["min"], data["max"]]) ## if cfg.insights_enable and (cfg.qqplot_enable or cfg.hist_enable): data["norm"] = normaltest(data["hist"][0]) ## if cfg.qqplot_enable data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99)) ## elif cfg.stats_enable ## data["qntls"] = srs.quantile([0.05, 0.25, 0.5, 0.75, 0.95]) ## elif cfg.boxplot_enable ## data["qntls"] = srs.quantile([0.25, 0.5, 0.75]) ## if cfg.stats_enable or cfg.hist_enable and cfg.insights_enable: data["skew"] = skew(srs) # if cfg.stats_enable data["nuniq"] = srs.nunique() data["nreals"] = srs.shape[0] data["nzero"] = (srs == 0).sum() data["nneg"] = (srs < 0).sum() data["mean"] = srs.mean() data["std"] = srs.std() data["kurt"] = kurtosis(srs) data["mem_use"] = srs.memory_usage(deep=True) ## if cfg.hist_enable and cfg.insight_enable data["chisq"] = chisquare(data["hist"][0]) # compute the density histogram data["dens"] = da.histogram(srs, bins=bins, range=[data["min"], data["max"]], density=True) # gaussian kernel density estimate data["kde"] = gaussian_kde( srs.map_partitions(lambda x: x.sample(min(1000, x.shape[0])), meta=srs)) ## if cfg.box_enable data.update(calc_box(srs, data["qntls"])) return data
def nom_comps( srs: dd.Series, first_rows: pd.Series, ngroups: int, largest: bool, bins: int, top_words: int, stopword: bool, lemmatize: bool, stem: bool, ) -> Dict[str, Any]: """ This function aggregates all of the computations required for plot(df, Nominal()) Parameters ---------- srs one categorical column ngroups Number of groups to return largest If true, show the groups with the largest count, else show the groups with the smallest count bins number of bins for the category length frequency histogram top_words Number of highest frequency words to show in the wordcloud and word frequency bar chart stopword If True, remove stop words, else keep them lemmatize If True, lemmatize the words before computing the word frequencies, else don't stem If True, extract the stem of the words before computing the word frequencies, else don't """ # pylint: disable=too-many-arguments data: Dict[str, Any] = {} # total rows data["nrows"] = srs.shape[0] # drop null values srs = srs.dropna() ## if cfg.bar_enable or cfg.pie_enable # counts of unique values in the series grps = srs.value_counts(sort=False) # total number of groups data["nuniq"] = grps.shape[0] # select the largest or smallest groups data["bar"] = grps.nlargest(ngroups) if largest else grps.nsmallest( ngroups) ## if cfg.barchart_bars == cfg.piechart_slices: data["pie"] = data["bar"] ## else ## data["pie"] = grps.nlargest(ngroups) if largest else grps.nsmallest(ngroups) ## if cfg.insights.evenness_enable data["chisq"] = chisquare(grps.values) ## if cfg.stats_enable df = grps.reset_index() ## if cfg.stats_enable or cfg.word_freq_enable if not first_rows.apply(lambda x: isinstance(x, str)).all(): srs = srs.astype( str) # srs must be a string to compute the value lengths df[df.columns[0]] = df[df.columns[0]].astype(str) data.update(calc_cat_stats(srs, df, bins, data["nrows"], data["nuniq"])) # ## if cfg.word_freq_enable data.update(calc_word_freq(df, top_words, stopword, lemmatize, stem)) return data