def describe_date_1d(series: pd.Series, summary: dict) -> Tuple[pd.Series, dict]: """Describe a date series. Args: series: The Series to describe. summary: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get( float) summary.update({ "min": pd.Timestamp.to_pydatetime(series.min()), "max": pd.Timestamp.to_pydatetime(series.max()), }) summary["range"] = summary["max"] - summary["min"] values = series.values.astype(np.int64) // 10**9 if chi_squared_threshold > 0.0: summary["chi_squared"] = chi_square(values) summary.update(histogram_compute(values, summary["n_distinct"])) return values, summary
def describe_categorical_1d(series: pd.Series, summary: dict) -> Tuple[pd.Series, dict]: """Describe a categorical series. Args: series: The Series to describe. summary: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # Make sure we deal with strings (Issue #100) series = series.astype(str) # Only run if at least 1 non-missing value value_counts = summary["value_counts_without_nan"] summary.update( histogram_compute( value_counts, summary["n_distinct"], name="histogram_frequencies" ) ) redact = config["vars"]["cat"]["redact"].get(float) if not redact: summary.update({"first_rows": series.head(5)}) chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(float) if chi_squared_threshold > 0.0: summary["chi_squared"] = chi_square(histogram=value_counts.values) check_length = config["vars"]["cat"]["length"].get(bool) if check_length: summary.update(length_summary(series)) summary.update( histogram_compute( summary["length"], summary["length"].nunique(), name="histogram_length" ) ) check_unicode = config["vars"]["cat"]["characters"].get(bool) if check_unicode: summary.update(unicode_summary(series)) summary["n_characters_distinct"] = summary["n_characters"] summary["n_characters"] = summary["character_counts"].values.sum() try: summary["category_alias_counts"].index = summary[ "category_alias_counts" ].index.str.replace("_", " ") except AttributeError: pass words = config["vars"]["cat"]["words"] if words: summary.update(word_summary(series)) return series, summary
def describe_numeric_1d(series: pd.Series, summary: dict) -> Tuple[pd.Series, dict]: """Describe a numeric series. Args: series: The Series to describe. summary: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # Config chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get( float) quantiles = config["vars"]["num"]["quantiles"].get(list) value_counts = summary["value_counts_without_nan"] summary["n_zeros"] = 0 negative_index = value_counts.index < 0 summary["n_negative"] = value_counts.loc[negative_index].sum() summary["p_negative"] = summary["n_negative"] / summary["n"] infinity_values = [np.inf, -np.inf] infinity_index = value_counts.index.isin(infinity_values) summary["n_infinite"] = value_counts.loc[infinity_index].sum() if 0 in value_counts.index: summary["n_zeros"] = value_counts.loc[0] stats = summary if isinstance(series.dtype, _IntegerDtype): stats.update(numeric_stats_pandas(series)) present_values = series.astype(str(series.dtype).lower()) finite_values = present_values else: present_values = series.values finite_values = present_values[np.isfinite(present_values)] stats.update(numeric_stats_numpy(present_values, series, summary)) stats.update({ "mad": mad(present_values), }) if chi_squared_threshold > 0.0: stats["chi_squared"] = chi_square(finite_values) stats["range"] = stats["max"] - stats["min"] stats.update({ f"{percentile:.0%}": value for percentile, value in series.quantile(quantiles).to_dict().items() }) stats["iqr"] = stats["75%"] - stats["25%"] stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN stats["p_zeros"] = stats["n_zeros"] / summary["n"] stats["p_infinite"] = summary["n_infinite"] / summary["n"] stats["monotonic_increase"] = series.is_monotonic_increasing stats["monotonic_decrease"] = series.is_monotonic_decreasing stats["monotonic_increase_strict"] = (stats["monotonic_increase"] and series.is_unique) stats["monotonic_decrease_strict"] = (stats["monotonic_decrease"] and series.is_unique) stats.update( histogram_compute( value_counts[~infinity_index].index.values, summary["n_distinct"], weights=value_counts[~infinity_index].values, )) return series, stats
def describe_categorical_1d(config: Settings, series: pd.Series, summary: dict) -> Tuple[Settings, pd.Series, dict]: """Describe a categorical series. Args: config: report Settings series: The Series to describe. summary: The dict containing the series description so far. Returns: A dict containing calculated series description values. """ # Make sure we deal with strings (Issue #100) series = series.astype(str) # Only run if at least 1 non-missing value value_counts = summary["value_counts_without_nan"] histogram_largest = config.vars.cat.histogram_largest histogram_data = value_counts if histogram_largest > 0: histogram_data = histogram_data.nlargest(histogram_largest) summary.update( histogram_compute( config, histogram_data, summary["n_distinct"], name="histogram_frequencies", )) redact = config.vars.cat.redact if not redact: summary.update({"first_rows": series.head(5)}) chi_squared_threshold = config.vars.num.chi_squared_threshold if chi_squared_threshold > 0.0: summary["chi_squared"] = chi_square(histogram=value_counts.values) if config.vars.cat.length: summary.update(length_summary(series)) summary.update( histogram_compute( config, summary["length"], summary["length"].nunique(), name="histogram_length", )) if config.vars.cat.characters: summary.update(unicode_summary(series)) summary["n_characters_distinct"] = summary["n_characters"] summary["n_characters"] = summary["character_counts"].values.sum() with contextlib.suppress(AttributeError): summary["category_alias_counts"].index = summary[ "category_alias_counts"].index.str.replace("_", " ") if config.vars.cat.words: summary.update(word_summary(series)) return config, series, summary