def describe_categorical_1d(series: pd.Series, summary: dict) -> Tuple[pd.Series, dict]:
    """Describe a categorical series.

    Args:
        series: The Series to describe.
        summary: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.
    """

    # Make sure we deal with strings (Issue #100)
    series = series.astype(str)

    # Only run if at least 1 non-missing value
    value_counts = summary["value_counts_without_nan"]

    summary.update(
        histogram_compute(
            value_counts, summary["n_distinct"], name="histogram_frequencies"
        )
    )

    redact = config["vars"]["cat"]["redact"].get(float)
    if not redact:
        summary.update({"first_rows": series.head(5)})

    chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(float)
    if chi_squared_threshold > 0.0:
        summary["chi_squared"] = chi_square(histogram=value_counts.values)

    check_length = config["vars"]["cat"]["length"].get(bool)
    if check_length:
        summary.update(length_summary(series))
        summary.update(
            histogram_compute(
                summary["length"], summary["length"].nunique(), name="histogram_length"
            )
        )

    check_unicode = config["vars"]["cat"]["characters"].get(bool)
    if check_unicode:
        summary.update(unicode_summary(series))
        summary["n_characters_distinct"] = summary["n_characters"]
        summary["n_characters"] = summary["character_counts"].values.sum()

        try:
            summary["category_alias_counts"].index = summary[
                "category_alias_counts"
            ].index.str.replace("_", " ")
        except AttributeError:
            pass

    words = config["vars"]["cat"]["words"]
    if words:
        summary.update(word_summary(series))

    return series, summary
Пример #2
0
def describe_date_1d(series: pd.Series,
                     summary: dict) -> Tuple[pd.Series, dict]:
    """Describe a date series.

    Args:
        series: The Series to describe.
        summary: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.
    """
    chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(
        float)

    summary.update({
        "min": pd.Timestamp.to_pydatetime(series.min()),
        "max": pd.Timestamp.to_pydatetime(series.max()),
    })

    summary["range"] = summary["max"] - summary["min"]

    values = series.values.astype(np.int64) // 10**9

    if chi_squared_threshold > 0.0:
        summary["chi_squared"] = chi_square(values)

    summary.update(histogram_compute(values, summary["n_distinct"]))
    return values, summary
Пример #3
0
def describe_file_1d(series: pd.Series,
                     summary: dict) -> Tuple[pd.Series, dict]:
    assert not series.hasnans
    assert hasattr(series, "str")

    summary.update(file_summary(series))
    summary.update(
        histogram_compute(
            summary["file_size"],
            summary["file_size"].nunique(),
            name="histogram_file_size",
        ))

    return series, summary
Пример #4
0
def describe_numeric_1d(series: pd.Series,
                        summary: dict) -> Tuple[pd.Series, dict]:
    """Describe a numeric series.
    Args:
        series: The Series to describe.
        summary: The dict containing the series description so far.
    Returns:
        A dict containing calculated series description values.
    """

    # Config
    chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(
        float)
    quantiles = config["vars"]["num"]["quantiles"].get(list)

    value_counts = summary["value_counts_without_nan"]

    summary["n_zeros"] = 0
    negative_index = value_counts.index < 0
    summary["n_negative"] = value_counts.loc[negative_index].sum()
    summary["p_negative"] = summary["n_negative"] / summary["n"]

    infinity_values = [np.inf, -np.inf]
    infinity_index = value_counts.index.isin(infinity_values)
    summary["n_infinite"] = value_counts.loc[infinity_index].sum()

    if 0 in value_counts.index:
        summary["n_zeros"] = value_counts.loc[0]

    stats = summary

    if isinstance(series.dtype, _IntegerDtype):
        stats.update(numeric_stats_pandas(series))
        present_values = series.astype(str(series.dtype).lower())
        finite_values = present_values
    else:
        present_values = series.values
        finite_values = present_values[np.isfinite(present_values)]
        stats.update(numeric_stats_numpy(present_values, series, summary))

    stats.update({
        "mad": mad(present_values),
    })

    if chi_squared_threshold > 0.0:
        stats["chi_squared"] = chi_square(finite_values)

    stats["range"] = stats["max"] - stats["min"]
    stats.update({
        f"{percentile:.0%}": value
        for percentile, value in series.quantile(quantiles).to_dict().items()
    })
    stats["iqr"] = stats["75%"] - stats["25%"]
    stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN
    stats["p_zeros"] = stats["n_zeros"] / summary["n"]
    stats["p_infinite"] = summary["n_infinite"] / summary["n"]

    stats["monotonic_increase"] = series.is_monotonic_increasing
    stats["monotonic_decrease"] = series.is_monotonic_decreasing

    stats["monotonic_increase_strict"] = (stats["monotonic_increase"]
                                          and series.is_unique)
    stats["monotonic_decrease_strict"] = (stats["monotonic_decrease"]
                                          and series.is_unique)

    stats.update(
        histogram_compute(
            value_counts[~infinity_index].index.values,
            summary["n_distinct"],
            weights=value_counts[~infinity_index].values,
        ))

    return series, stats
def describe_categorical_1d(config: Settings, series: pd.Series,
                            summary: dict) -> Tuple[Settings, pd.Series, dict]:
    """Describe a categorical series.

    Args:
        config: report Settings
        series: The Series to describe.
        summary: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.
    """

    # Make sure we deal with strings (Issue #100)
    series = series.astype(str)

    # Only run if at least 1 non-missing value
    value_counts = summary["value_counts_without_nan"]
    histogram_largest = config.vars.cat.histogram_largest
    histogram_data = value_counts
    if histogram_largest > 0:
        histogram_data = histogram_data.nlargest(histogram_largest)

    summary.update(
        histogram_compute(
            config,
            histogram_data,
            summary["n_distinct"],
            name="histogram_frequencies",
        ))

    redact = config.vars.cat.redact
    if not redact:
        summary.update({"first_rows": series.head(5)})

    chi_squared_threshold = config.vars.num.chi_squared_threshold
    if chi_squared_threshold > 0.0:
        summary["chi_squared"] = chi_square(histogram=value_counts.values)

    if config.vars.cat.length:
        summary.update(length_summary(series))
        summary.update(
            histogram_compute(
                config,
                summary["length"],
                summary["length"].nunique(),
                name="histogram_length",
            ))

    if config.vars.cat.characters:
        summary.update(unicode_summary(series))
        summary["n_characters_distinct"] = summary["n_characters"]
        summary["n_characters"] = summary["character_counts"].values.sum()

        with contextlib.suppress(AttributeError):
            summary["category_alias_counts"].index = summary[
                "category_alias_counts"].index.str.replace("_", " ")

    if config.vars.cat.words:
        summary.update(word_summary(series))

    return config, series, summary