def test_summarize():
    df = pd.DataFrame(
        {
            "Brand": ["Honda Civic", "Toyota Corolla", "Ford Focus", "Audi A4"],
            "Price": [22000, 25000, 27000, 35000],
        },
        columns=["Brand", "Price"],
    )
    summary = summarize(df, {"Brand": String, "Price": Integer}, CompleteSummary())

    assert summary["types"] == {"Brand": String, "Price": Integer}
def test_summarize_frame():
    df = pd.DataFrame(
        {
            "Brand": ["Honda Civic", "Toyota Corolla", "Ford Focus", "Audi A4"],
            "Price": [22000, 25000, 27000, 35000],
        },
        columns=["Brand", "Price"],
    )
    summary = summarize_frame(
        df, {"Brand": String, "Price": Integer}, CompleteSummary()
    )
    summary.pop("type_counts")
    assert summary == {
        "n_observations": 4,
        "n_variables": 2,
        "memory_size": 430,
        "na_count": 0,
        "n_vars_missing": 0,
    }
def test_summarize_series():
    brand_series = pd.Series(["Honda Civic", "Toyota Corolla", "Ford Focus", "Audi A4"])

    summary = summarize_series(brand_series, String, CompleteSummary())
    assert summary == {
        "n_unique": 4,
        "length": {7: 1, 14: 1, 11: 1, 10: 1},
        "category_short_values": {
            "H": "Lu",
            "o": "Ll",
            "n": "Ll",
            "d": "Ll",
            "a": "Ll",
            " ": "Zs",
            "C": "Lu",
            "i": "Ll",
            "v": "Ll",
            "c": "Ll",
            "T": "Lu",
            "y": "Ll",
            "t": "Ll",
            "r": "Ll",
            "l": "Ll",
            "F": "Lu",
            "u": "Ll",
            "s": "Ll",
            "A": "Lu",
            "4": "Nd",
        },
        "category_alias_values": {
            "H": "Uppercase_Letter",
            "o": "Lowercase_Letter",
            "n": "Lowercase_Letter",
            "d": "Lowercase_Letter",
            "a": "Lowercase_Letter",
            " ": "Space_Separator",
            "C": "Uppercase_Letter",
            "i": "Lowercase_Letter",
            "v": "Lowercase_Letter",
            "c": "Lowercase_Letter",
            "T": "Uppercase_Letter",
            "y": "Lowercase_Letter",
            "t": "Lowercase_Letter",
            "r": "Lowercase_Letter",
            "l": "Lowercase_Letter",
            "F": "Uppercase_Letter",
            "u": "Lowercase_Letter",
            "s": "Lowercase_Letter",
            "A": "Uppercase_Letter",
            "4": "Decimal_Number",
        },
        "script_values": {
            "H": "Latin",
            "o": "Latin",
            "n": "Latin",
            "d": "Latin",
            "a": "Latin",
            " ": "Common",
            "C": "Latin",
            "i": "Latin",
            "v": "Latin",
            "c": "Latin",
            "T": "Latin",
            "y": "Latin",
            "t": "Latin",
            "r": "Latin",
            "l": "Latin",
            "F": "Latin",
            "u": "Latin",
            "s": "Latin",
            "A": "Latin",
            "4": "Common",
        },
        "block_values": {
            "H": "Basic Latin",
            "o": "Basic Latin",
            "n": "Basic Latin",
            "d": "Basic Latin",
            "a": "Basic Latin",
            " ": "Basic Latin",
            "C": "Basic Latin",
            "i": "Basic Latin",
            "v": "Basic Latin",
            "c": "Basic Latin",
            "T": "Basic Latin",
            "y": "Basic Latin",
            "t": "Basic Latin",
            "r": "Basic Latin",
            "l": "Basic Latin",
            "F": "Basic Latin",
            "u": "Basic Latin",
            "s": "Basic Latin",
            "A": "Basic Latin",
            "4": "Basic Latin",
        },
        "block_alias_values": {
            "H": "ASCII",
            "o": "ASCII",
            "n": "ASCII",
            "d": "ASCII",
            "a": "ASCII",
            " ": "ASCII",
            "C": "ASCII",
            "i": "ASCII",
            "v": "ASCII",
            "c": "ASCII",
            "T": "ASCII",
            "y": "ASCII",
            "t": "ASCII",
            "r": "ASCII",
            "l": "ASCII",
            "F": "ASCII",
            "u": "ASCII",
            "s": "ASCII",
            "A": "ASCII",
            "4": "ASCII",
        },
        "frequencies": {
            "Audi A4": 1,
            "Honda Civic": 1,
            "Ford Focus": 1,
            "Toyota Corolla": 1,
        },
        "n_records": 4,
        "memory_size": 398,
        "dtype": np.object_,
        "types": {"str": 4},
        "na_count": 0,
    }

    price_series = pd.Series([22000, 25000, 27000, 35000])

    summary = summarize_series(price_series, Integer, CompleteSummary())
    assert summary == {
        "inf_count": 0,
        "mean": 27250.0,
        "std": 5560.275772537426,
        "var": 30916666.666666668,
        "max": 35000.0,
        "min": 22000.0,
        "median": 26000.0,
        "kurt": 1.8192544372679649,
        "skew": 1.19978923754086,
        "sum": 109000.0,
        "mad": 3875.0,
        "quantile_5": 22450.0,
        "quantile_25": 24250.0,
        "quantile_50": 26000.0,
        "quantile_75": 29000.0,
        "quantile_95": 33800.0,
        "iqr": 4750.0,
        "range": 13000.0,
        "cv": 0.20404681734082297,
        "n_zeros": 0,
        "n_unique": 4,
        "frequencies": {27000: 1, 35000: 1, 25000: 1, 22000: 1},
        "n_records": 4,
        "memory_size": 160,
        "dtype": np.int64,
        "types": {"int": 4},
        "na_count": 0,
        "monotonic_decrease": False,
        "monotonic_increase": True,
    }
示例#4
0
    "digits": ["01234", "121223", "123123"],
    "specials": ["$", "%^&*(", "!!!~``"],
    "whitespace": ["\t", "\n", " "],
    "jiddisch": ["רעכט צו לינקס", "שאָסיי 61", "פּיצאַ איז אָנגענעם"],
    "arabic": ["بوب ديلان", "باتي فالنتين", "السيد الدف الرجل"],
    "playing_cards": ["🂶", "🃁", "🂻"],
})

# Initialize the typeset
typeset = CompleteSet()

# Infer the column type
types = detect_type(df, typeset)

# Generate a summary
summarizer = CompleteSummary()
summary = summarizer.summarize(df, types)

print("| {h1: <15}| {h2: <17}| {h3: <84}| {h4: <25}|".format(h1="Column",
                                                             h2="Scripts",
                                                             h3="Categories",
                                                             h4="Blocks"))
print("{e:-<17}+{e:-<18}+{e:-<85}+{e:-<26}+".format(e=""))
for column, variable_summary in summary["series"].items():
    scripts = ", ".join(set(variable_summary["script_values"].values()))
    categories = ", ".join(
        set(variable_summary["category_alias_values"].values()))
    blocks = ", ".join(set(variable_summary["block_values"].values()))

    print("| {column: <15}| {scripts: <17}| {categories: <84}| {blocks: <25}|".
          format(column=column,
示例#5
0
from pprint import pprint

import numpy as np
import pandas as pd

import visions as v
from visions.application.summaries import CompleteSummary

datetime_series = pd.Series([
    pd.datetime(2010, 1, 1),
    pd.datetime(2010, 8, 2),
    pd.datetime(2011, 2, 1),
    np.datetime64("NaT"),
])

# Generate a summary
summarizer = CompleteSummary()
summary = summarizer.summarize_series(datetime_series, v.DateTime)

pprint(summary)
示例#6
0
def summary():
    return CompleteSummary()
示例#7
0
import numpy as np
import pandas as pd

import visions.types as vt
from visions.application.summaries import CompleteSummary

integer_series = pd.Series([1, 2, 3, 4, 5, -100000, np.nan], dtype="Int64")

summarizer = CompleteSummary()
summary = summarizer.summarize_series(integer_series, vt.Integer)
print(summary)

# Output:
# {
#     "inf_count": 0,
#     "mean": -16664.166666666668,
#     "std": 40826.05381575185,
#     "var": 1666766670.1666665,
#     "max": 5.0,
#     "min": -100000.0,
#     "median": 2.5,
#     "kurt": 5.999999974801513,
#     "skew": -2.449489736169953,
#     "sum": -99985.0,
#     "mad": 27778.611111111113,
#     "quantile_5": -74999.75,
#     "quantile_25": 1.25,
#     "quantile_50": 2.5,
#     "quantile_75": 3.75,
#     "quantile_95": 4.75,
#     "iqr": 2.5,
def test_summarize_series():
    brand_series = pd.Series(
        ["Honda Civic", "Toyota Corolla", "Ford Focus", "Audi A4"])

    summary = summarize_series(brand_series, String, CompleteSummary())

    assert summary["n_unique"] == 4
    assert summary["length"].to_dict() == {0: 11, 1: 14, 2: 10, 3: 7}
    assert summary["max_length"] == 14
    assert summary["min_length"] == 7
    assert summary["mean_length"] == 10.5
    assert summary["median_length"] == 10.5
    assert summary["n_characters"] == 20

    assert summary["category_alias_values"] == {
        "H": "Uppercase_Letter",
        "o": "Lowercase_Letter",
        "n": "Lowercase_Letter",
        "d": "Lowercase_Letter",
        "a": "Lowercase_Letter",
        " ": "Space_Separator",
        "C": "Uppercase_Letter",
        "i": "Lowercase_Letter",
        "v": "Lowercase_Letter",
        "c": "Lowercase_Letter",
        "T": "Uppercase_Letter",
        "y": "Lowercase_Letter",
        "t": "Lowercase_Letter",
        "r": "Lowercase_Letter",
        "l": "Lowercase_Letter",
        "F": "Uppercase_Letter",
        "u": "Lowercase_Letter",
        "s": "Lowercase_Letter",
        "A": "Uppercase_Letter",
        "4": "Decimal_Number",
    }

    assert summary["frequencies"] == {
        "Audi A4": 1,
        "Honda Civic": 1,
        "Ford Focus": 1,
        "Toyota Corolla": 1,
    }

    price_series = pd.Series([22000, 25000, 27000, 35000])

    summary = summarize_series(price_series, Integer, CompleteSummary())
    assert summary == {
        "n_infinite": 0,
        "mean": 27250.0,
        "std": 5560.275772537426,
        "variance": 30916666.666666668,
        "max": 35000.0,
        "min": 22000.0,
        "median": 26000.0,
        "kurt": 1.8192544372679649,
        "skew": 1.19978923754086,
        "sum": 109000.0,
        "mad": 2500.0,
        "quantile_5": 22450.0,
        "quantile_25": 24250.0,
        "quantile_50": 26000.0,
        "quantile_75": 29000.0,
        "quantile_95": 33800.0,
        "iqr": 4750.0,
        "range": 13000.0,
        "cv": 0.20404681734082297,
        "n_zeros": 0,
        "is_unique": True,
        "n_unique": 4,
        "frequencies": {
            27000: 1,
            35000: 1,
            25000: 1,
            22000: 1
        },
        "n_records": 4,
        "memory_size": 160,
        "dtype": np.int64,
        "types": {
            "int": 4
        },
        "na_count": 0,
        "monotonic_decrease": False,
        "monotonic_decrease_strict": False,
        "monotonic_increase": True,
        "monotonic_increase_strict": True,
    }
from pprint import pprint

import numpy as np
import pandas as pd

import visions as v
from visions.application.summaries import CompleteSummary

category_series = pd.Series(
    pd.Categorical(
        [True, False, np.nan, "test"], categories=[True, False, "test", "missing"]
    )
)

# Generate a summary
summarizer = CompleteSummary()
summary = summarizer.summarize_series(category_series, v.Categorical)

pprint(summary)
示例#10
0
import pandas as pd

import visions as v
from visions.application.summaries import CompleteSummary

string_series = pd.Series(["orange", "apple", "pear", "🂶", "🃁", "🂻"])

summarizer = CompleteSummary()
summary = summarizer.summarize_series(string_series, v.String)
print(summary)

# Output:
# {
#     "n_unique": 6,
#     "length": {1: 3, 6: 1, 5: 1, 4: 1},
#     "category_short_values": {
#         "o": "Ll",
#         "r": "Ll",
#         "a": "Ll",
#         "n": "Ll",
#         "g": "Ll",
#         "e": "Ll",
#         "p": "Ll",
#         "l": "Ll",
#         "🂶": "So",
#         "🃁": "So",
#         "🂻": "So",
#     },
#     "category_alias_values": {
#         "o": "Lowercase_Letter",
#         "r": "Lowercase_Letter",