def test_summarize(): df = pd.DataFrame( { "Brand": ["Honda Civic", "Toyota Corolla", "Ford Focus", "Audi A4"], "Price": [22000, 25000, 27000, 35000], }, columns=["Brand", "Price"], ) summary = summarize(df, {"Brand": String, "Price": Integer}, CompleteSummary()) assert summary["types"] == {"Brand": String, "Price": Integer}
def test_summarize_frame(): df = pd.DataFrame( { "Brand": ["Honda Civic", "Toyota Corolla", "Ford Focus", "Audi A4"], "Price": [22000, 25000, 27000, 35000], }, columns=["Brand", "Price"], ) summary = summarize_frame( df, {"Brand": String, "Price": Integer}, CompleteSummary() ) summary.pop("type_counts") assert summary == { "n_observations": 4, "n_variables": 2, "memory_size": 430, "na_count": 0, "n_vars_missing": 0, }
def test_summarize_series(): brand_series = pd.Series(["Honda Civic", "Toyota Corolla", "Ford Focus", "Audi A4"]) summary = summarize_series(brand_series, String, CompleteSummary()) assert summary == { "n_unique": 4, "length": {7: 1, 14: 1, 11: 1, 10: 1}, "category_short_values": { "H": "Lu", "o": "Ll", "n": "Ll", "d": "Ll", "a": "Ll", " ": "Zs", "C": "Lu", "i": "Ll", "v": "Ll", "c": "Ll", "T": "Lu", "y": "Ll", "t": "Ll", "r": "Ll", "l": "Ll", "F": "Lu", "u": "Ll", "s": "Ll", "A": "Lu", "4": "Nd", }, "category_alias_values": { "H": "Uppercase_Letter", "o": "Lowercase_Letter", "n": "Lowercase_Letter", "d": "Lowercase_Letter", "a": "Lowercase_Letter", " ": "Space_Separator", "C": "Uppercase_Letter", "i": "Lowercase_Letter", "v": "Lowercase_Letter", "c": "Lowercase_Letter", "T": "Uppercase_Letter", "y": "Lowercase_Letter", "t": "Lowercase_Letter", "r": "Lowercase_Letter", "l": "Lowercase_Letter", "F": "Uppercase_Letter", "u": "Lowercase_Letter", "s": "Lowercase_Letter", "A": "Uppercase_Letter", "4": "Decimal_Number", }, "script_values": { "H": "Latin", "o": "Latin", "n": "Latin", "d": "Latin", "a": "Latin", " ": "Common", "C": "Latin", "i": "Latin", "v": "Latin", "c": "Latin", "T": "Latin", "y": "Latin", "t": "Latin", "r": "Latin", "l": "Latin", "F": "Latin", "u": "Latin", "s": "Latin", "A": "Latin", "4": "Common", }, "block_values": { "H": "Basic Latin", "o": "Basic Latin", "n": "Basic Latin", "d": "Basic Latin", "a": "Basic Latin", " ": "Basic Latin", "C": "Basic Latin", "i": "Basic Latin", "v": "Basic Latin", "c": "Basic Latin", "T": "Basic Latin", "y": "Basic Latin", "t": "Basic Latin", "r": "Basic Latin", "l": "Basic Latin", "F": "Basic Latin", "u": "Basic Latin", "s": "Basic Latin", "A": "Basic Latin", "4": "Basic Latin", }, "block_alias_values": { "H": "ASCII", "o": "ASCII", "n": "ASCII", "d": "ASCII", "a": "ASCII", " ": "ASCII", "C": "ASCII", "i": "ASCII", "v": "ASCII", "c": "ASCII", "T": "ASCII", "y": "ASCII", "t": "ASCII", "r": "ASCII", "l": "ASCII", "F": "ASCII", "u": "ASCII", "s": "ASCII", "A": "ASCII", "4": "ASCII", }, "frequencies": { "Audi A4": 1, "Honda Civic": 1, "Ford Focus": 1, "Toyota Corolla": 1, }, "n_records": 4, "memory_size": 398, "dtype": np.object_, "types": {"str": 4}, "na_count": 0, } price_series = pd.Series([22000, 25000, 27000, 35000]) summary = summarize_series(price_series, Integer, CompleteSummary()) assert summary == { "inf_count": 0, "mean": 27250.0, "std": 5560.275772537426, "var": 30916666.666666668, "max": 35000.0, "min": 22000.0, "median": 26000.0, "kurt": 1.8192544372679649, "skew": 1.19978923754086, "sum": 109000.0, "mad": 3875.0, "quantile_5": 22450.0, "quantile_25": 24250.0, "quantile_50": 26000.0, "quantile_75": 29000.0, "quantile_95": 33800.0, "iqr": 4750.0, "range": 13000.0, "cv": 0.20404681734082297, "n_zeros": 0, "n_unique": 4, "frequencies": {27000: 1, 35000: 1, 25000: 1, 22000: 1}, "n_records": 4, "memory_size": 160, "dtype": np.int64, "types": {"int": 4}, "na_count": 0, "monotonic_decrease": False, "monotonic_increase": True, }
"digits": ["01234", "121223", "123123"], "specials": ["$", "%^&*(", "!!!~``"], "whitespace": ["\t", "\n", " "], "jiddisch": ["רעכט צו לינקס", "שאָסיי 61", "פּיצאַ איז אָנגענעם"], "arabic": ["بوب ديلان", "باتي فالنتين", "السيد الدف الرجل"], "playing_cards": ["🂶", "🃁", "🂻"], }) # Initialize the typeset typeset = CompleteSet() # Infer the column type types = detect_type(df, typeset) # Generate a summary summarizer = CompleteSummary() summary = summarizer.summarize(df, types) print("| {h1: <15}| {h2: <17}| {h3: <84}| {h4: <25}|".format(h1="Column", h2="Scripts", h3="Categories", h4="Blocks")) print("{e:-<17}+{e:-<18}+{e:-<85}+{e:-<26}+".format(e="")) for column, variable_summary in summary["series"].items(): scripts = ", ".join(set(variable_summary["script_values"].values())) categories = ", ".join( set(variable_summary["category_alias_values"].values())) blocks = ", ".join(set(variable_summary["block_values"].values())) print("| {column: <15}| {scripts: <17}| {categories: <84}| {blocks: <25}|". format(column=column,
from pprint import pprint import numpy as np import pandas as pd import visions as v from visions.application.summaries import CompleteSummary datetime_series = pd.Series([ pd.datetime(2010, 1, 1), pd.datetime(2010, 8, 2), pd.datetime(2011, 2, 1), np.datetime64("NaT"), ]) # Generate a summary summarizer = CompleteSummary() summary = summarizer.summarize_series(datetime_series, v.DateTime) pprint(summary)
def summary(): return CompleteSummary()
import numpy as np import pandas as pd import visions.types as vt from visions.application.summaries import CompleteSummary integer_series = pd.Series([1, 2, 3, 4, 5, -100000, np.nan], dtype="Int64") summarizer = CompleteSummary() summary = summarizer.summarize_series(integer_series, vt.Integer) print(summary) # Output: # { # "inf_count": 0, # "mean": -16664.166666666668, # "std": 40826.05381575185, # "var": 1666766670.1666665, # "max": 5.0, # "min": -100000.0, # "median": 2.5, # "kurt": 5.999999974801513, # "skew": -2.449489736169953, # "sum": -99985.0, # "mad": 27778.611111111113, # "quantile_5": -74999.75, # "quantile_25": 1.25, # "quantile_50": 2.5, # "quantile_75": 3.75, # "quantile_95": 4.75, # "iqr": 2.5,
def test_summarize_series(): brand_series = pd.Series( ["Honda Civic", "Toyota Corolla", "Ford Focus", "Audi A4"]) summary = summarize_series(brand_series, String, CompleteSummary()) assert summary["n_unique"] == 4 assert summary["length"].to_dict() == {0: 11, 1: 14, 2: 10, 3: 7} assert summary["max_length"] == 14 assert summary["min_length"] == 7 assert summary["mean_length"] == 10.5 assert summary["median_length"] == 10.5 assert summary["n_characters"] == 20 assert summary["category_alias_values"] == { "H": "Uppercase_Letter", "o": "Lowercase_Letter", "n": "Lowercase_Letter", "d": "Lowercase_Letter", "a": "Lowercase_Letter", " ": "Space_Separator", "C": "Uppercase_Letter", "i": "Lowercase_Letter", "v": "Lowercase_Letter", "c": "Lowercase_Letter", "T": "Uppercase_Letter", "y": "Lowercase_Letter", "t": "Lowercase_Letter", "r": "Lowercase_Letter", "l": "Lowercase_Letter", "F": "Uppercase_Letter", "u": "Lowercase_Letter", "s": "Lowercase_Letter", "A": "Uppercase_Letter", "4": "Decimal_Number", } assert summary["frequencies"] == { "Audi A4": 1, "Honda Civic": 1, "Ford Focus": 1, "Toyota Corolla": 1, } price_series = pd.Series([22000, 25000, 27000, 35000]) summary = summarize_series(price_series, Integer, CompleteSummary()) assert summary == { "n_infinite": 0, "mean": 27250.0, "std": 5560.275772537426, "variance": 30916666.666666668, "max": 35000.0, "min": 22000.0, "median": 26000.0, "kurt": 1.8192544372679649, "skew": 1.19978923754086, "sum": 109000.0, "mad": 2500.0, "quantile_5": 22450.0, "quantile_25": 24250.0, "quantile_50": 26000.0, "quantile_75": 29000.0, "quantile_95": 33800.0, "iqr": 4750.0, "range": 13000.0, "cv": 0.20404681734082297, "n_zeros": 0, "is_unique": True, "n_unique": 4, "frequencies": { 27000: 1, 35000: 1, 25000: 1, 22000: 1 }, "n_records": 4, "memory_size": 160, "dtype": np.int64, "types": { "int": 4 }, "na_count": 0, "monotonic_decrease": False, "monotonic_decrease_strict": False, "monotonic_increase": True, "monotonic_increase_strict": True, }
from pprint import pprint import numpy as np import pandas as pd import visions as v from visions.application.summaries import CompleteSummary category_series = pd.Series( pd.Categorical( [True, False, np.nan, "test"], categories=[True, False, "test", "missing"] ) ) # Generate a summary summarizer = CompleteSummary() summary = summarizer.summarize_series(category_series, v.Categorical) pprint(summary)
import pandas as pd import visions as v from visions.application.summaries import CompleteSummary string_series = pd.Series(["orange", "apple", "pear", "🂶", "🃁", "🂻"]) summarizer = CompleteSummary() summary = summarizer.summarize_series(string_series, v.String) print(summary) # Output: # { # "n_unique": 6, # "length": {1: 3, 6: 1, 5: 1, 4: 1}, # "category_short_values": { # "o": "Ll", # "r": "Ll", # "a": "Ll", # "n": "Ll", # "g": "Ll", # "e": "Ll", # "p": "Ll", # "l": "Ll", # "🂶": "So", # "🃁": "So", # "🂻": "So", # }, # "category_alias_values": { # "o": "Lowercase_Letter", # "r": "Lowercase_Letter",