def test_type_detect_frame(): # Create a DataFrame with various string columns df = pd.DataFrame( { "latin": ["orange", "apple", "pear"], "cyrillic": ["Кириллица", "гласность", "демократија"], "mixed": ["Кириллица", "soep", "демократија"], "burmese": ["ရေကြီးခြင်း", "စက်သင်ယူမှု", "ဉာဏ်ရည်တု"], "digits": ["01234", "121223", "12312"], "specials": ["$", "%^&*(", "!!!~``"], "whitespace": ["\t", "\n", " "], "jiddisch": ["רעכט צו לינקס", "שאָסיי 61", "פּיצאַ איז אָנגענעם"], "arabic": ["بوب ديلان", "باتي فالنتين", "السيد الدف الرجل"], "playing_cards": ["🂶", "🃁", "🂻"], } ) # Initialize the typeset typeset = CompleteSet() # Infer the column type types = detect_type(df, typeset) assert types == { "latin": String, "cyrillic": String, "mixed": String, "burmese": String, "digits": String, "specials": String, "whitespace": String, "jiddisch": String, "arabic": String, "playing_cards": String, }
def test_type_detect_series(): datetime_series = pd.Series( [ datetime.datetime(2010, 1, 1), datetime.datetime(2010, 8, 2), datetime.datetime(2011, 2, 1), np.datetime64("NaT"), ] ) typeset = StandardSet() detected_type = detect_type(datetime_series, typeset) assert detected_type == DateTime
"url": [ "http://www.cwi.nl:80/%7Eguido/Python.html", "https://numpy.org/", "https://github.com/pandas-profiling/pandas-profiling", ], "uuid": [ "0b8a22ca-80ad-4df5-85ac-fa49c44b7ede", "aaa381d6-8442-4f63-88c8-7c900e9a23c6", "00000000-0000-0000-0000-000000000000", ], }) # Choose the complete typeset, which includes URLs typeset = CompleteSet() # Detect the type (without casting) print(detect_type(df, typeset)) # {'numbers_with_nan': Float, 'url': String, 'uuid': String} # Cast the dataframe to inferred types cast_df = cast_to_inferred(df, typeset) print(cast_df.to_string()) # numbers_with_nan url uuid # 0 3 (http, www.cwi.nl:80, /%7Eguido/Python.html, ,... 0b8a22ca-80ad-4df5-85ac-fa49c44b7ede # 1 7 (https, numpy.org, /, , , ) aaa381d6-8442-4f63-88c8-7c900e9a23c6 # 2 NaN (https, github.com, /pandas-profiling/pandas-p... 00000000-0000-0000-0000-000000000000 # Print the inferred types print(infer_type(df, typeset)) # {'numbers_with_nan': Integer, 'url': URL, 'uuid': UUID}
"cyrillic": ["Кириллица", "гласность", "демократија"], "mixed": ["Кириллица", "soep", "демократија"], "burmese": ["ရေကြီးခြင်း", "စက်သင်ယူမှု", "ဉာဏ်ရည်တု"], "digits": ["01234", "121223", "123123"], "specials": ["$", "%^&*(", "!!!~``"], "whitespace": ["\t", "\n", " "], "jiddisch": ["רעכט צו לינקס", "שאָסיי 61", "פּיצאַ איז אָנגענעם"], "arabic": ["بوب ديلان", "باتي فالنتين", "السيد الدف الرجل"], "playing_cards": ["🂶", "🃁", "🂻"], }) # Initialize the typeset typeset = CompleteSet() # Infer the column type types = detect_type(df, typeset) # Generate a summary summarizer = CompleteSummary() summary = summarizer.summarize(df, types) print("| {h1: <15}| {h2: <17}| {h3: <84}| {h4: <25}|".format(h1="Column", h2="Scripts", h3="Categories", h4="Blocks")) print("{e:-<17}+{e:-<18}+{e:-<85}+{e:-<26}+".format(e="")) for column, variable_summary in summary["series"].items(): scripts = ", ".join(set(variable_summary["script_values"].values())) categories = ", ".join( set(variable_summary["category_alias_values"].values())) blocks = ", ".join(set(variable_summary["block_values"].values()))
import pandas as pd from examples.data_analysis.categorical import Category from visions.functional import detect_type from visions.types import Boolean, Categorical from visions.typesets import StandardSet ts = StandardSet() ts -= Boolean ts -= Categorical ts += Category s1 = pd.Series(["A", "B", "C"] * 1000, dtype="category") print(s1 in Category) print(detect_type(s1, ts)) s2 = pd.Series([True, False] * 1000) print(s2 in Category) print(detect_type(s2, ts))