def test_cast_copy(): s = pd.Series(["1", "2", "3", "4"]) id_s = hex(id(s)) typeset = CompleteSet() ns = typeset.cast_series(s) id_ns = hex(id(ns)) assert id_s != id_ns
def test_noncast_noncopy(): s = pd.Series(["asdasd", "asdasda", "asdasd", "sadasd"]) id_s = hex(id(s)) typeset = CompleteSet() ns = typeset.cast_series(s) id_ns = hex(id(ns)) assert id_s == id_ns
def all_relations_tested(series_map): typeset = CompleteSet() # Convert data structure for mapping series_map_lookup = {} for map_to_type, map_from_type, items in series_map: try: series_map_lookup[map_to_type][map_from_type] = items except KeyError: series_map_lookup[map_to_type] = {map_from_type: items} missing_relations = set() for node in typeset.types: for relation in node.relations: from_type, to_type = relation.related_type, relation.type if relation.inferential and ( to_type not in series_map_lookup or from_type not in series_map_lookup[to_type] or len(series_map_lookup[to_type][from_type]) == 0 ): missing_relations.add(str(relation)) if len(missing_relations) > 0: raise ValueError( "Not all inferential relations are tested {missing_relations}".format( missing_relations=missing_relations ) )
def test_type_inference_frame(): # Create a DataFrame with various string columns df = pd.DataFrame( { "latin": ["orange", "apple", "pear"], "cyrillic": ["Кириллица", "гласность", "демократија"], "mixed": ["Кириллица", "soep", "демократија"], "burmese": ["ရေကြီးခြင်း", "စက်သင်ယူမှု", "ဉာဏ်ရည်တု"], "digits": ["1234", "121223", "12312"], "specials": ["$", "%^&*(", "!!!~``"], "whitespace": ["\t", "\n", " "], "jiddisch": ["רעכט צו לינקס", "שאָסיי 61", "פּיצאַ איז אָנגענעם"], "arabic": ["بوب ديلان", "باتي فالنتين", "السيد الدف الرجل"], "playing_cards": ["🂶", "🃁", "🂻"], } ) # Initialize the typeset typeset = CompleteSet() # Infer the column type types = infer_type(df, typeset) assert types == { "latin": String, "cyrillic": String, "mixed": String, "burmese": String, "digits": Integer, "specials": String, "whitespace": String, "jiddisch": String, "arabic": String, "playing_cards": String, }
def __init__(self): type_summary_ops = { Boolean: [], Categorical: [category_summary, unique_summary], Complex: [ infinite_summary, numerical_basic_summary, unique_summary_complex, ], DateTime: [range_summary, unique_summary], Date: [], ExistingPath: [existing_path_summary, path_summary, text_summary], Float: [ infinite_summary, numerical_summary, zero_summary, unique_summary ], Geometry: [], ImagePath: [], Integer: [ infinite_summary, numerical_summary, zero_summary, unique_summary, ], Object: [unique_summary], Path: [path_summary, text_summary], String: [text_summary, unique_summary], Time: [], TimeDelta: [], UUID: [], URL: [url_summary, unique_summary], Generic: [base_summary, missing_summary], } super().__init__(type_summary_ops, CompleteSet())
def write_circular_packing_files() -> None: typeset = CompleteSet() graph = typeset.base_graph.copy() nx.relabel_nodes(graph, {n: str(n) for n in graph.nodes}, copy=False) data = to_json_tree_sorted(graph, root="Generic") data = update(data) write_json(data) write_html(data)
def main(): typeset = CompleteSet() graph = typeset.base_graph.copy() nx.relabel_nodes(graph, {n: str(n) for n in graph.nodes}, copy=False) data = json_graph.tree_data(graph, root="Generic") data = update(data) write_json(data) write_html(data)
def test_consistency(name, series): typeset = CompleteSet() if ( name in ["timedelta_series_nat", "date_series_nat", "timestamp_series_nat"] and sys.version_info.major == 3 and sys.version_info.minor == 7 ): pytest.skip("unsupported configuration") initial_type = str(typeset.detect_type(series)) converted_type = str(typeset.infer_type(series)) if initial_type != converted_type: converted_series = typeset.cast_to_inferred(series.copy(deep=True)) if hasattr(series, "dtype") and hasattr(converted_series, "dtype"): assert ( series.dtype.kind != converted_series.dtype.kind or not sequences_equal(series, converted_series) ) else: assert not sequences_equal(series, converted_series) else: converted_series = typeset.cast_to_inferred(series) assert sequences_equal(series, converted_series)
def test_side_effects(name, series): reference = series.copy() typeset = CompleteSet() typeset.detect_type(series) typeset.infer_type(series) assert sequences_equal(series, reference)
def test_side_effects(series): reference = series.copy() typeset = CompleteSet() typeset.detect_series_type(series) typeset.infer_series_type(series) # Check if NaN mask is equal assert series.notna().eq(reference.notna()).all() # Check if NonNaN values are equal assert series[series.notna()].eq(reference[reference.notna()]).all()
def test_type_cast_frame(): df = pd.DataFrame({ "latin": ["orange", "apple", "pear"], "cyrillic": ["Кириллица", "гласность", "демократија"], "mixed": ["Кириллица", "soep", "демократија"], "burmese": ["ရေကြီးခြင်း", "စက်သင်ယူမှု", "ဉာဏ်ရည်တု"], "digits": ["01234", "121223", "12312"], "specials": ["$", "%^&*(", "!!!~``"], "whitespace": ["\t", "\n", " "], "jiddisch": ["רעכט צו לינקס", "שאָסיי 61", "פּיצאַ איז אָנגענעם"], "arabic": ["بوب ديلان", "باتي فالنتين", "السيد الدف الرجل"], "playing_cards": ["🂶", "🃁", "🂻"], }) typeset = CompleteSet() new_df = cast_frame(df, typeset) assert new_df["digits"].iloc[1] - 3 == 121220 assert new_df["latin"].iloc[1] + "1" == "apple1"
def generate_typeset_plots() -> None: typesets_dir = Path("typesets/") typesets_dir.mkdir(exist_ok=True) # Initialize typeset for name, tsc in [ ("typeset_complete", CompleteSet()), ("typeset_geometry", GeometrySet()), ("typeset_standard", StandardSet()), ]: # Write graph to dot tsc.output_graph(typesets_dir / "{name}.dot".format(name=name)) # Plot the graph (svg) tsc.output_graph(typesets_dir / "{name}.svg".format(name=name)) tsc.output_graph(typesets_dir / "{name}_base.svg".format(name=name), base_only=True) # Plot the graph (png) tsc.output_graph(typesets_dir / "{name}.png".format(name=name), dpi=150)
def test_multiple_inference(name, series): """ Notes: Copy to prevent possible side effects only for testing. """ ts = CompleteSet() inferred_type = str(ts.infer_type(series)) series_convert = ts.cast_to_inferred(copy.copy(series)) initial_type_after_convert = str(ts.detect_type(series_convert)) assert inferred_type == initial_type_after_convert series_convert2 = ts.cast_to_inferred(series_convert) inferred_type_after_convert = str(ts.infer_type(series_convert2)) assert initial_type_after_convert == inferred_type_after_convert assert sequences_equal(series_convert, series_convert2)
def test_multiple_inference(series): """ Notes: Copy to prevent possible side effects only for testing. """ ts = CompleteSet() inferred_type = ts.infer_type(series) series_convert = ts.cast_to_inferred(series.copy(deep=True)) initial_type_after_convert = ts.detect_type(series_convert.copy(deep=True)) assert inferred_type == initial_type_after_convert series_convert2 = ts.cast_to_inferred(series_convert.copy(deep=True)) inferred_type_after_convert = ts.infer_type( series_convert2.copy(deep=True)) assert initial_type_after_convert == inferred_type_after_convert assert series_convert.isna().eq(series_convert2.isna()).all() assert (series_convert[series_convert.notna()].eq( series_convert2[series_convert2.notna()]).all())
def test_consistency(series): typeset = CompleteSet() if (series.name in [ "timedelta_series_nat", "date_series_nat", "timestamp_series_nat" ] and sys.version_info.major == 3 and sys.version_info.minor == 7): pytest.skip("unsupported configuration") initial_type = typeset.detect_series_type(series.copy(deep=True)) converted_type = typeset.infer_series_type(series.copy(deep=True)) if initial_type != converted_type: converted_series = typeset.cast_series(series.copy(deep=True)) assert series.dtype.kind != converted_series.dtype.kind or not ( (converted_series.eq(series) | (converted_series.isna() & series.isna())).all()) else: converted_series = typeset.cast_series(series.copy(deep=True)) assert (converted_series.eq(series) | (converted_series.isna() & series.isna())).all()
def test_plotting(tmp_path): complete_set = CompleteSet() plot_graph_circular_packing(complete_set, tmp_path / "circular_packing_file.html")
df = pd.DataFrame({ "numbers_with_nan": [3, 7, np.nan], "url": [ "http://www.cwi.nl:80/%7Eguido/Python.html", "https://numpy.org/", "https://github.com/pandas-profiling/pandas-profiling", ], "uuid": [ "0b8a22ca-80ad-4df5-85ac-fa49c44b7ede", "aaa381d6-8442-4f63-88c8-7c900e9a23c6", "00000000-0000-0000-0000-000000000000", ], }) # Choose the complete typeset, which includes URLs typeset = CompleteSet() # Detect the type (without casting) print(detect_frame_type(df, typeset)) # {'numbers_with_nan': Float, 'url': String, 'uuid': String} # Cast the dataframe to inferred types cast_df = cast_frame(df, typeset) print(cast_df.to_string()) # numbers_with_nan url uuid # 0 3 (http, www.cwi.nl:80, /%7Eguido/Python.html, ,... 0b8a22ca-80ad-4df5-85ac-fa49c44b7ede # 1 7 (https, numpy.org, /, , , ) aaa381d6-8442-4f63-88c8-7c900e9a23c6 # 2 NaN (https, github.com, /pandas-profiling/pandas-p... 00000000-0000-0000-0000-000000000000 # Print the inferred types print(infer_frame_type(df, typeset))
from pathlib import Path from visions.typesets import CompleteSet, StandardSet, GeometrySet # Windows Note # Tip for Python3/64-bit compatible version of pygraphviz # https://github.com/CristiFati/Prebuilt-Binaries/raw/master/Windows/PyGraphviz/pygraphviz-1.5-cp37-cp37m-win_amd64.whl typesets_dir = Path("typesets/") typesets_dir.mkdir(exist_ok=True) # Initialize typeset for name, tsc in [ ("typeset_complete", CompleteSet()), ("typeset_geometry", GeometrySet()), ("typeset_standard", StandardSet()), ]: # Write graph to dot tsc.output_graph(typesets_dir / "{name}.dot".format(name=name)) # Plot the graph (svg) tsc.output_graph(typesets_dir / "{name}.svg".format(name=name)) tsc.output_graph(typesets_dir / "{name}_base.svg".format(name=name), base_only=True) # Plot the graph (png) tsc.output_graph(typesets_dir / "{name}.png".format(name=name), dpi=150)
if len(nbrs) == 0: return [] children_ = [] for child in nbrs: d = dict(chain(G.nodes[child].items(), [("id", child)])) c = add_children(child, G) if c: d["children"] = c children_.append(d) children_ = sorted(children_, key=lambda x: x["id"]) return children_ data = dict(chain(G.nodes[root].items(), [("id", root)])) data["children"] = add_children(root, G) return data def plot_graph_circular_packing(typeset, output_file) -> None: graph = typeset.base_graph.copy() nx.relabel_nodes(graph, {n: str(n) for n in graph.nodes}, copy=False) data = to_json_tree_sorted(graph, root=str(typeset.root_node)) data = update(data) write_html(data, output_file) if __name__ == "__main__": complete_set = CompleteSet() plot_graph_circular_packing(complete_set, "circular_packing.html")
import pytest from visions import EmailAddress from visions.test.series import get_series from visions.test.series_geometry import get_geometry_series from visions.types import Generic from visions.typesets import CompleteSet typeset = CompleteSet() typeset += EmailAddress def pytest_generate_tests(metafunc): _test_suite = get_series() _test_suite.update(get_geometry_series()) if metafunc.function.__name__ in ["test_consistency", "test_traversal_mutex"]: argsvalues = [] for name, series in _test_suite.items(): args = {"id": name} argsvalues.append(pytest.param(name, series, **args)) metafunc.parametrize(argnames=["name", "series"], argvalues=argsvalues) def test_consistency(name, series): detected_type = typeset.detect_type(series) message = f"Detected type {detected_type} for series {name} but {detected_type}.contains_op(series) -> False" assert series in detected_type, message def _traverse_relation_graph(series, G, node=Generic):