示例#1
0
def test_cast_copy():
    s = pd.Series(["1", "2", "3", "4"])
    id_s = hex(id(s))

    typeset = CompleteSet()
    ns = typeset.cast_series(s)
    id_ns = hex(id(ns))
    assert id_s != id_ns
示例#2
0
def test_noncast_noncopy():
    s = pd.Series(["asdasd", "asdasda", "asdasd", "sadasd"])
    id_s = hex(id(s))

    typeset = CompleteSet()
    ns = typeset.cast_series(s)
    id_ns = hex(id(ns))
    assert id_s == id_ns
示例#3
0
def all_relations_tested(series_map):
    typeset = CompleteSet()

    # Convert data structure for mapping
    series_map_lookup = {}
    for map_to_type, map_from_type, items in series_map:
        try:
            series_map_lookup[map_to_type][map_from_type] = items
        except KeyError:
            series_map_lookup[map_to_type] = {map_from_type: items}

    missing_relations = set()
    for node in typeset.types:
        for relation in node.relations:
            from_type, to_type = relation.related_type, relation.type
            if relation.inferential and (
                to_type not in series_map_lookup
                or from_type not in series_map_lookup[to_type]
                or len(series_map_lookup[to_type][from_type]) == 0
            ):
                missing_relations.add(str(relation))

    if len(missing_relations) > 0:
        raise ValueError(
            "Not all inferential relations are tested {missing_relations}".format(
                missing_relations=missing_relations
            )
        )
示例#4
0
def test_type_inference_frame():
    # Create a DataFrame with various string columns
    df = pd.DataFrame(
        {
            "latin": ["orange", "apple", "pear"],
            "cyrillic": ["Кириллица", "гласность", "демократија"],
            "mixed": ["Кириллица", "soep", "демократија"],
            "burmese": ["ရေကြီးခြင်း", "စက်သင်ယူမှု", "ဉာဏ်ရည်တု"],
            "digits": ["1234", "121223", "12312"],
            "specials": ["$", "%^&*(", "!!!~``"],
            "whitespace": ["\t", "\n", " "],
            "jiddisch": ["רעכט צו לינקס", "שאָסיי 61", "פּיצאַ איז אָנגענעם"],
            "arabic": ["بوب ديلان", "باتي فالنتين", "السيد الدف الرجل"],
            "playing_cards": ["🂶", "🃁", "🂻"],
        }
    )

    # Initialize the typeset
    typeset = CompleteSet()

    # Infer the column type
    types = infer_type(df, typeset)
    assert types == {
        "latin": String,
        "cyrillic": String,
        "mixed": String,
        "burmese": String,
        "digits": Integer,
        "specials": String,
        "whitespace": String,
        "jiddisch": String,
        "arabic": String,
        "playing_cards": String,
    }
示例#5
0
 def __init__(self):
     type_summary_ops = {
         Boolean: [],
         Categorical: [category_summary, unique_summary],
         Complex: [
             infinite_summary,
             numerical_basic_summary,
             unique_summary_complex,
         ],
         DateTime: [range_summary, unique_summary],
         Date: [],
         ExistingPath: [existing_path_summary, path_summary, text_summary],
         Float: [
             infinite_summary, numerical_summary, zero_summary,
             unique_summary
         ],
         Geometry: [],
         ImagePath: [],
         Integer: [
             infinite_summary,
             numerical_summary,
             zero_summary,
             unique_summary,
         ],
         Object: [unique_summary],
         Path: [path_summary, text_summary],
         String: [text_summary, unique_summary],
         Time: [],
         TimeDelta: [],
         UUID: [],
         URL: [url_summary, unique_summary],
         Generic: [base_summary, missing_summary],
     }
     super().__init__(type_summary_ops, CompleteSet())
示例#6
0
def write_circular_packing_files() -> None:
    typeset = CompleteSet()
    graph = typeset.base_graph.copy()
    nx.relabel_nodes(graph, {n: str(n) for n in graph.nodes}, copy=False)

    data = to_json_tree_sorted(graph, root="Generic")
    data = update(data)

    write_json(data)
    write_html(data)
示例#7
0
def main():
    typeset = CompleteSet()
    graph = typeset.base_graph.copy()
    nx.relabel_nodes(graph, {n: str(n) for n in graph.nodes}, copy=False)

    data = json_graph.tree_data(graph, root="Generic")

    data = update(data)

    write_json(data)
    write_html(data)
示例#8
0
def test_consistency(name, series):
    typeset = CompleteSet()

    if (
        name in ["timedelta_series_nat", "date_series_nat", "timestamp_series_nat"]
        and sys.version_info.major == 3
        and sys.version_info.minor == 7
    ):
        pytest.skip("unsupported configuration")

    initial_type = str(typeset.detect_type(series))
    converted_type = str(typeset.infer_type(series))

    if initial_type != converted_type:
        converted_series = typeset.cast_to_inferred(series.copy(deep=True))

        if hasattr(series, "dtype") and hasattr(converted_series, "dtype"):
            assert (
                series.dtype.kind != converted_series.dtype.kind
                or not sequences_equal(series, converted_series)
            )
        else:
            assert not sequences_equal(series, converted_series)

    else:
        converted_series = typeset.cast_to_inferred(series)
        assert sequences_equal(series, converted_series)
示例#9
0
def test_side_effects(name, series):
    reference = series.copy()

    typeset = CompleteSet()
    typeset.detect_type(series)
    typeset.infer_type(series)

    assert sequences_equal(series, reference)
示例#10
0
def test_side_effects(series):
    reference = series.copy()

    typeset = CompleteSet()
    typeset.detect_series_type(series)
    typeset.infer_series_type(series)

    # Check if NaN mask is equal
    assert series.notna().eq(reference.notna()).all()
    # Check if NonNaN values are equal
    assert series[series.notna()].eq(reference[reference.notna()]).all()
示例#11
0
def test_type_cast_frame():
    df = pd.DataFrame({
        "latin": ["orange", "apple", "pear"],
        "cyrillic": ["Кириллица", "гласность", "демократија"],
        "mixed": ["Кириллица", "soep", "демократија"],
        "burmese": ["ရေကြီးခြင်း", "စက်သင်ယူမှု", "ဉာဏ်ရည်တု"],
        "digits": ["01234", "121223", "12312"],
        "specials": ["$", "%^&*(", "!!!~``"],
        "whitespace": ["\t", "\n", " "],
        "jiddisch": ["רעכט צו לינקס", "שאָסיי 61", "פּיצאַ איז אָנגענעם"],
        "arabic": ["بوب ديلان", "باتي فالنتين", "السيد الدف الرجل"],
        "playing_cards": ["🂶", "🃁", "🂻"],
    })

    typeset = CompleteSet()
    new_df = cast_frame(df, typeset)
    assert new_df["digits"].iloc[1] - 3 == 121220
    assert new_df["latin"].iloc[1] + "1" == "apple1"
def generate_typeset_plots() -> None:
    typesets_dir = Path("typesets/")
    typesets_dir.mkdir(exist_ok=True)

    # Initialize typeset
    for name, tsc in [
        ("typeset_complete", CompleteSet()),
        ("typeset_geometry", GeometrySet()),
        ("typeset_standard", StandardSet()),
    ]:
        # Write graph to dot
        tsc.output_graph(typesets_dir / "{name}.dot".format(name=name))

        # Plot the graph (svg)
        tsc.output_graph(typesets_dir / "{name}.svg".format(name=name))
        tsc.output_graph(typesets_dir / "{name}_base.svg".format(name=name),
                         base_only=True)

        # Plot the graph (png)
        tsc.output_graph(typesets_dir / "{name}.png".format(name=name),
                         dpi=150)
示例#13
0
def test_multiple_inference(name, series):
    """
    Notes:
        Copy to prevent possible side effects only for testing.
    """
    ts = CompleteSet()

    inferred_type = str(ts.infer_type(series))

    series_convert = ts.cast_to_inferred(copy.copy(series))

    initial_type_after_convert = str(ts.detect_type(series_convert))
    assert inferred_type == initial_type_after_convert

    series_convert2 = ts.cast_to_inferred(series_convert)

    inferred_type_after_convert = str(ts.infer_type(series_convert2))
    assert initial_type_after_convert == inferred_type_after_convert
    assert sequences_equal(series_convert, series_convert2)
示例#14
0
def test_multiple_inference(series):
    """
    Notes:
        Copy to prevent possible side effects only for testing.
    """
    ts = CompleteSet()

    inferred_type = ts.infer_type(series)

    series_convert = ts.cast_to_inferred(series.copy(deep=True))

    initial_type_after_convert = ts.detect_type(series_convert.copy(deep=True))
    assert inferred_type == initial_type_after_convert

    series_convert2 = ts.cast_to_inferred(series_convert.copy(deep=True))

    inferred_type_after_convert = ts.infer_type(
        series_convert2.copy(deep=True))
    assert initial_type_after_convert == inferred_type_after_convert

    assert series_convert.isna().eq(series_convert2.isna()).all()
    assert (series_convert[series_convert.notna()].eq(
        series_convert2[series_convert2.notna()]).all())
示例#15
0
def test_consistency(series):
    typeset = CompleteSet()

    if (series.name in [
            "timedelta_series_nat", "date_series_nat", "timestamp_series_nat"
    ] and sys.version_info.major == 3 and sys.version_info.minor == 7):
        pytest.skip("unsupported configuration")

    initial_type = typeset.detect_series_type(series.copy(deep=True))
    converted_type = typeset.infer_series_type(series.copy(deep=True))

    if initial_type != converted_type:
        converted_series = typeset.cast_series(series.copy(deep=True))
        assert series.dtype.kind != converted_series.dtype.kind or not (
            (converted_series.eq(series) |
             (converted_series.isna() & series.isna())).all())
    else:
        converted_series = typeset.cast_series(series.copy(deep=True))
        assert (converted_series.eq(series) |
                (converted_series.isna() & series.isna())).all()
示例#16
0
def test_plotting(tmp_path):
    complete_set = CompleteSet()
    plot_graph_circular_packing(complete_set, tmp_path / "circular_packing_file.html")
示例#17
0
df = pd.DataFrame({
    "numbers_with_nan": [3, 7, np.nan],
    "url": [
        "http://www.cwi.nl:80/%7Eguido/Python.html",
        "https://numpy.org/",
        "https://github.com/pandas-profiling/pandas-profiling",
    ],
    "uuid": [
        "0b8a22ca-80ad-4df5-85ac-fa49c44b7ede",
        "aaa381d6-8442-4f63-88c8-7c900e9a23c6",
        "00000000-0000-0000-0000-000000000000",
    ],
})

# Choose the complete typeset, which includes URLs
typeset = CompleteSet()

# Detect the type (without casting)
print(detect_frame_type(df, typeset))
# {'numbers_with_nan': Float, 'url': String, 'uuid': String}

# Cast the dataframe to inferred types
cast_df = cast_frame(df, typeset)
print(cast_df.to_string())
#    numbers_with_nan                                                url                                  uuid
# 0                 3  (http, www.cwi.nl:80, /%7Eguido/Python.html, ,...  0b8a22ca-80ad-4df5-85ac-fa49c44b7ede
# 1                 7                        (https, numpy.org, /, , , )  aaa381d6-8442-4f63-88c8-7c900e9a23c6
# 2               NaN  (https, github.com, /pandas-profiling/pandas-p...  00000000-0000-0000-0000-000000000000

# Print the inferred types
print(infer_frame_type(df, typeset))
示例#18
0
from pathlib import Path

from visions.typesets import CompleteSet, StandardSet, GeometrySet

# Windows Note
# Tip for Python3/64-bit compatible version of pygraphviz
# https://github.com/CristiFati/Prebuilt-Binaries/raw/master/Windows/PyGraphviz/pygraphviz-1.5-cp37-cp37m-win_amd64.whl

typesets_dir = Path("typesets/")
typesets_dir.mkdir(exist_ok=True)

# Initialize typeset
for name, tsc in [
    ("typeset_complete", CompleteSet()),
    ("typeset_geometry", GeometrySet()),
    ("typeset_standard", StandardSet()),
]:
    # Write graph to dot
    tsc.output_graph(typesets_dir / "{name}.dot".format(name=name))

    # Plot the graph (svg)
    tsc.output_graph(typesets_dir / "{name}.svg".format(name=name))
    tsc.output_graph(typesets_dir / "{name}_base.svg".format(name=name),
                     base_only=True)

    # Plot the graph (png)
    tsc.output_graph(typesets_dir / "{name}.png".format(name=name), dpi=150)
        if len(nbrs) == 0:
            return []
        children_ = []
        for child in nbrs:
            d = dict(chain(G.nodes[child].items(), [("id", child)]))
            c = add_children(child, G)
            if c:
                d["children"] = c
            children_.append(d)

        children_ = sorted(children_, key=lambda x: x["id"])
        return children_

    data = dict(chain(G.nodes[root].items(), [("id", root)]))
    data["children"] = add_children(root, G)
    return data


def plot_graph_circular_packing(typeset, output_file) -> None:
    graph = typeset.base_graph.copy()
    nx.relabel_nodes(graph, {n: str(n) for n in graph.nodes}, copy=False)

    data = to_json_tree_sorted(graph, root=str(typeset.root_node))
    data = update(data)
    write_html(data, output_file)


if __name__ == "__main__":
    complete_set = CompleteSet()
    plot_graph_circular_packing(complete_set, "circular_packing.html")
示例#20
0
import pytest

from visions import EmailAddress
from visions.test.series import get_series
from visions.test.series_geometry import get_geometry_series
from visions.types import Generic
from visions.typesets import CompleteSet

typeset = CompleteSet()
typeset += EmailAddress


def pytest_generate_tests(metafunc):
    _test_suite = get_series()
    _test_suite.update(get_geometry_series())
    if metafunc.function.__name__ in ["test_consistency", "test_traversal_mutex"]:
        argsvalues = []
        for name, series in _test_suite.items():
            args = {"id": name}
            argsvalues.append(pytest.param(name, series, **args))

        metafunc.parametrize(argnames=["name", "series"], argvalues=argsvalues)


def test_consistency(name, series):
    detected_type = typeset.detect_type(series)
    message = f"Detected type {detected_type} for series {name} but {detected_type}.contains_op(series) -> False"
    assert series in detected_type, message


def _traverse_relation_graph(series, G, node=Generic):