Пример #1
0
def test_recoding_reject(recoding_data):
    config["check_recoded"] = True
    config["correlations"]["recoded"] = True
    results = describe(recoding_data)

    assert (results["variables"]["y"]["type"] == Variable.S_TYPE_RECODED
            and results["variables"]["x"]["type"] == Variable.TYPE_CAT) or (
                results["variables"]["x"]["type"] == Variable.S_TYPE_RECODED
                and results["variables"]["y"]["type"]
                == Variable.TYPE_CAT), "Type is wrong"
    assert ("correlation_var" in results["variables"]["y"]
            and results["variables"]["y"]["correlation_var"]
            == "x") or ("correlation_var" in results["variables"]["x"]
                        and results["variables"]["x"]["correlation_var"]
                        == "y"), "Values should be equal"

    expected_results = {
        "n_cells_missing": 0.0,
        Variable.S_TYPE_UNIQUE.value: 0,
        Variable.S_TYPE_CONST.value: 0,
        "nvar": 2,
        Variable.S_TYPE_REJECTED.value: 1,
        "n": 8,
        Variable.S_TYPE_RECODED.value: 1,
        Variable.S_TYPE_CORR.value: 0,
        Variable.TYPE_DATE.value: 0,
        Variable.TYPE_NUM.value: 0,
        Variable.TYPE_CAT.value: 1,
        "n_duplicates": 5,
    }
    for key in expected_results:
        assert (results["table"][key] == expected_results[key]
                ), "recoding error {}".format(key)
Пример #2
0
def test_describe_df(describe_data, expected_results):
    config["low_categorical_threshold"].set(0)
    describe_data_frame = pd.DataFrame(describe_data)
    describe_data_frame["somedate"] = pd.to_datetime(
        describe_data_frame["somedate"])

    results = describe(describe_data_frame)

    assert {
        "table",
        "variables",
        "correlations",
        "missing",
        "messages",
        "package",
    } == set(results.keys()), "Not in results"
    assert set({
        "CAT": 1,
        "CONST": 2,
        "DATE": 1,
        "NUM": 2,
        "UNIQUE": 2,
        "BOOL": 4,
        "REJECTED": 2,
        "RECODED": 0,
        "CORR": 0,
        "UNSUPPORTED": 3,
        "n": 9,
        "nvar": 15,
        "n_cells_missing": 6,
        "n_duplicates": 0,
    }.items()).issubset(set(
        results["table"].items())), "Variable analysis failed"

    # Loop over variables
    for col in describe_data.keys():
        for k, v in expected_results[col].items():
            if v == check_is_NaN:
                assert (
                    k not in results["variables"][col]
                ) == True, "Value {} for key {} in column {} is not NaN".format(
                    results["variables"][col][k], k, col)
            elif isinstance(v, float):
                assert (
                    pytest.approx(v) == results["variables"][col][k]
                ), "Value {} for key {} in column {} is not NaN".format(
                    results["variables"][col][k], k, col)
            else:
                assert (
                    v == results["variables"][col][k]
                ), "Value {} for key {} in column {} is not NaN".format(
                    results["variables"][col][k], k, col)

        if results["variables"][col]["type"].value in ["NUM", "DATE"]:
            assert ("histogramdata" in results["variables"][col]
                    ), "Mini-histogram missing for column {} ".format(col)
def test_describe_df(describe_data, expected_results):
    config["vars"]["num"]["low_categorical_threshold"].set(0)
    describe_data_frame = pd.DataFrame(describe_data)
    describe_data_frame["somedate"] = pd.to_datetime(
        describe_data_frame["somedate"])

    results = describe("title", describe_data_frame)

    assert {
        "analysis",
        "table",
        "variables",
        "scatter",
        "correlations",
        "missing",
        "messages",
        "package",
        "sample",
        "duplicates",
    } == set(results.keys()), "Not in results"

    assert {
        "BOOL": 5,
        "CAT": 3,
        "UNSUPPORTED": 4,
        "NUM": 2,
        "DATE": 1
    } == results["table"]["types"], "Variable analysis failed"

    # Loop over variables
    for col in describe_data.keys():
        for k, v in expected_results[col].items():
            if v == check_is_NaN:
                assert (
                    k not in results["variables"][col]
                ) == True, "Value `{}` for key `{}` in column `{}` is not NaN".format(
                    results["variables"][col][k], k, col)
            elif isinstance(v, float):
                assert (
                    pytest.approx(v) == results["variables"][col][k]
                ), "Value `{}` for key `{}` in column `{}` is not NaN".format(
                    results["variables"][col][k], k, col)
            else:
                assert (
                    v == results["variables"][col][k]
                ), "Value `{}` for key `{}` in column `{}` is not NaN".format(
                    results["variables"][col][k], k, col)

        if results["variables"][col]["type"].value in ["NUM", "DATE"]:
            assert ("histogram" in results["variables"][col]
                    ), "Histogram missing for column {} ".format(col)
Пример #4
0
def test_describe_df(column, describe_data, expected_results, summarizer):
    config = Settings()
    config.vars.num.low_categorical_threshold = 0

    typeset = ProfilingTypeSet(config)

    describe_data_frame = pd.DataFrame({column: describe_data[column]})
    if column == "somedate":
        describe_data_frame["somedate"] = pd.to_datetime(
            describe_data_frame["somedate"]
        )

    results = describe(config, describe_data_frame, summarizer, typeset)

    assert {
        "analysis",
        "table",
        "variables",
        "scatter",
        "correlations",
        "missing",
        "messages",
        "package",
        "sample",
        "duplicates",
    } == set(results.keys()), "Not in results"

    # Loop over variables
    for k, v in expected_results[column].items():
        if v == check_is_NaN:
            test_condition = k not in results["variables"][column]
        elif isinstance(v, float):
            test_condition = pytest.approx(v) == results["variables"][column][k]
        else:
            test_condition = v == results["variables"][column][k]

        assert (
            test_condition
        ), f"Value `{results['variables'][column][k]}` for key `{k}` in column `{column}` is not NaN"

    if results["variables"][column]["type"] in ["Numeric", "DateTime"]:
        assert (
            "histogram" in results["variables"][column]
        ), f"Histogram missing for column {column}"
Пример #5
0
def test_cramers_reject(recoding_data):
    recoding_data.loc[len(recoding_data)] = {"x": "chat", "y": "dog"}
    config["check_correlation_cramers"] = True
    config["correlation_threshold_cramers"] = 0.1
    config["correlations"]["cramers"] = True
    results = describe(recoding_data)

    # The order of dicts is not preserved in Python 3.5 and not guaranteed in Python 3.6
    assert (
        results["variables"]["y"]["type"] == Variable.S_TYPE_CORR
        and results["variables"]["x"]["type"] == Variable.TYPE_CAT
    ) or (
        results["variables"]["x"]["type"] == Variable.S_TYPE_CORR
        and results["variables"]["y"]["type"] == Variable.TYPE_CAT
    ), "Type is wrong"
    assert (
        "correlation_var" in results["variables"]["y"]
        and results["variables"]["y"]["correlation_var"] == "x"
    ) or (
        "correlation_var" in results["variables"]["x"]
        and results["variables"]["x"]["correlation_var"] == "y"
    ), "Values should be equal"

    expected_results = {
        "n_cells_missing": 0.0,
        Variable.S_TYPE_UNIQUE.value: 0,
        Variable.S_TYPE_CONST.value: 0,
        "nvar": 2,
        Variable.S_TYPE_REJECTED.value: 1,
        "n": 9,
        Variable.S_TYPE_RECODED.value: 0,
        Variable.S_TYPE_CORR.value: 1,
        Variable.TYPE_DATE.value: 0,
        Variable.TYPE_NUM.value: 0,
        Variable.TYPE_CAT.value: 1,
        "n_duplicates": 5,
    }
    for key in expected_results:
        assert (
            results["table"][key] == expected_results[key]
        ), "recoding error {}".format(key)
def test_describe_list():
    with pytest.raises(AttributeError):
        with pytest.warns(UserWarning):
            describe("", [1, 2, 3])
def test_describe_empty():
    empty_frame = pd.DataFrame()
    with pytest.raises(ValueError):
        describe("", empty_frame)
Пример #8
0
def test_describe_list():
    with pytest.raises(TypeError):
        describe([1, 2, 3])
Пример #9
0
def test_describe_list(summarizer, typeset):
    config = Settings()

    with pytest.raises(AttributeError), pytest.warns(UserWarning):
        describe(config, "", [1, 2, 3], summarizer, typeset)
Пример #10
0
def test_describe_list(summarizer, typeset):
    with pytest.raises(AttributeError):
        with pytest.warns(UserWarning):
            describe("", [1, 2, 3], summarizer, typeset)
Пример #11
0
def test_describe_empty(summarizer, typeset):
    empty_frame = pd.DataFrame()
    with pytest.raises(ValueError):
        describe("", empty_frame, summarizer, typeset)
Пример #12
0
def test_describe_list(summarizer, typeset):
    config = Settings()

    with pytest.raises(NotImplementedError):
        describe(config, "", [1, 2, 3], summarizer, typeset)