예제 #1
0
def tests_multi_index_subindex_coerce():
    """MultIndex component should override sub indexes."""
    indexes = [
        Index(String, coerce=True),
        Index(String, coerce=False),
        Index(String, coerce=True),
        Index(String, coerce=False),
    ]

    data = pd.DataFrame(index=pd.MultiIndex.from_arrays([[1, 2, 3, 4]] * 4))

    # coerce=True in MultiIndex and DataFrameSchema should override subindex
    # coerce setting
    for schema_override in [
            DataFrameSchema(index=MultiIndex(indexes, coerce=True)),
            DataFrameSchema(index=MultiIndex(indexes), coerce=True),
    ]:
        validated_df_override = schema_override(data)
        for level_i in range(validated_df_override.index.nlevels):
            assert (validated_df_override.index.get_level_values(level_i).dtype
                    == "object")

    # coerce=False at the MultiIndex level should result in two type errors
    schema = DataFrameSchema(index=MultiIndex(indexes))
    with pytest.raises(errors.SchemaErrors,
                       match="A total of 2 schema errors were found"):
        schema(data, lazy=True)
예제 #2
0
def test_add_and_remove_columns():
    """Check that adding and removing columns works as expected and doesn't
    modify the original underlying DataFrameSchema."""
    schema1 = DataFrameSchema(
        {
            "col1": Column(Int, Check(lambda s: s >= 0)),
        },
        strict=True,
    )

    schema1_exact_copy = copy.deepcopy(schema1)

    # test that add_columns doesn't modify schema1 after add_columns:
    schema2 = schema1.add_columns(
        {
            "col2": Column(String, Check(lambda x: x <= 0)),
            "col3": Column(Object, Check(lambda x: x == 0)),
        }
    )

    schema2_exact_copy = copy.deepcopy(schema2)

    assert schema1 == schema1_exact_copy

    # test that add_columns changed schema1 into schema2:
    expected_schema_2 = DataFrameSchema(
        {
            "col1": Column(Int, Check(lambda s: s >= 0)),
            "col2": Column(String, Check(lambda x: x <= 0)),
            "col3": Column(Object, Check(lambda x: x == 0)),
        },
        strict=True,
    )

    assert schema2 == expected_schema_2

    # test that remove_columns doesn't modify schema2:
    schema3 = schema2.remove_columns(["col2"])

    assert schema2 == schema2_exact_copy

    # test that remove_columns has removed the changes as expected:
    expected_schema_3 = DataFrameSchema(
        {
            "col1": Column(Int, Check(lambda s: s >= 0)),
            "col3": Column(Object, Check(lambda x: x == 0)),
        },
        strict=True,
    )

    assert schema3 == expected_schema_3

    # test that remove_columns can remove two columns:
    schema4 = schema2.remove_columns(["col2", "col3"])

    expected_schema_4 = DataFrameSchema(
        {"col1": Column(Int, Check(lambda s: s >= 0))}, strict=True
    )

    assert schema4 == expected_schema_4 == schema1
예제 #3
0
def test_check_function_decorator_transform():
    """Test that transformer argument is in effect in check_input decorator."""

    in_schema = DataFrameSchema(
        {"column1": Column(Int)},
        transformer=lambda df: df.assign(column2="foo"))
    out_schema = DataFrameSchema(
        {"column1": Column(Int),
         "column2": Column(String)})

    @check_input(in_schema)
    @check_output(out_schema)
    def func_input_transform1(df):
        return df

    result1 = func_input_transform1(pd.DataFrame({"column1": [1, 2, 3]}))
    assert "column2" in result1

    @check_input(in_schema, 1)
    @check_output(out_schema, 1)
    def func_input_transform2(_, df):
        return _, df

    result2 = func_input_transform2(None, pd.DataFrame({"column1": [1, 2, 3]}))
    assert "column2" in result2[1]
예제 #4
0
def test_two_sample_ttest_hypothesis_relationships():
    """Check allowable relationships in two-sample ttest."""
    for relationship in Hypothesis.RELATIONSHIPS:
        schema = DataFrameSchema({
            "height_in_feet": Column(Float, [
                Hypothesis.two_sample_ttest(
                    sample1="M",
                    sample2="F",
                    groupby="sex",
                    relationship=relationship,
                    alpha=0.5),
            ]),
            "sex": Column(String)
        })
        assert isinstance(schema, DataFrameSchema)

    for relationship in ["foo", "bar", 1, 2, 3, None]:
        with pytest.raises(errors.SchemaInitError):
            DataFrameSchema({
                "height_in_feet": Column(Float, [
                    Hypothesis.two_sample_ttest(
                        sample1="M",
                        sample2="F",
                        groupby="sex",
                        relationship=relationship,
                        alpha=0.5),
                ]),
                "sex": Column(String)
            })
예제 #5
0
def test_coerce_dtype_nullable_str():
    """Tests how null values are handled in string dtypes."""
    # dataframes with columns where the last two values are null
    df_nans = pd.DataFrame({
        "col": ["foobar", "foo", "bar", "baz", np.nan, np.nan],
    })
    df_nones = pd.DataFrame({
        "col": ["foobar", "foo", "bar", "baz", None, None],
    })

    with pytest.raises(errors.SchemaError):
        for df in [df_nans, df_nones]:
            DataFrameSchema({
                "col": Column(String, coerce=True, nullable=False)
            }).validate(df)

    schema = DataFrameSchema({
        "col": Column(String, coerce=True, nullable=True)
    })

    for df in [df_nans, df_nones]:
        validated_df = schema.validate(df)
        assert isinstance(validated_df, pd.DataFrame)
        assert pd.isna(validated_df["col"].iloc[-1])
        assert pd.isna(validated_df["col"].iloc[-2])
        for i in range(4):
            assert isinstance(validated_df["col"].iloc[i], str)
예제 #6
0
def test_check_function_decorator_errors():
    """Test that the check_input and check_output decorators error properly."""
    # case 1: checks that the input and output decorators error when different
    # types are passed in and out
    @check_input(DataFrameSchema({"column1": Column(Int)}))
    @check_output(DataFrameSchema({"column2": Column(Float)}))
    def test_func(df):
        return df

    with pytest.raises(
            errors.SchemaError,
            match=r"^error in check_input decorator of function"):
        test_func(pd.DataFrame({"column2": ["a", "b", "c"]}))

    with pytest.raises(
            errors.SchemaError,
            match=r"^error in check_output decorator of function"):
        test_func(pd.DataFrame({"column1": [1, 2, 3]}))

    # case 2: check that if the input decorator refers to an index that's not
    # in the function signature, it will fail in a way that's easy to interpret
    @check_input(DataFrameSchema({"column1": Column(Int)}), 1)
    def test_incorrect_check_input_index(df):
        return df

    with pytest.raises(
            errors.SchemaError,
            match=r"^error in check_input decorator of function"):
        test_incorrect_check_input_index(pd.DataFrame({"column1": [1, 2, 3]})
                                         )
예제 #7
0
def test_column_regex_non_str_types() -> None:
    """Check that column name regex matching excludes non-string types."""
    data = pd.DataFrame(
        {
            1: [1, 2, 3],
            2.2: [1, 2, 3],
            pd.Timestamp("2018/01/01"): [1, 2, 3],
            "foo_1": [1, 2, 3],
            "foo_2": [1, 2, 3],
            "foo_3": [1, 2, 3],
        }
    )
    schema = DataFrameSchema(
        columns={
            "foo_": Column(Int, Check.gt(0), regex=True),
            r"\d+": Column(Int, Check.gt(0), regex=True),
            r"\d+\.\d+": Column(Int, Check.gt(0), regex=True),
            "2018-01-01": Column(Int, Check.gt(0), regex=True),
        },
    )
    assert isinstance(schema.validate(data), pd.DataFrame)

    # test MultiIndex column case
    data = pd.DataFrame(
        {
            (1, 1): [1, 2, 3],
            (2.2, 4.5): [1, 2, 3],
            ("foo", "bar"): [1, 2, 3],
        }
    )
    schema = DataFrameSchema(
        columns={("foo_*", "bar_*"): Column(Int, regex=True)},
    )
    schema.validate(data)
예제 #8
0
def test_coerce_dtype_in_dataframe():
    """Tests coercions of datatypes, especially regarding nullable integers."""
    df = pd.DataFrame({
        "column1": [10.0, 20.0, 30.0],
        "column2": ["2018-01-01", "2018-02-01", "2018-03-01"],
        "column3": [1, 2, None],
        "column4": [1., 1., np.nan],
    })
    # specify `coerce` at the Column level
    schema1 = DataFrameSchema({
        "column1": Column(Int, Check(lambda x: x > 0), coerce=True),
        "column2": Column(DateTime, coerce=True),
        "column3": Column(String, coerce=True, nullable=True),
    })
    # specify `coerce` at the DataFrameSchema level
    schema2 = DataFrameSchema({
        "column1": Column(Int, Check(lambda x: x > 0)),
        "column2": Column(DateTime),
        "column3": Column(String, nullable=True),
    }, coerce=True)

    for schema in [schema1, schema2]:
        result = schema.validate(df)
        assert result.column1.dtype == Int.value
        assert result.column2.dtype == DateTime.value
        for _, x in result.column3.iteritems():
            assert pd.isna(x) or isinstance(x, str)

        # make sure that correct error is raised when null values are present
        # in a float column that's coerced to an int
        schema = DataFrameSchema({"column4": Column(Int, coerce=True)})
        with pytest.raises(ValueError):
            schema.validate(df)
예제 #9
0
def test_one_sample_hypothesis():
    """Check one sample ttest."""
    schema = DataFrameSchema({
        "height_in_feet": Column(
            Float, [
                Hypothesis.one_sample_ttest(
                    popmean=5,
                    relationship="greater_than",
                    alpha=0.1),
            ]
        ),
    })

    subset_schema = DataFrameSchema({
        "group": Column(String),
        "height_in_feet": Column(
            Float, [
                Hypothesis.one_sample_ttest(
                    sample="A",
                    groupby="group",
                    popmean=5,
                    relationship="greater_than",
                    alpha=0.1),
            ]
        ),
    })

    df = (
        pd.DataFrame({
            "height_in_feet": [8.1, 7, 6.5, 6.7, 5.1],
            "group": ["A", "A", "B", "B", "A"],
        })
    )
    schema.validate(df)
    subset_schema.validate(df)
예제 #10
0
def test_schema_equality_operators():
    """Test the usage of == for DataFrameSchema, SeriesSchema and
    SeriesSchemaBase."""
    df_schema = DataFrameSchema({
        "col1": Column(Int, Check(lambda s: s >= 0)),
        "col2": Column(String, Check(lambda s: s >= 2)),
        }, strict=True)
    df_schema_columns_in_different_order = DataFrameSchema({
        "col2": Column(String, Check(lambda s: s >= 2)),
        "col1": Column(Int, Check(lambda s: s >= 0)),
        }, strict=True)
    series_schema = SeriesSchema(
        String,
        checks=[Check(lambda s: s.str.startswith("foo"))],
        nullable=False,
        allow_duplicates=True,
        name="my_series")
    series_schema_base = SeriesSchemaBase(
        String,
        checks=[Check(lambda s: s.str.startswith("foo"))],
        nullable=False,
        allow_duplicates=True,
        name="my_series")
    not_equal_schema = DataFrameSchema({
        "col1": Column(String)
        }, strict=False)

    assert df_schema == copy.deepcopy(df_schema)
    assert df_schema != not_equal_schema
    assert df_schema == df_schema_columns_in_different_order
    assert series_schema == copy.deepcopy(series_schema)
    assert series_schema != not_equal_schema
    assert series_schema_base == copy.deepcopy(series_schema_base)
    assert series_schema_base != not_equal_schema
예제 #11
0
def tests_multi_index_subindex_coerce():
    """MultIndex component should override sub indexes."""
    indexes = [
        Index(String, coerce=True),
        Index(String, coerce=False),
        Index(String, coerce=True),
        Index(String, coerce=False),
    ]

    data = pd.DataFrame(index=pd.MultiIndex.from_arrays([[1, 2, 3, 4]] * 4))

    schema = DataFrameSchema(index=MultiIndex(indexes), coerce=False)
    validated_df = schema(data)
    for level_i in range(validated_df.index.nlevels):
        if indexes[level_i].coerce:
            assert validated_df.index.get_level_values(level_i).dtype == \
                indexes[level_i].dtype
        else:
            # dtype should be string representation of pandas strings
            assert validated_df.index.get_level_values(level_i).dtype == \
                "object"

    # coerce=True in MultiIndex should override subindex coerce setting
    schema_override = DataFrameSchema(index=MultiIndex(indexes), coerce=True)
    validated_df_override = schema_override(data)
    for level_i in range(validated_df.index.nlevels):
        assert validated_df_override.index.get_level_values(level_i).dtype == \
            indexes[level_i].dtype
예제 #12
0
def test_dataframe_checks():
    """Tests that dataframe checks validate, error when a DataFrame doesn't
    comply with the schema, simple tests of the groupby checks which are
    covered in more detail above."""
    schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Float),
            "col3": Column(String),
            "col4": Column(String),
        },
        checks=[
            Check(lambda df: df["col1"] < df["col2"]),
            Check(lambda df: df["col3"] == df["col4"]),
        ],
    )
    df = pd.DataFrame({
        "col1": [1, 2, 3],
        "col2": [2.0, 3.0, 4.0],
        "col3": ["foo", "bar", "baz"],
        "col4": ["foo", "bar", "baz"],
    })

    assert isinstance(schema.validate(df), pd.DataFrame)

    # test invalid schema error raising
    invalid_df = df.copy()
    invalid_df["col1"] = invalid_df["col1"] * 3

    with pytest.raises(errors.SchemaError):
        schema.validate(invalid_df)

    # test groupby checks
    groupby_check_schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col3": Column(String),
        },
        checks=[
            Check(lambda g: g["foo"]["col1"].iat[0] == 1, groupby="col3"),
            Check(lambda g: g["foo"]["col2"].iat[0] == 2.0, groupby="col3"),
            Check(lambda g: g["foo"]["col3"].iat[0] == "foo", groupby="col3"),
            Check(
                lambda g: g[("foo", "foo")]["col1"].iat[0] == 1,
                groupby=["col3", "col4"],
            ),
        ],
    )
    assert isinstance(groupby_check_schema.validate(df), pd.DataFrame)

    # test element-wise checks
    element_wise_check_schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Float),
        },
        checks=Check(lambda row: row["col1"] < row["col2"], element_wise=True),
    )
    assert isinstance(element_wise_check_schema.validate(df), pd.DataFrame)
예제 #13
0
def test_coerce_without_dtype():
    """Test that an error is thrown when a dtype isn't specified and coerce
    is True."""
    with pytest.raises(errors.SchemaInitError):
        DataFrameSchema({"col": Column(coerce=True)})

    with pytest.raises(errors.SchemaInitError):
        DataFrameSchema({"col": Column()}, coerce=True)
예제 #14
0
def test_check_groups():
    """Tests uses of groupby and groups (for values within columns)."""
    schema = DataFrameSchema({
        "col1":
        Column(Int, [
            Check(lambda s: s["foo"] > 10, groupby="col2", groups=["foo"]),
            Check(lambda s: s["foo"] > 10, groupby="col2", groups="foo"),
        ]),
        "col2":
        Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })

    df = pd.DataFrame({
        "col1": [7, 8, 9, 11, 12, 13],
        "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
    })

    validated_df = schema.validate(df)
    assert isinstance(validated_df, pd.DataFrame)
    assert len(validated_df.columns) == 2
    assert set(validated_df.columns) == {"col1", "col2"}

    # raise KeyError when groups does not include a particular group name
    schema_fail_key_error = DataFrameSchema({
        "col1":
        Column(Int, [
            Check(lambda s: s["bar"] > 10, groupby="col2", groups="foo"),
        ]),
        "col2":
        Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })
    with pytest.raises(KeyError, match="^'bar'"):
        schema_fail_key_error.validate(df)

    # raise KeyError when the group does not exist in the groupby column when
    # referenced in the Check function
    schema_fail_nonexistent_key_in_fn = DataFrameSchema({
        "col1":
        Column(Int, [
            Check(lambda s: s["baz"] > 10, groupby="col2", groups=["foo"]),
        ]),
        "col2":
        Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })
    with pytest.raises(KeyError, match="^'baz'"):
        schema_fail_nonexistent_key_in_fn.validate(df)

    # raise KeyError when the group does not exist in the groups argument.
    schema_fail_nonexistent_key_in_groups = DataFrameSchema({
        "col1":
        Column(Int, [
            Check(lambda s: s["foo"] > 10, groupby="col2", groups=["baz"]),
        ]),
        "col2":
        Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })
    with pytest.raises(KeyError):
        schema_fail_nonexistent_key_in_groups.validate(df)
예제 #15
0
def test_dataframe_hypothesis_checks():
    """
    Test that two specific implementations of a Hypothesis work as expected
    and that using a Column that wasn't defined will error.
    """
    df = pd.DataFrame({
        "col1": range(100, 201),
        "col2": range(0, 101),
    })

    hypothesis_check_schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Int),
        },
        checks=[
            # two-sample test
            Hypothesis(
                test=stats.ttest_ind,
                samples=["col1", "col2"],
                relationship=lambda stat, pvalue, alpha=0.01:
                (stat > 0 and pvalue / 2 < alpha),
                relationship_kwargs={"alpha": 0.5},
            ),
            # one-sample test
            Hypothesis(
                test=stats.ttest_1samp,
                samples=["col1"],
                relationship=lambda stat, pvalue, alpha=0.01:
                (stat > 0 and pvalue / 2 < alpha),
                test_kwargs={"popmean": 50},
                relationship_kwargs={"alpha": 0.01},
            ),
        ],
    )

    hypothesis_check_schema.validate(df)

    # raise error when using groupby for a column that doesn't exist
    hypothesis_check_schema_groupby = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Int),
        },
        checks=[
            # two-sample test
            Hypothesis(
                test=stats.ttest_ind,
                samples=["col1", "col2"],
                groupby="col3",
                relationship=lambda stat, pvalue, alpha=0.01:
                (stat > 0 and pvalue / 2 < alpha),
                relationship_kwargs={"alpha": 0.5},
            ),
        ],
    )
    with pytest.raises(errors.SchemaDefinitionError):
        hypothesis_check_schema_groupby.validate(df)
예제 #16
0
def test_dataframe_hypothesis_checks():

    df = pd.DataFrame({
        "col1": range(100, 201),
        "col2": range(0, 101),
    })

    hypothesis_check_schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Int),
        },
        checks=[
            # two-sample test
            Hypothesis(
                test=stats.ttest_ind,
                samples=["col1", "col2"],
                relationship=lambda stat, pvalue, alpha=0.01: (
                    stat > 0 and pvalue / 2 < alpha
                ),
                relationship_kwargs={"alpha": 0.5},
            ),
            # one-sample test
            Hypothesis(
                test=stats.ttest_1samp,
                samples=["col1"],
                relationship=lambda stat, pvalue, alpha=0.01: (
                    stat > 0 and pvalue / 2 < alpha
                ),
                test_kwargs={"popmean": 50},
                relationship_kwargs={"alpha": 0.01},
            ),
        ]
    )

    hypothesis_check_schema.validate(df)

    # raise error when using groupby
    hypothesis_check_schema_groupby = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Int),
        },
        checks=[
            # two-sample test
            Hypothesis(
                test=stats.ttest_ind,
                samples=["col1", "col2"],
                groupby="col3",
                relationship=lambda stat, pvalue, alpha=0.01: (
                    stat > 0 and pvalue / 2 < alpha
                ),
                relationship_kwargs={"alpha": 0.5},
            ),
        ]
    )
    with pytest.raises(errors.SchemaDefinitionError):
        hypothesis_check_schema_groupby.validate(df)
예제 #17
0
def test_no_dtype_dataframe():
    schema = DataFrameSchema({"col": Column(nullable=False)})
    validated_df = schema.validate(pd.DataFrame({"col": [-123.1, -76.3, 1.0]}))
    assert isinstance(validated_df, pd.DataFrame)

    schema = DataFrameSchema({"col": Column(nullable=True)})
    validated_df = schema.validate(pd.DataFrame({"col": [-123.1, None, 1.0]}))
    assert isinstance(validated_df, pd.DataFrame)

    with pytest.raises(errors.SchemaError):
        schema = DataFrameSchema({"col": Column(nullable=False)})
        schema.validate(pd.DataFrame({"col": [-123.1, None, 1.0]}))
예제 #18
0
def test_check_input_method_decorators():
    """Test the check_input and check_output decorator behaviours when the
    dataframe is changed within the function being checked"""
    in_schema = DataFrameSchema({"column1": Column(String)})
    out_schema = DataFrameSchema({"column2": Column(Int)})
    dataframe = pd.DataFrame({"column1": ["a", "b", "c"]})

    def _transform_helper(df):
        return df.assign(column2=[1, 2, 3])

    class TransformerClass():
        """Contains functions with different signatures representing the way
        that the decorators can be called."""
        # pylint: disable=E0012,C0111,C0116,W0613, R0201
        # disables missing-function-docstring as this is a factory method
        # disables unused-arguments because handling the second argument is
        # what is being tested and this is intentional.
        # disables no-self-use because having TransformerClass with functions
        # is cleaner.

        @check_input(in_schema)
        @check_output(out_schema)
        def transform_first_arg(self, df):
            return _transform_helper(df)

        @check_input(in_schema, 0)
        @check_output(out_schema)
        def transform_first_arg_with_list_getter(self, df):
            return _transform_helper(df)

        @check_input(in_schema, 1)
        @check_output(out_schema)
        def transform_secord_arg_with_list_getter(self, x, df):
            return _transform_helper(df)

        @check_input(in_schema, "df")
        @check_output(out_schema)
        def transform_secord_arg_with_dict_getter(self, x, df):
            return _transform_helper(df)

    def _assert_expectation(result_df):
        assert isinstance(result_df, pd.DataFrame)
        assert "column2" in result_df.columns

    transformer = TransformerClass()
    _assert_expectation(transformer.transform_first_arg(dataframe))
    _assert_expectation(
        transformer.transform_first_arg_with_list_getter(dataframe))
    _assert_expectation(
        transformer.transform_secord_arg_with_list_getter(None, dataframe))
    _assert_expectation(
        transformer.transform_secord_arg_with_dict_getter(None, dataframe))
예제 #19
0
def test_dataframe_schema_check():
    """Test that DataFrameSchema-level Checks work properly."""
    data = pd.DataFrame([range(10) for _ in range(10)])

    schema_check_return_bool = DataFrameSchema(
        checks=Check(lambda df: (df < 10).all()))
    assert isinstance(schema_check_return_bool.validate(data), pd.DataFrame)

    schema_check_return_series = DataFrameSchema(
        checks=Check(lambda df: df[0] < 10))
    assert isinstance(schema_check_return_series.validate(data), pd.DataFrame)

    schema_check_return_df = DataFrameSchema(checks=Check(lambda df: df < 10))
    assert isinstance(schema_check_return_df.validate(data), pd.DataFrame)
예제 #20
0
def test_dataframe_checks():
    schema = DataFrameSchema(columns={
        "col1": Column(Int),
        "col2": Column(Float),
        "col3": Column(String),
        "col4": Column(String),
    },
                             checks=[
                                 Check(lambda df: df["col1"] < df["col2"]),
                                 Check(lambda df: df["col3"] == df["col4"]),
                             ])
    df = pd.DataFrame({
        "col1": [1, 2, 3],
        "col2": [2.0, 3.0, 4.0],
        "col3": ["foo", "bar", "baz"],
        "col4": ["foo", "bar", "baz"],
    })

    assert isinstance(schema.validate(df), pd.DataFrame)

    # test invalid schema error raising
    invalid_df = df.copy()
    invalid_df["col1"] = invalid_df["col1"] * 3

    with pytest.raises(errors.SchemaError):
        schema.validate(invalid_df)

    # test groupby checks
    groupby_check_schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col3": Column(String),
        },
        checks=[
            Check(lambda g: g["foo"]["col1"].iat[0] == 1, groupby="col3"),
            Check(lambda g: g["foo"]["col2"].iat[0] == 2.0, groupby="col3"),
            Check(lambda g: g["foo"]["col3"].iat[0] == "foo", groupby="col3"),
            Check(lambda g: g[("foo", "foo")]["col1"].iat[0] == 1,
                  groupby=["col3", "col4"]),
        ])
    assert isinstance(groupby_check_schema.validate(df), pd.DataFrame)

    # test element-wise checks
    element_wise_check_schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Float),
        },
        checks=Check(lambda row: row["col1"] < row["col2"], element_wise=True))
    assert isinstance(element_wise_check_schema.validate(df), pd.DataFrame)
예제 #21
0
def test_no_dtype_dataframe():
    """Test how nullability is handled in DataFrameSchemas where no type is
    specified."""
    schema = DataFrameSchema({"col": Column(nullable=False)})
    validated_df = schema.validate(pd.DataFrame({"col": [-123.1, -76.3, 1.0]}))
    assert isinstance(validated_df, pd.DataFrame)

    schema = DataFrameSchema({"col": Column(nullable=True)})
    validated_df = schema.validate(pd.DataFrame({"col": [-123.1, None, 1.0]}))
    assert isinstance(validated_df, pd.DataFrame)

    with pytest.raises(errors.SchemaError):
        schema = DataFrameSchema({"col": Column(nullable=False)})
        schema.validate(pd.DataFrame({"col": [-123.1, None, 1.0]}))
예제 #22
0
def test_python_builtin_types():
    """Test support python data types can be used for validation."""
    schema = DataFrameSchema(
        {
            "int_col": Column(int),
            "float_col": Column(float),
            "str_col": Column(str),
            "bool_col": Column(bool),
            "object_col": Column(object),
            "complex_col": Column(complex),
        }
    )
    df = pd.DataFrame(
        {
            "int_col": [1, 2, 3],
            "float_col": [1.0, 2.0, 3.0],
            "str_col": list("abc"),
            "bool_col": [True, False, True],
            "object_col": [[1], 1, {"foo": "bar"}],
            "complex_col": [complex(1), complex(2), complex(3)],
        }
    )
    assert isinstance(schema(df), pd.DataFrame)
    assert schema.dtype["int_col"] == PandasDtype.Int.str_alias
    assert schema.dtype["float_col"] == PandasDtype.Float.str_alias
    assert schema.dtype["str_col"] == PandasDtype.Str.str_alias
    assert schema.dtype["bool_col"] == PandasDtype.Bool.str_alias
    assert schema.dtype["object_col"] == PandasDtype.Object.str_alias
    assert schema.dtype["complex_col"] == PandasDtype.Complex.str_alias
예제 #23
0
def test_schema_get_dtype():
    """Test that schema dtype and get_dtype methods handle regex columns."""
    schema = DataFrameSchema({
        "col1": Column(Int),
        "var*": Column(Float, regex=True),
    })

    data = pd.DataFrame({
        "col1": [1, 2, 3],
        "var1": [1.0, 1.1, 1.2],
        "var2": [1.0, 1.1, 1.2],
        "var3": [1.0, 1.1, 1.2],
    })

    with pytest.warns(UserWarning) as record:
        assert schema.dtype == {"col1": Int.str_alias}
    assert len(record) == 1
    assert record[0].message.args[0].startswith(
        "Schema has columns specified as regex column names:")

    assert schema.get_dtype(data) == {
        "col1": Int.str_alias,
        "var1": Float.str_alias,
        "var2": Float.str_alias,
        "var3": Float.str_alias,
    }
예제 #24
0
def test_required():
    """Tests how a Required Column is handled when it's not included, included
    and then not specified and a second column which is implicitly required
    isn't available."""
    schema = DataFrameSchema({
        "col1": Column(Int, required=False),
        "col2": Column(String)
    })

    df_ok_1 = pd.DataFrame({
        "col2": ['hello', 'world']
    })

    df = schema.validate(df_ok_1)
    assert isinstance(df, pd.DataFrame)
    assert len(df.columns) == 1
    assert set(df.columns) == {"col2"}

    df_ok_2 = pd.DataFrame({
        "col1": [1, 2],
        "col2": ['hello', 'world']
    })

    df = schema.validate(df_ok_2)
    assert isinstance(df, pd.DataFrame)
    assert len(df.columns) == 2
    assert set(df.columns) == {"col1", "col2"}

    df_not_ok = pd.DataFrame({
        "col1": [1, 2]
    })

    with pytest.raises(Exception):
        schema.validate(df_not_ok)
예제 #25
0
def test_column_regex_strict() -> None:
    """Test that Column regex patterns correctly parsed in DataFrameSchema."""
    data = pd.DataFrame(
        {
            "foo_1": [1, 2, 3],
            "foo_2": [1, 2, 3],
            "foo_3": [1, 2, 3],
        }
    )
    schema = DataFrameSchema(
        columns={"foo_*": Column(Int, regex=True)}, strict=True
    )
    assert isinstance(schema.validate(data), pd.DataFrame)

    # adding an extra column in the dataframe should cause error
    data = data.assign(bar=[1, 2, 3])
    with pytest.raises(errors.SchemaError):
        schema.validate(data)

    # adding an extra regex column to the schema should pass the strictness
    # test
    validated_data = schema.add_columns(
        {"bar_*": Column(Int, regex=True)}
    ).validate(data.assign(bar_1=[1, 2, 3]))
    assert isinstance(validated_data, pd.DataFrame)
예제 #26
0
def test_multi_index_columns() -> None:
    """Tests that multi-index Columns within DataFrames validate correctly."""
    schema = DataFrameSchema(
        {
            ("zero", "foo"): Column(Float, Check(lambda s: (s > 0) & (s < 1))),
            ("zero", "bar"): Column(
                String, Check(lambda s: s.isin(["a", "b", "c", "d"]))
            ),
            ("one", "foo"): Column(Int, Check(lambda s: (s > 0) & (s < 10))),
            ("one", "bar"): Column(
                DateTime, Check(lambda s: s == pd.Timestamp(2019, 1, 1))
            ),
        }
    )
    validated_df = schema.validate(
        pd.DataFrame(
            {
                ("zero", "foo"): [0.1, 0.2, 0.7, 0.3],
                ("zero", "bar"): ["a", "b", "c", "d"],
                ("one", "foo"): [1, 6, 4, 7],
                ("one", "bar"): pd.to_datetime(["2019/01/01"] * 4),
            }
        )
    )
    assert isinstance(validated_df, pd.DataFrame)
예제 #27
0
 def init_schema_no_groupby_column():
     DataFrameSchema({
         "col1":
         Column(Int, [
             Check(lambda s: s["foo"] > 10, groupby=["col2"]),
         ]),
     })
예제 #28
0
def test_datetime():
    """Test datetime types can be validated properly by schema.validate"""
    schema = DataFrameSchema(
        columns={
            "col": Column(
                pa.DateTime,
                checks=Check(lambda s: s.min() > pd.Timestamp("2015")),
            )
        }
    )

    validated_df = schema.validate(
        pd.DataFrame(
            {"col": pd.to_datetime(["2019/01/01", "2018/05/21", "2016/03/10"])}
        )
    )

    assert isinstance(validated_df, pd.DataFrame)

    with pytest.raises(SchemaError):
        schema.validate(
            pd.DataFrame(
                {"col": pd.to_datetime(["2010/01/01"])}
            )
        )
예제 #29
0
def test_check_groupby_multiple_columns():
    """Tests uses of groupby to specify dependencies between one column and a
    number of other columns, including error handling."""
    schema = DataFrameSchema({
        "col1":
        Column(
            Int,
            [
                Check(
                    lambda s: s[("bar", True)].sum() == 16,  # 7 + 9
                    groupby=["col2", "col3"]),
            ]),
        "col2":
        Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
        "col3":
        Column(Bool),
    })

    df_pass = pd.DataFrame({
        "col1": [7, 8, 9, 11, 12, 13],
        "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
        "col3": [True, False, True, False, True, False],
    })

    df = schema.validate(df_pass)
    assert isinstance(df, pd.DataFrame)
    assert len(df.columns) == 3
    assert set(df.columns) == {"col1", "col2", "col3"}
예제 #30
0
def test_column_in_dataframe_schema():
    """Test that a Column check returns a dataframe."""
    schema = DataFrameSchema(
        {"a": Column(Int, Check(lambda x: x > 0, element_wise=True))}
    )
    data = pd.DataFrame({"a": [1, 2, 3]})
    assert isinstance(schema.validate(data), pd.DataFrame)