示例#1
0
def test_dataframe_schema_dtype_property():
    """Test that schema.dtype returns the matching Column types."""
    schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(String),
            "col3": Column(DateTime),
            "col4": Column("uint16"),
        }
    )
    assert schema.dtype == {
        "col1": "int64",
        "col2": "object",
        "col3": "datetime64[ns]",
        "col4": "uint16"
    }
示例#2
0
def test_column_regex_matching(
        column_name_regex, expected_matches, error):
    """
    Column regex pattern matching should yield correct matches and raise
    expected errors.
    """
    columns = pd.MultiIndex.from_tuples(
        (
            ("foo_1", "biz_1"),
            ("foo_2", "baz_1"),
            ("foo_3", "baz_2"),
            ("bar_1", "biz_2"),
            ("bar_2", "biz_3"),
            ("bar_3", "biz_3"),
        )
    )

    column_schema = Column(
        Int, Check(lambda s: s >= 0), name=column_name_regex, regex=True,
    )
    if error is not None:
        with pytest.raises(error):
            column_schema.get_regex_columns(columns)
    else:
        matched_columns = column_schema.get_regex_columns(columns)
        assert expected_matches == matched_columns.tolist()
示例#3
0
文件: io.py 项目: seb84924/pandera
def _deserialize_schema(serialized_schema):
    # pylint: disable=import-outside-toplevel
    from pandera import Column, DataFrameSchema, Index, MultiIndex

    columns, index = None, None
    if serialized_schema["columns"] is not None:
        columns = {
            col_name: Column(**_deserialize_component_stats(column_stats))
            for col_name, column_stats in serialized_schema["columns"].items()
        }

    if serialized_schema["index"] is not None:
        index = [
            _deserialize_component_stats(index_component)
            for index_component in serialized_schema["index"]
        ]

    if index is None:
        pass
    elif len(index) == 1:
        index = Index(**index[0])
    else:
        index = MultiIndex(
            indexes=[Index(**index_properties) for index_properties in index])

    return DataFrameSchema(
        columns=columns,
        index=index,
        coerce=serialized_schema["coerce"],
        strict=serialized_schema["strict"],
    )
示例#4
0
def test_dataframe_checks():
    """Tests that dataframe checks validate, error when a DataFrame doesn't
    comply with the schema, simple tests of the groupby checks which are
    covered in more detail above."""
    schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Float),
            "col3": Column(String),
            "col4": Column(String),
        },
        checks=[
            Check(lambda df: df["col1"] < df["col2"]),
            Check(lambda df: df["col3"] == df["col4"]),
        ]
    )
    df = pd.DataFrame({
        "col1": [1, 2, 3],
        "col2": [2.0, 3.0, 4.0],
        "col3": ["foo", "bar", "baz"],
        "col4": ["foo", "bar", "baz"],
    })

    assert isinstance(schema.validate(df), pd.DataFrame)

    # test invalid schema error raising
    invalid_df = df.copy()
    invalid_df["col1"] = invalid_df["col1"] * 3

    with pytest.raises(errors.SchemaError):
        schema.validate(invalid_df)

    # test groupby checks
    groupby_check_schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col3": Column(String),
        },
        checks=[
            Check(lambda g: g["foo"]["col1"].iat[0] == 1, groupby="col3"),
            Check(lambda g: g["foo"]["col2"].iat[0] == 2.0, groupby="col3"),
            Check(lambda g: g["foo"]["col3"].iat[0] == "foo", groupby="col3"),
            Check(lambda g: g[("foo", "foo")]["col1"].iat[0] == 1,
                  groupby=["col3", "col4"]),
        ]
    )
    assert isinstance(groupby_check_schema.validate(df), pd.DataFrame)

    # test element-wise checks
    element_wise_check_schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(Float),
        },
        checks=Check(lambda row: row["col1"] < row["col2"], element_wise=True)
    )
    assert isinstance(element_wise_check_schema.validate(df), pd.DataFrame)
示例#5
0
def test_check_groups():
    schema = DataFrameSchema({
        "col1":
        Column(Int, [
            Check(lambda s: s["foo"] > 10, groupby="col2", groups=["foo"]),
            Check(lambda s: s["foo"] > 10, groupby="col2", groups="foo"),
        ]),
        "col2":
        Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })

    df = pd.DataFrame({
        "col1": [7, 8, 9, 11, 12, 13],
        "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
    })

    validated_df = schema.validate(df)
    assert isinstance(validated_df, pd.DataFrame)
    assert len(validated_df.columns) == 2
    assert set(validated_df.columns) == {"col1", "col2"}

    # raise KeyError when groups does not include a particular group name
    schema_fail_key_error = DataFrameSchema({
        "col1":
        Column(Int, [
            Check(lambda s: s["bar"] > 10, groupby="col2", groups="foo"),
        ]),
        "col2":
        Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })
    with pytest.raises(KeyError, match="^'bar'"):
        schema_fail_key_error.validate(df)

    # raise KeyError when the group does not exist in the groupby column when
    # referenced in the Check function
    schema_fail_nonexistent_key_in_fn = DataFrameSchema({
        "col1":
        Column(Int, [
            Check(lambda s: s["baz"] > 10, groupby="col2", groups=["foo"]),
        ]),
        "col2":
        Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })
    with pytest.raises(KeyError, match="^'baz'"):
        schema_fail_nonexistent_key_in_fn.validate(df)

    # raise KeyError when the group does not exist in the groups argument.
    schema_fail_nonexistent_key_in_groups = DataFrameSchema({
        "col1":
        Column(Int, [
            Check(lambda s: s["foo"] > 10, groupby="col2", groups=["baz"]),
        ]),
        "col2":
        Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })
    with pytest.raises(KeyError):
        schema_fail_nonexistent_key_in_groups.validate(df)
示例#6
0
def test_dataframe_schema():
    schema = DataFrameSchema({
        "a":
        Column(Int, Check(lambda x: x > 0, element_wise=True)),
        "b":
        Column(Float, Check(lambda x: 0 <= x <= 10, element_wise=True)),
        "c":
        Column(String, Check(lambda x: set(x) == {"x", "y", "z"})),
        "d":
        Column(Bool, Check(lambda x: x.mean() > 0.5)),
        "e":
        Column(Category, Check(lambda x: set(x) == {"c1", "c2", "c3"})),
        "f":
        Column(Object, Check(lambda x: x.isin([(1, ), (2, ), (3, )]))),
        "g":
        Column(
            DateTime,
            Check(lambda x: x >= pd.Timestamp("2015-01-01"),
                  element_wise=True)),
        "i":
        Column(
            Timedelta,
            Check(lambda x: x < pd.Timedelta(10, unit="D"), element_wise=True))
    })
    df = pd.DataFrame({
        "a": [1, 2, 3],
        "b": [1.1, 2.5, 9.9],
        "c": ["z", "y", "x"],
        "d": [True, True, False],
        "e":
        pd.Series(["c2", "c1", "c3"], dtype="category"),
        "f": [(3, ), (2, ), (1, )],
        "g": [
            pd.Timestamp("2015-02-01"),
            pd.Timestamp("2015-02-02"),
            pd.Timestamp("2015-02-03")
        ],
        "i": [
            pd.Timedelta(1, unit="D"),
            pd.Timedelta(5, unit="D"),
            pd.Timedelta(9, unit="D")
        ]
    })
    assert isinstance(schema.validate(df), pd.DataFrame)

    # error case
    with pytest.raises(errors.SchemaError):
        schema.validate(df.drop("a", axis=1))

    with pytest.raises(errors.SchemaError):
        schema.validate(df.assign(a=[-1, -2, -1]))

    # checks if 'a' is converted to float, while schema says int, will a schema
    # error be thrown
    with pytest.raises(errors.SchemaError):
        schema.validate(df.assign(a=[1.7, 2.3, 3.1]))
示例#7
0
def test_raise_warning_dataframe():
    """Test that checks with raise_warning=True raise a warning."""
    data = pd.DataFrame({"positive_numbers": [-1, -2, -3]})
    error_schema = DataFrameSchema({
        "positive_numbers":
        Column(checks=Check(lambda s: s > 0)),
    })
    warning_schema = DataFrameSchema({
        "positive_numbers":
        Column(checks=Check(lambda s: s > 0, raise_warning=True)),
    })

    with pytest.raises(errors.SchemaError):
        error_schema(data)

    with pytest.warns(UserWarning):
        warning_schema(data)
示例#8
0
def test_schema_component_equality_operators():
    """Test the usage of == for Column, Index and MultiIndex."""
    column = Column(Int, Check(lambda s: s >= 0))
    index = Index(Int, [Check(lambda x: 1 <= x <= 11, element_wise=True)])
    multi_index = MultiIndex(indexes=[
        Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"),
        Index(String, Check(lambda s: s.isin(["foo", "bar"])), name="index1"),
    ])
    not_equal_schema = DataFrameSchema(
        {"col1": Column(Int, Check(lambda s: s >= 0))})

    assert column == copy.deepcopy(column)
    assert column != not_equal_schema
    assert index == copy.deepcopy(index)
    assert index != not_equal_schema
    assert multi_index == copy.deepcopy(multi_index)
    assert multi_index != not_equal_schema
示例#9
0
def test_dataframe_schema_str_repr():
    """Test the __str__ and __repr__ methods which are used for cleanly
    printing/logging of a DataFrameSchema."""
    schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(String),
            "col3": Column(DateTime),
        },
        index=Index(Int, name="my_index"),
    )

    for x in [schema.__str__(), schema.__repr__()]:
        assert isinstance(x, str)
        assert schema.__class__.__name__ in x
        for name in ["col1", "col2", "col3", "my_index"]:
            assert name in x
示例#10
0
def test_dataframe_schema_strict():
    """checks if strict=True whether a schema error is raised because 'a' is
    not present in the dataframe."""
    schema = DataFrameSchema({"a": Column(Int, nullable=True)},
                             strict=True)
    df = pd.DataFrame({"b": [1, 2, 3]})
    with pytest.raises(errors.SchemaError):
        schema.validate(df)
示例#11
0
def test_column():
    """Test that the Column object can be used to check dataframe."""
    data = pd.DataFrame({
        "a": [1, 2, 3],
        "b": [2.0, 3.0, 4.0],
        "c": ["foo", "bar", "baz"],
    })

    column_a = Column(Int, name="a")
    column_b = Column(Float, name="b")
    column_c = Column(String, name="c")

    assert isinstance(
        data.pipe(column_a).pipe(column_b).pipe(column_c), pd.DataFrame)

    with pytest.raises(errors.SchemaError):
        Column(Int)(data)
示例#12
0
def test_column_regex_matching_non_str_types(
    column_name_regex: str, expected_matches: List
) -> None:
    """Non-string column names should be cast into str for regex matching."""
    columns = pd.Index([1, 2.2, 3.1415, -1, -3.6, pd.Timestamp("2018/01/01")])
    column_schema = Column(name=column_name_regex, regex=True)
    matched_columns = column_schema.get_regex_columns(columns)
    assert expected_matches == matched_columns.tolist()
示例#13
0
def test_coerce_dtype_in_dataframe():
    """Tests coercions of datatypes, especially regarding nullable integers."""
    df = pd.DataFrame({
        "column1": [10.0, 20.0, 30.0],
        "column2": ["2018-01-01", "2018-02-01", "2018-03-01"],
        "column3": [1, 2, None],
        "column4": [1., 1., np.nan],
    })
    # specify `coerce` at the Column level
    schema1 = DataFrameSchema({
        "column1": Column(Int, Check(lambda x: x > 0), coerce=True),
        "column2": Column(DateTime, coerce=True),
        "column3": Column(String, coerce=True, nullable=True),
    })
    # specify `coerce` at the DataFrameSchema level
    schema2 = DataFrameSchema({
        "column1": Column(Int, Check(lambda x: x > 0)),
        "column2": Column(DateTime),
        "column3": Column(String, nullable=True),
    }, coerce=True)

    for schema in [schema1, schema2]:
        result = schema.validate(df)
        assert result.column1.dtype == Int.value
        assert result.column2.dtype == DateTime.value
        for _, x in result.column3.iteritems():
            assert pd.isna(x) or isinstance(x, str)

        # make sure that correct error is raised when null values are present
        # in a float column that's coerced to an int
        schema = DataFrameSchema({"column4": Column(Int, coerce=True)})
        with pytest.raises(ValueError):
            schema.validate(df)
示例#14
0
    def validate(self):
        """ Check if the evaluation data is valid.

        The following constraints are checked:

            * CHROM has to be in ``{"1",...,"22","X","Y"}``
            * POS has to be ``> 1``
            * REF has to match with ``re.compile("^[ACGT]+$")``
            * ALT has to match with ``re.compile("^[ACGT]+$")``
            * RG has to be of type :class:`vpmbench.enums.ReferenceGenome`
            * CLASS has to be of type :class:`vpmbench.enums.PathogencityClass`
            * TYPE has to be of type :class:`vpmbench.enums.VariationType`
            * UID has to be ``> 0``

        Raises
        ------
        :class:`~pandera.errors.SchemaErrors`
            If the validation of the data fails
        """
        chroms = set([str(x) for x in range(1, 23)] + ["X", "Y"])
        ref_validator = re.compile("^[ACGT]+$")
        alt_validator = re.compile("^[ACGT]+$")
        schema = DataFrameSchema({
            "CHROM":
            Column(String,
                   Check(lambda chrom: chrom in chroms, element_wise=True),
                   required=True),
            "POS":
            Column(Int, Check(lambda pos: pos >= 1), required=True),
            "REF":
            Column(String,
                   Check(lambda ref: ref_validator.match(ref) is not None,
                         element_wise=True),
                   required=True),
            "ALT":
            Column(String,
                   Check(lambda alt: alt_validator.match(alt) is not None,
                         element_wise=True),
                   required=True),
            "CLASS":
            Column(checks=Check(lambda cl: isinstance(cl, PathogencityClass),
                                element_wise=True),
                   required=True),
            "UID":
            Column(Int, Check(lambda x: x >= 0), required=True),
            "TYPE":
            Column(checks=Check(lambda cl: isinstance(cl, VariationType),
                                element_wise=True),
                   required=True),
            "RG":
            Column(checks=Check(lambda cl: isinstance(cl, ReferenceGenome),
                                element_wise=True),
                   required=True)
        })
        schema.validate(self.table, lazy=True)
示例#15
0
def test_dataframe_schema():
    schema = DataFrameSchema({
        "a":
        Column(PandasDtype.Int, Check(lambda x: x > 0)),
        "b":
        Column(PandasDtype.Float, Check(lambda x: 0 <= x <= 10)),
        "c":
        Column(PandasDtype.String,
               Check(lambda x: set(x) == {"x", "y", "z"}, element_wise=False)),
        "d":
        Column(PandasDtype.Bool,
               Check(lambda x: x.mean() > 0.5, element_wise=False)),
        "e":
        Column(
            PandasDtype.Category,
            Check(lambda x: set(x) == {"c1", "c2", "c3"}, element_wise=False)),
        "f":
        Column(
            PandasDtype.Object,
            Check(lambda x: x.isin([(1, ), (2, ), (3, )]),
                  element_wise=False)),
        "g":
        Column(PandasDtype.DateTime,
               Check(lambda x: x >= pd.Timestamp("2015-01-01"))),
        "i":
        Column(PandasDtype.Timedelta,
               Check(lambda x: x < pd.Timedelta(10, unit="D")))
    })
    df = pd.DataFrame({
        "a": [1, 2, 3],
        "b": [1.1, 2.5, 9.9],
        "c": ["z", "y", "x"],
        "d": [True, True, False],
        "e":
        pd.Series(["c2", "c1", "c3"], dtype="category"),
        "f": [(3, ), (2, ), (1, )],
        "g": [
            pd.Timestamp("2015-02-01"),
            pd.Timestamp("2015-02-02"),
            pd.Timestamp("2015-02-03")
        ],
        "i": [
            pd.Timedelta(1, unit="D"),
            pd.Timedelta(5, unit="D"),
            pd.Timedelta(9, unit="D")
        ]
    })
    assert isinstance(schema.validate(df), pd.DataFrame)

    # error case
    with pytest.raises(SchemaError):
        schema.validate(df.drop("a", axis=1))

    with pytest.raises(SchemaError):
        schema.validate(df.assign(a=[-1, -2, -1]))
示例#16
0
def test_nullable_int_in_dataframe():
    df = pd.DataFrame({"column1": [5, 1, np.nan]})
    null_schema = DataFrameSchema(
        {"column1": Column(Int, Check(lambda x: x > 0), nullable=True)})
    assert isinstance(null_schema.validate(df), pd.DataFrame)

    # test case where column is an object
    df = df.astype({"column1": "object"})
    assert isinstance(null_schema.validate(df), pd.DataFrame)
示例#17
0
def test_rename_columns():
    """Check that DataFrameSchema.rename_columns() method does it's job"""

    rename_dict = {"col1": "col1_new_name", "col2": "col2_new_name"}
    schema_original = DataFrameSchema(columns={
        "col1": Column(Int),
        "col2": Column(Float)
    })

    schema_renamed = schema_original.rename_columns(rename_dict)

    # Check if new column names are indeed present in the new schema
    assert all([
        col_name in rename_dict.values() for col_name in schema_renamed.columns
    ])
    # Check if original schema didn't change in the process
    assert all(
        [col_name in schema_original.columns for col_name in rename_dict])
示例#18
0
 def init_schema_element_wise():
     DataFrameSchema(
         {
             "col1": Column(
                 Int,
                 [
                     Check(
                         lambda s: s["foo"] > 10,
                         element_wise=True,
                         groupby=["col2"],
                     ),
                 ],
             ),
             "col2": Column(
                 String, Check(lambda s: s.isin(["foo", "bar"]))
             ),
         }
     )
示例#19
0
def test_coerce_not_required(data, required):
    """Test that not required columns are not coerced."""
    schema = DataFrameSchema({"col": Column(int, required=required)},
                             coerce=True)
    if required and data.empty:
        with pytest.raises(errors.SchemaError):
            schema(data)
        return
    schema(data)
示例#20
0
def test_dataframe_schema_check_function_types(check_function, should_fail):
    """Tests a DataFrameSchema against a variety of Check conditions."""
    schema = DataFrameSchema(
        {
            "a": Column(Int,
                        Check(fn=check_function, element_wise=False)),
            "b": Column(Float,
                        Check(fn=check_function, element_wise=False))
        })
    df = pd.DataFrame({
        "a": [1, 2, 3],
        "b": [1.1, 2.5, 9.9]
    })
    if should_fail:
        with pytest.raises(errors.SchemaError):
            schema.validate(df)
    else:
        schema.validate(df)
示例#21
0
def test_pandas_nullable_int_dtype(dtype, coerce):
    """Test that pandas nullable int dtype can be specified in a schema."""
    assert all(
        isinstance(
            schema.validate(
                pd.DataFrame(
                    # keep max range to 127 in order to support Int8
                    {"col": range(128)},
                    **({} if coerce else {
                        "dtype": dtype.str_alias
                    }))),
            pd.DataFrame)
        for schema in [
            DataFrameSchema({"col": Column(dtype, nullable=False)},
                            coerce=coerce),
            DataFrameSchema({"col": Column(dtype.str_alias, nullable=False)},
                            coerce=coerce)
        ])
示例#22
0
def test_add_and_remove_columns():
    """Check that adding and removing columns works as expected and doesn't
    modify the original underlying DataFrameSchema."""
    schema1 = DataFrameSchema({
        "col1": Column(Int, Check(lambda s: s >= 0)),
        }, strict=True)

    schema1_exact_copy = copy.deepcopy(schema1)

    # test that add_columns doesn't modify schema1 after add_columns:
    schema2 = schema1.add_columns({
        "col2": Column(String, Check(lambda x: x <= 0)),
        "col3": Column(Object, Check(lambda x: x == 0))
        })

    schema2_exact_copy = copy.deepcopy(schema2)

    assert schema1 == schema1_exact_copy

    # test that add_columns changed schema1 into schema2:
    expected_schema_2 = DataFrameSchema({
        "col1": Column(Int, Check(lambda s: s >= 0)),
        "col2": Column(String, Check(lambda x: x <= 0)),
        "col3": Column(Object, Check(lambda x: x == 0))
        }, strict=True)

    assert schema2 == expected_schema_2

    # test that remove_columns doesn't modify schema2:
    schema3 = schema2.remove_columns(["col2"])

    assert schema2 == schema2_exact_copy

    # test that remove_columns has removed the changes as expected:
    expected_schema_3 = DataFrameSchema({
        "col1": Column(Int, Check(lambda s: s >= 0)),
        "col3": Column(Object, Check(lambda x: x == 0))
        }, strict=True)

    assert schema3 == expected_schema_3

    # test that remove_columns can remove two columns:
    schema4 = schema2.remove_columns(["col2", "col3"])

    expected_schema_4 = DataFrameSchema({
        "col1": Column(Int, Check(lambda s: s >= 0))
        }, strict=True)

    assert schema4 == expected_schema_4 == schema1
示例#23
0
def _boolean_update_column_case(bool_kwarg):
    def _assert_bool_case(old_schema, new_schema):
        assert not getattr(old_schema.columns["col"], bool_kwarg)
        assert getattr(new_schema.columns["col"], bool_kwarg)

    return [
        Column(Int, **{bool_kwarg: False}), "col", {
            bool_kwarg: True
        }, _assert_bool_case
    ]
示例#24
0
def test_non_str_column_name_regex(column_key):
    """Check that Columns with non-str names cannot have regex=True."""

    with pytest.raises(ValueError):
        DataFrameSchema({
            column_key: Column(
                Float,
                checks=Check.greater_than_or_equal_to(0),
                regex=True,
            ),
        })

    with pytest.raises(ValueError):
        Column(
            Float,
            checks=Check.greater_than_or_equal_to(0),
            name=column_key,
            regex=True,
        )
示例#25
0
def test_python_builtin_types():
    """Test support python data types can be used for validation."""
    schema = DataFrameSchema({
        "int_col": Column(int),
        "float_col": Column(float),
        "str_col": Column(str),
        "bool_col": Column(bool),
    })
    df = pd.DataFrame({
        "int_col": [1, 2, 3],
        "float_col": [1., 2., 3.],
        "str_col": list("abc"),
        "bool_col": [True, False, True],
    })
    assert isinstance(schema(df), pd.DataFrame)
    assert schema.dtype["int_col"] == PandasDtype.Int.str_alias
    assert schema.dtype["float_col"] == PandasDtype.Float.str_alias
    assert schema.dtype["str_col"] == PandasDtype.String.str_alias
    assert schema.dtype["bool_col"] == PandasDtype.Bool.str_alias
示例#26
0
def test_multi_index_columns():
    """Tests that multi-index Columns within DataFrames validate correctly."""
    schema = DataFrameSchema({
        ("zero", "foo"): Column(Float, Check(lambda s: (s > 0) & (s < 1))),
        ("zero", "bar"): Column(
            String, Check(lambda s: s.isin(["a", "b", "c", "d"]))),
        ("one", "foo"): Column(Int, Check(lambda s: (s > 0) & (s < 10))),
        ("one", "bar"): Column(
            DateTime, Check(lambda s: s == pd.Timestamp(2019, 1, 1)))
    })
    validated_df = schema.validate(
        pd.DataFrame({
            ("zero", "foo"): [0.1, 0.2, 0.7, 0.3],
            ("zero", "bar"): ["a", "b", "c", "d"],
            ("one", "foo"): [1, 6, 4, 7],
            ("one", "bar"): pd.to_datetime(["2019/01/01"] * 4)
        })
    )
    assert isinstance(validated_df, pd.DataFrame)
示例#27
0
 def init_schema_no_groupby_column():
     DataFrameSchema({
         "col1":
         Column(
             Int,
             [
                 Check(lambda s: s["foo"] > 10, groupby=["col2"]),
             ],
         ),
     })
示例#28
0
def test_column_regex_multiindex() -> None:
    """Text that column regex works on multi-index column."""
    column_schema = Column(
        Int,
        Check(lambda s: s >= 0),
        name=("foo_*", "baz_*"),
        regex=True,
    )
    dataframe_schema = DataFrameSchema(
        {
            ("foo_*", "baz_*"): Column(
                Int, Check(lambda s: s >= 0), regex=True
            ),
        }
    )

    data = pd.DataFrame(
        {
            ("foo_1", "biz_1"): range(10),
            ("foo_2", "baz_1"): range(10, 20),
            ("foo_3", "baz_2"): range(20, 30),
            ("bar_1", "biz_2"): range(10),
            ("bar_2", "biz_3"): range(10, 20),
            ("bar_3", "biz_3"): range(20, 30),
        }
    )
    assert isinstance(column_schema.validate(data), pd.DataFrame)
    assert isinstance(dataframe_schema.validate(data), pd.DataFrame)

    # Raise an error if tuple column name is applied to a dataframe with a
    # flat pd.Index object.
    failure_column_cases = (
        [f"foo_{i}" for i in range(6)],
        pd.MultiIndex.from_tuples(
            [(f"foo_{i}", f"bar_{i}", f"baz_{i}") for i in range(6)]
        ),
    )
    for columns in failure_column_cases:
        data.columns = columns
        with pytest.raises(IndexError):
            column_schema.validate(data)
        with pytest.raises(IndexError):
            dataframe_schema.validate(data)
示例#29
0
def test_multi_index_index() -> None:
    """Tests that multi-index Indexes within DataFrames validate correctly."""
    schema = DataFrameSchema(
        columns={
            "column1": Column(Float, Check(lambda s: s > 0)),
            "column2": Column(Float, Check(lambda s: s > 0)),
        },
        index=MultiIndex(
            indexes=[
                Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"),
                Index(
                    String,
                    Check(lambda s: s.isin(["foo", "bar"])),
                    name="index1",
                ),
            ]
        ),
    )

    df = pd.DataFrame(
        data={
            "column1": [0.1, 0.5, 123.1, 10.6, 22.31],
            "column2": [0.1, 0.5, 123.1, 10.6, 22.31],
        },
        index=pd.MultiIndex.from_arrays(
            [[0, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]],
            names=["index0", "index1"],
        ),
    )

    validated_df = schema.validate(df)
    assert isinstance(validated_df, pd.DataFrame)
    assert schema.index.names == ["index0", "index1"]

    # failure case
    df_fail = df.copy()
    df_fail.index = pd.MultiIndex.from_arrays(
        [[-1, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]],
        names=["index0", "index1"],
    )
    with pytest.raises(errors.SchemaError):
        schema.validate(df_fail)
示例#30
0
def test_check_groupby():
    schema = DataFrameSchema({
        "col1":
        Column(Int, [
            Check(lambda s: s["foo"] > 10, groupby="col2"),
            Check(lambda s: s["bar"] < 10, groupby=["col2"]),
            Check(lambda s: s["foo"] > 10,
                  groupby=lambda df: df.groupby("col2")),
            Check(lambda s: s["bar"] < 10,
                  groupby=lambda df: df.groupby("col2"))
        ]),
        "col2":
        Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
    })

    df_pass = pd.DataFrame({
        "col1": [7, 8, 9, 11, 12, 13],
        "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
    })

    df = schema.validate(df_pass)
    assert isinstance(df, pd.DataFrame)
    assert len(df.columns) == 2
    assert set(df.columns) == {"col1", "col2"}

    # raise errors.SchemaError when Check fails
    df_fail_on_bar = pd.DataFrame({
        "col1": [7, 8, 20, 11, 12, 13],
        "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
    })
    df_fail_on_foo = pd.DataFrame({
        "col1": [7, 8, 9, 11, 1, 13],
        "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
    })
    # raise errors.SchemaError when groupby column doesn't exist
    df_fail_no_column = pd.DataFrame({
        "col1": [7, 8, 20, 11, 12, 13],
    })

    for df in [df_fail_on_bar, df_fail_on_foo, df_fail_no_column]:
        with pytest.raises(errors.SchemaError):
            schema.validate(df)