def test_dataframe_schema_dtype_property(): """Test that schema.dtype returns the matching Column types.""" schema = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(String), "col3": Column(DateTime), "col4": Column("uint16"), } ) assert schema.dtype == { "col1": "int64", "col2": "object", "col3": "datetime64[ns]", "col4": "uint16" }
def test_column_regex_matching( column_name_regex, expected_matches, error): """ Column regex pattern matching should yield correct matches and raise expected errors. """ columns = pd.MultiIndex.from_tuples( ( ("foo_1", "biz_1"), ("foo_2", "baz_1"), ("foo_3", "baz_2"), ("bar_1", "biz_2"), ("bar_2", "biz_3"), ("bar_3", "biz_3"), ) ) column_schema = Column( Int, Check(lambda s: s >= 0), name=column_name_regex, regex=True, ) if error is not None: with pytest.raises(error): column_schema.get_regex_columns(columns) else: matched_columns = column_schema.get_regex_columns(columns) assert expected_matches == matched_columns.tolist()
def _deserialize_schema(serialized_schema): # pylint: disable=import-outside-toplevel from pandera import Column, DataFrameSchema, Index, MultiIndex columns, index = None, None if serialized_schema["columns"] is not None: columns = { col_name: Column(**_deserialize_component_stats(column_stats)) for col_name, column_stats in serialized_schema["columns"].items() } if serialized_schema["index"] is not None: index = [ _deserialize_component_stats(index_component) for index_component in serialized_schema["index"] ] if index is None: pass elif len(index) == 1: index = Index(**index[0]) else: index = MultiIndex( indexes=[Index(**index_properties) for index_properties in index]) return DataFrameSchema( columns=columns, index=index, coerce=serialized_schema["coerce"], strict=serialized_schema["strict"], )
def test_dataframe_checks(): """Tests that dataframe checks validate, error when a DataFrame doesn't comply with the schema, simple tests of the groupby checks which are covered in more detail above.""" schema = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(Float), "col3": Column(String), "col4": Column(String), }, checks=[ Check(lambda df: df["col1"] < df["col2"]), Check(lambda df: df["col3"] == df["col4"]), ] ) df = pd.DataFrame({ "col1": [1, 2, 3], "col2": [2.0, 3.0, 4.0], "col3": ["foo", "bar", "baz"], "col4": ["foo", "bar", "baz"], }) assert isinstance(schema.validate(df), pd.DataFrame) # test invalid schema error raising invalid_df = df.copy() invalid_df["col1"] = invalid_df["col1"] * 3 with pytest.raises(errors.SchemaError): schema.validate(invalid_df) # test groupby checks groupby_check_schema = DataFrameSchema( columns={ "col1": Column(Int), "col3": Column(String), }, checks=[ Check(lambda g: g["foo"]["col1"].iat[0] == 1, groupby="col3"), Check(lambda g: g["foo"]["col2"].iat[0] == 2.0, groupby="col3"), Check(lambda g: g["foo"]["col3"].iat[0] == "foo", groupby="col3"), Check(lambda g: g[("foo", "foo")]["col1"].iat[0] == 1, groupby=["col3", "col4"]), ] ) assert isinstance(groupby_check_schema.validate(df), pd.DataFrame) # test element-wise checks element_wise_check_schema = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(Float), }, checks=Check(lambda row: row["col1"] < row["col2"], element_wise=True) ) assert isinstance(element_wise_check_schema.validate(df), pd.DataFrame)
def test_check_groups(): schema = DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s["foo"] > 10, groupby="col2", groups=["foo"]), Check(lambda s: s["foo"] > 10, groupby="col2", groups="foo"), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), }) df = pd.DataFrame({ "col1": [7, 8, 9, 11, 12, 13], "col2": ["bar", "bar", "bar", "foo", "foo", "foo"], }) validated_df = schema.validate(df) assert isinstance(validated_df, pd.DataFrame) assert len(validated_df.columns) == 2 assert set(validated_df.columns) == {"col1", "col2"} # raise KeyError when groups does not include a particular group name schema_fail_key_error = DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s["bar"] > 10, groupby="col2", groups="foo"), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), }) with pytest.raises(KeyError, match="^'bar'"): schema_fail_key_error.validate(df) # raise KeyError when the group does not exist in the groupby column when # referenced in the Check function schema_fail_nonexistent_key_in_fn = DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s["baz"] > 10, groupby="col2", groups=["foo"]), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), }) with pytest.raises(KeyError, match="^'baz'"): schema_fail_nonexistent_key_in_fn.validate(df) # raise KeyError when the group does not exist in the groups argument. schema_fail_nonexistent_key_in_groups = DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s["foo"] > 10, groupby="col2", groups=["baz"]), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), }) with pytest.raises(KeyError): schema_fail_nonexistent_key_in_groups.validate(df)
def test_dataframe_schema(): schema = DataFrameSchema({ "a": Column(Int, Check(lambda x: x > 0, element_wise=True)), "b": Column(Float, Check(lambda x: 0 <= x <= 10, element_wise=True)), "c": Column(String, Check(lambda x: set(x) == {"x", "y", "z"})), "d": Column(Bool, Check(lambda x: x.mean() > 0.5)), "e": Column(Category, Check(lambda x: set(x) == {"c1", "c2", "c3"})), "f": Column(Object, Check(lambda x: x.isin([(1, ), (2, ), (3, )]))), "g": Column( DateTime, Check(lambda x: x >= pd.Timestamp("2015-01-01"), element_wise=True)), "i": Column( Timedelta, Check(lambda x: x < pd.Timedelta(10, unit="D"), element_wise=True)) }) df = pd.DataFrame({ "a": [1, 2, 3], "b": [1.1, 2.5, 9.9], "c": ["z", "y", "x"], "d": [True, True, False], "e": pd.Series(["c2", "c1", "c3"], dtype="category"), "f": [(3, ), (2, ), (1, )], "g": [ pd.Timestamp("2015-02-01"), pd.Timestamp("2015-02-02"), pd.Timestamp("2015-02-03") ], "i": [ pd.Timedelta(1, unit="D"), pd.Timedelta(5, unit="D"), pd.Timedelta(9, unit="D") ] }) assert isinstance(schema.validate(df), pd.DataFrame) # error case with pytest.raises(errors.SchemaError): schema.validate(df.drop("a", axis=1)) with pytest.raises(errors.SchemaError): schema.validate(df.assign(a=[-1, -2, -1])) # checks if 'a' is converted to float, while schema says int, will a schema # error be thrown with pytest.raises(errors.SchemaError): schema.validate(df.assign(a=[1.7, 2.3, 3.1]))
def test_raise_warning_dataframe(): """Test that checks with raise_warning=True raise a warning.""" data = pd.DataFrame({"positive_numbers": [-1, -2, -3]}) error_schema = DataFrameSchema({ "positive_numbers": Column(checks=Check(lambda s: s > 0)), }) warning_schema = DataFrameSchema({ "positive_numbers": Column(checks=Check(lambda s: s > 0, raise_warning=True)), }) with pytest.raises(errors.SchemaError): error_schema(data) with pytest.warns(UserWarning): warning_schema(data)
def test_schema_component_equality_operators(): """Test the usage of == for Column, Index and MultiIndex.""" column = Column(Int, Check(lambda s: s >= 0)) index = Index(Int, [Check(lambda x: 1 <= x <= 11, element_wise=True)]) multi_index = MultiIndex(indexes=[ Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"), Index(String, Check(lambda s: s.isin(["foo", "bar"])), name="index1"), ]) not_equal_schema = DataFrameSchema( {"col1": Column(Int, Check(lambda s: s >= 0))}) assert column == copy.deepcopy(column) assert column != not_equal_schema assert index == copy.deepcopy(index) assert index != not_equal_schema assert multi_index == copy.deepcopy(multi_index) assert multi_index != not_equal_schema
def test_dataframe_schema_str_repr(): """Test the __str__ and __repr__ methods which are used for cleanly printing/logging of a DataFrameSchema.""" schema = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(String), "col3": Column(DateTime), }, index=Index(Int, name="my_index"), ) for x in [schema.__str__(), schema.__repr__()]: assert isinstance(x, str) assert schema.__class__.__name__ in x for name in ["col1", "col2", "col3", "my_index"]: assert name in x
def test_dataframe_schema_strict(): """checks if strict=True whether a schema error is raised because 'a' is not present in the dataframe.""" schema = DataFrameSchema({"a": Column(Int, nullable=True)}, strict=True) df = pd.DataFrame({"b": [1, 2, 3]}) with pytest.raises(errors.SchemaError): schema.validate(df)
def test_column(): """Test that the Column object can be used to check dataframe.""" data = pd.DataFrame({ "a": [1, 2, 3], "b": [2.0, 3.0, 4.0], "c": ["foo", "bar", "baz"], }) column_a = Column(Int, name="a") column_b = Column(Float, name="b") column_c = Column(String, name="c") assert isinstance( data.pipe(column_a).pipe(column_b).pipe(column_c), pd.DataFrame) with pytest.raises(errors.SchemaError): Column(Int)(data)
def test_column_regex_matching_non_str_types( column_name_regex: str, expected_matches: List ) -> None: """Non-string column names should be cast into str for regex matching.""" columns = pd.Index([1, 2.2, 3.1415, -1, -3.6, pd.Timestamp("2018/01/01")]) column_schema = Column(name=column_name_regex, regex=True) matched_columns = column_schema.get_regex_columns(columns) assert expected_matches == matched_columns.tolist()
def test_coerce_dtype_in_dataframe(): """Tests coercions of datatypes, especially regarding nullable integers.""" df = pd.DataFrame({ "column1": [10.0, 20.0, 30.0], "column2": ["2018-01-01", "2018-02-01", "2018-03-01"], "column3": [1, 2, None], "column4": [1., 1., np.nan], }) # specify `coerce` at the Column level schema1 = DataFrameSchema({ "column1": Column(Int, Check(lambda x: x > 0), coerce=True), "column2": Column(DateTime, coerce=True), "column3": Column(String, coerce=True, nullable=True), }) # specify `coerce` at the DataFrameSchema level schema2 = DataFrameSchema({ "column1": Column(Int, Check(lambda x: x > 0)), "column2": Column(DateTime), "column3": Column(String, nullable=True), }, coerce=True) for schema in [schema1, schema2]: result = schema.validate(df) assert result.column1.dtype == Int.value assert result.column2.dtype == DateTime.value for _, x in result.column3.iteritems(): assert pd.isna(x) or isinstance(x, str) # make sure that correct error is raised when null values are present # in a float column that's coerced to an int schema = DataFrameSchema({"column4": Column(Int, coerce=True)}) with pytest.raises(ValueError): schema.validate(df)
def validate(self): """ Check if the evaluation data is valid. The following constraints are checked: * CHROM has to be in ``{"1",...,"22","X","Y"}`` * POS has to be ``> 1`` * REF has to match with ``re.compile("^[ACGT]+$")`` * ALT has to match with ``re.compile("^[ACGT]+$")`` * RG has to be of type :class:`vpmbench.enums.ReferenceGenome` * CLASS has to be of type :class:`vpmbench.enums.PathogencityClass` * TYPE has to be of type :class:`vpmbench.enums.VariationType` * UID has to be ``> 0`` Raises ------ :class:`~pandera.errors.SchemaErrors` If the validation of the data fails """ chroms = set([str(x) for x in range(1, 23)] + ["X", "Y"]) ref_validator = re.compile("^[ACGT]+$") alt_validator = re.compile("^[ACGT]+$") schema = DataFrameSchema({ "CHROM": Column(String, Check(lambda chrom: chrom in chroms, element_wise=True), required=True), "POS": Column(Int, Check(lambda pos: pos >= 1), required=True), "REF": Column(String, Check(lambda ref: ref_validator.match(ref) is not None, element_wise=True), required=True), "ALT": Column(String, Check(lambda alt: alt_validator.match(alt) is not None, element_wise=True), required=True), "CLASS": Column(checks=Check(lambda cl: isinstance(cl, PathogencityClass), element_wise=True), required=True), "UID": Column(Int, Check(lambda x: x >= 0), required=True), "TYPE": Column(checks=Check(lambda cl: isinstance(cl, VariationType), element_wise=True), required=True), "RG": Column(checks=Check(lambda cl: isinstance(cl, ReferenceGenome), element_wise=True), required=True) }) schema.validate(self.table, lazy=True)
def test_dataframe_schema(): schema = DataFrameSchema({ "a": Column(PandasDtype.Int, Check(lambda x: x > 0)), "b": Column(PandasDtype.Float, Check(lambda x: 0 <= x <= 10)), "c": Column(PandasDtype.String, Check(lambda x: set(x) == {"x", "y", "z"}, element_wise=False)), "d": Column(PandasDtype.Bool, Check(lambda x: x.mean() > 0.5, element_wise=False)), "e": Column( PandasDtype.Category, Check(lambda x: set(x) == {"c1", "c2", "c3"}, element_wise=False)), "f": Column( PandasDtype.Object, Check(lambda x: x.isin([(1, ), (2, ), (3, )]), element_wise=False)), "g": Column(PandasDtype.DateTime, Check(lambda x: x >= pd.Timestamp("2015-01-01"))), "i": Column(PandasDtype.Timedelta, Check(lambda x: x < pd.Timedelta(10, unit="D"))) }) df = pd.DataFrame({ "a": [1, 2, 3], "b": [1.1, 2.5, 9.9], "c": ["z", "y", "x"], "d": [True, True, False], "e": pd.Series(["c2", "c1", "c3"], dtype="category"), "f": [(3, ), (2, ), (1, )], "g": [ pd.Timestamp("2015-02-01"), pd.Timestamp("2015-02-02"), pd.Timestamp("2015-02-03") ], "i": [ pd.Timedelta(1, unit="D"), pd.Timedelta(5, unit="D"), pd.Timedelta(9, unit="D") ] }) assert isinstance(schema.validate(df), pd.DataFrame) # error case with pytest.raises(SchemaError): schema.validate(df.drop("a", axis=1)) with pytest.raises(SchemaError): schema.validate(df.assign(a=[-1, -2, -1]))
def test_nullable_int_in_dataframe(): df = pd.DataFrame({"column1": [5, 1, np.nan]}) null_schema = DataFrameSchema( {"column1": Column(Int, Check(lambda x: x > 0), nullable=True)}) assert isinstance(null_schema.validate(df), pd.DataFrame) # test case where column is an object df = df.astype({"column1": "object"}) assert isinstance(null_schema.validate(df), pd.DataFrame)
def test_rename_columns(): """Check that DataFrameSchema.rename_columns() method does it's job""" rename_dict = {"col1": "col1_new_name", "col2": "col2_new_name"} schema_original = DataFrameSchema(columns={ "col1": Column(Int), "col2": Column(Float) }) schema_renamed = schema_original.rename_columns(rename_dict) # Check if new column names are indeed present in the new schema assert all([ col_name in rename_dict.values() for col_name in schema_renamed.columns ]) # Check if original schema didn't change in the process assert all( [col_name in schema_original.columns for col_name in rename_dict])
def init_schema_element_wise(): DataFrameSchema( { "col1": Column( Int, [ Check( lambda s: s["foo"] > 10, element_wise=True, groupby=["col2"], ), ], ), "col2": Column( String, Check(lambda s: s.isin(["foo", "bar"])) ), } )
def test_coerce_not_required(data, required): """Test that not required columns are not coerced.""" schema = DataFrameSchema({"col": Column(int, required=required)}, coerce=True) if required and data.empty: with pytest.raises(errors.SchemaError): schema(data) return schema(data)
def test_dataframe_schema_check_function_types(check_function, should_fail): """Tests a DataFrameSchema against a variety of Check conditions.""" schema = DataFrameSchema( { "a": Column(Int, Check(fn=check_function, element_wise=False)), "b": Column(Float, Check(fn=check_function, element_wise=False)) }) df = pd.DataFrame({ "a": [1, 2, 3], "b": [1.1, 2.5, 9.9] }) if should_fail: with pytest.raises(errors.SchemaError): schema.validate(df) else: schema.validate(df)
def test_pandas_nullable_int_dtype(dtype, coerce): """Test that pandas nullable int dtype can be specified in a schema.""" assert all( isinstance( schema.validate( pd.DataFrame( # keep max range to 127 in order to support Int8 {"col": range(128)}, **({} if coerce else { "dtype": dtype.str_alias }))), pd.DataFrame) for schema in [ DataFrameSchema({"col": Column(dtype, nullable=False)}, coerce=coerce), DataFrameSchema({"col": Column(dtype.str_alias, nullable=False)}, coerce=coerce) ])
def test_add_and_remove_columns(): """Check that adding and removing columns works as expected and doesn't modify the original underlying DataFrameSchema.""" schema1 = DataFrameSchema({ "col1": Column(Int, Check(lambda s: s >= 0)), }, strict=True) schema1_exact_copy = copy.deepcopy(schema1) # test that add_columns doesn't modify schema1 after add_columns: schema2 = schema1.add_columns({ "col2": Column(String, Check(lambda x: x <= 0)), "col3": Column(Object, Check(lambda x: x == 0)) }) schema2_exact_copy = copy.deepcopy(schema2) assert schema1 == schema1_exact_copy # test that add_columns changed schema1 into schema2: expected_schema_2 = DataFrameSchema({ "col1": Column(Int, Check(lambda s: s >= 0)), "col2": Column(String, Check(lambda x: x <= 0)), "col3": Column(Object, Check(lambda x: x == 0)) }, strict=True) assert schema2 == expected_schema_2 # test that remove_columns doesn't modify schema2: schema3 = schema2.remove_columns(["col2"]) assert schema2 == schema2_exact_copy # test that remove_columns has removed the changes as expected: expected_schema_3 = DataFrameSchema({ "col1": Column(Int, Check(lambda s: s >= 0)), "col3": Column(Object, Check(lambda x: x == 0)) }, strict=True) assert schema3 == expected_schema_3 # test that remove_columns can remove two columns: schema4 = schema2.remove_columns(["col2", "col3"]) expected_schema_4 = DataFrameSchema({ "col1": Column(Int, Check(lambda s: s >= 0)) }, strict=True) assert schema4 == expected_schema_4 == schema1
def _boolean_update_column_case(bool_kwarg): def _assert_bool_case(old_schema, new_schema): assert not getattr(old_schema.columns["col"], bool_kwarg) assert getattr(new_schema.columns["col"], bool_kwarg) return [ Column(Int, **{bool_kwarg: False}), "col", { bool_kwarg: True }, _assert_bool_case ]
def test_non_str_column_name_regex(column_key): """Check that Columns with non-str names cannot have regex=True.""" with pytest.raises(ValueError): DataFrameSchema({ column_key: Column( Float, checks=Check.greater_than_or_equal_to(0), regex=True, ), }) with pytest.raises(ValueError): Column( Float, checks=Check.greater_than_or_equal_to(0), name=column_key, regex=True, )
def test_python_builtin_types(): """Test support python data types can be used for validation.""" schema = DataFrameSchema({ "int_col": Column(int), "float_col": Column(float), "str_col": Column(str), "bool_col": Column(bool), }) df = pd.DataFrame({ "int_col": [1, 2, 3], "float_col": [1., 2., 3.], "str_col": list("abc"), "bool_col": [True, False, True], }) assert isinstance(schema(df), pd.DataFrame) assert schema.dtype["int_col"] == PandasDtype.Int.str_alias assert schema.dtype["float_col"] == PandasDtype.Float.str_alias assert schema.dtype["str_col"] == PandasDtype.String.str_alias assert schema.dtype["bool_col"] == PandasDtype.Bool.str_alias
def test_multi_index_columns(): """Tests that multi-index Columns within DataFrames validate correctly.""" schema = DataFrameSchema({ ("zero", "foo"): Column(Float, Check(lambda s: (s > 0) & (s < 1))), ("zero", "bar"): Column( String, Check(lambda s: s.isin(["a", "b", "c", "d"]))), ("one", "foo"): Column(Int, Check(lambda s: (s > 0) & (s < 10))), ("one", "bar"): Column( DateTime, Check(lambda s: s == pd.Timestamp(2019, 1, 1))) }) validated_df = schema.validate( pd.DataFrame({ ("zero", "foo"): [0.1, 0.2, 0.7, 0.3], ("zero", "bar"): ["a", "b", "c", "d"], ("one", "foo"): [1, 6, 4, 7], ("one", "bar"): pd.to_datetime(["2019/01/01"] * 4) }) ) assert isinstance(validated_df, pd.DataFrame)
def init_schema_no_groupby_column(): DataFrameSchema({ "col1": Column( Int, [ Check(lambda s: s["foo"] > 10, groupby=["col2"]), ], ), })
def test_column_regex_multiindex() -> None: """Text that column regex works on multi-index column.""" column_schema = Column( Int, Check(lambda s: s >= 0), name=("foo_*", "baz_*"), regex=True, ) dataframe_schema = DataFrameSchema( { ("foo_*", "baz_*"): Column( Int, Check(lambda s: s >= 0), regex=True ), } ) data = pd.DataFrame( { ("foo_1", "biz_1"): range(10), ("foo_2", "baz_1"): range(10, 20), ("foo_3", "baz_2"): range(20, 30), ("bar_1", "biz_2"): range(10), ("bar_2", "biz_3"): range(10, 20), ("bar_3", "biz_3"): range(20, 30), } ) assert isinstance(column_schema.validate(data), pd.DataFrame) assert isinstance(dataframe_schema.validate(data), pd.DataFrame) # Raise an error if tuple column name is applied to a dataframe with a # flat pd.Index object. failure_column_cases = ( [f"foo_{i}" for i in range(6)], pd.MultiIndex.from_tuples( [(f"foo_{i}", f"bar_{i}", f"baz_{i}") for i in range(6)] ), ) for columns in failure_column_cases: data.columns = columns with pytest.raises(IndexError): column_schema.validate(data) with pytest.raises(IndexError): dataframe_schema.validate(data)
def test_multi_index_index() -> None: """Tests that multi-index Indexes within DataFrames validate correctly.""" schema = DataFrameSchema( columns={ "column1": Column(Float, Check(lambda s: s > 0)), "column2": Column(Float, Check(lambda s: s > 0)), }, index=MultiIndex( indexes=[ Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"), Index( String, Check(lambda s: s.isin(["foo", "bar"])), name="index1", ), ] ), ) df = pd.DataFrame( data={ "column1": [0.1, 0.5, 123.1, 10.6, 22.31], "column2": [0.1, 0.5, 123.1, 10.6, 22.31], }, index=pd.MultiIndex.from_arrays( [[0, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]], names=["index0", "index1"], ), ) validated_df = schema.validate(df) assert isinstance(validated_df, pd.DataFrame) assert schema.index.names == ["index0", "index1"] # failure case df_fail = df.copy() df_fail.index = pd.MultiIndex.from_arrays( [[-1, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]], names=["index0", "index1"], ) with pytest.raises(errors.SchemaError): schema.validate(df_fail)
def test_check_groupby(): schema = DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s["foo"] > 10, groupby="col2"), Check(lambda s: s["bar"] < 10, groupby=["col2"]), Check(lambda s: s["foo"] > 10, groupby=lambda df: df.groupby("col2")), Check(lambda s: s["bar"] < 10, groupby=lambda df: df.groupby("col2")) ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), }) df_pass = pd.DataFrame({ "col1": [7, 8, 9, 11, 12, 13], "col2": ["bar", "bar", "bar", "foo", "foo", "foo"], }) df = schema.validate(df_pass) assert isinstance(df, pd.DataFrame) assert len(df.columns) == 2 assert set(df.columns) == {"col1", "col2"} # raise errors.SchemaError when Check fails df_fail_on_bar = pd.DataFrame({ "col1": [7, 8, 20, 11, 12, 13], "col2": ["bar", "bar", "bar", "foo", "foo", "foo"], }) df_fail_on_foo = pd.DataFrame({ "col1": [7, 8, 9, 11, 1, 13], "col2": ["bar", "bar", "bar", "foo", "foo", "foo"], }) # raise errors.SchemaError when groupby column doesn't exist df_fail_no_column = pd.DataFrame({ "col1": [7, 8, 20, 11, 12, 13], }) for df in [df_fail_on_bar, df_fail_on_foo, df_fail_no_column]: with pytest.raises(errors.SchemaError): schema.validate(df)