def test_multi_index_index(): schema = DataFrameSchema( columns={ "column1": Column(Float, Check(lambda s: s > 0)), "column2": Column(Float, Check(lambda s: s > 0)), }, index=MultiIndex(indexes=[ Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"), Index(String, Check(lambda s: s.isin(["foo", "bar"])), name="index1"), ])) df = pd.DataFrame( data={ "column1": [0.1, 0.5, 123.1, 10.6, 22.31], "column2": [0.1, 0.5, 123.1, 10.6, 22.31], }, index=pd.MultiIndex.from_arrays( [[0, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]], names=["index0", "index1"], )) validated_df = schema.validate(df) assert isinstance(validated_df, pd.DataFrame) # failure case df_fail = df.copy() df_fail.index = pd.MultiIndex.from_arrays( [[-1, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]], names=["index0", "index1"], ) with pytest.raises(errors.SchemaError): schema.validate(df_fail)
def test_one_sample_hypothesis(): """Check one sample ttest.""" schema = DataFrameSchema({ "height_in_feet": Column( Float, [ Hypothesis.one_sample_ttest( popmean=5, relationship="greater_than", alpha=0.1), ] ), }) subset_schema = DataFrameSchema({ "group": Column(String), "height_in_feet": Column( Float, [ Hypothesis.one_sample_ttest( sample="A", groupby="group", popmean=5, relationship="greater_than", alpha=0.1), ] ), }) df = ( pd.DataFrame({ "height_in_feet": [8.1, 7, 6.5, 6.7, 5.1], "group": ["A", "A", "B", "B", "A"], }) ) schema.validate(df) subset_schema.validate(df)
def test_column_regex_non_str_types() -> None: """Check that column name regex matching excludes non-string types.""" data = pd.DataFrame( { 1: [1, 2, 3], 2.2: [1, 2, 3], pd.Timestamp("2018/01/01"): [1, 2, 3], "foo_1": [1, 2, 3], "foo_2": [1, 2, 3], "foo_3": [1, 2, 3], } ) schema = DataFrameSchema( columns={ "foo_": Column(Int, Check.gt(0), regex=True), r"\d+": Column(Int, Check.gt(0), regex=True), r"\d+\.\d+": Column(Int, Check.gt(0), regex=True), "2018-01-01": Column(Int, Check.gt(0), regex=True), }, ) assert isinstance(schema.validate(data), pd.DataFrame) # test MultiIndex column case data = pd.DataFrame( { (1, 1): [1, 2, 3], (2.2, 4.5): [1, 2, 3], ("foo", "bar"): [1, 2, 3], } ) schema = DataFrameSchema( columns={("foo_*", "bar_*"): Column(Int, regex=True)}, ) schema.validate(data)
def test_lazy_dataframe_validation_nullable(): """ Test that non-nullable column failure cases are correctly processed during lazy validation. """ schema = DataFrameSchema( columns={ "int_column": Column(Int, nullable=False), "float_column": Column(Float, nullable=False), "str_column": Column(String, nullable=False), }, strict=True, ) df = pd.DataFrame({ "int_column": [1, None, 3], "float_column": [0.1, 1.2, None], "str_column": [None, "foo", "bar"], }) try: schema.validate(df, lazy=True) except errors.SchemaErrors as err: assert err.failure_cases.failure_case.isna().all() for col, index in [ ("int_column", 1), ("float_column", 2), ("str_column", 0), ]: # pylint: disable=cell-var-from-loop assert (err.failure_cases.loc[lambda df: df.column == col, "index"].iloc[0] == index)
def test_coerce_dtype_in_dataframe(): """Tests coercions of datatypes, especially regarding nullable integers.""" df = pd.DataFrame({ "column1": [10.0, 20.0, 30.0], "column2": ["2018-01-01", "2018-02-01", "2018-03-01"], "column3": [1, 2, None], "column4": [1., 1., np.nan], }) # specify `coerce` at the Column level schema1 = DataFrameSchema({ "column1": Column(Int, Check(lambda x: x > 0), coerce=True), "column2": Column(DateTime, coerce=True), "column3": Column(String, coerce=True, nullable=True), }) # specify `coerce` at the DataFrameSchema level schema2 = DataFrameSchema({ "column1": Column(Int, Check(lambda x: x > 0)), "column2": Column(DateTime), "column3": Column(String, nullable=True), }, coerce=True) for schema in [schema1, schema2]: result = schema.validate(df) assert result.column1.dtype == Int.value assert result.column2.dtype == DateTime.value for _, x in result.column3.iteritems(): assert pd.isna(x) or isinstance(x, str) # make sure that correct error is raised when null values are present # in a float column that's coerced to an int schema = DataFrameSchema({"column4": Column(Int, coerce=True)}) with pytest.raises(ValueError): schema.validate(df)
def test_column_regex_multiindex(): """Text that column regex works on multi-index column.""" column_schema = Column( Int, Check(lambda s: s >= 0), name=("foo_*", "baz_*"), regex=True, ) dataframe_schema = DataFrameSchema({ ("foo_*", "baz_*"): Column(Int, Check(lambda s: s >= 0), regex=True), }) data = pd.DataFrame({ ("foo_1", "biz_1"): range(10), ("foo_2", "baz_1"): range(10, 20), ("foo_3", "baz_2"): range(20, 30), ("bar_1", "biz_2"): range(10), ("bar_2", "biz_3"): range(10, 20), ("bar_3", "biz_3"): range(20, 30), }) assert isinstance(column_schema.validate(data), pd.DataFrame) assert isinstance(dataframe_schema.validate(data), pd.DataFrame) # Raise an error if tuple column name is applied to a dataframe with a # flat pd.Index object. failure_column_cases = ( ["foo_%s" % i for i in range(6)], pd.MultiIndex.from_tuples([ ("foo_%s" % i, "bar_%s" % i, "baz_%s" % i) for i in range(6) ]) ) for columns in failure_column_cases: data.columns = columns with pytest.raises(IndexError): column_schema.validate(data) with pytest.raises(IndexError): dataframe_schema.validate(data)
def _validate_score_table(variant_information_table: DataFrame, score_table: DataFrame): """ Validate the results of the prioritization method. The following constraints are checked: * Each UID from the variant_information_table is also in the score_table * Each SCORE in the score_table is a numerical value Parameters ---------- variant_information_table : The variant information table score_table : The scoring results from the prioritization method Raises ------ :class:`~pandera.errors.SchemaErrors` If the validation of the data fails """ variants_uid = variant_information_table["UID"] schema = DataFrameSchema({ "UID": Column( Int, Check(lambda x: variants_uid.isin(x) & x.isin(variants_uid)), required=True), "SCORE": Column(Float, coerce=True, required=True) }) schema.validate(score_table, lazy=True)
def test_required(): """Tests how a Required Column is handled when it's not included, included and then not specified and a second column which is implicitly required isn't available.""" schema = DataFrameSchema({ "col1": Column(Int, required=False), "col2": Column(String) }) df_ok_1 = pd.DataFrame({ "col2": ['hello', 'world'] }) df = schema.validate(df_ok_1) assert isinstance(df, pd.DataFrame) assert len(df.columns) == 1 assert set(df.columns) == {"col2"} df_ok_2 = pd.DataFrame({ "col1": [1, 2], "col2": ['hello', 'world'] }) df = schema.validate(df_ok_2) assert isinstance(df, pd.DataFrame) assert len(df.columns) == 2 assert set(df.columns) == {"col1", "col2"} df_not_ok = pd.DataFrame({ "col1": [1, 2] }) with pytest.raises(Exception): schema.validate(df_not_ok)
def test_dataframe_schema_strict(): # checks if strict=True whether a schema error is raised because 'a' is not # present in the dataframe. schema = DataFrameSchema({"a": Column(Int, nullable=True)}, strict=True) df = pd.DataFrame({"b": [1, 2, 3]}) with pytest.raises(errors.SchemaError): schema.validate(df)
def test_datetime(): """Test datetime types can be validated properly by schema.validate""" schema = DataFrameSchema( columns={ "col": Column( pa.DateTime, checks=Check(lambda s: s.min() > pd.Timestamp("2015")), ) } ) validated_df = schema.validate( pd.DataFrame( {"col": pd.to_datetime(["2019/01/01", "2018/05/21", "2016/03/10"])} ) ) assert isinstance(validated_df, pd.DataFrame) with pytest.raises(SchemaError): schema.validate( pd.DataFrame( {"col": pd.to_datetime(["2010/01/01"])} ) )
def test_column_regex_strict() -> None: """Test that Column regex patterns correctly parsed in DataFrameSchema.""" data = pd.DataFrame( { "foo_1": [1, 2, 3], "foo_2": [1, 2, 3], "foo_3": [1, 2, 3], } ) schema = DataFrameSchema( columns={"foo_*": Column(Int, regex=True)}, strict=True ) assert isinstance(schema.validate(data), pd.DataFrame) # adding an extra column in the dataframe should cause error data = data.assign(bar=[1, 2, 3]) with pytest.raises(errors.SchemaError): schema.validate(data) # adding an extra regex column to the schema should pass the strictness # test validated_data = schema.add_columns( {"bar_*": Column(Int, regex=True)} ).validate(data.assign(bar_1=[1, 2, 3])) assert isinstance(validated_data, pd.DataFrame)
def test_column_regex(): """Test that column regex work on single-level column index.""" column_schema = Column( Int, Check(lambda s: s >= 0), name="foo_*", regex=True) dataframe_schema = DataFrameSchema({ "foo_*": Column(Int, Check(lambda s: s >= 0), regex=True), }) data = pd.DataFrame({ "foo_1": range(10), "foo_2": range(10, 20), "foo_3": range(20, 30), "bar_1": range(10), "bar_2": range(10, 20), "bar_3": range(20, 30), }) assert isinstance(column_schema.validate(data), pd.DataFrame) assert isinstance(dataframe_schema.validate(data), pd.DataFrame) # Raise an error on multi-index column case data.columns = pd.MultiIndex.from_tuples( ( ("foo_1", "biz_1"), ("foo_2", "baz_1"), ("foo_3", "baz_2"), ("bar_1", "biz_2"), ("bar_2", "biz_3"), ("bar_3", "biz_3"), ) ) with pytest.raises(IndexError): column_schema.validate(data) with pytest.raises(IndexError): dataframe_schema.validate(data)
def test_dataframe_checks(): """Tests that dataframe checks validate, error when a DataFrame doesn't comply with the schema, simple tests of the groupby checks which are covered in more detail above.""" schema = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(Float), "col3": Column(String), "col4": Column(String), }, checks=[ Check(lambda df: df["col1"] < df["col2"]), Check(lambda df: df["col3"] == df["col4"]), ], ) df = pd.DataFrame({ "col1": [1, 2, 3], "col2": [2.0, 3.0, 4.0], "col3": ["foo", "bar", "baz"], "col4": ["foo", "bar", "baz"], }) assert isinstance(schema.validate(df), pd.DataFrame) # test invalid schema error raising invalid_df = df.copy() invalid_df["col1"] = invalid_df["col1"] * 3 with pytest.raises(errors.SchemaError): schema.validate(invalid_df) # test groupby checks groupby_check_schema = DataFrameSchema( columns={ "col1": Column(Int), "col3": Column(String), }, checks=[ Check(lambda g: g["foo"]["col1"].iat[0] == 1, groupby="col3"), Check(lambda g: g["foo"]["col2"].iat[0] == 2.0, groupby="col3"), Check(lambda g: g["foo"]["col3"].iat[0] == "foo", groupby="col3"), Check( lambda g: g[("foo", "foo")]["col1"].iat[0] == 1, groupby=["col3", "col4"], ), ], ) assert isinstance(groupby_check_schema.validate(df), pd.DataFrame) # test element-wise checks element_wise_check_schema = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(Float), }, checks=Check(lambda row: row["col1"] < row["col2"], element_wise=True), ) assert isinstance(element_wise_check_schema.validate(df), pd.DataFrame)
def test_check_groupby(): """Tests uses of groupby to specify dependencies between one column and a single other column, including error handling.""" schema = DataFrameSchema( columns={ "col1": Column(Int, [ Check(lambda s: s["foo"] > 10, groupby="col2"), Check(lambda s: s["bar"] < 10, groupby=["col2"]), Check(lambda s: s["foo"] > 10, groupby=lambda df: df.groupby("col2")), Check(lambda s: s["bar"] < 10, groupby=lambda df: df.groupby("col2")) ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), }, index=Index(Int, name="data_id"), ) df_pass = pd.DataFrame( data={ "col1": [7, 8, 9, 11, 12, 13], "col2": ["bar", "bar", "bar", "foo", "foo", "foo"], }, index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"), ) df = schema.validate(df_pass) assert isinstance(df, pd.DataFrame) assert len(df.columns) == 2 assert set(df.columns) == {"col1", "col2"} # raise errors.SchemaError when Check fails df_fail_on_bar = pd.DataFrame( data={ "col1": [7, 8, 20, 11, 12, 13], "col2": ["bar", "bar", "bar", "foo", "foo", "foo"], }, index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"), ) df_fail_on_foo = pd.DataFrame( data={ "col1": [7, 8, 9, 11, 1, 13], "col2": ["bar", "bar", "bar", "foo", "foo", "foo"], }, index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"), ) # raise errors.SchemaError when groupby column doesn't exist df_fail_no_column = pd.DataFrame( data={ "col1": [7, 8, 20, 11, 12, 13], }, index=pd.Series([1, 2, 3, 4, 5, 6], name="data_id"), ) for df in [df_fail_on_bar, df_fail_on_foo, df_fail_no_column]: with pytest.raises(errors.SchemaError): schema.validate(df)
def test_dataframe_hypothesis_checks(): """ Test that two specific implementations of a Hypothesis work as expected and that using a Column that wasn't defined will error. """ df = pd.DataFrame({ "col1": range(100, 201), "col2": range(0, 101), }) hypothesis_check_schema = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(Int), }, checks=[ # two-sample test Hypothesis( test=stats.ttest_ind, samples=["col1", "col2"], relationship=lambda stat, pvalue, alpha=0.01: (stat > 0 and pvalue / 2 < alpha), relationship_kwargs={"alpha": 0.5}, ), # one-sample test Hypothesis( test=stats.ttest_1samp, samples=["col1"], relationship=lambda stat, pvalue, alpha=0.01: (stat > 0 and pvalue / 2 < alpha), test_kwargs={"popmean": 50}, relationship_kwargs={"alpha": 0.01}, ), ], ) hypothesis_check_schema.validate(df) # raise error when using groupby for a column that doesn't exist hypothesis_check_schema_groupby = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(Int), }, checks=[ # two-sample test Hypothesis( test=stats.ttest_ind, samples=["col1", "col2"], groupby="col3", relationship=lambda stat, pvalue, alpha=0.01: (stat > 0 and pvalue / 2 < alpha), relationship_kwargs={"alpha": 0.5}, ), ], ) with pytest.raises(errors.SchemaDefinitionError): hypothesis_check_schema_groupby.validate(df)
def test_dataframe_hypothesis_checks(): df = pd.DataFrame({ "col1": range(100, 201), "col2": range(0, 101), }) hypothesis_check_schema = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(Int), }, checks=[ # two-sample test Hypothesis( test=stats.ttest_ind, samples=["col1", "col2"], relationship=lambda stat, pvalue, alpha=0.01: ( stat > 0 and pvalue / 2 < alpha ), relationship_kwargs={"alpha": 0.5}, ), # one-sample test Hypothesis( test=stats.ttest_1samp, samples=["col1"], relationship=lambda stat, pvalue, alpha=0.01: ( stat > 0 and pvalue / 2 < alpha ), test_kwargs={"popmean": 50}, relationship_kwargs={"alpha": 0.01}, ), ] ) hypothesis_check_schema.validate(df) # raise error when using groupby hypothesis_check_schema_groupby = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(Int), }, checks=[ # two-sample test Hypothesis( test=stats.ttest_ind, samples=["col1", "col2"], groupby="col3", relationship=lambda stat, pvalue, alpha=0.01: ( stat > 0 and pvalue / 2 < alpha ), relationship_kwargs={"alpha": 0.5}, ), ] ) with pytest.raises(errors.SchemaDefinitionError): hypothesis_check_schema_groupby.validate(df)
def test_pickling(int_dataframe: pd.DataFrame, schema: DataFrameSchema): """Test for a non-empty pickled object.""" try: schema.validate(int_dataframe, lazy=True) except SchemaErrors as exc: # expect non-empty bytes assert pickle.dumps(exc) else: pytest.fail("SchemaErrors not raised")
def test_nullable_int_in_dataframe(): df = pd.DataFrame({"column1": [5, 1, np.nan]}) null_schema = DataFrameSchema( {"column1": Column(Int, Check(lambda x: x > 0), nullable=True)}) assert isinstance(null_schema.validate(df), pd.DataFrame) # test case where column is an object df = df.astype({"column1": "object"}) assert isinstance(null_schema.validate(df), pd.DataFrame)
def validate(self): """ Check if the evaluation data is valid. The following constraints are checked: * CHROM has to be in ``{"1",...,"22","X","Y"}`` * POS has to be ``> 1`` * REF has to match with ``re.compile("^[ACGT]+$")`` * ALT has to match with ``re.compile("^[ACGT]+$")`` * RG has to be of type :class:`vpmbench.enums.ReferenceGenome` * CLASS has to be of type :class:`vpmbench.enums.PathogencityClass` * TYPE has to be of type :class:`vpmbench.enums.VariationType` * UID has to be ``> 0`` Raises ------ :class:`~pandera.errors.SchemaErrors` If the validation of the data fails """ chroms = set([str(x) for x in range(1, 23)] + ["X", "Y"]) ref_validator = re.compile("^[ACGT]+$") alt_validator = re.compile("^[ACGT]+$") schema = DataFrameSchema({ "CHROM": Column(String, Check(lambda chrom: chrom in chroms, element_wise=True), required=True), "POS": Column(Int, Check(lambda pos: pos >= 1), required=True), "REF": Column(String, Check(lambda ref: ref_validator.match(ref) is not None, element_wise=True), required=True), "ALT": Column(String, Check(lambda alt: alt_validator.match(alt) is not None, element_wise=True), required=True), "CLASS": Column(checks=Check(lambda cl: isinstance(cl, PathogencityClass), element_wise=True), required=True), "UID": Column(Int, Check(lambda x: x >= 0), required=True), "TYPE": Column(checks=Check(lambda cl: isinstance(cl, VariationType), element_wise=True), required=True), "RG": Column(checks=Check(lambda cl: isinstance(cl, ReferenceGenome), element_wise=True), required=True) }) schema.validate(self.table, lazy=True)
def test_dataframe_schema(): schema = DataFrameSchema({ "a": Column(PandasDtype.Int, Check(lambda x: x > 0)), "b": Column(PandasDtype.Float, Check(lambda x: 0 <= x <= 10)), "c": Column(PandasDtype.String, Check(lambda x: set(x) == {"x", "y", "z"}, element_wise=False)), "d": Column(PandasDtype.Bool, Check(lambda x: x.mean() > 0.5, element_wise=False)), "e": Column( PandasDtype.Category, Check(lambda x: set(x) == {"c1", "c2", "c3"}, element_wise=False)), "f": Column( PandasDtype.Object, Check(lambda x: x.isin([(1, ), (2, ), (3, )]), element_wise=False)), "g": Column(PandasDtype.DateTime, Check(lambda x: x >= pd.Timestamp("2015-01-01"))), "i": Column(PandasDtype.Timedelta, Check(lambda x: x < pd.Timedelta(10, unit="D"))) }) df = pd.DataFrame({ "a": [1, 2, 3], "b": [1.1, 2.5, 9.9], "c": ["z", "y", "x"], "d": [True, True, False], "e": pd.Series(["c2", "c1", "c3"], dtype="category"), "f": [(3, ), (2, ), (1, )], "g": [ pd.Timestamp("2015-02-01"), pd.Timestamp("2015-02-02"), pd.Timestamp("2015-02-03") ], "i": [ pd.Timedelta(1, unit="D"), pd.Timedelta(5, unit="D"), pd.Timedelta(9, unit="D") ] }) assert isinstance(schema.validate(df), pd.DataFrame) # error case with pytest.raises(SchemaError): schema.validate(df.drop("a", axis=1)) with pytest.raises(SchemaError): schema.validate(df.assign(a=[-1, -2, -1]))
def test_tail_dataframe_schema(): df = pd.DataFrame( {"col1": [i for i in range(100)] + [i for i in range(-1, -1001, -1)]}) schema = DataFrameSchema( columns={"col1": Column(Int, Check(lambda s: s < 0))}) # Validating with tail of 1000 should pass assert schema.validate(df, tail=1000).equals(df) with pytest.raises(errors.SchemaError): schema.validate(df)
def test_pickling(int_dataframe: pd.DataFrame, check_obj: Check): """Test for a non-empty pickled object.""" schema = DataFrameSchema({"a": Column(int, check_obj)}) try: # fails for element -1 schema.validate(int_dataframe) except SchemaError as exc: # must be non-empty byte-array assert pickle.dumps(exc) else: pytest.fail("SchemaError not raised")
def test_index_schema(): """Tests that when specifying a DataFrameSchema Index pandera validates and errors appropriately.""" schema = DataFrameSchema(index=Index(Int, [ Check(lambda x: 1 <= x <= 11, element_wise=True), Check(lambda index: index.mean() > 1) ])) df = pd.DataFrame(index=range(1, 11), dtype="int64") assert isinstance(schema.validate(df), pd.DataFrame) with pytest.raises(errors.SchemaError): schema.validate(pd.DataFrame(index=range(1, 20)))
def test_no_dtype_dataframe(): schema = DataFrameSchema({"col": Column(nullable=False)}) validated_df = schema.validate(pd.DataFrame({"col": [-123.1, -76.3, 1.0]})) assert isinstance(validated_df, pd.DataFrame) schema = DataFrameSchema({"col": Column(nullable=True)}) validated_df = schema.validate(pd.DataFrame({"col": [-123.1, None, 1.0]})) assert isinstance(validated_df, pd.DataFrame) with pytest.raises(errors.SchemaError): schema = DataFrameSchema({"col": Column(nullable=False)}) schema.validate(pd.DataFrame({"col": [-123.1, None, 1.0]}))
def test_index_schema(): schema = DataFrameSchema( columns={}, index=Index(Int, [ Check(lambda x: 1 <= x <= 11, element_wise=True), Check(lambda index: index.mean() > 1) ])) df = pd.DataFrame(index=range(1, 11), dtype="int64") assert isinstance(schema.validate(df), pd.DataFrame) with pytest.raises(errors.SchemaError): schema.validate(pd.DataFrame(index=range(1, 20)))
def test_unpickling( self, int_dataframe: pd.DataFrame, multi_check_schema: DataFrameSchema ): """Tests content validity of unpickled SchemaErrors.""" try: multi_check_schema.validate(int_dataframe, lazy=True) except SchemaErrors as exc: loaded = pickle.loads(pickle.dumps(exc)) assert loaded is not None self._compare_exception_with_unpickled(exc, loaded) else: pytest.fail("SchemaErrors not raised")
def test_tail_dataframe_schema(): """Checks that validating the tail of a dataframe validates correctly.""" df = pd.DataFrame( {"col1": list(range(0, 100)) + list(range(-1, -1001, -1))}) schema = DataFrameSchema( columns={"col1": Column(Int, Check(lambda s: s < 0))}) # Validating with tail of 1000 should pass assert schema.validate(df, tail=1000).equals(df) with pytest.raises(errors.SchemaError): schema.validate(df)
def test_dataframe_schema_check_function_types(check_function, should_fail): schema = DataFrameSchema({ "a": Column(Int, Check(fn=check_function, element_wise=False)), "b": Column(Float, Check(fn=check_function, element_wise=False)) }) df = pd.DataFrame({"a": [1, 2, 3], "b": [1.1, 2.5, 9.9]}) if should_fail: with pytest.raises(errors.SchemaError): schema.validate(df) else: schema.validate(df)
def test_single_index_multi_index_mismatch() -> None: """Tests the failure case that attempting to validate a MultiIndex DataFrame against a single index schema raises a SchemaError with a constructive error message.""" ind = pd.MultiIndex.from_tuples( [("a", "b"), ("c", "d"), ("e", "f")], names=("one", "two"), ) df_fail = pd.DataFrame(index=ind) schema = DataFrameSchema(index=Index(name="key")) with pytest.raises(errors.SchemaError): schema.validate(df_fail)
def test_dataframe_schema_check(): """Test that DataFrameSchema-level Checks work properly.""" data = pd.DataFrame([range(10) for _ in range(10)]) schema_check_return_bool = DataFrameSchema( checks=Check(lambda df: (df < 10).all())) assert isinstance(schema_check_return_bool.validate(data), pd.DataFrame) schema_check_return_series = DataFrameSchema( checks=Check(lambda df: df[0] < 10)) assert isinstance(schema_check_return_series.validate(data), pd.DataFrame) schema_check_return_df = DataFrameSchema(checks=Check(lambda df: df < 10)) assert isinstance(schema_check_return_df.validate(data), pd.DataFrame)