def test_lazy_dataframe_validation_error(): """Test exceptions on lazy dataframe validation.""" schema = DataFrameSchema( columns={ "int_col": Column(Int, Check.greater_than(5)), "int_col2": Column(Int), "float_col": Column(Float, Check.less_than(0)), "str_col": Column(String, Check.isin(["foo", "bar"])), "not_in_dataframe": Column(Int), }, checks=Check( lambda df: df != 1, error="dataframe_not_equal_1", ignore_na=False ), index=Index(String, name="str_index"), strict=True, ) dataframe = pd.DataFrame( data={ "int_col": [1, 2, 6], "int_col2": ["a", "b", "c"], "float_col": [1., -2., 3.], "str_col": ["foo", "b", "c"], "unknown_col": [None, None, None], }, index=pd.Index( ["index0", "index1", "index2"], name="str_index" ), ) expectation = { # schema object context -> check failure cases "DataFrameSchema": { # check name -> failure cases "column_in_schema": ["unknown_col"], "dataframe_not_equal_1": [1], "column_in_dataframe": ["not_in_dataframe"], }, "Column": { "greater_than(5)": [1, 2], "pandas_dtype('int64')": ["object"], "less_than(0)": [1, 3], }, } with pytest.raises( errors.SchemaErrors, match="^A total of .+ schema errors were found"): schema.validate(dataframe, lazy=True) try: schema.validate(dataframe, lazy=True) except errors.SchemaErrors as err: # data in the caught exception should be equal to the dataframe # passed into validate assert err.data.equals(dataframe) # make sure all expected check errors are in schema errors for schema_context, check_failure_cases in expectation.items(): err_df = err.schema_errors.loc[ err.schema_errors.schema_context == schema_context] for check, failure_cases in check_failure_cases.items(): assert check in err_df.check.values assert ( err_df.loc[err_df.check == check] .failure_case.isin(failure_cases) .all() )
def test_index_schema_coerce(): """Test that index can be type-coerced.""" schema = DataFrameSchema(index=Index(Float, coerce=True)) df = pd.DataFrame(index=pd.Index([1, 2, 3, 4], dtype="int64")) validated_df = schema(df) assert validated_df.index.dtype == Float.value
], [ Column( Int, checks=[Check.greater_than(1), Check.less_than(3)], name="column" ), pd.DataFrame({"column": [1, 2, 3]}), { "data": pd.DataFrame({"column": [1, 2, 3]}), "schema_errors": { "Column": {"greater_than(1)": [1], "less_than(3)": [3]}, }, }, ], [ Index(String, checks=Check.isin(["a", "b", "c"])), pd.DataFrame({"col": [1, 2, 3]}, index=["a", "b", "d"]), { # expect that the data in the SchemaError is the pd.Index cast # into a Series "data": pd.Series(["a", "b", "d"]), "schema_errors": { "Index": {"isin(%s)" % {'a', 'b', 'c'}: ["d"]}, } }, ], [ MultiIndex( indexes=[ Index(Int, checks=Check.greater_than(0), name="index0"), Index(Int, checks=Check.less_than(0), name="index1"),