def test_category_dtype(): """Test the category type can be validated properly by schema.validate""" schema = DataFrameSchema( columns={ "col": Column( pa.Category, checks=[ Check(lambda s: set(s) == {"A", "B", "C"}), Check(lambda s: s.cat.categories.tolist() == ["A", "B", "C"]), Check(lambda s: s.isin(["A", "B", "C"])), ], nullable=False, ), }, coerce=False, ) validated_df = schema.validate( pd.DataFrame( {"col": pd.Series(["A", "B", "A", "B", "C"], dtype="category")})) assert isinstance(validated_df, pd.DataFrame)
def test_check_groupby_multiple_columns(): """Tests uses of groupby to specify dependencies between one column and a number of other columns, including error handling.""" schema = DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s[("bar", True)].sum() == 16, # 7 + 9 groupby=["col2", "col3"]), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), "col3": Column(Bool), }) df_pass = pd.DataFrame({ "col1": [7, 8, 9, 11, 12, 13], "col2": ["bar", "bar", "bar", "foo", "foo", "foo"], "col3": [True, False, True, False, True, False], }) df = schema.validate(df_pass) assert isinstance(df, pd.DataFrame) assert len(df.columns) == 3 assert set(df.columns) == {"col1", "col2", "col3"}
def test_dataframe_coerce_regex(): """Test dataframe pandas dtype coercion for regex columns""" schema = DataFrameSchema( columns={"column_": Column(float, regex=True, required=False)}, pandas_dtype=int, coerce=True, ) no_match_df = pd.DataFrame({"foo": [1, 2, 3]}) match_valid_df = pd.DataFrame({ "column_1": [1, 2, 3], "column_2": ["1", "2", "3"], }) schema(no_match_df) schema(match_valid_df) # if the regex column is required, no matches should raise an error schema_required = schema.update_column("column_", required=True) with pytest.raises(errors.SchemaError, match="Column regex name='column_' did not match"): schema_required(no_match_df)
def _multi_check_schema() -> DataFrameSchema: """Schema with multiple positivity checks on column `a`""" return DataFrameSchema( { "a": Column( int, [ Check.isin([0, 1]), Check(lambda x: x >= 0), ], ), } )
def test_add_and_remove_columns(): """Check that adding and removing columns works as expected and doesn't modify the original underlying DataFrameSchema.""" schema1 = DataFrameSchema( { "col1": Column(Int, Check(lambda s: s >= 0)), }, strict=True, ) schema1_exact_copy = copy.deepcopy(schema1) # test that add_columns doesn't modify schema1 after add_columns: schema2 = schema1.add_columns({ "col2": Column(String, Check(lambda x: x <= 0)), "col3": Column(Object, Check(lambda x: x == 0)), }) schema2_exact_copy = copy.deepcopy(schema2) assert schema1 == schema1_exact_copy # test that add_columns changed schema1 into schema2: expected_schema_2 = DataFrameSchema( { "col1": Column(Int, Check(lambda s: s >= 0)), "col2": Column(String, Check(lambda x: x <= 0)), "col3": Column(Object, Check(lambda x: x == 0)), }, strict=True, ) assert schema2 == expected_schema_2 # test that remove_columns doesn't modify schema2: schema3 = schema2.remove_columns(["col2"]) assert schema2 == schema2_exact_copy # test that remove_columns has removed the changes as expected: expected_schema_3 = DataFrameSchema( { "col1": Column(Int, Check(lambda s: s >= 0)), "col3": Column(Object, Check(lambda x: x == 0)), }, strict=True, ) assert schema3 == expected_schema_3 # test that remove_columns can remove two columns: schema4 = schema2.remove_columns(["col2", "col3"]) expected_schema_4 = DataFrameSchema( {"col1": Column(Int, Check(lambda s: s >= 0))}, strict=True) assert schema4 == expected_schema_4 == schema1
def test_multi_index_index(): """Tests that multi-index Indexes within DataFrames validate correctly.""" schema = DataFrameSchema( columns={ "column1": Column(Float, Check(lambda s: s > 0)), "column2": Column(Float, Check(lambda s: s > 0)), }, index=MultiIndex(indexes=[ Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"), Index( Str, Check(lambda s: s.isin(["foo", "bar"])), name="index1", ), ]), ) df = pd.DataFrame( data={ "column1": [0.1, 0.5, 123.1, 10.6, 22.31], "column2": [0.1, 0.5, 123.1, 10.6, 22.31], }, index=pd.MultiIndex.from_arrays( [[0, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]], names=["index0", "index1"], ), ) validated_df = schema.validate(df) assert isinstance(validated_df, pd.DataFrame) # failure case df_fail = df.copy() df_fail.index = pd.MultiIndex.from_arrays( [[-1, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]], names=["index0", "index1"], ) with pytest.raises(errors.SchemaError): schema.validate(df_fail)
def test_pandas_nullable_int_dtype(dtype, coerce): """Test that pandas nullable int dtype can be specified in a schema.""" assert all( isinstance( schema.validate( pd.DataFrame( # keep max range to 127 in order to support Int8 {"col": range(128)}, **({} if coerce else {"dtype": dtype.str_alias}), ) ), pd.DataFrame, ) for schema in [ DataFrameSchema( {"col": Column(dtype, nullable=False)}, coerce=coerce ), DataFrameSchema( {"col": Column(dtype.str_alias, nullable=False)}, coerce=coerce ), ] )
def test_numeric_dtypes(): """Test every numeric type can be validated properly by schema.validate""" for dtype in [ dtypes.Float, dtypes.Float16, dtypes.Float32, dtypes.Float64]: assert all( isinstance( schema.validate( pd.DataFrame( {"col": [-123.1, -7654.321, 1.0, 1.1, 1199.51, 5.1]}, dtype=dtype.value)), pd.DataFrame ) for schema in [ DataFrameSchema({"col": Column(dtype, nullable=False)}), DataFrameSchema({"col": Column(dtype.value, nullable=False)}) ] ) for dtype in [ dtypes.Int, dtypes.Int8, dtypes.Int16, dtypes.Int32, dtypes.Int64]: assert all( isinstance( schema.validate( pd.DataFrame( {"col": [-712, -4, -321, 0, 1, 777, 5, 123, 9000]}, dtype=dtype.value)), pd.DataFrame ) for schema in [ DataFrameSchema({"col": Column(dtype, nullable=False)}), DataFrameSchema({"col": Column(dtype.value, nullable=False)}) ] ) for dtype in [ dtypes.UInt8, dtypes.UInt16, dtypes.UInt32, dtypes.UInt64]: assert all( isinstance( schema.validate( pd.DataFrame( {"col": [1, 777, 5, 123, 9000]}, dtype=dtype.value)), pd.DataFrame ) for schema in [ DataFrameSchema({"col": Column(dtype, nullable=False)}), DataFrameSchema({"col": Column(dtype.value, nullable=False)}) ] )
def schema_multiindex(): """Fixture for schema with MultiIndex.""" schema = DataFrameSchema( columns={ "col1": Column(pandas_dtype=Int), "col2": Column(pandas_dtype=Float), }, index=MultiIndex([ Index(pandas_dtype=String, name="ind0"), Index(pandas_dtype=String, name="ind1"), ]), ) return schema
def test_coerce_dtype_nullable_str(data, dtype, nonnull_idx, string_type, nullable): """Tests how null values are handled with string dtypes.""" if LEGACY_PANDAS and (dtype == "Int64" or string_type in {STRING, "string"}): pytest.skip("Skipping data types that depend on pandas>1.0.0") dataframe = pd.DataFrame({"col": pd.Series(data, dtype=dtype)}) schema = DataFrameSchema( {"col": Column(string_type, coerce=True, nullable=nullable)}) if not nullable: with pytest.raises(errors.SchemaError): schema.validate(dataframe) return validated_df = schema.validate(dataframe) assert isinstance(validated_df, pd.DataFrame) for i, element in validated_df["col"].iteritems(): if i < nonnull_idx: assert isinstance(element, str) else: assert pd.isna(element)
def test_lazy_dataframe_validation_nullable(): """ Test that non-nullable column failure cases are correctly processed during lazy validation. """ schema = DataFrameSchema( columns={ "int_column": Column(Int, nullable=False), "float_column": Column(Float, nullable=False), "str_column": Column(String, nullable=False), }, strict=True, ) df = pd.DataFrame( { "int_column": [1, None, 3], "float_column": [0.1, 1.2, None], "str_column": [None, "foo", "bar"], } ) try: schema.validate(df, lazy=True) except errors.SchemaErrors as err: assert err.schema_errors.failure_case.isna().all() for col, index in [ ("int_column", 1), ("float_column", 2), ("str_column", 0), ]: # pylint: disable=cell-var-from-loop assert ( err.schema_errors.loc[ lambda df: df.column == col, "index" ].iloc[0] == index )
def test_check_function_decorator_errors(): """Test that the check_input and check_output decorators error properly.""" # case 1: checks that the input and output decorators error when different # types are passed in and out @check_input(DataFrameSchema({"column1": Column(Int)})) @check_output(DataFrameSchema({"column2": Column(Float)})) def test_func(df): return df with pytest.raises( errors.SchemaError, match=r"^error in check_input decorator of function", ): test_func(pd.DataFrame({"column2": ["a", "b", "c"]})) with pytest.raises( errors.SchemaError, match=r"^error in check_input decorator of function", ): test_func(df=pd.DataFrame({"column2": ["a", "b", "c"]})) with pytest.raises( errors.SchemaError, match=r"^error in check_output decorator of function", ): test_func(pd.DataFrame({"column1": [1, 2, 3]})) # case 2: check that if the input decorator refers to an index that's not # in the function signature, it will fail in a way that's easy to interpret @check_input(DataFrameSchema({"column1": Column(Int)}), 1) def test_incorrect_check_input_index(df): return df with pytest.raises( IndexError, match=r"^error in check_input decorator of function" ): test_incorrect_check_input_index(pd.DataFrame({"column1": [1, 2, 3]}))
def test_column_regex_multiindex(): """Text that column regex works on multi-index column.""" column_schema = Column( Int, Check(lambda s: s >= 0), name=("foo_*", "baz_*"), regex=True, ) dataframe_schema = DataFrameSchema({ ("foo_*", "baz_*"): Column(Int, Check(lambda s: s >= 0), regex=True), }) data = pd.DataFrame({ ("foo_1", "biz_1"): range(10), ("foo_2", "baz_1"): range(10, 20), ("foo_3", "baz_2"): range(20, 30), ("bar_1", "biz_2"): range(10), ("bar_2", "biz_3"): range(10, 20), ("bar_3", "biz_3"): range(20, 30), }) assert isinstance(column_schema.validate(data), pd.DataFrame) assert isinstance(dataframe_schema.validate(data), pd.DataFrame) # Raise an error if tuple column name is applied to a dataframe with a # flat pd.Index object. failure_column_cases = ( [f"foo_{i}" for i in range(6)], pd.MultiIndex.from_tuples([(f"foo_{i}", f"bar_{i}", f"baz_{i}") for i in range(6)]), ) for columns in failure_column_cases: data.columns = columns with pytest.raises(IndexError): column_schema.validate(data) with pytest.raises(IndexError): dataframe_schema.validate(data)
def test_dataframe_pandas_dtype_coerce(): """ Test that pandas dtype specified at the dataframe level overrides column data types. """ schema = DataFrameSchema( columns={f"column_{i}": Column(float) for i in range(5)}, pandas_dtype=int, coerce=True, ) df = pd.DataFrame({f"column_{i}": range(10) for i in range(5)}).astype(float) assert (schema(df).dtypes == Int.str_alias).all() # test that pandas_dtype in columns are preserved for col in schema.columns.values(): assert col.pandas_dtype is float # raises SchemeError if dataframe can't be coerced with pytest.raises(errors.SchemaErrors): schema.coerce_dtype(pd.DataFrame({"foo": list("abcdef")})) # raises SchemaErrors on lazy validation with pytest.raises(errors.SchemaErrors): schema(pd.DataFrame({"foo": list("abcdef")}), lazy=True) # test that original dataframe dtypes are preserved assert (df.dtypes == Float.str_alias).all() # test case where pandas_dtype is string schema.pandas_dtype = str assert (schema(df).dtypes == "object").all() schema.pandas_dtype = PandasDtype.String assert (schema(df).dtypes == "object").all() # raises ValueError if _coerce_dtype is called when pandas_dtype is None schema.pandas_dtype = None with pytest.raises(ValueError): schema._coerce_dtype(df)
def _deserialize_schema(serialized_schema): # pylint: disable=import-outside-toplevel from pandera import Check, Column, DataFrameSchema, Index, MultiIndex # GH#475 serialized_schema = serialized_schema if serialized_schema else {} if not isinstance(serialized_schema, Mapping): raise pandera.errors.SchemaDefinitionError( "Schema representation must be a mapping.") columns = serialized_schema.get("columns") index = serialized_schema.get("index") checks = serialized_schema.get("checks") if columns is not None: columns = { col_name: Column(**_deserialize_component_stats(column_stats)) for col_name, column_stats in columns.items() } if index is not None: index = [ _deserialize_component_stats(index_component) for index_component in index ] if checks is not None: # handles unregistered checks by raising AttributeErrors from getattr checks = [ _deserialize_check_stats(getattr(Check, check_name), check_stats) for check_name, check_stats in checks.items() ] if index is None: pass elif len(index) == 1: index = Index(**index[0]) else: index = MultiIndex( indexes=[Index(**index_properties) for index_properties in index]) return DataFrameSchema( columns=columns, checks=checks, index=index, coerce=serialized_schema.get("coerce", False), strict=serialized_schema.get("strict", False), )
def test_schema_equality_operators(): """Test the usage of == for DataFrameSchema, SeriesSchema and SeriesSchemaBase.""" df_schema = DataFrameSchema( { "col1": Column(Int, Check(lambda s: s >= 0)), "col2": Column(String, Check(lambda s: s >= 2)), }, strict=True) df_schema_columns_in_different_order = DataFrameSchema( { "col2": Column(String, Check(lambda s: s >= 2)), "col1": Column(Int, Check(lambda s: s >= 0)), }, strict=True) series_schema = SeriesSchema( String, checks=[Check(lambda s: s.str.startswith("foo"))], nullable=False, allow_duplicates=True, name="my_series") series_schema_base = SeriesSchemaBase( String, checks=[Check(lambda s: s.str.startswith("foo"))], nullable=False, allow_duplicates=True, name="my_series") not_equal_schema = DataFrameSchema({"col1": Column(String)}, strict=False) assert df_schema == copy.deepcopy(df_schema) assert df_schema != not_equal_schema assert df_schema == df_schema_columns_in_different_order assert series_schema == copy.deepcopy(series_schema) assert series_schema != not_equal_schema assert series_schema_base == copy.deepcopy(series_schema_base) assert series_schema_base != not_equal_schema
def test_dataframe_schema_dtype_property(): """Test that schema.dtype returns the matching Column types.""" schema = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(String), "col3": Column(DateTime), "col4": Column("uint16"), }) assert schema.dtype == { "col1": "int64", "col2": "object", "col3": "datetime64[ns]", "col4": "uint16" }
def test_column_regex(): """Test that column regex work on single-level column index.""" column_schema = Column(Int, Check(lambda s: s >= 0), name="foo_*", regex=True) dataframe_schema = DataFrameSchema({ "foo_*": Column(Int, Check(lambda s: s >= 0), regex=True), }) data = pd.DataFrame({ "foo_1": range(10), "foo_2": range(10, 20), "foo_3": range(20, 30), "bar_1": range(10), "bar_2": range(10, 20), "bar_3": range(20, 30), }) assert isinstance(column_schema.validate(data), pd.DataFrame) assert isinstance(dataframe_schema.validate(data), pd.DataFrame) # Raise an error on multi-index column case data.columns = pd.MultiIndex.from_tuples(( ("foo_1", "biz_1"), ("foo_2", "baz_1"), ("foo_3", "baz_2"), ("bar_1", "biz_2"), ("bar_2", "biz_3"), ("bar_3", "biz_3"), )) with pytest.raises(IndexError): column_schema.validate(data) with pytest.raises(IndexError): dataframe_schema.validate(data)
def test_category_dtype_coerce(): """Test coercion of the category type is validated properly by schema.validate and fails safely.""" columns = { "col": Column( pa.Category, checks=Check(lambda s: set(s) == {"A", "B", "C"}), nullable=False ), } with pytest.raises(SchemaError): DataFrameSchema(columns=columns, coerce=False).validate( pd.DataFrame( {"col": pd.Series(["A", "B", "A", "B", "C"], dtype="object")} ) ) validated_df = DataFrameSchema(columns=columns, coerce=True).validate( pd.DataFrame( {"col": pd.Series(["A", "B", "A", "B", "C"], dtype="object")} ) ) assert isinstance(validated_df, pd.DataFrame)
def init_schema_element_wise(): DataFrameSchema( { "col1": Column( Int, [ Check( lambda s: s["foo"] > 10, element_wise=True, groupby=["col2"], ), ], ), "col2": Column(Str, Check(lambda s: s.isin(["foo", "bar"]))), } )
def test_schema_component_equality_operators(): """Test the usage of == for Column, Index and MultiIndex.""" column = Column(Int, Check(lambda s: s >= 0)) index = Index(Int, [Check(lambda x: 1 <= x <= 11, element_wise=True)]) multi_index = MultiIndex(indexes=[ Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"), Index(String, Check(lambda s: s.isin(["foo", "bar"])), name="index1"), ]) not_equal_schema = DataFrameSchema( {"col1": Column(Int, Check(lambda s: s >= 0))}) assert column == copy.deepcopy(column) assert column != not_equal_schema assert index == copy.deepcopy(index) assert index != not_equal_schema assert multi_index == copy.deepcopy(multi_index) assert multi_index != not_equal_schema
def test_multi_index_schema_coerce(): """Test that multi index can be type-coerced.""" indexes = [ Index(Float), Index(Int), Index(String), ] schema = DataFrameSchema(index=MultiIndex(indexes=indexes)) df = pd.DataFrame(index=pd.MultiIndex.from_arrays([ [1.0, 2.1, 3.5, 4.8], [5, 6, 7, 8], ["9", "10", "11", "12"], ])) validated_df = schema(df) for level_i in range(validated_df.index.nlevels): assert (validated_df.index.get_level_values(level_i).dtype == indexes[level_i].dtype)
def test_check_input_output_unrecognized_obj_getter(obj_getter): """ Test that check_input and check_output raise correct errors on unrecognized dataframe object getters """ schema = DataFrameSchema({"column": Column(int)}) @check_input(schema, obj_getter) def test_check_input_fn(df): return df @check_output(schema, obj_getter) def test_check_output_fn(df): return df for fn in [test_check_input_fn, test_check_output_fn]: with pytest.raises(TypeError): fn(pd.DataFrame({"column": [1, 2, 3]}))
def test_dataframe_schema_check_function_types(check_function, should_fail): schema = DataFrameSchema({ "a": Column(Int, Check(fn=check_function, element_wise=False)), "b": Column(Float, Check(fn=check_function, element_wise=False)) }) df = pd.DataFrame({"a": [1, 2, 3], "b": [1.1, 2.5, 9.9]}) if should_fail: with pytest.raises(errors.SchemaError): schema.validate(df) else: schema.validate(df)
def test_dtypes(): for dtype in [ dtypes.Float, dtypes.Float16, dtypes.Float32, dtypes.Float64]: schema = DataFrameSchema({"col": Column(dtype, nullable=False)}) validated_df = schema.validate( pd.DataFrame( {"col": [-123.1, -7654.321, 1.0, 1.1, 1199.51, 5.1, 4.6]}, dtype=dtype.value)) assert isinstance(validated_df, pd.DataFrame) for dtype in [ dtypes.Int, dtypes.Int8, dtypes.Int16, dtypes.Int32, dtypes.Int64]: schema = DataFrameSchema({"col": Column(dtype, nullable=False)}) validated_df = schema.validate( pd.DataFrame( {"col": [-712, -4, -321, 0, 1, 777, 5, 123, 9000]}, dtype=dtype.value)) assert isinstance(validated_df, pd.DataFrame) for dtype in [ dtypes.UInt8, dtypes.UInt16, dtypes.UInt32, dtypes.UInt64]: schema = DataFrameSchema({"col": Column(dtype, nullable=False)}) validated_df = schema.validate( pd.DataFrame( {"col": [1, 777, 5, 123, 9000]}, dtype=dtype.value)) assert isinstance(validated_df, pd.DataFrame)
def test_one_sample_hypothesis(): """Check one sample ttest.""" schema = DataFrameSchema( { "height_in_feet": Column( Float, [ Hypothesis.one_sample_ttest( popmean=5, relationship="greater_than", alpha=0.1 ), ], ), } ) subset_schema = DataFrameSchema( { "group": Column(String), "height_in_feet": Column( Float, [ Hypothesis.one_sample_ttest( sample="A", groupby="group", popmean=5, relationship="greater_than", alpha=0.1, ), ], ), } ) df = pd.DataFrame( { "height_in_feet": [8.1, 7, 6.5, 6.7, 5.1], "group": ["A", "A", "B", "B", "A"], } ) schema.validate(df) subset_schema.validate(df)
def test_python_builtin_types(): """Test support python data types can be used for validation.""" schema = DataFrameSchema({ "int_col": Column(int), "float_col": Column(float), "str_col": Column(str), "bool_col": Column(bool), }) df = pd.DataFrame({ "int_col": [1, 2, 3], "float_col": [1., 2., 3.], "str_col": list("abc"), "bool_col": [True, False, True], }) assert isinstance(schema(df), pd.DataFrame) assert schema.dtype["int_col"] == PandasDtype.Int.str_alias assert schema.dtype["float_col"] == PandasDtype.Float.str_alias assert schema.dtype["str_col"] == PandasDtype.String.str_alias assert schema.dtype["bool_col"] == PandasDtype.Bool.str_alias
def test_non_str_column_name_regex(column_key): """Check that Columns with non-str names cannot have regex=True.""" with pytest.raises(ValueError): DataFrameSchema({ column_key: Column( Float, checks=Check.greater_than_or_equal_to(0), regex=True, ), }) with pytest.raises(ValueError): Column( Float, checks=Check.greater_than_or_equal_to(0), name=column_key, regex=True, )
def test_dataframe_schema_check_function_types(check_function, should_fail): """Tests a DataFrameSchema against a variety of Check conditions.""" schema = DataFrameSchema({ "a": Column(Int, Check(check_function, element_wise=False)), "b": Column(Float, Check(check_function, element_wise=False)), }) df = pd.DataFrame({"a": [1, 2, 3], "b": [1.1, 2.5, 9.9]}) if should_fail: with pytest.raises(errors.SchemaError): schema.validate(df) else: schema.validate(df)
def test_ordered_dataframe(columns: Dict[str, Column], index: MultiIndex): """Test that columns are ordered.""" schema = DataFrameSchema(columns=columns, index=index, ordered=True) df = pd.DataFrame( data=[[1, 2, 3]], columns=["a", "a", "b"], index=pd.MultiIndex.from_arrays([[1], [2], [3]], names=["a", "a", "b"]), ) assert isinstance(schema.validate(df), pd.DataFrame) # test optional column df = pd.DataFrame( data=[[1]], columns=["b"], index=pd.MultiIndex.from_arrays([[1], [2]], names=["a", "b"]), ) assert isinstance(schema.validate(df), pd.DataFrame) df = pd.DataFrame( data=[[1, 2]], columns=["b", "a"], index=pd.MultiIndex.from_arrays([[1], [2]], names=["b", "a"]), ) with pytest.raises(errors.SchemaErrors, match="A total of 2 schema errors"): schema.validate(df, lazy=True) # test out-of-order duplicates df = pd.DataFrame( data=[[1, 2, 3, 4]], columns=["a", "b", "c", "a"], index=pd.MultiIndex.from_arrays([[1], [2], [3], [4]], names=["a", "b", "c", "a"]), ) with pytest.raises(errors.SchemaErrors, match="A total of 1 schema errors"): schema.validate(df, lazy=True)