def test_pandas_extension_types(): """Test pandas extension data type happy path.""" # pylint: disable=no-member test_params = [ (pd.CategoricalDtype(), pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"), None), (pd.DatetimeTZDtype(tz='UTC'), pd.Series(pd.date_range(start="20200101", end="20200301"), dtype="datetime64[ns, utc]"), None), (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None), (pd.StringDtype(), pd.Series(["foo", "bar", "baz"], dtype="string"), None), (pd.PeriodDtype(freq='D'), pd.Series(pd.period_range('1/1/2019', '1/1/2020', freq='D')), None), ( pd.SparseDtype("float"), pd.Series(range(100)).where(lambda s: s < 5, other=np.nan).astype("Sparse[float]"), { "nullable": True }, ), (pd.BooleanDtype(), pd.Series([1, 0, 0, 1, 1], dtype="boolean"), None), ( pd.IntervalDtype(subtype="int64"), pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])), None, ) ] for dtype, data, series_kwargs in test_params: series_kwargs = {} if series_kwargs is None else series_kwargs series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs) assert isinstance(series_schema.validate(data), pd.Series)
def test_series_schema_with_index(coerce): """Test SeriesSchema with Index and MultiIndex components.""" schema_with_index = SeriesSchema( pandas_dtype=Int, index=Index(Int, coerce=coerce), ) validated_series = schema_with_index(pd.Series([1, 2, 3], index=[1, 2, 3])) assert isinstance(validated_series, pd.Series) schema_with_multiindex = SeriesSchema( pandas_dtype=Int, index=MultiIndex( [ Index(Int, coerce=coerce), Index(String, coerce=coerce), ] ), ) multi_index = pd.MultiIndex.from_arrays( [[0, 1, 2], ["foo", "bar", "foo"]], ) validated_series_multiindex = schema_with_multiindex( pd.Series([1, 2, 3], index=multi_index) ) assert isinstance(validated_series_multiindex, pd.Series) assert (validated_series_multiindex.index == multi_index).all()
def test_vectorized_checks(): schema = SeriesSchema( Int, Check(lambda s: s.value_counts() == 2, element_wise=False)) validated_series = schema.validate(pd.Series([1, 1, 2, 2, 3, 3])) assert isinstance(validated_series, pd.Series) # error case with pytest.raises(errors.SchemaError): schema.validate(pd.Series([1, 2, 3]))
def test_series_schema_multiple_validators(): schema = SeriesSchema(PandasDtype.Int, [ Check(lambda x: 0 <= x <= 50), Check(lambda s: (s == 21).any(), element_wise=False) ]) validated_series = schema.validate(pd.Series([1, 5, 21, 50])) assert isinstance(validated_series, pd.Series) # raise error if any of the validators fails with pytest.raises(SchemaError): schema.validate(pd.Series([1, 5, 20, 50]))
def test_vectorized_checks() -> None: """Test that using element-wise checking returns and errors as expected.""" schema = SeriesSchema( Int, Check(lambda s: s.value_counts() == 2, element_wise=False) ) validated_series = schema.validate(pd.Series([1, 1, 2, 2, 3, 3])) assert isinstance(validated_series, pd.Series) # error case with pytest.raises(errors.SchemaError): schema.validate(pd.Series([1, 2, 3]))
def test_raise_warning_series(): """Test that checks with raise_warning=True raise a warning.""" data = pd.Series([-1, -2, -3]) error_schema = SeriesSchema(checks=Check(lambda s: s > 0)) warning_schema = SeriesSchema( checks=Check(lambda s: s > 0, raise_warning=True)) with pytest.raises(errors.SchemaError): error_schema(data) with pytest.warns(UserWarning): warning_schema(data)
def test_no_dtype_series(): schema = SeriesSchema(nullable=False) validated_series = schema.validate(pd.Series([0, 1, 2, 3, 4, 1])) assert isinstance(validated_series, pd.Series) schema = SeriesSchema(nullable=True) validated_series = schema.validate(pd.Series([0, 1, 2, None, 4, 1])) assert isinstance(validated_series, pd.Series) with pytest.raises(errors.SchemaError): schema = SeriesSchema(nullable=False) schema.validate(pd.Series([0, 1, 2, None, 4, 1]))
def setup(self): self.schema = SeriesSchema( String, checks=[ Check(lambda s: s.str.startswith("foo")), Check(lambda s: s.str.endswith("bar")), Check(lambda x: len(x) > 3, element_wise=True) ], nullable=False, allow_duplicates=True, name="my_series") self.series = pd.Series(["foobar", "foobar", "foobar"], name="my_series")
def test_series_schema_multiple_validators(): """Tests how multiple Checks on a Series Schema are handled both successfully and when errors are expected.""" schema = SeriesSchema( Int, [ Check(lambda x: 0 <= x <= 50, element_wise=True), Check(lambda s: (s == 21).any())]) validated_series = schema.validate(pd.Series([1, 5, 21, 50])) assert isinstance(validated_series, pd.Series) # raise error if any of the validators fails with pytest.raises(errors.SchemaError): schema.validate(pd.Series([1, 5, 20, 50]))
def test_no_dtype_series(): """Test how nullability is handled in SeriesSchemas where no type is specified.""" schema = SeriesSchema(nullable=False) validated_series = schema.validate(pd.Series([0, 1, 2, 3, 4, 1])) assert isinstance(validated_series, pd.Series) schema = SeriesSchema(nullable=True) validated_series = schema.validate(pd.Series([0, 1, 2, None, 4, 1])) assert isinstance(validated_series, pd.Series) with pytest.raises(errors.SchemaError): schema = SeriesSchema(nullable=False) schema.validate(pd.Series([0, 1, 2, None, 4, 1]))
def test_schema_equality_operators(): """Test the usage of == for DataFrameSchema, SeriesSchema and SeriesSchemaBase.""" df_schema = DataFrameSchema({ "col1": Column(Int, Check(lambda s: s >= 0)), "col2": Column(String, Check(lambda s: s >= 2)), }, strict=True) df_schema_columns_in_different_order = DataFrameSchema({ "col2": Column(String, Check(lambda s: s >= 2)), "col1": Column(Int, Check(lambda s: s >= 0)), }, strict=True) series_schema = SeriesSchema( String, checks=[Check(lambda s: s.str.startswith("foo"))], nullable=False, allow_duplicates=True, name="my_series") series_schema_base = SeriesSchemaBase( String, checks=[Check(lambda s: s.str.startswith("foo"))], nullable=False, allow_duplicates=True, name="my_series") not_equal_schema = DataFrameSchema({ "col1": Column(String) }, strict=False) assert df_schema == copy.deepcopy(df_schema) assert df_schema != not_equal_schema assert df_schema == df_schema_columns_in_different_order assert series_schema == copy.deepcopy(series_schema) assert series_schema != not_equal_schema assert series_schema_base == copy.deepcopy(series_schema_base) assert series_schema_base != not_equal_schema
def test_series_schema_pdtype(pdtype): """Series schema pdtype property should return PandasDtype.""" if pdtype is None: series_schema = SeriesSchema(pdtype) assert series_schema.pdtype is None return for pandas_dtype_input in [ pdtype, pdtype.str_alias, pdtype.value, ]: series_schema = SeriesSchema(pandas_dtype_input) if pdtype is PandasDtype.STRING and LEGACY_PANDAS: assert series_schema.pdtype == PandasDtype.String else: assert series_schema.pdtype == pdtype
def test_series_schema_checks(): """Test SeriesSchema check property.""" series_schema_no_checks = SeriesSchema() series_schema_one_check = SeriesSchema(checks=Check.eq(0)) series_schema_multiple_checks = SeriesSchema( checks=[Check.gt(0), Check.lt(100)]) for schema in [ series_schema_no_checks, series_schema_one_check, series_schema_multiple_checks, ]: assert isinstance(schema.checks, list) assert len(series_schema_no_checks.checks) == 0 assert len(series_schema_one_check.checks) == 1 assert len(series_schema_multiple_checks.checks) == 2
def test_series_strategy_undefined_check_strategy(schema: pa.SeriesSchema, warning: str, data) -> None: """Test case where series check strategy is undefined.""" with pytest.warns( UserWarning, match=f"{warning} check doesn't have a defined strategy"): strat = schema.strategy(size=5) example = data.draw(strat) schema(example)
class Validate: """ Benchmarking Series schema.validate """ def setup(self): self.schema = SeriesSchema( String, checks=[ Check(lambda s: s.str.startswith("foo")), Check(lambda s: s.str.endswith("bar")), Check(lambda x: len(x) > 3, element_wise=True) ], nullable=False, allow_duplicates=True, name="my_series") self.series = pd.Series(["foobar", "foobar", "foobar"], name="my_series") def time_series_schema(self): self.schema.validate(self.series) def mem_series_schema(self): self.schema.validate(self.series) def peakmem_series_schema(self): self.schema.validate(self.series)
def test_series_schema(): schema = SeriesSchema(PandasDtype.Int, Check(lambda x: 0 <= x <= 100)) validated_series = schema.validate(pd.Series([0, 30, 50, 100])) assert isinstance(validated_series, pd.Series) # error cases for data in [-1, 101, 50.1, "foo"]: with pytest.raises(SchemaError): schema.validate(pd.Series([data])) for data in [-1, {"a": 1}, -1.0]: with pytest.raises(TypeError): schema.validate(TypeError)
@pytest.mark.parametrize( "indexes", [ [Index(int)], [Index(int, name="a"), Index(int)], [Index(int), Index(int, name="a")], ], ) def test_multiindex_unordered_init_exception(indexes): """Un-named indexes in unordered MultiIndex raises an exception.""" with pytest.raises(errors.SchemaInitError): MultiIndex(indexes, ordered=False) @pytest.mark.parametrize( "indexes", [ [Column(int)], [Column(int, name="a"), Index(int)], [Index(int), Column(int, name="a")], [SeriesSchema(int)], 1, 1.0, "foo", ], ) def test_multiindex_incorrect_input(indexes): """Passing in non-Index object raises SchemaInitError.""" with pytest.raises((errors.SchemaInitError, TypeError)): MultiIndex(indexes)
def test_series_schema(): """Tests that a SeriesSchema Check behaves as expected for integers and strings. Tests error cases for types, duplicates, name errors, and issues around float and integer handling of nulls""" int_schema = SeriesSchema( Int, Check(lambda x: 0 <= x <= 100, element_wise=True)) assert isinstance(int_schema.validate( pd.Series([0, 30, 50, 100])), pd.Series) str_schema = SeriesSchema( String, Check(lambda s: s.isin(["foo", "bar", "baz"])), nullable=True, coerce=True) assert isinstance(str_schema.validate( pd.Series(["foo", "bar", "baz", None])), pd.Series) assert isinstance(str_schema.validate( pd.Series(["foo", "bar", "baz", np.nan])), pd.Series) # error cases for data in [-1, 101, 50.1, "foo"]: with pytest.raises(errors.SchemaError): int_schema.validate(pd.Series([data])) for data in [-1, {"a": 1}, -1.0]: with pytest.raises(TypeError): int_schema.validate(TypeError) non_duplicate_schema = SeriesSchema( Int, allow_duplicates=False) with pytest.raises(errors.SchemaError): non_duplicate_schema.validate(pd.Series([0, 1, 2, 3, 4, 1])) # when series name doesn't match schema named_schema = SeriesSchema(Int, name="my_series") with pytest.raises( errors.SchemaError, match=r"^Expected .+ to have name"): named_schema.validate(pd.Series(range(5), name="your_series")) # when series floats are declared to be integer with pytest.raises( errors.SchemaError, match=r"^after dropping null values, expected values in series"): SeriesSchema(Int, nullable=True).validate( pd.Series([1.1, 2.3, 5.5, np.nan])) # when series contains null values when schema is not nullable with pytest.raises( errors.SchemaError, match=r"^non-nullable series .+ contains null values"): SeriesSchema(Float, nullable=False).validate( pd.Series([1.1, 2.3, 5.5, np.nan])) # when series contains null values when schema is not nullable in addition # to having the wrong data type with pytest.raises( errors.SchemaError, match=( r"^expected series '.+' to have type .+, got .+ and " "non-nullable series contains null values")): SeriesSchema(Int, nullable=False).validate( pd.Series([1.1, 2.3, 5.5, np.nan]))
def test_series_schema_dtype_property(pandas_dtype, expected): """Tests every type of allowed dtype.""" assert SeriesSchema(pandas_dtype).dtype == expected
def test_lazy_dataframe_scalar_false_check(schema_cls, data): """Lazy validation handles checks returning scalar False values.""" # define a check that always returns a scalare False value check = Check(check_fn=lambda _: False, element_wise=False, error="failing check") schema = schema_cls(checks=check) with pytest.raises(errors.SchemaErrors): schema(data, lazy=True) @pytest.mark.parametrize( "schema, data, expectation", [ [ SeriesSchema(Int, checks=Check.greater_than(0)), pd.Series(["a", "b", "c"]), { "data": pd.Series(["a", "b", "c"]), "schema_errors": { # schema object context -> check failure cases "SeriesSchema": { # check name -> failure cases "greater_than(0)": [ "TypeError(\"'>' not supported between instances of " "'str' and 'int'\")", # TypeError raised in python=3.5 'TypeError("unorderable types: str() > int()")', ], "pandas_dtype('int64')": ["object"], },