def test_vectorized_checks(): schema = SeriesSchema( Int, Check(lambda s: s.value_counts() == 2, element_wise=False)) validated_series = schema.validate(pd.Series([1, 1, 2, 2, 3, 3])) assert isinstance(validated_series, pd.Series) # error case with pytest.raises(errors.SchemaError): schema.validate(pd.Series([1, 2, 3]))
def test_series_schema_multiple_validators(): schema = SeriesSchema(PandasDtype.Int, [ Check(lambda x: 0 <= x <= 50), Check(lambda s: (s == 21).any(), element_wise=False) ]) validated_series = schema.validate(pd.Series([1, 5, 21, 50])) assert isinstance(validated_series, pd.Series) # raise error if any of the validators fails with pytest.raises(SchemaError): schema.validate(pd.Series([1, 5, 20, 50]))
def test_vectorized_checks() -> None: """Test that using element-wise checking returns and errors as expected.""" schema = SeriesSchema( Int, Check(lambda s: s.value_counts() == 2, element_wise=False) ) validated_series = schema.validate(pd.Series([1, 1, 2, 2, 3, 3])) assert isinstance(validated_series, pd.Series) # error case with pytest.raises(errors.SchemaError): schema.validate(pd.Series([1, 2, 3]))
def test_no_dtype_series(): schema = SeriesSchema(nullable=False) validated_series = schema.validate(pd.Series([0, 1, 2, 3, 4, 1])) assert isinstance(validated_series, pd.Series) schema = SeriesSchema(nullable=True) validated_series = schema.validate(pd.Series([0, 1, 2, None, 4, 1])) assert isinstance(validated_series, pd.Series) with pytest.raises(errors.SchemaError): schema = SeriesSchema(nullable=False) schema.validate(pd.Series([0, 1, 2, None, 4, 1]))
def test_series_schema_multiple_validators(): """Tests how multiple Checks on a Series Schema are handled both successfully and when errors are expected.""" schema = SeriesSchema( Int, [ Check(lambda x: 0 <= x <= 50, element_wise=True), Check(lambda s: (s == 21).any())]) validated_series = schema.validate(pd.Series([1, 5, 21, 50])) assert isinstance(validated_series, pd.Series) # raise error if any of the validators fails with pytest.raises(errors.SchemaError): schema.validate(pd.Series([1, 5, 20, 50]))
def test_series_schema(): schema = SeriesSchema(PandasDtype.Int, Check(lambda x: 0 <= x <= 100)) validated_series = schema.validate(pd.Series([0, 30, 50, 100])) assert isinstance(validated_series, pd.Series) # error cases for data in [-1, 101, 50.1, "foo"]: with pytest.raises(SchemaError): schema.validate(pd.Series([data])) for data in [-1, {"a": 1}, -1.0]: with pytest.raises(TypeError): schema.validate(TypeError)
def test_no_dtype_series(): """Test how nullability is handled in SeriesSchemas where no type is specified.""" schema = SeriesSchema(nullable=False) validated_series = schema.validate(pd.Series([0, 1, 2, 3, 4, 1])) assert isinstance(validated_series, pd.Series) schema = SeriesSchema(nullable=True) validated_series = schema.validate(pd.Series([0, 1, 2, None, 4, 1])) assert isinstance(validated_series, pd.Series) with pytest.raises(errors.SchemaError): schema = SeriesSchema(nullable=False) schema.validate(pd.Series([0, 1, 2, None, 4, 1]))
def test_pandas_extension_types(): """Test pandas extension data type happy path.""" # pylint: disable=no-member test_params = [ (pd.CategoricalDtype(), pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"), None), (pd.DatetimeTZDtype(tz='UTC'), pd.Series(pd.date_range(start="20200101", end="20200301"), dtype="datetime64[ns, utc]"), None), (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None), (pd.StringDtype(), pd.Series(["foo", "bar", "baz"], dtype="string"), None), (pd.PeriodDtype(freq='D'), pd.Series(pd.period_range('1/1/2019', '1/1/2020', freq='D')), None), ( pd.SparseDtype("float"), pd.Series(range(100)).where(lambda s: s < 5, other=np.nan).astype("Sparse[float]"), { "nullable": True }, ), (pd.BooleanDtype(), pd.Series([1, 0, 0, 1, 1], dtype="boolean"), None), ( pd.IntervalDtype(subtype="int64"), pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])), None, ) ] for dtype, data, series_kwargs in test_params: series_kwargs = {} if series_kwargs is None else series_kwargs series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs) assert isinstance(series_schema.validate(data), pd.Series)
class Validate: """ Benchmarking Series schema.validate """ def setup(self): self.schema = SeriesSchema( String, checks=[ Check(lambda s: s.str.startswith("foo")), Check(lambda s: s.str.endswith("bar")), Check(lambda x: len(x) > 3, element_wise=True) ], nullable=False, allow_duplicates=True, name="my_series") self.series = pd.Series(["foobar", "foobar", "foobar"], name="my_series") def time_series_schema(self): self.schema.validate(self.series) def mem_series_schema(self): self.schema.validate(self.series) def peakmem_series_schema(self): self.schema.validate(self.series)
def test_series_schema(): """Tests that a SeriesSchema Check behaves as expected for integers and strings. Tests error cases for types, duplicates, name errors, and issues around float and integer handling of nulls""" int_schema = SeriesSchema( Int, Check(lambda x: 0 <= x <= 100, element_wise=True)) assert isinstance(int_schema.validate( pd.Series([0, 30, 50, 100])), pd.Series) str_schema = SeriesSchema( String, Check(lambda s: s.isin(["foo", "bar", "baz"])), nullable=True, coerce=True) assert isinstance(str_schema.validate( pd.Series(["foo", "bar", "baz", None])), pd.Series) assert isinstance(str_schema.validate( pd.Series(["foo", "bar", "baz", np.nan])), pd.Series) # error cases for data in [-1, 101, 50.1, "foo"]: with pytest.raises(errors.SchemaError): int_schema.validate(pd.Series([data])) for data in [-1, {"a": 1}, -1.0]: with pytest.raises(TypeError): int_schema.validate(TypeError) non_duplicate_schema = SeriesSchema( Int, allow_duplicates=False) with pytest.raises(errors.SchemaError): non_duplicate_schema.validate(pd.Series([0, 1, 2, 3, 4, 1])) # when series name doesn't match schema named_schema = SeriesSchema(Int, name="my_series") with pytest.raises( errors.SchemaError, match=r"^Expected .+ to have name"): named_schema.validate(pd.Series(range(5), name="your_series")) # when series floats are declared to be integer with pytest.raises( errors.SchemaError, match=r"^after dropping null values, expected values in series"): SeriesSchema(Int, nullable=True).validate( pd.Series([1.1, 2.3, 5.5, np.nan])) # when series contains null values when schema is not nullable with pytest.raises( errors.SchemaError, match=r"^non-nullable series .+ contains null values"): SeriesSchema(Float, nullable=False).validate( pd.Series([1.1, 2.3, 5.5, np.nan])) # when series contains null values when schema is not nullable in addition # to having the wrong data type with pytest.raises( errors.SchemaError, match=( r"^expected series '.+' to have type .+, got .+ and " "non-nullable series contains null values")): SeriesSchema(Int, nullable=False).validate( pd.Series([1.1, 2.3, 5.5, np.nan]))