def test_dataframe_strategy_with_indexes(pdtype, data): """Test dataframe strategy with index and multiindex components.""" dataframe_schema_index = pa.DataFrameSchema(index=pa.Index(pdtype)) dataframe_schema_multiindex = pa.DataFrameSchema(index=pa.MultiIndex( [pa.Index(pdtype, name=f"index{i}") for i in range(3)])) dataframe_schema_index(data.draw(dataframe_schema_index.strategy(size=10))) dataframe_schema_multiindex( data.draw(dataframe_schema_multiindex.strategy(size=10)))
def test_dataframeschema(): """Test that DataFrameSchema is compatible with pydantic.""" assert isinstance( DataFrameSchemaPydantic( pa_schema=pa.DataFrameSchema(), pa_mi=pa.MultiIndex([pa.Index(str), pa.Index(int)]), ), DataFrameSchemaPydantic, )
def _create_schema(index="single"): if index == "multi": index = pa.MultiIndex([ pa.Index(pa.Int, name="int_index0"), pa.Index(pa.Int, name="int_index1"), pa.Index(pa.Int, name="int_index2"), ]) elif index == "single": index = pa.Index(pa.Int, name="int_index") else: index = None return pa.DataFrameSchema(columns={ "int_column": pa.Column( pa.Int, checks=[ pa.Check.greater_than(0), pa.Check.less_than(10), pa.Check.in_range(0, 10), ], ), "float_column": pa.Column( pa.Float, checks=[ pa.Check.greater_than(-10), pa.Check.less_than(20), pa.Check.in_range(-10, 20), ], ), "str_column": pa.Column( pa.String, checks=[ pa.Check.isin(["foo", "bar", "x", "xy"]), pa.Check.str_length(1, 3) ], ), "datetime_column": pa.Column(pa.DateTime, checks=[ pa.Check.greater_than(pd.Timestamp("20100101")), pa.Check.less_than(pd.Timestamp("20200101")), ]), "timedelta_column": pa.Column(pa.Timedelta, checks=[ pa.Check.greater_than(pd.Timedelta(1000, unit="ns")), pa.Check.less_than(pd.Timedelta(10000, unit="ns")), ]), }, index=index, coerce=False, strict=True)
def test_multiindex(): """Test that multiple Index annotations create a MultiIndex.""" class Schema(pa.SchemaModel): a: Index[int] = pa.Field(gt=0) b: Index[str] expected = pa.DataFrameSchema(index=pa.MultiIndex([ pa.Index(int, name="a", checks=pa.Check.gt(0)), pa.Index(str, name="b"), ])) assert expected == Schema.to_schema()
def test_multiindex_example(): """ Test MultiIndex schema component example method generates examples that pass. """ pdtype = pa.PandasDtype.Float multiindex = pa.MultiIndex(indexes=[ pa.Index(pdtype, allow_duplicates=False, name="level_0"), pa.Index(pdtype, nullable=True), pa.Index(pdtype), ]) for _ in range(10): example = multiindex.example() multiindex(pd.DataFrame(index=example))
def test_multiindex_example() -> None: """ Test MultiIndex schema component example method generates examples that pass. """ data_type = pa.Float() multiindex = pa.MultiIndex(indexes=[ pa.Index(data_type, unique=True, name="level_0"), pa.Index(data_type, nullable=True), pa.Index(data_type), ]) for _ in range(10): example = multiindex.example() multiindex(pd.DataFrame(index=example))
def test_multiindex_strategy(data): """Test MultiIndex schema component strategy.""" pdtype = pa.PandasDtype.Float multiindex = pa.MultiIndex(indexes=[ pa.Index(pdtype, allow_duplicates=False, name="level_0"), pa.Index(pdtype, nullable=True), pa.Index(pdtype), ]) strat = multiindex.strategy(size=10) example = data.draw(strat) for i in range(example.nlevels): assert example.get_level_values(i).dtype == pdtype.str_alias with pytest.raises(pa.errors.BaseStrategyOnlyError): strategies.multiindex_strategy( pdtype, strategies.pandas_dtype_strategy(pdtype))
def test_config() -> None: """Test that Config can be inherited and translate into DataFrameSchema options.""" class Base(pa.SchemaModel): a: Series[int] idx_1: Index[str] idx_2: Index[str] class Config: name = "Base schema" coerce = True ordered = True multiindex_coerce = True multiindex_strict = True multiindex_name: Optional[str] = "mi" class Child(Base): b: Series[int] class Config: name = "Child schema" strict = True multiindex_strict = False description = "foo" title = "bar" expected = pa.DataFrameSchema( columns={ "a": pa.Column(int), "b": pa.Column(int) }, index=pa.MultiIndex( [pa.Index(str, name="idx_1"), pa.Index(str, name="idx_2")], coerce=True, strict=False, name="mi", ), name="Child schema", coerce=True, strict=True, ordered=True, description="foo", title="bar", ) assert expected == Child.to_schema()
def test_multiindex_strategy(data) -> None: """Test MultiIndex schema component strategy.""" data_type = pa.Float() multiindex = pa.MultiIndex(indexes=[ pa.Index(data_type, unique=True, name="level_0"), pa.Index(data_type, nullable=True), pa.Index(data_type), ]) strat = multiindex.strategy(size=10) example = data.draw(strat) for i in range(example.nlevels): actual_data_type = pandas_engine.Engine.dtype( example.get_level_values(i).dtype) assert data_type.check(actual_data_type) with pytest.raises(pa.errors.BaseStrategyOnlyError): strategies.multiindex_strategy( data_type, strategies.pandas_dtype_strategy(data_type))
def test_field_name_access_inherit(): """Test that column and index names can be accessed through the class""" class Base(pa.SchemaModel): a: Series[int] b: Series[int] = pa.Field() c: Series[int] = pa.Field(alias="_c") d: Series[int] = pa.Field(alias=123) i1: Index[int] i2: Index[int] = pa.Field() class Child(Base): b: Series[str] = pa.Field(alias="_b") c: Series[str] d: Series[str] = pa.Field() extra1: Series[int] extra2: Series[int] = pa.Field() extra3: Series[int] = pa.Field(alias="_extra3") i1: Index[str] i3: Index[int] = pa.Field(alias="_i3") expected_base = pa.DataFrameSchema( columns={ "a": pa.Column(int), "b": pa.Column(int), "_c": pa.Column(int), 123: pa.Column(int), }, index=pa.MultiIndex([ pa.Index(int, name="i1"), pa.Index(int, name="i2"), ]), ) expected_child = pa.DataFrameSchema( columns={ "a": pa.Column(int), "_b": pa.Column(str), "c": pa.Column(str), "d": pa.Column(str), "extra1": pa.Column(int), "extra2": pa.Column(int), "_extra3": pa.Column(int), }, index=pa.MultiIndex([ pa.Index(str, name="i1"), pa.Index(int, name="i2"), pa.Index(int, name="_i3"), ]), ) assert expected_base == Base.to_schema() assert expected_child == Child.to_schema() assert Child.a == "a" # pylint:disable=no-member assert Child.b == "_b" assert Child.c == "c" assert Child.d == "d" assert Child.extra1 == "extra1" assert Child.extra2 == "extra2" assert Child.extra3 == "_extra3" assert Child.i1 == "i1" assert Child.i2 == "i2" assert Child.i3 == "_i3"
def _create_schema(index="single"): if index == "multi": index = pa.MultiIndex([ pa.Index(pa.Int, name="int_index0"), pa.Index(pa.Int, name="int_index1"), pa.Index(pa.Int, name="int_index2"), ]) elif index == "single": # make sure io modules can handle case when index name is None index = pa.Index(pa.Int, name=None) else: index = None return pa.DataFrameSchema( columns={ "int_column": pa.Column( pa.Int, checks=[ pa.Check.greater_than(0), pa.Check.less_than(10), pa.Check.in_range(0, 10), ], ), "float_column": pa.Column( pa.Float, checks=[ pa.Check.greater_than(-10), pa.Check.less_than(20), pa.Check.in_range(-10, 20), ], ), "str_column": pa.Column( pa.String, checks=[ pa.Check.isin(["foo", "bar", "x", "xy"]), pa.Check.str_length(1, 3), ], ), "datetime_column": pa.Column( pa.DateTime, checks=[ pa.Check.greater_than(pd.Timestamp("20100101")), pa.Check.less_than(pd.Timestamp("20200101")), ], ), "timedelta_column": pa.Column( pa.Timedelta, checks=[ pa.Check.greater_than(pd.Timedelta(1000, unit="ns")), pa.Check.less_than(pd.Timedelta(10000, unit="ns")), ], ), "optional_props_column": pa.Column( pa.String, nullable=True, allow_duplicates=True, coerce=True, required=False, regex=True, checks=[pa.Check.str_length(1, 3)], ), }, index=index, coerce=False, strict=True, )