def test_dataframeschema(): """Test that DataFrameSchema is compatible with pydantic.""" assert isinstance( DataFrameSchemaPydantic( pa_schema=pa.DataFrameSchema(), pa_mi=pa.MultiIndex([pa.Index(str), pa.Index(int)]), ), DataFrameSchemaPydantic, )
def test_dataframe_strategy_with_indexes(pdtype, data): """Test dataframe strategy with index and multiindex components.""" dataframe_schema_index = pa.DataFrameSchema(index=pa.Index(pdtype)) dataframe_schema_multiindex = pa.DataFrameSchema(index=pa.MultiIndex( [pa.Index(pdtype, name=f"index{i}") for i in range(3)])) dataframe_schema_index(data.draw(dataframe_schema_index.strategy(size=10))) dataframe_schema_multiindex( data.draw(dataframe_schema_multiindex.strategy(size=10)))
def _create_schema(index="single"): if index == "multi": index = pa.MultiIndex([ pa.Index(pa.Int, name="int_index0"), pa.Index(pa.Int, name="int_index1"), pa.Index(pa.Int, name="int_index2"), ]) elif index == "single": index = pa.Index(pa.Int, name="int_index") else: index = None return pa.DataFrameSchema(columns={ "int_column": pa.Column( pa.Int, checks=[ pa.Check.greater_than(0), pa.Check.less_than(10), pa.Check.in_range(0, 10), ], ), "float_column": pa.Column( pa.Float, checks=[ pa.Check.greater_than(-10), pa.Check.less_than(20), pa.Check.in_range(-10, 20), ], ), "str_column": pa.Column( pa.String, checks=[ pa.Check.isin(["foo", "bar", "x", "xy"]), pa.Check.str_length(1, 3) ], ), "datetime_column": pa.Column(pa.DateTime, checks=[ pa.Check.greater_than(pd.Timestamp("20100101")), pa.Check.less_than(pd.Timestamp("20200101")), ]), "timedelta_column": pa.Column(pa.Timedelta, checks=[ pa.Check.greater_than(pd.Timedelta(1000, unit="ns")), pa.Check.less_than(pd.Timedelta(10000, unit="ns")), ]), }, index=index, coerce=False, strict=True)
def test_multiindex(): """Test that multiple Index annotations create a MultiIndex.""" class Schema(pa.SchemaModel): a: Index[int] = pa.Field(gt=0) b: Index[str] expected = pa.DataFrameSchema(index=pa.MultiIndex([ pa.Index(int, name="a", checks=pa.Check.gt(0)), pa.Index(str, name="b"), ])) assert expected == Schema.to_schema()
def test_multiindex_example(): """ Test MultiIndex schema component example method generates examples that pass. """ pdtype = pa.PandasDtype.Float multiindex = pa.MultiIndex(indexes=[ pa.Index(pdtype, allow_duplicates=False, name="level_0"), pa.Index(pdtype, nullable=True), pa.Index(pdtype), ]) for _ in range(10): example = multiindex.example() multiindex(pd.DataFrame(index=example))
def test_multiindex_example() -> None: """ Test MultiIndex schema component example method generates examples that pass. """ data_type = pa.Float() multiindex = pa.MultiIndex(indexes=[ pa.Index(data_type, unique=True, name="level_0"), pa.Index(data_type, nullable=True), pa.Index(data_type), ]) for _ in range(10): example = multiindex.example() multiindex(pd.DataFrame(index=example))
def create_event_schema( coerce: bool = True, strict: bool = True, nullable: bool = False, ): """Function to validate that event schema is correct, it also does value checks in runtime (really nice stuff, right here). If this fails, then write to dead letter queue. Args: coerce (bool): Flag given to determine whether to coerce series to specified type strict (bool): Flag given to determine whether or not to accept columns in the dataframe that are not in the DataFrame nullable (bool): If columns should be nullable or not Returns: A pandas DataFrame schema that validates that the types are correct, and that the values inserted are correct. """ return pa.DataFrameSchema( { "id": pa.Column(pa.String, nullable=nullable), "timestamp": pa.Column(pa.DateTime, nullable=nullable), "version": pa.Column(pa.String, nullable=nullable), }, index=pa.Index(pa.Int), strict=strict, coerce=coerce, )
def create_invoice_stats_schema(coerce: bool = True, strict: bool = True, nullable: bool = True): """Function to validate that invoice stats schema is correct, it also does value checks in runtime (really nice stuff, right here). Args: coerce (bool): Flag given to determine whether to coerce series to specified type strict (bool): Flag given to determine whether or not to accept columns in the dataframe that are not in the DataFrame nullable (bool): If columns should be nullable or not Returns: A pandas DataFrame schema that validates that the types are correct """ return pa.DataFrameSchema( { INVOICE_STATS_COLUMN_NAMES.get("invoice_median"): pa.Column(pa.Float64, nullable=nullable), INVOICE_STATS_COLUMN_NAMES.get("invoice_mean"): pa.Column(pa.Float64, nullable=nullable), }, index=pa.Index(pa.Int), strict=strict, coerce=coerce, )
def test_index_dtypes( dtype: pandas_engine.DataType, coerce: bool, schema_cls, data: st.DataObject, ): """Test koalas Index and MultiIndex on subset of datatypes. Only test basic datatypes since index handling in pandas is already a little finicky. """ if coerce and dtype is pandas_engine.DateTime: pytest.skip( "koalas cannot coerce a koalas DateTime index to datetime.") if schema_cls is pa.Index: schema = schema_cls(dtype, name="field") schema.coerce = coerce else: schema = schema_cls(indexes=[pa.Index(dtype, name="field")], coerce=True) sample = data.draw(schema.strategy(size=3)) if dtype is pandas_engine.DateTime or isinstance(dtype, pandas_engine.DateTime): # handle datetimes if MIN_TIMESTAMP is not None and (sample.to_frame() < MIN_TIMESTAMP).any(axis=None): with pytest.raises(OverflowError, match="mktime argument out of range"): ks.DataFrame(pd.DataFrame(index=sample)) return else: assert isinstance(schema(ks.DataFrame(pd.DataFrame(index=sample))), ks.DataFrame)
def test_multiindex_strategy(data): """Test MultiIndex schema component strategy.""" pdtype = pa.PandasDtype.Float multiindex = pa.MultiIndex(indexes=[ pa.Index(pdtype, allow_duplicates=False, name="level_0"), pa.Index(pdtype, nullable=True), pa.Index(pdtype), ]) strat = multiindex.strategy(size=10) example = data.draw(strat) for i in range(example.nlevels): assert example.get_level_values(i).dtype == pdtype.str_alias with pytest.raises(pa.errors.BaseStrategyOnlyError): strategies.multiindex_strategy( pdtype, strategies.pandas_dtype_strategy(pdtype))
def test_index_example() -> None: """ Test Index schema component example method generates examples that pass. """ data_type = pa.Int() index_schema = pa.Index(data_type, unique=True) for _ in range(10): index_schema(pd.DataFrame(index=index_schema.example()))
def test_index_example(): """ Test Index schema component example method generates examples that pass. """ pdtype = pa.PandasDtype.Int index_schema = pa.Index(pdtype, allow_duplicates=False) for _ in range(10): index_schema(pd.DataFrame(index=index_schema.example()))
def test_config() -> None: """Test that Config can be inherited and translate into DataFrameSchema options.""" class Base(pa.SchemaModel): a: Series[int] idx_1: Index[str] idx_2: Index[str] class Config: name = "Base schema" coerce = True ordered = True multiindex_coerce = True multiindex_strict = True multiindex_name: Optional[str] = "mi" class Child(Base): b: Series[int] class Config: name = "Child schema" strict = True multiindex_strict = False description = "foo" title = "bar" expected = pa.DataFrameSchema( columns={ "a": pa.Column(int), "b": pa.Column(int) }, index=pa.MultiIndex( [pa.Index(str, name="idx_1"), pa.Index(str, name="idx_2")], coerce=True, strict=False, name="mi", ), name="Child schema", coerce=True, strict=True, ordered=True, description="foo", title="bar", ) assert expected == Child.to_schema()
def test_register_custom_groupby_check(custom_check_teardown: None) -> None: """Test registering a custom groupby check.""" @extensions.register_check_method( statistics=["group_a", "group_b"], supported_types=(pd.Series, pd.DataFrame), check_type="groupby", ) def custom_check(dict_groups, *, group_a, group_b): """ Test that the mean values in group A is larger than that of group B. Note that this function can handle groups of both dataframes and series. """ return (dict_groups[group_a].values.mean() > dict_groups[group_b].values.mean()) # column groupby check data_column_check = pd.DataFrame({ "col1": [20, 20, 10, 10], "col2": list("aabb"), }) schema_column_check = pa.DataFrameSchema({ "col1": pa.Column( int, Check.custom_check(group_a="a", group_b="b", groupby="col2"), ), "col2": pa.Column(str), }) assert isinstance(schema_column_check(data_column_check), pd.DataFrame) # dataframe groupby check data_df_check = pd.DataFrame( { "col1": [20, 20, 10, 10], "col2": [30, 30, 5, 5], "col3": [10, 10, 1, 1], }, index=pd.Index(list("aabb"), name="my_index"), ) schema_df_check = pa.DataFrameSchema( columns={ "col1": pa.Column(int), "col2": pa.Column(int), "col3": pa.Column(int), }, index=pa.Index(str, name="my_index"), checks=Check.custom_check(group_a="a", group_b="b", groupby="my_index"), ) assert isinstance(schema_df_check(data_df_check), pd.DataFrame) for kwargs in [{"element_wise": True}, {"element_wise": False}]: with pytest.warns(UserWarning): Check.custom_check(val=10, **kwargs)
def test_index_strategy(data): """Test Index schema component strategy.""" pdtype = pa.PandasDtype.Int index_schema = pa.Index(pdtype, allow_duplicates=False, name="index") strat = index_schema.strategy(size=10) example = data.draw(strat) assert (~example.duplicated()).all() assert example.dtype == pdtype.str_alias index_schema(pd.DataFrame(index=example))
def test_multiindex_strategy(data) -> None: """Test MultiIndex schema component strategy.""" data_type = pa.Float() multiindex = pa.MultiIndex(indexes=[ pa.Index(data_type, unique=True, name="level_0"), pa.Index(data_type, nullable=True), pa.Index(data_type), ]) strat = multiindex.strategy(size=10) example = data.draw(strat) for i in range(example.nlevels): actual_data_type = pandas_engine.Engine.dtype( example.get_level_values(i).dtype) assert data_type.check(actual_data_type) with pytest.raises(pa.errors.BaseStrategyOnlyError): strategies.multiindex_strategy( data_type, strategies.pandas_dtype_strategy(data_type))
def test_seriesschema(): """Test that SeriesSchemaBase is compatible with pydantic.""" assert isinstance( SeriesSchemaPydantic( pa_series_schema=pa.SeriesSchema(), pa_column=pa.Column(), pa_index=pa.Index(), ), SeriesSchemaPydantic, )
def test_index_strategy(data) -> None: """Test Index schema component strategy.""" data_type = pa.Int() index_schema = pa.Index(data_type, unique=True, name="index") strat = index_schema.strategy(size=10) example = data.draw(strat) assert (~example.duplicated()).all() actual_data_type = pandas_engine.Engine.dtype(example.dtype) assert data_type.check(actual_data_type) index_schema(pd.DataFrame(index=example))
def database_schema() -> pa.DataFrameSchema: """ Get pandera DataFrame schema for database.csv. """ schema = pa.DataFrameSchema( # Index is the area name index=pa.Index( **name_column_kwargs(allow_duplicates=False, geom_type=ColumnNames.AREA) ), coerce=True, # Columns columns={ # traces, thematic and scale columns are strings ColumnNames.TRACES.value: pa.Column( **name_column_kwargs( allow_duplicates=True, geom_type=ColumnNames.TRACES ) ), ColumnNames.THEMATIC.value: pa.Column( **name_column_kwargs(allow_duplicates=True, geom_type=None) ), ColumnNames.SCALE.value: pa.Column( **name_column_kwargs(allow_duplicates=True, geom_type=None) ), # area-shape must be one of the enum values ColumnNames.AREA_SHAPE.value: pa.Column( **enum_column_kwargs(enum_class=AreaShapes) ), # validated must be one of the enum values ColumnNames.VALIDITY.value: pa.Column( **enum_column_kwargs(enum_class=ValidationResults) ), ColumnNames.SNAP_THRESHOLD.value: pa.Column( pa.Float, checks=[ pa.Check.greater_than_or_equal_to(1e-8), pa.Check.less_than_or_equal_to(1e8), ], coerce=True, nullable=False, ), }, ) assert isinstance(schema, pa.DataFrameSchema) return schema
def create_invoice_schema( max_invoice_value: Decimal, min_invoice_value: Decimal, coerce: bool = True, strict: bool = True, nullable: bool = False, ): """Function to validate that invoice schema is correct, it also does value checks in runtime (really nice stuff, right here). Args: max_invoice_value (Decimal): Given max invoice value min_invoice_value (Decimal): Given min invoice value coerce (bool): Flag given to determine whether to coerce series to specified type strict (bool): Flag given to determine whether or not to accept columns in the dataframe that are not in the DataFrame nullable (bool): If columns should be nullable or not Returns: A pandas DataFrame schema that validates that the types are correct, and that the values inserted are correct. If a row is inserted that does not follow: 0 < invoice_value < 200000000.00 An error will be thrown in runtime. """ return pa.DataFrameSchema( { INVOICE_COLUMN_NAMES.get("invoice_name"): pa.Column(pa.String, nullable=nullable), INVOICE_COLUMN_NAMES.get("invoice_value"): pa.Column( pa.Float64, checks=[ pa.Check.less_than_or_equal_to(max_invoice_value), pa.Check.greater_than_or_equal_to(min_invoice_value), ], nullable=nullable, ), }, index=pa.Index(pa.Int), strict=strict, coerce=coerce, )
def test_inherit_schemamodel_fields_alias(): """Test that columns and index aliases are inherited.""" class Base(pa.SchemaModel): a: Series[int] idx: Index[str] class Mid(Base): b: Series[str] = pa.Field(alias="_b") idx: Index[str] class ChildOverrideAttr(Mid): b: Series[int] class ChildOverrideAlias(Mid): b: Series[str] = pa.Field(alias="new_b") class ChildNewAttr(Mid): c: Series[int] class ChildEmpty(Mid): pass expected_mid = pa.DataFrameSchema( columns={ "a": pa.Column(int), "_b": pa.Column(str) }, index=pa.Index(str), ) expected_child_override_attr = expected_mid.rename_columns({ "_b": "b" }).update_column("b", pandas_dtype=int) expected_child_override_alias = expected_mid.rename_columns( {"_b": "new_b"}) expected_child_new_attr = expected_mid.add_columns({ "c": pa.Column(int), }) assert expected_mid == Mid.to_schema() assert expected_child_override_attr == ChildOverrideAttr.to_schema() assert expected_child_override_alias == ChildOverrideAlias.to_schema() assert expected_child_new_attr == ChildNewAttr.to_schema() assert expected_mid == ChildEmpty.to_schema()
def test_to_schema(): """Test that SchemaModel.to_schema() can produce the correct schema.""" class Schema(pa.SchemaModel): a: Series[int] b: Series[str] idx: Index[str] expected = pa.DataFrameSchema( columns={ "a": pa.Column(int), "b": pa.Column(str) }, index=pa.Index(str), ) assert expected == Schema.to_schema() with pytest.raises(TypeError): Schema()
def test_schemamodel_with_fields(): """Test that Fields are translated in the schema.""" class Schema(pa.SchemaModel): a: Series[int] = pa.Field(eq=9, ne=0) b: Series[str] idx: Index[str] = pa.Field(str_length={"min_value": 1}) actual = Schema.to_schema() expected = pa.DataFrameSchema( columns={ "a": pa.Column(int, checks=[pa.Check.equal_to(9), pa.Check.not_equal_to(0)]), "b": pa.Column(str), }, index=pa.Index(str, pa.Check.str_length(1)), ) assert actual == expected
def test_inherit_schemamodel_fields(): """Test that columns and indices are inherited.""" class Base(pa.SchemaModel): a: Series[int] idx: Index[str] class Mid(Base): b: Series[str] idx: Index[str] class Child(Mid): b: Series[int] expected = pa.DataFrameSchema( columns={ "a": pa.Column(int), "b": pa.Column(int) }, index=pa.Index(str), ) assert expected == Child.to_schema()
def test_index_dtypes( dtype: pandas_engine.DataType, coerce: bool, schema_cls, data: st.DataObject, ): """Test modin Index and MultiIndex on subset of datatypes. Only test basic datatypes since index handling in pandas is already a little finicky. """ if schema_cls is pa.Index: schema = schema_cls(dtype, name="field", coerce=coerce) else: schema = schema_cls(indexes=[pa.Index(dtype, name="field")]) schema.coerce = coerce sample = data.draw(schema.strategy(size=3)) # pandas (and modin) use object arrays to store boolean data if dtype is bool: assert sample.dtype == "object" return assert isinstance(schema(mpd.DataFrame(pd.DataFrame(index=sample))), mpd.DataFrame)
def test_to_schema_and_validate() -> None: """ Test that SchemaModel.to_schema() can produce the correct schema and can validate dataframe objects. """ class Schema(pa.SchemaModel): a: Series[int] b: Series[str] c: Series[Any] idx: Index[str] expected = pa.DataFrameSchema( columns={ "a": pa.Column(int), "b": pa.Column(str), "c": pa.Column() }, index=pa.Index(str), ) assert expected == Schema.to_schema() Schema(pd.DataFrame({"a": [1], "b": ["foo"], "c": [3.4]}, index=["1"])) with pytest.raises(pa.errors.SchemaError): Schema(pd.DataFrame({"a": [1]}))
def test_field_name_access_inherit(): """Test that column and index names can be accessed through the class""" class Base(pa.SchemaModel): a: Series[int] b: Series[int] = pa.Field() c: Series[int] = pa.Field(alias="_c") d: Series[int] = pa.Field(alias=123) i1: Index[int] i2: Index[int] = pa.Field() class Child(Base): b: Series[str] = pa.Field(alias="_b") c: Series[str] d: Series[str] = pa.Field() extra1: Series[int] extra2: Series[int] = pa.Field() extra3: Series[int] = pa.Field(alias="_extra3") i1: Index[str] i3: Index[int] = pa.Field(alias="_i3") expected_base = pa.DataFrameSchema( columns={ "a": pa.Column(int), "b": pa.Column(int), "_c": pa.Column(int), 123: pa.Column(int), }, index=pa.MultiIndex([ pa.Index(int, name="i1"), pa.Index(int, name="i2"), ]), ) expected_child = pa.DataFrameSchema( columns={ "a": pa.Column(int), "_b": pa.Column(str), "c": pa.Column(str), "d": pa.Column(str), "extra1": pa.Column(int), "extra2": pa.Column(int), "_extra3": pa.Column(int), }, index=pa.MultiIndex([ pa.Index(str, name="i1"), pa.Index(int, name="i2"), pa.Index(int, name="_i3"), ]), ) assert expected_base == Base.to_schema() assert expected_child == Child.to_schema() assert Child.a == "a" # pylint:disable=no-member assert Child.b == "_b" assert Child.c == "c" assert Child.d == "d" assert Child.extra1 == "extra1" assert Child.extra2 == "extra2" assert Child.extra3 == "_extra3" assert Child.i1 == "i1" assert Child.i2 == "i2" assert Child.i3 == "_i3"
def _create_schema(index="single"): if index == "multi": index = pa.MultiIndex([ pa.Index(pa.Int, name="int_index0"), pa.Index(pa.Int, name="int_index1"), pa.Index(pa.Int, name="int_index2"), ]) elif index == "single": # make sure io modules can handle case when index name is None index = pa.Index(pa.Int, name=None) else: index = None return pa.DataFrameSchema( columns={ "int_column": pa.Column( pa.Int, checks=[ pa.Check.greater_than(0), pa.Check.less_than(10), pa.Check.in_range(0, 10), ], ), "float_column": pa.Column( pa.Float, checks=[ pa.Check.greater_than(-10), pa.Check.less_than(20), pa.Check.in_range(-10, 20), ], ), "str_column": pa.Column( pa.String, checks=[ pa.Check.isin(["foo", "bar", "x", "xy"]), pa.Check.str_length(1, 3), ], ), "datetime_column": pa.Column( pa.DateTime, checks=[ pa.Check.greater_than(pd.Timestamp("20100101")), pa.Check.less_than(pd.Timestamp("20200101")), ], ), "timedelta_column": pa.Column( pa.Timedelta, checks=[ pa.Check.greater_than(pd.Timedelta(1000, unit="ns")), pa.Check.less_than(pd.Timedelta(10000, unit="ns")), ], ), "optional_props_column": pa.Column( pa.String, nullable=True, allow_duplicates=True, coerce=True, required=False, regex=True, checks=[pa.Check.str_length(1, 3)], ), }, index=index, coerce=False, strict=True, )
def test_get_dataframe_schema_statistics(): """Test that dataframe schema statistics logic is correct.""" schema = pa.DataFrameSchema( columns={ "int": pa.Column( pa.Int, checks=[ pa.Check.greater_than_or_equal_to(0), pa.Check.less_than_or_equal_to(100), ], nullable=True, ), "float": pa.Column( pa.Float, checks=[ pa.Check.greater_than_or_equal_to(50), pa.Check.less_than_or_equal_to(100), ]), "str": pa.Column( pa.String, checks=[ pa.Check.isin(["foo", "bar", "baz"]) ] ), }, index=pa.Index( pa.Int, checks=pa.Check.greater_than_or_equal_to(0), nullable=False, name="int_index" ) ) expectation = { "columns": { "int": { "pandas_dtype": pa.Int, "checks": { "greater_than_or_equal_to": {"min_value": 0}, "less_than_or_equal_to": {"max_value": 100}, }, "nullable": True, "allow_duplicates": True, "coerce": False, "required": True, "regex": False, }, "float": { "pandas_dtype": pa.Float, "checks": { "greater_than_or_equal_to": {"min_value": 50}, "less_than_or_equal_to": {"max_value": 100}, }, "nullable": False, "allow_duplicates": True, "coerce": False, "required": True, "regex": False, }, "str": { "pandas_dtype": pa.String, "checks": {"isin": {"allowed_values": ["foo", "bar", "baz"]}}, "nullable": False, "allow_duplicates": True, "coerce": False, "required": True, "regex": False, }, }, "index": [ { "pandas_dtype": pa.Int, "checks": {"greater_than_or_equal_to": {"min_value": 0}}, "nullable": False, "coerce": False, "name": "int_index", } ], "coerce": False, } statistics = schema_statistics.get_dataframe_schema_statistics(schema) assert statistics == expectation
"checks": { "greater_than_or_equal_to": {"min_value": 0}, "less_than_or_equal_to": {"max_value": 100}, }, "name": None, "coerce": False, } @pytest.mark.parametrize("index_schema_component, expectation", [ [ pa.Index( pa.Int, checks=[ pa.Check.greater_than_or_equal_to(10), pa.Check.less_than_or_equal_to(20), ], nullable=False, name="int_index", ), [ { "pandas_dtype": pa.Int, "nullable": False, "checks": { "greater_than_or_equal_to": {"min_value": 10}, "less_than_or_equal_to": {"max_value": 20}, }, "name": "int_index", "coerce": False, }