def test_get_series_schema_statistics(): """Test that series schema statistics logic is correct.""" schema = pa.SeriesSchema( int, nullable=False, checks=[ pa.Check.greater_than_or_equal_to(0), pa.Check.less_than_or_equal_to(100), ], ) statistics = schema_statistics.get_series_schema_statistics(schema) assert statistics == { "dtype": pandas_engine.Engine.dtype(int), "nullable": False, "checks": { "greater_than_or_equal_to": { "min_value": 0 }, "less_than_or_equal_to": { "max_value": 100 }, }, "name": None, "coerce": False, }
def test_unique(): """Test uniqueness checks on modin dataframes.""" schema = pa.DataFrameSchema({"field": pa.Column(int)}, unique=["field"]) column_schema = pa.Column(int, unique=True, name="field") series_schema = pa.SeriesSchema(int, unique=True, name="field") data_unique = mpd.DataFrame({"field": [1, 2, 3]}) data_non_unique = mpd.DataFrame({"field": [1, 1, 1]}) assert isinstance(schema(data_unique), mpd.DataFrame) assert isinstance(column_schema(data_unique), mpd.DataFrame) assert isinstance(series_schema(data_unique["field"]), mpd.Series) with pytest.raises(pa.errors.SchemaError, match="columns .+ not unique"): schema(data_non_unique) with pytest.raises(pa.errors.SchemaError, match="series .+ contains duplicate values"): column_schema(data_non_unique) with pytest.raises(pa.errors.SchemaError, match="series .+ contains duplicate values"): series_schema(data_non_unique["field"]) schema.unique = None column_schema.unique = False series_schema.unique = False assert isinstance(schema(data_non_unique), mpd.DataFrame) assert isinstance(column_schema(data_non_unique), mpd.DataFrame) assert isinstance(series_schema(data_non_unique["field"]), mpd.Series)
def test_seriesschema(): """Test that SeriesSchemaBase is compatible with pydantic.""" assert isinstance( SeriesSchemaPydantic( pa_series_schema=pa.SeriesSchema(), pa_column=pa.Column(), pa_index=pa.Index(), ), SeriesSchemaPydantic, )
def test_series_schema() -> None: """ Test that SeriesSchema based pandera validation works with Dask Series. """ integer_schema = pa.SeriesSchema(int) string_schema = pa.SeriesSchema(str) series = pd.Series(["1"]) dseries = dd.from_pandas(series, npartitions=1) dseries = string_schema.validate(dseries) pd.testing.assert_series_equal(series, dseries.compute()) dseries = integer_schema.validate(dseries) with pytest.raises(pa.errors.SchemaError): dseries.compute() integer_schema.validate(dseries, inplace=True) with pytest.raises(pa.errors.SchemaError): dseries.compute()
def test_series_example(): """Test SeriesSchema example method generate examples that pass.""" series_schema = pa.SeriesSchema(pa.Int, pa.Check.gt(0)) for _ in range(10): series_schema(series_schema.example())
def test_series_strategy(data): """Test SeriesSchema strategy.""" series_schema = pa.SeriesSchema(pa.Int, pa.Check.gt(0)) series_schema(data.draw(series_schema.strategy()))
) example = data.draw(strat) if nullable: assert example.isna().any(axis=None) else: assert example.notna().all(axis=None) @pytest.mark.parametrize( "schema, warning", [ [ pa.SeriesSchema( pa.Int, checks=[ pa.Check(lambda x: x > 0, element_wise=True), pa.Check(lambda x: x > -10, element_wise=True), ], ), "Element-wise", ], [ pa.SeriesSchema( pa.Int, checks=[ pa.Check(lambda s: s > -10000), pa.Check(lambda s: s > -9999), ], ), "Vectorized", ],
import pandas as pd import pytest import pandera as pa @pytest.mark.parametrize( "schema1, schema2, data", [ [ pa.DataFrameSchema({"col": pa.Column(int)}, coerce=True), pa.DataFrameSchema({"col": pa.Column(float)}, coerce=True), pd.DataFrame({"col": [1, 2, 3]}), ], [ pa.SeriesSchema(int, coerce=True), pa.SeriesSchema(float, coerce=True), pd.Series([1, 2, 3]), ], ], ) @pytest.mark.parametrize("inplace", [False, True]) def test_dataframe_series_add_schema(schema1, schema2, data, inplace): """ Test that pandas object contains schema metadata after pandera validation. """ validated_data_1 = schema1(data, inplace=inplace) if inplace: assert data.pandera.schema == schema1 else: assert data.pandera.schema is None
import pandera as pa weight_series = pa.SeriesSchema(pa.Float64, index=pa.Index(pa.DateTime), name="weight") consumption_series = pa.SeriesSchema(pa.Float64, index=pa.Index(pa.DateTime), name="consumption") servings_series = pa.SeriesSchema(pa.Int, index=pa.Index(pa.DateTime), name="servings")
probabilistic_mental_illness_schema(fatal_encounters_clean); # %% [markdown] slideshow={"slide_type": "slide"} # ### Prepare Training and Test Sets # # For functions that have tuple/list-like output, specify an integer # index `pa.check_output(schema, <int>)` to apply the schema to a # specific element in the output. # %% from sklearn.model_selection import train_test_split target_schema = pa.SeriesSchema( pa.Bool, name="disposition_accidental", checks=Hypothesis.one_sample_ttest( popmean=0.0275, relationship="equal", alpha=0.01 ) ) feature_schema = training_data_schema.remove_columns([target_schema.name]) @pa.check_input(training_data_schema) @pa.check_output(feature_schema, 0) @pa.check_output(feature_schema, 1) @pa.check_output(target_schema, 2) @pa.check_output(target_schema, 3) def split_training_data(fatal_encounters_clean): return train_test_split( fatal_encounters_clean[list(feature_schema.columns)], fatal_encounters_clean[target_schema.name],