def validate_dataframe(dataframe: DataFrame) -> bool: """Validate dataframe against schema.""" schema = DataFrameSchema( { "source": Column(pa.String), "topic": Column(pa.String, nullable=True), "concept": Column(pa.String, nullable=True), "variable": Column(pa.String), "label": Column(pa.String), "value": Column(pa.Float), "denominator_variable": Column(pa.String, nullable=True), "denominator_label": Column(pa.String, nullable=True), "denominator": Column(pa.Float, nullable=True), "year": Column( pa.Int, checks=[ Check.less_than_or_equal_to(dt.now().year), Check.greater_than_or_equal_to(2000), ], ), "year_date": Column(pa.String, checks=[Check(validate_year_date, element_wise=True)]), "geo_id": Column(pa.String), "geo_name": Column(pa.String), "geo_type": Column(pa.String), "location": Column(pa.String, checks=[Check(validate_location, element_wise=True)]), "row_id": Column(pa.String, allow_duplicates=False), }, strict=True, coerce=True, checks=[ # Check that year_date and year fields are aligned Check(lambda df: df["year_date"][:4] == df["year"].astype(str), element_wise=True), # Check that row_id field concatenates other identifying fields as expected Check(lambda df: df["row_id"] == df.apply(make_row_id, axis=1)), ], ) # Validate dataframe against schema try: schema.validate(dataframe) except SchemaError as error: logger.warning(f"Failed to validate dataframe: {error.args[0]}") return False else: return True
"Connections per Trace": Column( pandas_dtype=PandasDtype.Float64, checks=[ Check.greater_than_or_equal_to(min_value=0.0), ], nullable=False, allow_duplicates=True, coerce=True, required=True, regex=False, ), "Connections per Branch": Column( pandas_dtype=PandasDtype.Float64, checks=[ Check.greater_than_or_equal_to(min_value=0.0), Check.less_than_or_equal_to(max_value=2.0), ], nullable=False, allow_duplicates=True, coerce=True, required=True, regex=False, ), "Fracture Intensity (Mauldon)": Column( pandas_dtype=PandasDtype.Float64, checks=[ Check.greater_than_or_equal_to(min_value=0.0), ], nullable=False, allow_duplicates=True, coerce=True,
# %% [markdown] slideshow={"slide_type": "skip"} # ## Pandera Basics # # ### Step 1: Define a `DataFrameSchema` # %% slideshow={"slide_type": "skip"} import pandera as pa from pandera import Column, Check schema = pa.DataFrameSchema( { "hours_worked": Column( pa.Float, [ Check.greater_than_or_equal_to(0), Check.less_than_or_equal_to(60), ], nullable=True ), "wage_per_hour": Column( pa.Float, Check.greater_than_or_equal_to(15), nullable=True ), }, coerce=True, ) # %% [markdown] slideshow={"slide_type": "skip"} # ### Step 2: Call the `schema` on some data # %% slideshow={"slide_type": "skip"} import pandas as pd