def sample_dataframe_schema(**kwargs):
    return pa.DataFrameSchema(
        {
            "a":
            pa.Column(int, checks=pa.Check.le(10), description="a desc"),
            "b":
            pa.Column(float, checks=pa.Check.lt(-1.2), description="b desc"),
            "c":
            pa.Column(
                str,
                description="c desc",
                checks=[
                    pa.Check.str_startswith("value_"),
                    pa.Check(
                        lambda s: s.str.split("_", expand=True).shape[1] == 2,
                        description="Two words separated by underscore",
                    ),
                ],
            ),
        },
        checks=[
            pa.Check(lambda df: df["a"].sum() > df["b"].sum(),
                     description="sum(a) > sum(b)"),
        ],
        **kwargs,
    )
示例#2
0
def test_to_script_lambda_check():
    """Test writing DataFrameSchema to a script with lambda check."""
    schema1 = pandera.DataFrameSchema(
        {
            "a": pandera.Column(
                pandera.Int,
                checks=pandera.Check(
                    lambda s: s.mean() > 5, element_wise=False
                ),
            ),
        }
    )

    with pytest.warns(UserWarning):
        pandera.io.to_script(schema1)

    schema2 = pandera.DataFrameSchema(
        {
            "a": pandera.Column(
                pandera.Int,
            ),
        },
        checks=pandera.Check(lambda s: s.mean() > 5, element_wise=False),
    )

    with pytest.warns(UserWarning, match=".*registered checks.*"):
        pandera.io.to_script(schema2)
 def test_csv_download(self):
     df = data_import.download_csv()
     schema_csv_download = pa.DataFrameSchema({
         'name':
         pa.Column(pa.String),
         'energy,calculated (kJ)':
         pa.Column(
             pa.Int,
             pa.Check(lambda x: 0 <= x <= 4000,
                      element_wise=True,
                      error="kJ range checker [0, 2000]")),
         'fat, total (g)':
         pa.Column(pa.String),
         'carbohydrate, available (g)':
         pa.Column(pa.String),
         'protein, total (g)':
         pa.Column(pa.String),
         # 'fibre, total (g)': pa.Column(), # can have NaN values
         'sugars, total (g)':
         pa.Column(pa.String),
         'alcohol (g)':
         pa.Column(pa.String),
         # 'sodium (mg)': pa.Column(), # can have NaN values
         'salt (mg)':
         pa.Column(pa.String),
     })
     df_valid = schema_csv_download.validate(df)
     self.assertTrue(1000 in df_valid.index)
示例#4
0
def test_to_yaml_lambda_check():
    """Test writing DataFrameSchema to a yaml with lambda check."""
    schema = pa.DataFrameSchema({
        "a": pa.Column(
            pa.Int,
            checks=pa.Check(lambda s: s.mean() > 5, element_wise=False)
        ),
    })

    with pytest.warns(UserWarning):
        pa.io.to_yaml(schema)
示例#5
0
def test_to_yaml_custom_dataframe_check():
    """Tests that writing DataFrameSchema with an unregistered check raises."""

    schema = pa.DataFrameSchema(
        {
            "a": pa.Column(pa.Int, ),
        },
        checks=[pa.Check(lambda obj: len(obj.index) > 1)],
    )

    with pytest.warns(UserWarning, match=".*registered checks.*"):
        pa.io.to_yaml(schema)
示例#6
0
def test_infer_series_schema(series):
    """Test series schema is correctly inferred."""
    schema = schema_inference.infer_series_schema(series)
    assert isinstance(schema, pa.SeriesSchema)

    with pytest.warns(
            UserWarning,
            match="^This .+ is an inferred schema that hasn't been modified"):
        schema.validate(series)

    # modifying an inferred schema should set _is_inferred to False
    schema_with_new_checks = schema.set_checks(
        [pa.Check(lambda x: x is not None)])
    assert schema._is_inferred
    assert not schema_with_new_checks._is_inferred
    assert isinstance(schema_with_new_checks.validate(series), pd.Series)
示例#7
0
def prioritized_values_check(named_priorities: Dict[str, int], separator: str,
                             name: str) -> pa.checks.Check:
    """
    Construct check for e.g. data source and scale columns.

    Both have fixed values and fixed order of the values.
    """
    return pa.Check(
        lambda value: schema_checks.named_priority_check(
            value,
            named_priorities=named_priorities,
            separator=separator,
        ),
        element_wise=True,
        name=name,
    )
def test_custom_checks(custom_check_teardown):
    """Test that custom checks can be executed."""
    @extensions.register_check_method(statistics=["value"])
    def modin_eq(modin_obj, *, value):
        return modin_obj == value

    custom_schema = pa.DataFrameSchema(
        {"field": pa.Column(checks=pa.Check(lambda s: s == 0, name="custom"))})

    custom_registered_schema = pa.DataFrameSchema(
        {"field": pa.Column(checks=pa.Check.modin_eq(0))})

    for schema in (custom_schema, custom_registered_schema):
        schema(mpd.DataFrame({"field": [0] * 100}))

        try:
            schema(mpd.DataFrame({"field": [-1] * 100}))
        except pa.errors.SchemaError as err:
            assert (err.failure_cases["failure_case"] == -1).all()
示例#9
0
    )
    example = data.draw(strat)
    if nullable:
        assert example.isna().any(axis=None)
    else:
        assert example.notna().all(axis=None)


@pytest.mark.parametrize(
    "schema, warning",
    [
        [
            pa.SeriesSchema(
                pa.Int,
                checks=[
                    pa.Check(lambda x: x > 0, element_wise=True),
                    pa.Check(lambda x: x > -10, element_wise=True),
                ],
            ),
            "Element-wise",
        ],
        [
            pa.SeriesSchema(
                pa.Int,
                checks=[
                    pa.Check(lambda s: s > -10000),
                    pa.Check(lambda s: s > -9999),
                ],
            ),
            "Vectorized",
        ],
import titanic.datalake as datalake
from titanic.config import Config, parse_args

schema = pa.DataFrameSchema(
    {
        "PassengerId":
        pa.Column(int),
        "Survived":
        pa.Column(int, checks=pa.Check.isin([0, 1])),
        "Pclass":
        pa.Column(int, checks=pa.Check.isin([0, 1, 2, 3])),
        "Name":
        pa.Column(str),
        "Sex":
        pa.Column(str, checks=pa.Check(lambda s: s.isin(["male", "female"]))),
        "Age":
        pa.Column(float,
                  checks=pa.Check.less_than(100, ignore_na=True),
                  nullable=True),
        "SibSp":
        pa.Column(int),
        "Parch":
        pa.Column(int),
        "Ticket":
        pa.Column(str),
        "Fare":
        pa.Column(float),
        "Cabin":
        pa.Column(str, nullable=True),
        "Embarked":
示例#11
0
def traces_schema(metadata: rules.Metadata):
    """
    Get pandera schema for traces GeoDataFrame.
    """
    trace_columns: Dict[str, pa.Column] = {
        VALIDATION_ERROR_COLUMN:
        pa.Column(pa.String, **default_non_required_kwargs()),
        DIP_COLUMN:
        pa.Column(
            pa.Float,
            **default_non_required_kwargs(),
            checks=[pa.checks.Check.in_range(min_value=0.0, max_value=90.0)],
        ),
        DIP_DIR_COLUMN:
        pa.Column(
            pa.Float,
            **default_non_required_kwargs(),
            checks=[pa.checks.Check.in_range(min_value=0.0, max_value=360.0)],
        ),
        DATA_SOURCE_COLUMN:
        pa.Column(
            pa.String,
            **default_non_required_kwargs(nullable=False),
            checks=[
                prioritized_values_check(
                    named_priorities=metadata.data_source.order,
                    separator=metadata.data_source.separator,
                    name=
                    f"Value and priority order check for {DATA_SOURCE_COLUMN}.",
                )
            ],
        ),
        DATE_COLUMN:
        pa.Column(
            pa.DateTime,
            **default_non_required_kwargs(nullable=False),
            checks=[
                pa.Check(schema_checks.date_datetime_check, element_wise=True)
            ],
        ),
        OPERATOR_COLUMN:
        pa.Column(
            pa.String,
            **default_non_required_kwargs(nullable=False),
            checks=[pa.Check.isin(metadata.operators)],
        ),
        SCALE_COLUMN:
        pa.Column(
            pa.String,
            **default_non_required_kwargs(nullable=False),
            checks=[
                prioritized_values_check(
                    named_priorities=metadata.scale.order,
                    separator=metadata.scale.separator,
                    name=f"Value and priority order check for {SCALE_COLUMN}.",
                )
            ],
        ),
        CERTAINTY_COLUMN:
        pa.Column(
            pa.String,
            **default_non_required_kwargs(nullable=False),
            checks=[pa.Check.isin(metadata.certainty)],
        ),
        LINEAMENT_ID_COLUMN:
        pa.Column(
            pa.String,
            **default_non_required_kwargs(nullable=False),
            checks=[
                pa.Check(
                    lambda raw_value: schema_checks.lineament_id_check(
                        raw_value=raw_value,
                        lineament_id_prefixes=metadata.lineament_id_prefixes,
                    ),
                    element_wise=True,
                    name=f"{LINEAMENT_ID_COLUMN} check.",
                )
            ],
            allow_duplicates=False,
        ),
    }
    return pa.DataFrameSchema(
        index=pa.Index(pa.Int),
        columns={
            "geometry": pa.Column(required=True, ),
            **trace_columns,
        },
    )
示例#12
0
import pandera as pa
import pandas as pd

data = pd.read_json('../user.json')
print(data)

# Defining the schema
schema = pa.DataFrameSchema({
    "email" : pa.Column(pa.String, nullable=False)
    "books" : pa.Column(pa.String, nullable=True),
    "title" : pa.Column(pa.String, pa.Check(len<=120), nullable=True),
    "isbn" : pa.Column(pa.String, nullable=False),
})

# Validating the data
schema.validate(data_sample)
示例#13
0
    def test_add_necessary_columns(self):
        data = {
            'name': [
                'Tämä on Ruuan kategoria, ja tämä tuotemerkki Beef, ja tää on detail'
            ],
            'energy,calculated (kJ)': [123],
            'fat, total (g)': ['3.4'],
            'carbohydrate, available (g)': ['58.8'],
            'protein, total (g)': ['<0.1'],
            'sugars, total (g)': ['1.3'],
            'fibre, total (g)': ['11.5'],
            'alcohol (g)': ['0.0'],
            'sodium (mg)': ['87.8'],
            'salt (mg)': ['470.1'],
            'lactose (g)': ['2.1'],
        }
        df_test = pd.DataFrame(data)
        df = data_import.add_necessary_columns(df_test)

        schema_added_columns = pa.DataFrameSchema({
            'category':
            pa.Column(pa.String),
            'extra_category':
            pa.Column(pa.String),
            'kcal':
            pa.Column(pa.Float, pa.Check(lambda s: s >= 0)),
            'fat_kcal':
            pa.Column(pa.Float, pa.Check(lambda s: s >= 0)),
            'carb_kcal':
            pa.Column(pa.Float, pa.Check(lambda s: s >= 0)),
            'protein_kcal':
            pa.Column(pa.Float, pa.Check(lambda s: s >= 0)),
            'alc_kcal':
            pa.Column(pa.Float, pa.Check(lambda s: s >= 0)),
            'sugar':
            pa.Column(pa.Float, pa.Check(lambda s: s >= 0)),
            'fibre':
            pa.Column(pa.Float, pa.Check(lambda s: s >= 0)),
            'alc':
            pa.Column(pa.Float, pa.Check(lambda s: s >= 0)),
            'sodium':
            pa.Column(pa.Float, pa.Check(lambda s: s >= 0)),
            'salt':
            pa.Column(pa.Float, pa.Check(lambda s: s >= 0)),
            'kcal_ratio':
            pa.Column(pa.Float, pa.Check(lambda s: s >= 0)),
            'lactose':
            pa.Column(pa.Float, pa.Check(lambda s: s >= 0)),
        })

        df_valid = schema_added_columns.validate(df)

        self.assertEqual(df_valid['kcal'][0],
                         df_test['energy,calculated (kJ)'][0] / 4.184)
        self.assertEqual(df_valid['salt'][0], 470.1)
        self.assertEqual(df_valid['category'][0], 'Tämä on Ruuan kategoria')
        self.assertEqual(df_valid['extra_category'][0], 'beef')