示例#1
0
def test_pandas_extension_types():
    """Test pandas extension data type happy path."""
    # pylint: disable=no-member
    test_params = [
        (pd.CategoricalDtype(),
         pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"), None),
        (pd.DatetimeTZDtype(tz='UTC'),
         pd.Series(pd.date_range(start="20200101", end="20200301"),
                   dtype="datetime64[ns, utc]"), None),
        (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None),
        (pd.StringDtype(), pd.Series(["foo", "bar", "baz"],
                                     dtype="string"), None),
        (pd.PeriodDtype(freq='D'),
         pd.Series(pd.period_range('1/1/2019', '1/1/2020', freq='D')), None),
        (
            pd.SparseDtype("float"),
            pd.Series(range(100)).where(lambda s: s < 5,
                                        other=np.nan).astype("Sparse[float]"),
            {
                "nullable": True
            },
        ),
        (pd.BooleanDtype(), pd.Series([1, 0, 0, 1, 1], dtype="boolean"), None),
        (
            pd.IntervalDtype(subtype="int64"),
            pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])),
            None,
        )
    ]
    for dtype, data, series_kwargs in test_params:
        series_kwargs = {} if series_kwargs is None else series_kwargs
        series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs)
        assert isinstance(series_schema.validate(data), pd.Series)
示例#2
0
def test_series_schema_with_index(coerce):
    """Test SeriesSchema with Index and MultiIndex components."""
    schema_with_index = SeriesSchema(
        pandas_dtype=Int,
        index=Index(Int, coerce=coerce),
    )
    validated_series = schema_with_index(pd.Series([1, 2, 3], index=[1, 2, 3]))
    assert isinstance(validated_series, pd.Series)

    schema_with_multiindex = SeriesSchema(
        pandas_dtype=Int,
        index=MultiIndex(
            [
                Index(Int, coerce=coerce),
                Index(String, coerce=coerce),
            ]
        ),
    )
    multi_index = pd.MultiIndex.from_arrays(
        [[0, 1, 2], ["foo", "bar", "foo"]],
    )
    validated_series_multiindex = schema_with_multiindex(
        pd.Series([1, 2, 3], index=multi_index)
    )
    assert isinstance(validated_series_multiindex, pd.Series)
    assert (validated_series_multiindex.index == multi_index).all()
示例#3
0
def test_vectorized_checks():
    schema = SeriesSchema(
        Int, Check(lambda s: s.value_counts() == 2, element_wise=False))
    validated_series = schema.validate(pd.Series([1, 1, 2, 2, 3, 3]))
    assert isinstance(validated_series, pd.Series)

    # error case
    with pytest.raises(errors.SchemaError):
        schema.validate(pd.Series([1, 2, 3]))
示例#4
0
def test_series_schema_multiple_validators():
    schema = SeriesSchema(PandasDtype.Int, [
        Check(lambda x: 0 <= x <= 50),
        Check(lambda s: (s == 21).any(), element_wise=False)
    ])
    validated_series = schema.validate(pd.Series([1, 5, 21, 50]))
    assert isinstance(validated_series, pd.Series)

    # raise error if any of the validators fails
    with pytest.raises(SchemaError):
        schema.validate(pd.Series([1, 5, 20, 50]))
示例#5
0
def test_vectorized_checks() -> None:
    """Test that using element-wise checking returns and errors as expected."""
    schema = SeriesSchema(
        Int, Check(lambda s: s.value_counts() == 2, element_wise=False)
    )
    validated_series = schema.validate(pd.Series([1, 1, 2, 2, 3, 3]))
    assert isinstance(validated_series, pd.Series)

    # error case
    with pytest.raises(errors.SchemaError):
        schema.validate(pd.Series([1, 2, 3]))
示例#6
0
def test_raise_warning_series():
    """Test that checks with raise_warning=True raise a warning."""
    data = pd.Series([-1, -2, -3])
    error_schema = SeriesSchema(checks=Check(lambda s: s > 0))
    warning_schema = SeriesSchema(
        checks=Check(lambda s: s > 0, raise_warning=True))

    with pytest.raises(errors.SchemaError):
        error_schema(data)

    with pytest.warns(UserWarning):
        warning_schema(data)
示例#7
0
def test_no_dtype_series():
    schema = SeriesSchema(nullable=False)
    validated_series = schema.validate(pd.Series([0, 1, 2, 3, 4, 1]))
    assert isinstance(validated_series, pd.Series)

    schema = SeriesSchema(nullable=True)
    validated_series = schema.validate(pd.Series([0, 1, 2, None, 4, 1]))
    assert isinstance(validated_series, pd.Series)

    with pytest.raises(errors.SchemaError):
        schema = SeriesSchema(nullable=False)
        schema.validate(pd.Series([0, 1, 2, None, 4, 1]))
示例#8
0
 def setup(self):
     self.schema = SeriesSchema(
             String,
             checks=[
                 Check(lambda s: s.str.startswith("foo")),
                 Check(lambda s: s.str.endswith("bar")),
                 Check(lambda x: len(x) > 3, element_wise=True)
                 ],
             nullable=False,
             allow_duplicates=True,
             name="my_series")
     self.series = pd.Series(["foobar", "foobar", "foobar"],
                             name="my_series")
示例#9
0
def test_series_schema_multiple_validators():
    """Tests how multiple Checks on a Series Schema are handled both
    successfully and when errors are expected."""
    schema = SeriesSchema(
        Int, [
            Check(lambda x: 0 <= x <= 50, element_wise=True),
            Check(lambda s: (s == 21).any())])
    validated_series = schema.validate(pd.Series([1, 5, 21, 50]))
    assert isinstance(validated_series, pd.Series)

    # raise error if any of the validators fails
    with pytest.raises(errors.SchemaError):
        schema.validate(pd.Series([1, 5, 20, 50]))
示例#10
0
def test_no_dtype_series():
    """Test how nullability is handled in SeriesSchemas where no type is
    specified."""
    schema = SeriesSchema(nullable=False)
    validated_series = schema.validate(pd.Series([0, 1, 2, 3, 4, 1]))
    assert isinstance(validated_series, pd.Series)

    schema = SeriesSchema(nullable=True)
    validated_series = schema.validate(pd.Series([0, 1, 2, None, 4, 1]))
    assert isinstance(validated_series, pd.Series)

    with pytest.raises(errors.SchemaError):
        schema = SeriesSchema(nullable=False)
        schema.validate(pd.Series([0, 1, 2, None, 4, 1]))
示例#11
0
def test_schema_equality_operators():
    """Test the usage of == for DataFrameSchema, SeriesSchema and
    SeriesSchemaBase."""
    df_schema = DataFrameSchema({
        "col1": Column(Int, Check(lambda s: s >= 0)),
        "col2": Column(String, Check(lambda s: s >= 2)),
        }, strict=True)
    df_schema_columns_in_different_order = DataFrameSchema({
        "col2": Column(String, Check(lambda s: s >= 2)),
        "col1": Column(Int, Check(lambda s: s >= 0)),
        }, strict=True)
    series_schema = SeriesSchema(
        String,
        checks=[Check(lambda s: s.str.startswith("foo"))],
        nullable=False,
        allow_duplicates=True,
        name="my_series")
    series_schema_base = SeriesSchemaBase(
        String,
        checks=[Check(lambda s: s.str.startswith("foo"))],
        nullable=False,
        allow_duplicates=True,
        name="my_series")
    not_equal_schema = DataFrameSchema({
        "col1": Column(String)
        }, strict=False)

    assert df_schema == copy.deepcopy(df_schema)
    assert df_schema != not_equal_schema
    assert df_schema == df_schema_columns_in_different_order
    assert series_schema == copy.deepcopy(series_schema)
    assert series_schema != not_equal_schema
    assert series_schema_base == copy.deepcopy(series_schema_base)
    assert series_schema_base != not_equal_schema
示例#12
0
def test_series_schema_pdtype(pdtype):
    """Series schema pdtype property should return PandasDtype."""
    if pdtype is None:
        series_schema = SeriesSchema(pdtype)
        assert series_schema.pdtype is None
        return
    for pandas_dtype_input in [
            pdtype,
            pdtype.str_alias,
            pdtype.value,
    ]:
        series_schema = SeriesSchema(pandas_dtype_input)
        if pdtype is PandasDtype.STRING and LEGACY_PANDAS:
            assert series_schema.pdtype == PandasDtype.String
        else:
            assert series_schema.pdtype == pdtype
示例#13
0
def test_series_schema_checks():
    """Test SeriesSchema check property."""
    series_schema_no_checks = SeriesSchema()
    series_schema_one_check = SeriesSchema(checks=Check.eq(0))
    series_schema_multiple_checks = SeriesSchema(
        checks=[Check.gt(0), Check.lt(100)])

    for schema in [
            series_schema_no_checks,
            series_schema_one_check,
            series_schema_multiple_checks,
    ]:
        assert isinstance(schema.checks, list)

    assert len(series_schema_no_checks.checks) == 0
    assert len(series_schema_one_check.checks) == 1
    assert len(series_schema_multiple_checks.checks) == 2
示例#14
0
def test_series_strategy_undefined_check_strategy(schema: pa.SeriesSchema,
                                                  warning: str, data) -> None:
    """Test case where series check strategy is undefined."""
    with pytest.warns(
            UserWarning,
            match=f"{warning} check doesn't have a defined strategy"):
        strat = schema.strategy(size=5)
    example = data.draw(strat)
    schema(example)
示例#15
0
class Validate:
    """
    Benchmarking Series schema.validate
    """

    def setup(self):
        self.schema = SeriesSchema(
                String,
                checks=[
                    Check(lambda s: s.str.startswith("foo")),
                    Check(lambda s: s.str.endswith("bar")),
                    Check(lambda x: len(x) > 3, element_wise=True)
                    ],
                nullable=False,
                allow_duplicates=True,
                name="my_series")
        self.series = pd.Series(["foobar", "foobar", "foobar"],
                                name="my_series")

    def time_series_schema(self):
        self.schema.validate(self.series)

    def mem_series_schema(self):
         self.schema.validate(self.series)

    def peakmem_series_schema(self):
         self.schema.validate(self.series)
示例#16
0
def test_series_schema():
    schema = SeriesSchema(PandasDtype.Int, Check(lambda x: 0 <= x <= 100))
    validated_series = schema.validate(pd.Series([0, 30, 50, 100]))
    assert isinstance(validated_series, pd.Series)

    # error cases
    for data in [-1, 101, 50.1, "foo"]:
        with pytest.raises(SchemaError):
            schema.validate(pd.Series([data]))

    for data in [-1, {"a": 1}, -1.0]:
        with pytest.raises(TypeError):
            schema.validate(TypeError)
示例#17
0
@pytest.mark.parametrize(
    "indexes",
    [
        [Index(int)],
        [Index(int, name="a"), Index(int)],
        [Index(int), Index(int, name="a")],
    ],
)
def test_multiindex_unordered_init_exception(indexes):
    """Un-named indexes in unordered MultiIndex raises an exception."""
    with pytest.raises(errors.SchemaInitError):
        MultiIndex(indexes, ordered=False)


@pytest.mark.parametrize(
    "indexes",
    [
        [Column(int)],
        [Column(int, name="a"), Index(int)],
        [Index(int), Column(int, name="a")],
        [SeriesSchema(int)],
        1,
        1.0,
        "foo",
    ],
)
def test_multiindex_incorrect_input(indexes):
    """Passing in non-Index object raises SchemaInitError."""
    with pytest.raises((errors.SchemaInitError, TypeError)):
        MultiIndex(indexes)
示例#18
0
def test_series_schema():
    """Tests that a SeriesSchema Check behaves as expected for integers and
    strings. Tests error cases for types, duplicates, name errors, and issues
    around float and integer handling of nulls"""
    int_schema = SeriesSchema(
        Int, Check(lambda x: 0 <= x <= 100, element_wise=True))
    assert isinstance(int_schema.validate(
        pd.Series([0, 30, 50, 100])), pd.Series)

    str_schema = SeriesSchema(
        String, Check(lambda s: s.isin(["foo", "bar", "baz"])),
        nullable=True, coerce=True)
    assert isinstance(str_schema.validate(
        pd.Series(["foo", "bar", "baz", None])), pd.Series)
    assert isinstance(str_schema.validate(
        pd.Series(["foo", "bar", "baz", np.nan])), pd.Series)

    # error cases
    for data in [-1, 101, 50.1, "foo"]:
        with pytest.raises(errors.SchemaError):
            int_schema.validate(pd.Series([data]))

    for data in [-1, {"a": 1}, -1.0]:
        with pytest.raises(TypeError):
            int_schema.validate(TypeError)

    non_duplicate_schema = SeriesSchema(
        Int, allow_duplicates=False)
    with pytest.raises(errors.SchemaError):
        non_duplicate_schema.validate(pd.Series([0, 1, 2, 3, 4, 1]))

    # when series name doesn't match schema
    named_schema = SeriesSchema(Int, name="my_series")
    with pytest.raises(
            errors.SchemaError,
            match=r"^Expected .+ to have name"):
        named_schema.validate(pd.Series(range(5), name="your_series"))

    # when series floats are declared to be integer
    with pytest.raises(
            errors.SchemaError,
            match=r"^after dropping null values, expected values in series"):
        SeriesSchema(Int, nullable=True).validate(
            pd.Series([1.1, 2.3, 5.5, np.nan]))

    # when series contains null values when schema is not nullable
    with pytest.raises(
            errors.SchemaError,
            match=r"^non-nullable series .+ contains null values"):
        SeriesSchema(Float, nullable=False).validate(
            pd.Series([1.1, 2.3, 5.5, np.nan]))

    # when series contains null values when schema is not nullable in addition
    # to having the wrong data type
    with pytest.raises(
            errors.SchemaError,
            match=(
                r"^expected series '.+' to have type .+, got .+ and "
                "non-nullable series contains null values")):
        SeriesSchema(Int, nullable=False).validate(
            pd.Series([1.1, 2.3, 5.5, np.nan]))
示例#19
0
def test_series_schema_dtype_property(pandas_dtype, expected):
    """Tests every type of allowed dtype."""
    assert SeriesSchema(pandas_dtype).dtype == expected
示例#20
0
def test_lazy_dataframe_scalar_false_check(schema_cls, data):
    """Lazy validation handles checks returning scalar False values."""
    # define a check that always returns a scalare False value
    check = Check(check_fn=lambda _: False,
                  element_wise=False,
                  error="failing check")
    schema = schema_cls(checks=check)
    with pytest.raises(errors.SchemaErrors):
        schema(data, lazy=True)


@pytest.mark.parametrize(
    "schema, data, expectation",
    [
        [
            SeriesSchema(Int, checks=Check.greater_than(0)),
            pd.Series(["a", "b", "c"]),
            {
                "data": pd.Series(["a", "b", "c"]),
                "schema_errors": {
                    # schema object context -> check failure cases
                    "SeriesSchema": {
                        # check name -> failure cases
                        "greater_than(0)": [
                            "TypeError(\"'>' not supported between instances of "
                            "'str' and 'int'\")",
                            # TypeError raised in python=3.5
                            'TypeError("unorderable types: str() > int()")',
                        ],
                        "pandas_dtype('int64')": ["object"],
                    },