Пример #1
0
def test_vectorized_checks():
    schema = SeriesSchema(
        Int, Check(lambda s: s.value_counts() == 2, element_wise=False))
    validated_series = schema.validate(pd.Series([1, 1, 2, 2, 3, 3]))
    assert isinstance(validated_series, pd.Series)

    # error case
    with pytest.raises(errors.SchemaError):
        schema.validate(pd.Series([1, 2, 3]))
Пример #2
0
def test_series_schema_multiple_validators():
    schema = SeriesSchema(PandasDtype.Int, [
        Check(lambda x: 0 <= x <= 50),
        Check(lambda s: (s == 21).any(), element_wise=False)
    ])
    validated_series = schema.validate(pd.Series([1, 5, 21, 50]))
    assert isinstance(validated_series, pd.Series)

    # raise error if any of the validators fails
    with pytest.raises(SchemaError):
        schema.validate(pd.Series([1, 5, 20, 50]))
Пример #3
0
def test_vectorized_checks() -> None:
    """Test that using element-wise checking returns and errors as expected."""
    schema = SeriesSchema(
        Int, Check(lambda s: s.value_counts() == 2, element_wise=False)
    )
    validated_series = schema.validate(pd.Series([1, 1, 2, 2, 3, 3]))
    assert isinstance(validated_series, pd.Series)

    # error case
    with pytest.raises(errors.SchemaError):
        schema.validate(pd.Series([1, 2, 3]))
Пример #4
0
def test_no_dtype_series():
    schema = SeriesSchema(nullable=False)
    validated_series = schema.validate(pd.Series([0, 1, 2, 3, 4, 1]))
    assert isinstance(validated_series, pd.Series)

    schema = SeriesSchema(nullable=True)
    validated_series = schema.validate(pd.Series([0, 1, 2, None, 4, 1]))
    assert isinstance(validated_series, pd.Series)

    with pytest.raises(errors.SchemaError):
        schema = SeriesSchema(nullable=False)
        schema.validate(pd.Series([0, 1, 2, None, 4, 1]))
Пример #5
0
def test_series_schema_multiple_validators():
    """Tests how multiple Checks on a Series Schema are handled both
    successfully and when errors are expected."""
    schema = SeriesSchema(
        Int, [
            Check(lambda x: 0 <= x <= 50, element_wise=True),
            Check(lambda s: (s == 21).any())])
    validated_series = schema.validate(pd.Series([1, 5, 21, 50]))
    assert isinstance(validated_series, pd.Series)

    # raise error if any of the validators fails
    with pytest.raises(errors.SchemaError):
        schema.validate(pd.Series([1, 5, 20, 50]))
Пример #6
0
def test_series_schema():
    schema = SeriesSchema(PandasDtype.Int, Check(lambda x: 0 <= x <= 100))
    validated_series = schema.validate(pd.Series([0, 30, 50, 100]))
    assert isinstance(validated_series, pd.Series)

    # error cases
    for data in [-1, 101, 50.1, "foo"]:
        with pytest.raises(SchemaError):
            schema.validate(pd.Series([data]))

    for data in [-1, {"a": 1}, -1.0]:
        with pytest.raises(TypeError):
            schema.validate(TypeError)
Пример #7
0
def test_no_dtype_series():
    """Test how nullability is handled in SeriesSchemas where no type is
    specified."""
    schema = SeriesSchema(nullable=False)
    validated_series = schema.validate(pd.Series([0, 1, 2, 3, 4, 1]))
    assert isinstance(validated_series, pd.Series)

    schema = SeriesSchema(nullable=True)
    validated_series = schema.validate(pd.Series([0, 1, 2, None, 4, 1]))
    assert isinstance(validated_series, pd.Series)

    with pytest.raises(errors.SchemaError):
        schema = SeriesSchema(nullable=False)
        schema.validate(pd.Series([0, 1, 2, None, 4, 1]))
Пример #8
0
def test_pandas_extension_types():
    """Test pandas extension data type happy path."""
    # pylint: disable=no-member
    test_params = [
        (pd.CategoricalDtype(),
         pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"), None),
        (pd.DatetimeTZDtype(tz='UTC'),
         pd.Series(pd.date_range(start="20200101", end="20200301"),
                   dtype="datetime64[ns, utc]"), None),
        (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None),
        (pd.StringDtype(), pd.Series(["foo", "bar", "baz"],
                                     dtype="string"), None),
        (pd.PeriodDtype(freq='D'),
         pd.Series(pd.period_range('1/1/2019', '1/1/2020', freq='D')), None),
        (
            pd.SparseDtype("float"),
            pd.Series(range(100)).where(lambda s: s < 5,
                                        other=np.nan).astype("Sparse[float]"),
            {
                "nullable": True
            },
        ),
        (pd.BooleanDtype(), pd.Series([1, 0, 0, 1, 1], dtype="boolean"), None),
        (
            pd.IntervalDtype(subtype="int64"),
            pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])),
            None,
        )
    ]
    for dtype, data, series_kwargs in test_params:
        series_kwargs = {} if series_kwargs is None else series_kwargs
        series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs)
        assert isinstance(series_schema.validate(data), pd.Series)
Пример #9
0
class Validate:
    """
    Benchmarking Series schema.validate
    """

    def setup(self):
        self.schema = SeriesSchema(
                String,
                checks=[
                    Check(lambda s: s.str.startswith("foo")),
                    Check(lambda s: s.str.endswith("bar")),
                    Check(lambda x: len(x) > 3, element_wise=True)
                    ],
                nullable=False,
                allow_duplicates=True,
                name="my_series")
        self.series = pd.Series(["foobar", "foobar", "foobar"],
                                name="my_series")

    def time_series_schema(self):
        self.schema.validate(self.series)

    def mem_series_schema(self):
         self.schema.validate(self.series)

    def peakmem_series_schema(self):
         self.schema.validate(self.series)
Пример #10
0
def test_series_schema():
    """Tests that a SeriesSchema Check behaves as expected for integers and
    strings. Tests error cases for types, duplicates, name errors, and issues
    around float and integer handling of nulls"""
    int_schema = SeriesSchema(
        Int, Check(lambda x: 0 <= x <= 100, element_wise=True))
    assert isinstance(int_schema.validate(
        pd.Series([0, 30, 50, 100])), pd.Series)

    str_schema = SeriesSchema(
        String, Check(lambda s: s.isin(["foo", "bar", "baz"])),
        nullable=True, coerce=True)
    assert isinstance(str_schema.validate(
        pd.Series(["foo", "bar", "baz", None])), pd.Series)
    assert isinstance(str_schema.validate(
        pd.Series(["foo", "bar", "baz", np.nan])), pd.Series)

    # error cases
    for data in [-1, 101, 50.1, "foo"]:
        with pytest.raises(errors.SchemaError):
            int_schema.validate(pd.Series([data]))

    for data in [-1, {"a": 1}, -1.0]:
        with pytest.raises(TypeError):
            int_schema.validate(TypeError)

    non_duplicate_schema = SeriesSchema(
        Int, allow_duplicates=False)
    with pytest.raises(errors.SchemaError):
        non_duplicate_schema.validate(pd.Series([0, 1, 2, 3, 4, 1]))

    # when series name doesn't match schema
    named_schema = SeriesSchema(Int, name="my_series")
    with pytest.raises(
            errors.SchemaError,
            match=r"^Expected .+ to have name"):
        named_schema.validate(pd.Series(range(5), name="your_series"))

    # when series floats are declared to be integer
    with pytest.raises(
            errors.SchemaError,
            match=r"^after dropping null values, expected values in series"):
        SeriesSchema(Int, nullable=True).validate(
            pd.Series([1.1, 2.3, 5.5, np.nan]))

    # when series contains null values when schema is not nullable
    with pytest.raises(
            errors.SchemaError,
            match=r"^non-nullable series .+ contains null values"):
        SeriesSchema(Float, nullable=False).validate(
            pd.Series([1.1, 2.3, 5.5, np.nan]))

    # when series contains null values when schema is not nullable in addition
    # to having the wrong data type
    with pytest.raises(
            errors.SchemaError,
            match=(
                r"^expected series '.+' to have type .+, got .+ and "
                "non-nullable series contains null values")):
        SeriesSchema(Int, nullable=False).validate(
            pd.Series([1.1, 2.3, 5.5, np.nan]))