示例#1
0
def test_category_dtype():
    """Test the category type can be validated properly by schema.validate"""
    schema = DataFrameSchema(
        columns={
            "col":
            Column(
                pa.Category,
                checks=[
                    Check(lambda s: set(s) == {"A", "B", "C"}),
                    Check(lambda s: s.cat.categories.tolist() ==
                          ["A", "B", "C"]),
                    Check(lambda s: s.isin(["A", "B", "C"])),
                ],
                nullable=False,
            ),
        },
        coerce=False,
    )
    validated_df = schema.validate(
        pd.DataFrame(
            {"col": pd.Series(["A", "B", "A", "B", "C"], dtype="category")}))
    assert isinstance(validated_df, pd.DataFrame)
示例#2
0
def test_check_groupby_multiple_columns():
    """Tests uses of groupby to specify dependencies between one column and a
    number of other columns, including error handling."""
    schema = DataFrameSchema({
        "col1": Column(Int, [
            Check(lambda s: s[("bar", True)].sum() == 16,  # 7 + 9
                  groupby=["col2", "col3"]),
        ]),
        "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))),
        "col3": Column(Bool),
    })

    df_pass = pd.DataFrame({
        "col1": [7, 8, 9, 11, 12, 13],
        "col2": ["bar", "bar", "bar", "foo", "foo", "foo"],
        "col3": [True, False, True, False, True, False],
    })

    df = schema.validate(df_pass)
    assert isinstance(df, pd.DataFrame)
    assert len(df.columns) == 3
    assert set(df.columns) == {"col1", "col2", "col3"}
示例#3
0
def test_dataframe_coerce_regex():
    """Test dataframe pandas dtype coercion for regex columns"""
    schema = DataFrameSchema(
        columns={"column_": Column(float, regex=True, required=False)},
        pandas_dtype=int,
        coerce=True,
    )

    no_match_df = pd.DataFrame({"foo": [1, 2, 3]})
    match_valid_df = pd.DataFrame({
        "column_1": [1, 2, 3],
        "column_2": ["1", "2", "3"],
    })

    schema(no_match_df)
    schema(match_valid_df)

    # if the regex column is required, no matches should raise an error
    schema_required = schema.update_column("column_", required=True)
    with pytest.raises(errors.SchemaError,
                       match="Column regex name='column_' did not match"):
        schema_required(no_match_df)
示例#4
0
def _multi_check_schema() -> DataFrameSchema:
    """Schema with multiple positivity checks on column `a`"""
    return DataFrameSchema(
        {
            "a": Column(
                int,
                [
                    Check.isin([0, 1]),
                    Check(lambda x: x >= 0),
                ],
            ),
        }
    )
示例#5
0
def test_add_and_remove_columns():
    """Check that adding and removing columns works as expected and doesn't
    modify the original underlying DataFrameSchema."""
    schema1 = DataFrameSchema(
        {
            "col1": Column(Int, Check(lambda s: s >= 0)),
        },
        strict=True,
    )

    schema1_exact_copy = copy.deepcopy(schema1)

    # test that add_columns doesn't modify schema1 after add_columns:
    schema2 = schema1.add_columns({
        "col2":
        Column(String, Check(lambda x: x <= 0)),
        "col3":
        Column(Object, Check(lambda x: x == 0)),
    })

    schema2_exact_copy = copy.deepcopy(schema2)

    assert schema1 == schema1_exact_copy

    # test that add_columns changed schema1 into schema2:
    expected_schema_2 = DataFrameSchema(
        {
            "col1": Column(Int, Check(lambda s: s >= 0)),
            "col2": Column(String, Check(lambda x: x <= 0)),
            "col3": Column(Object, Check(lambda x: x == 0)),
        },
        strict=True,
    )

    assert schema2 == expected_schema_2

    # test that remove_columns doesn't modify schema2:
    schema3 = schema2.remove_columns(["col2"])

    assert schema2 == schema2_exact_copy

    # test that remove_columns has removed the changes as expected:
    expected_schema_3 = DataFrameSchema(
        {
            "col1": Column(Int, Check(lambda s: s >= 0)),
            "col3": Column(Object, Check(lambda x: x == 0)),
        },
        strict=True,
    )

    assert schema3 == expected_schema_3

    # test that remove_columns can remove two columns:
    schema4 = schema2.remove_columns(["col2", "col3"])

    expected_schema_4 = DataFrameSchema(
        {"col1": Column(Int, Check(lambda s: s >= 0))}, strict=True)

    assert schema4 == expected_schema_4 == schema1
def test_multi_index_index():
    """Tests that multi-index Indexes within DataFrames validate correctly."""
    schema = DataFrameSchema(
        columns={
            "column1": Column(Float, Check(lambda s: s > 0)),
            "column2": Column(Float, Check(lambda s: s > 0)),
        },
        index=MultiIndex(indexes=[
            Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"),
            Index(
                Str,
                Check(lambda s: s.isin(["foo", "bar"])),
                name="index1",
            ),
        ]),
    )

    df = pd.DataFrame(
        data={
            "column1": [0.1, 0.5, 123.1, 10.6, 22.31],
            "column2": [0.1, 0.5, 123.1, 10.6, 22.31],
        },
        index=pd.MultiIndex.from_arrays(
            [[0, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]],
            names=["index0", "index1"],
        ),
    )

    validated_df = schema.validate(df)
    assert isinstance(validated_df, pd.DataFrame)

    # failure case
    df_fail = df.copy()
    df_fail.index = pd.MultiIndex.from_arrays(
        [[-1, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]],
        names=["index0", "index1"],
    )
    with pytest.raises(errors.SchemaError):
        schema.validate(df_fail)
示例#7
0
def test_pandas_nullable_int_dtype(dtype, coerce):
    """Test that pandas nullable int dtype can be specified in a schema."""
    assert all(
        isinstance(
            schema.validate(
                pd.DataFrame(
                    # keep max range to 127 in order to support Int8
                    {"col": range(128)},
                    **({} if coerce else {"dtype": dtype.str_alias}),
                )
            ),
            pd.DataFrame,
        )
        for schema in [
            DataFrameSchema(
                {"col": Column(dtype, nullable=False)}, coerce=coerce
            ),
            DataFrameSchema(
                {"col": Column(dtype.str_alias, nullable=False)}, coerce=coerce
            ),
        ]
    )
示例#8
0
def test_numeric_dtypes():
    """Test every numeric type can be validated properly by schema.validate"""
    for dtype in [
            dtypes.Float,
            dtypes.Float16,
            dtypes.Float32,
            dtypes.Float64]:
        assert all(
            isinstance(
                schema.validate(
                    pd.DataFrame(
                        {"col": [-123.1, -7654.321, 1.0, 1.1, 1199.51, 5.1]},
                        dtype=dtype.value)),
                pd.DataFrame
            )
            for schema in [
                DataFrameSchema({"col": Column(dtype, nullable=False)}),
                DataFrameSchema({"col": Column(dtype.value, nullable=False)})
            ]
        )

    for dtype in [
            dtypes.Int,
            dtypes.Int8,
            dtypes.Int16,
            dtypes.Int32,
            dtypes.Int64]:
        assert all(
            isinstance(
                schema.validate(
                    pd.DataFrame(
                        {"col": [-712, -4, -321, 0, 1, 777, 5, 123, 9000]},
                        dtype=dtype.value)),
                pd.DataFrame
            )
            for schema in [
                DataFrameSchema({"col": Column(dtype, nullable=False)}),
                DataFrameSchema({"col": Column(dtype.value, nullable=False)})
            ]
        )

    for dtype in [
            dtypes.UInt8,
            dtypes.UInt16,
            dtypes.UInt32,
            dtypes.UInt64]:
        assert all(
            isinstance(
                schema.validate(
                    pd.DataFrame(
                        {"col": [1, 777, 5, 123, 9000]},
                        dtype=dtype.value)),
                pd.DataFrame
            )
            for schema in [
                DataFrameSchema({"col": Column(dtype, nullable=False)}),
                DataFrameSchema({"col": Column(dtype.value, nullable=False)})
            ]
        )
示例#9
0
def schema_multiindex():
    """Fixture for schema with MultiIndex."""
    schema = DataFrameSchema(
        columns={
            "col1": Column(pandas_dtype=Int),
            "col2": Column(pandas_dtype=Float),
        },
        index=MultiIndex([
            Index(pandas_dtype=String, name="ind0"),
            Index(pandas_dtype=String, name="ind1"),
        ]),
    )
    return schema
示例#10
0
def test_coerce_dtype_nullable_str(data, dtype, nonnull_idx, string_type,
                                   nullable):
    """Tests how null values are handled with string dtypes."""
    if LEGACY_PANDAS and (dtype == "Int64"
                          or string_type in {STRING, "string"}):
        pytest.skip("Skipping data types that depend on pandas>1.0.0")
    dataframe = pd.DataFrame({"col": pd.Series(data, dtype=dtype)})
    schema = DataFrameSchema(
        {"col": Column(string_type, coerce=True, nullable=nullable)})

    if not nullable:
        with pytest.raises(errors.SchemaError):
            schema.validate(dataframe)
        return

    validated_df = schema.validate(dataframe)
    assert isinstance(validated_df, pd.DataFrame)
    for i, element in validated_df["col"].iteritems():
        if i < nonnull_idx:
            assert isinstance(element, str)
        else:
            assert pd.isna(element)
示例#11
0
def test_lazy_dataframe_validation_nullable():
    """
    Test that non-nullable column failure cases are correctly processed during
    lazy validation.
    """
    schema = DataFrameSchema(
        columns={
            "int_column": Column(Int, nullable=False),
            "float_column": Column(Float, nullable=False),
            "str_column": Column(String, nullable=False),
        },
        strict=True,
    )

    df = pd.DataFrame(
        {
            "int_column": [1, None, 3],
            "float_column": [0.1, 1.2, None],
            "str_column": [None, "foo", "bar"],
        }
    )

    try:
        schema.validate(df, lazy=True)
    except errors.SchemaErrors as err:
        assert err.schema_errors.failure_case.isna().all()
        for col, index in [
            ("int_column", 1),
            ("float_column", 2),
            ("str_column", 0),
        ]:
            # pylint: disable=cell-var-from-loop
            assert (
                err.schema_errors.loc[
                    lambda df: df.column == col, "index"
                ].iloc[0]
                == index
            )
示例#12
0
def test_check_function_decorator_errors():
    """Test that the check_input and check_output decorators error properly."""
    # case 1: checks that the input and output decorators error when different
    # types are passed in and out
    @check_input(DataFrameSchema({"column1": Column(Int)}))
    @check_output(DataFrameSchema({"column2": Column(Float)}))
    def test_func(df):
        return df

    with pytest.raises(
        errors.SchemaError,
        match=r"^error in check_input decorator of function",
    ):
        test_func(pd.DataFrame({"column2": ["a", "b", "c"]}))

    with pytest.raises(
        errors.SchemaError,
        match=r"^error in check_input decorator of function",
    ):
        test_func(df=pd.DataFrame({"column2": ["a", "b", "c"]}))

    with pytest.raises(
        errors.SchemaError,
        match=r"^error in check_output decorator of function",
    ):
        test_func(pd.DataFrame({"column1": [1, 2, 3]}))

    # case 2: check that if the input decorator refers to an index that's not
    # in the function signature, it will fail in a way that's easy to interpret
    @check_input(DataFrameSchema({"column1": Column(Int)}), 1)
    def test_incorrect_check_input_index(df):
        return df

    with pytest.raises(
        IndexError, match=r"^error in check_input decorator of function"
    ):
        test_incorrect_check_input_index(pd.DataFrame({"column1": [1, 2, 3]}))
示例#13
0
def test_column_regex_multiindex():
    """Text that column regex works on multi-index column."""
    column_schema = Column(
        Int,
        Check(lambda s: s >= 0),
        name=("foo_*", "baz_*"),
        regex=True,
    )
    dataframe_schema = DataFrameSchema({
        ("foo_*", "baz_*"):
        Column(Int, Check(lambda s: s >= 0), regex=True),
    })

    data = pd.DataFrame({
        ("foo_1", "biz_1"): range(10),
        ("foo_2", "baz_1"): range(10, 20),
        ("foo_3", "baz_2"): range(20, 30),
        ("bar_1", "biz_2"): range(10),
        ("bar_2", "biz_3"): range(10, 20),
        ("bar_3", "biz_3"): range(20, 30),
    })
    assert isinstance(column_schema.validate(data), pd.DataFrame)
    assert isinstance(dataframe_schema.validate(data), pd.DataFrame)

    # Raise an error if tuple column name is applied to a dataframe with a
    # flat pd.Index object.
    failure_column_cases = (
        [f"foo_{i}" for i in range(6)],
        pd.MultiIndex.from_tuples([(f"foo_{i}", f"bar_{i}", f"baz_{i}")
                                   for i in range(6)]),
    )
    for columns in failure_column_cases:
        data.columns = columns
        with pytest.raises(IndexError):
            column_schema.validate(data)
        with pytest.raises(IndexError):
            dataframe_schema.validate(data)
示例#14
0
def test_dataframe_pandas_dtype_coerce():
    """
    Test that pandas dtype specified at the dataframe level overrides
    column data types.
    """
    schema = DataFrameSchema(
        columns={f"column_{i}": Column(float)
                 for i in range(5)},
        pandas_dtype=int,
        coerce=True,
    )

    df = pd.DataFrame({f"column_{i}": range(10)
                       for i in range(5)}).astype(float)
    assert (schema(df).dtypes == Int.str_alias).all()

    # test that pandas_dtype in columns are preserved
    for col in schema.columns.values():
        assert col.pandas_dtype is float

    # raises SchemeError if dataframe can't be coerced
    with pytest.raises(errors.SchemaErrors):
        schema.coerce_dtype(pd.DataFrame({"foo": list("abcdef")}))

    # raises SchemaErrors on lazy validation
    with pytest.raises(errors.SchemaErrors):
        schema(pd.DataFrame({"foo": list("abcdef")}), lazy=True)

    # test that original dataframe dtypes are preserved
    assert (df.dtypes == Float.str_alias).all()

    # test case where pandas_dtype is string
    schema.pandas_dtype = str
    assert (schema(df).dtypes == "object").all()

    schema.pandas_dtype = PandasDtype.String
    assert (schema(df).dtypes == "object").all()

    # raises ValueError if _coerce_dtype is called when pandas_dtype is None
    schema.pandas_dtype = None
    with pytest.raises(ValueError):
        schema._coerce_dtype(df)
示例#15
0
文件: io.py 项目: lkadin/pandera
def _deserialize_schema(serialized_schema):
    # pylint: disable=import-outside-toplevel
    from pandera import Check, Column, DataFrameSchema, Index, MultiIndex

    # GH#475
    serialized_schema = serialized_schema if serialized_schema else {}

    if not isinstance(serialized_schema, Mapping):
        raise pandera.errors.SchemaDefinitionError(
            "Schema representation must be a mapping.")

    columns = serialized_schema.get("columns")
    index = serialized_schema.get("index")
    checks = serialized_schema.get("checks")

    if columns is not None:
        columns = {
            col_name: Column(**_deserialize_component_stats(column_stats))
            for col_name, column_stats in columns.items()
        }

    if index is not None:
        index = [
            _deserialize_component_stats(index_component)
            for index_component in index
        ]

    if checks is not None:
        # handles unregistered checks by raising AttributeErrors from getattr
        checks = [
            _deserialize_check_stats(getattr(Check, check_name), check_stats)
            for check_name, check_stats in checks.items()
        ]

    if index is None:
        pass
    elif len(index) == 1:
        index = Index(**index[0])
    else:
        index = MultiIndex(
            indexes=[Index(**index_properties) for index_properties in index])

    return DataFrameSchema(
        columns=columns,
        checks=checks,
        index=index,
        coerce=serialized_schema.get("coerce", False),
        strict=serialized_schema.get("strict", False),
    )
示例#16
0
def test_schema_equality_operators():
    """Test the usage of == for DataFrameSchema, SeriesSchema and
    SeriesSchemaBase."""
    df_schema = DataFrameSchema(
        {
            "col1": Column(Int, Check(lambda s: s >= 0)),
            "col2": Column(String, Check(lambda s: s >= 2)),
        },
        strict=True)
    df_schema_columns_in_different_order = DataFrameSchema(
        {
            "col2": Column(String, Check(lambda s: s >= 2)),
            "col1": Column(Int, Check(lambda s: s >= 0)),
        },
        strict=True)
    series_schema = SeriesSchema(
        String,
        checks=[Check(lambda s: s.str.startswith("foo"))],
        nullable=False,
        allow_duplicates=True,
        name="my_series")
    series_schema_base = SeriesSchemaBase(
        String,
        checks=[Check(lambda s: s.str.startswith("foo"))],
        nullable=False,
        allow_duplicates=True,
        name="my_series")
    not_equal_schema = DataFrameSchema({"col1": Column(String)}, strict=False)

    assert df_schema == copy.deepcopy(df_schema)
    assert df_schema != not_equal_schema
    assert df_schema == df_schema_columns_in_different_order
    assert series_schema == copy.deepcopy(series_schema)
    assert series_schema != not_equal_schema
    assert series_schema_base == copy.deepcopy(series_schema_base)
    assert series_schema_base != not_equal_schema
示例#17
0
def test_dataframe_schema_dtype_property():
    """Test that schema.dtype returns the matching Column types."""
    schema = DataFrameSchema(
        columns={
            "col1": Column(Int),
            "col2": Column(String),
            "col3": Column(DateTime),
            "col4": Column("uint16"),
        })
    assert schema.dtype == {
        "col1": "int64",
        "col2": "object",
        "col3": "datetime64[ns]",
        "col4": "uint16"
    }
示例#18
0
def test_column_regex():
    """Test that column regex work on single-level column index."""
    column_schema = Column(Int,
                           Check(lambda s: s >= 0),
                           name="foo_*",
                           regex=True)

    dataframe_schema = DataFrameSchema({
        "foo_*":
        Column(Int, Check(lambda s: s >= 0), regex=True),
    })

    data = pd.DataFrame({
        "foo_1": range(10),
        "foo_2": range(10, 20),
        "foo_3": range(20, 30),
        "bar_1": range(10),
        "bar_2": range(10, 20),
        "bar_3": range(20, 30),
    })
    assert isinstance(column_schema.validate(data), pd.DataFrame)
    assert isinstance(dataframe_schema.validate(data), pd.DataFrame)

    # Raise an error on multi-index column case
    data.columns = pd.MultiIndex.from_tuples((
        ("foo_1", "biz_1"),
        ("foo_2", "baz_1"),
        ("foo_3", "baz_2"),
        ("bar_1", "biz_2"),
        ("bar_2", "biz_3"),
        ("bar_3", "biz_3"),
    ))
    with pytest.raises(IndexError):
        column_schema.validate(data)
    with pytest.raises(IndexError):
        dataframe_schema.validate(data)
示例#19
0
def test_category_dtype_coerce():
    """Test coercion of the category type is validated properly by
    schema.validate and fails safely."""
    columns = {
        "col": Column(
            pa.Category,
            checks=Check(lambda s: set(s) == {"A", "B", "C"}),
            nullable=False
        ),
    }

    with pytest.raises(SchemaError):
        DataFrameSchema(columns=columns, coerce=False).validate(
            pd.DataFrame(
                {"col": pd.Series(["A", "B", "A", "B", "C"], dtype="object")}
            )
        )

    validated_df = DataFrameSchema(columns=columns, coerce=True).validate(
        pd.DataFrame(
            {"col": pd.Series(["A", "B", "A", "B", "C"], dtype="object")}
        )
    )
    assert isinstance(validated_df, pd.DataFrame)
示例#20
0
 def init_schema_element_wise():
     DataFrameSchema(
         {
             "col1": Column(
                 Int,
                 [
                     Check(
                         lambda s: s["foo"] > 10,
                         element_wise=True,
                         groupby=["col2"],
                     ),
                 ],
             ),
             "col2": Column(Str, Check(lambda s: s.isin(["foo", "bar"]))),
         }
     )
示例#21
0
def test_schema_component_equality_operators():
    """Test the usage of == for Column, Index and MultiIndex."""
    column = Column(Int, Check(lambda s: s >= 0))
    index = Index(Int, [Check(lambda x: 1 <= x <= 11, element_wise=True)])
    multi_index = MultiIndex(indexes=[
        Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"),
        Index(String, Check(lambda s: s.isin(["foo", "bar"])), name="index1"),
    ])
    not_equal_schema = DataFrameSchema(
        {"col1": Column(Int, Check(lambda s: s >= 0))})

    assert column == copy.deepcopy(column)
    assert column != not_equal_schema
    assert index == copy.deepcopy(index)
    assert index != not_equal_schema
    assert multi_index == copy.deepcopy(multi_index)
    assert multi_index != not_equal_schema
示例#22
0
def test_multi_index_schema_coerce():
    """Test that multi index can be type-coerced."""
    indexes = [
        Index(Float),
        Index(Int),
        Index(String),
    ]
    schema = DataFrameSchema(index=MultiIndex(indexes=indexes))
    df = pd.DataFrame(index=pd.MultiIndex.from_arrays([
        [1.0, 2.1, 3.5, 4.8],
        [5, 6, 7, 8],
        ["9", "10", "11", "12"],
    ]))
    validated_df = schema(df)
    for level_i in range(validated_df.index.nlevels):
        assert (validated_df.index.get_level_values(level_i).dtype ==
                indexes[level_i].dtype)
示例#23
0
def test_check_input_output_unrecognized_obj_getter(obj_getter):
    """
    Test that check_input and check_output raise correct errors on unrecognized
    dataframe object getters
    """
    schema = DataFrameSchema({"column": Column(int)})

    @check_input(schema, obj_getter)
    def test_check_input_fn(df):
        return df

    @check_output(schema, obj_getter)
    def test_check_output_fn(df):
        return df

    for fn in [test_check_input_fn, test_check_output_fn]:
        with pytest.raises(TypeError):
            fn(pd.DataFrame({"column": [1, 2, 3]}))
示例#24
0
def test_dataframe_schema_check_function_types(check_function, should_fail):
    schema = DataFrameSchema({
        "a":
        Column(Int, Check(fn=check_function, element_wise=False)),
        "b":
        Column(Float, Check(fn=check_function, element_wise=False))
    })
    df = pd.DataFrame({"a": [1, 2, 3], "b": [1.1, 2.5, 9.9]})
    if should_fail:
        with pytest.raises(errors.SchemaError):
            schema.validate(df)
    else:
        schema.validate(df)
示例#25
0
def test_dtypes():
    for dtype in [
            dtypes.Float,
            dtypes.Float16,
            dtypes.Float32,
            dtypes.Float64]:
        schema = DataFrameSchema({"col": Column(dtype, nullable=False)})
        validated_df = schema.validate(
            pd.DataFrame(
                {"col": [-123.1, -7654.321, 1.0, 1.1, 1199.51, 5.1, 4.6]},
                dtype=dtype.value))
        assert isinstance(validated_df, pd.DataFrame)

    for dtype in [
            dtypes.Int,
            dtypes.Int8,
            dtypes.Int16,
            dtypes.Int32,
            dtypes.Int64]:
        schema = DataFrameSchema({"col": Column(dtype, nullable=False)})
        validated_df = schema.validate(
            pd.DataFrame(
                {"col": [-712, -4, -321, 0, 1, 777, 5, 123, 9000]},
                dtype=dtype.value))
        assert isinstance(validated_df, pd.DataFrame)

    for dtype in [
            dtypes.UInt8,
            dtypes.UInt16,
            dtypes.UInt32,
            dtypes.UInt64]:
        schema = DataFrameSchema({"col": Column(dtype, nullable=False)})
        validated_df = schema.validate(
            pd.DataFrame(
                {"col": [1, 777, 5, 123, 9000]}, dtype=dtype.value))
        assert isinstance(validated_df, pd.DataFrame)
示例#26
0
def test_one_sample_hypothesis():
    """Check one sample ttest."""
    schema = DataFrameSchema(
        {
            "height_in_feet": Column(
                Float,
                [
                    Hypothesis.one_sample_ttest(
                        popmean=5, relationship="greater_than", alpha=0.1
                    ),
                ],
            ),
        }
    )

    subset_schema = DataFrameSchema(
        {
            "group": Column(String),
            "height_in_feet": Column(
                Float,
                [
                    Hypothesis.one_sample_ttest(
                        sample="A",
                        groupby="group",
                        popmean=5,
                        relationship="greater_than",
                        alpha=0.1,
                    ),
                ],
            ),
        }
    )

    df = pd.DataFrame(
        {
            "height_in_feet": [8.1, 7, 6.5, 6.7, 5.1],
            "group": ["A", "A", "B", "B", "A"],
        }
    )
    schema.validate(df)
    subset_schema.validate(df)
示例#27
0
def test_python_builtin_types():
    """Test support python data types can be used for validation."""
    schema = DataFrameSchema({
        "int_col": Column(int),
        "float_col": Column(float),
        "str_col": Column(str),
        "bool_col": Column(bool),
    })
    df = pd.DataFrame({
        "int_col": [1, 2, 3],
        "float_col": [1., 2., 3.],
        "str_col": list("abc"),
        "bool_col": [True, False, True],
    })
    assert isinstance(schema(df), pd.DataFrame)
    assert schema.dtype["int_col"] == PandasDtype.Int.str_alias
    assert schema.dtype["float_col"] == PandasDtype.Float.str_alias
    assert schema.dtype["str_col"] == PandasDtype.String.str_alias
    assert schema.dtype["bool_col"] == PandasDtype.Bool.str_alias
示例#28
0
def test_non_str_column_name_regex(column_key):
    """Check that Columns with non-str names cannot have regex=True."""

    with pytest.raises(ValueError):
        DataFrameSchema({
            column_key: Column(
                Float,
                checks=Check.greater_than_or_equal_to(0),
                regex=True,
            ),
        })

    with pytest.raises(ValueError):
        Column(
            Float,
            checks=Check.greater_than_or_equal_to(0),
            name=column_key,
            regex=True,
        )
示例#29
0
def test_dataframe_schema_check_function_types(check_function, should_fail):
    """Tests a DataFrameSchema against a variety of Check conditions."""
    schema = DataFrameSchema({
        "a":
        Column(Int, Check(check_function, element_wise=False)),
        "b":
        Column(Float, Check(check_function, element_wise=False)),
    })
    df = pd.DataFrame({"a": [1, 2, 3], "b": [1.1, 2.5, 9.9]})
    if should_fail:
        with pytest.raises(errors.SchemaError):
            schema.validate(df)
    else:
        schema.validate(df)
示例#30
0
def test_ordered_dataframe(columns: Dict[str, Column], index: MultiIndex):
    """Test that columns are ordered."""
    schema = DataFrameSchema(columns=columns, index=index, ordered=True)

    df = pd.DataFrame(
        data=[[1, 2, 3]],
        columns=["a", "a", "b"],
        index=pd.MultiIndex.from_arrays([[1], [2], [3]], names=["a", "a",
                                                                "b"]),
    )
    assert isinstance(schema.validate(df), pd.DataFrame)

    # test optional column
    df = pd.DataFrame(
        data=[[1]],
        columns=["b"],
        index=pd.MultiIndex.from_arrays([[1], [2]], names=["a", "b"]),
    )
    assert isinstance(schema.validate(df), pd.DataFrame)

    df = pd.DataFrame(
        data=[[1, 2]],
        columns=["b", "a"],
        index=pd.MultiIndex.from_arrays([[1], [2]], names=["b", "a"]),
    )
    with pytest.raises(errors.SchemaErrors,
                       match="A total of 2 schema errors"):
        schema.validate(df, lazy=True)

    # test out-of-order duplicates
    df = pd.DataFrame(
        data=[[1, 2, 3, 4]],
        columns=["a", "b", "c", "a"],
        index=pd.MultiIndex.from_arrays([[1], [2], [3], [4]],
                                        names=["a", "b", "c", "a"]),
    )
    with pytest.raises(errors.SchemaErrors,
                       match="A total of 1 schema errors"):
        schema.validate(df, lazy=True)