Exemplo n.º 1
0
def test_lazy_dataframe_validation_error():
    """Test exceptions on lazy dataframe validation."""
    schema = DataFrameSchema(
        columns={
            "int_col": Column(Int, Check.greater_than(5)),
            "int_col2": Column(Int),
            "float_col": Column(Float, Check.less_than(0)),
            "str_col": Column(String, Check.isin(["foo", "bar"])),
            "not_in_dataframe": Column(Int),
        },
        checks=Check(
            lambda df: df != 1, error="dataframe_not_equal_1", ignore_na=False
        ),
        index=Index(String, name="str_index"),
        strict=True,
    )

    dataframe = pd.DataFrame(
        data={
            "int_col": [1, 2, 6],
            "int_col2": ["a", "b", "c"],
            "float_col": [1., -2., 3.],
            "str_col": ["foo", "b", "c"],
            "unknown_col": [None, None, None],
        },
        index=pd.Index(
            ["index0", "index1", "index2"],
            name="str_index"
        ),
    )

    expectation = {
        # schema object context -> check failure cases
        "DataFrameSchema": {
            # check name -> failure cases
            "column_in_schema": ["unknown_col"],
            "dataframe_not_equal_1": [1],
            "column_in_dataframe": ["not_in_dataframe"],
        },
        "Column": {
            "greater_than(5)": [1, 2],
            "pandas_dtype('int64')": ["object"],
            "less_than(0)": [1, 3],
        },
    }

    with pytest.raises(
            errors.SchemaErrors,
            match="^A total of .+ schema errors were found"):
        schema.validate(dataframe, lazy=True)

    try:
        schema.validate(dataframe, lazy=True)
    except errors.SchemaErrors as err:

        # data in the caught exception should be equal to the dataframe
        # passed into validate
        assert err.data.equals(dataframe)

        # make sure all expected check errors are in schema errors
        for schema_context, check_failure_cases in expectation.items():
            err_df = err.schema_errors.loc[
                err.schema_errors.schema_context == schema_context]
            for check, failure_cases in check_failure_cases.items():
                assert check in err_df.check.values
                assert (
                    err_df.loc[err_df.check == check]
                    .failure_case.isin(failure_cases)
                    .all()
                )
Exemplo n.º 2
0
def test_index_schema_coerce():
    """Test that index can be type-coerced."""
    schema = DataFrameSchema(index=Index(Float, coerce=True))
    df = pd.DataFrame(index=pd.Index([1, 2, 3, 4], dtype="int64"))
    validated_df = schema(df)
    assert validated_df.index.dtype == Float.value
Exemplo n.º 3
0
 ],
 [
     Column(
         Int, checks=[Check.greater_than(1), Check.less_than(3)],
         name="column"
     ),
     pd.DataFrame({"column": [1, 2, 3]}),
     {
         "data": pd.DataFrame({"column": [1, 2, 3]}),
         "schema_errors": {
             "Column": {"greater_than(1)": [1], "less_than(3)": [3]},
         },
     },
 ],
 [
     Index(String, checks=Check.isin(["a", "b", "c"])),
     pd.DataFrame({"col": [1, 2, 3]}, index=["a", "b", "d"]),
     {
         # expect that the data in the SchemaError is the pd.Index cast
         # into a Series
         "data": pd.Series(["a", "b", "d"]),
         "schema_errors": {
             "Index": {"isin(%s)" % {'a', 'b', 'c'}: ["d"]},
         }
     },
 ],
 [
     MultiIndex(
         indexes=[
             Index(Int, checks=Check.greater_than(0), name="index0"),
             Index(Int, checks=Check.less_than(0), name="index1"),