def tests_multi_index_subindex_coerce(): """MultIndex component should override sub indexes.""" indexes = [ Index(String, coerce=True), Index(String, coerce=False), Index(String, coerce=True), Index(String, coerce=False), ] data = pd.DataFrame(index=pd.MultiIndex.from_arrays([[1, 2, 3, 4]] * 4)) # coerce=True in MultiIndex and DataFrameSchema should override subindex # coerce setting for schema_override in [ DataFrameSchema(index=MultiIndex(indexes, coerce=True)), DataFrameSchema(index=MultiIndex(indexes), coerce=True), ]: validated_df_override = schema_override(data) for level_i in range(validated_df_override.index.nlevels): assert (validated_df_override.index.get_level_values(level_i).dtype == "object") # coerce=False at the MultiIndex level should result in two type errors schema = DataFrameSchema(index=MultiIndex(indexes)) with pytest.raises(errors.SchemaErrors, match="A total of 2 schema errors were found"): schema(data, lazy=True)
def test_add_and_remove_columns(): """Check that adding and removing columns works as expected and doesn't modify the original underlying DataFrameSchema.""" schema1 = DataFrameSchema( { "col1": Column(Int, Check(lambda s: s >= 0)), }, strict=True, ) schema1_exact_copy = copy.deepcopy(schema1) # test that add_columns doesn't modify schema1 after add_columns: schema2 = schema1.add_columns( { "col2": Column(String, Check(lambda x: x <= 0)), "col3": Column(Object, Check(lambda x: x == 0)), } ) schema2_exact_copy = copy.deepcopy(schema2) assert schema1 == schema1_exact_copy # test that add_columns changed schema1 into schema2: expected_schema_2 = DataFrameSchema( { "col1": Column(Int, Check(lambda s: s >= 0)), "col2": Column(String, Check(lambda x: x <= 0)), "col3": Column(Object, Check(lambda x: x == 0)), }, strict=True, ) assert schema2 == expected_schema_2 # test that remove_columns doesn't modify schema2: schema3 = schema2.remove_columns(["col2"]) assert schema2 == schema2_exact_copy # test that remove_columns has removed the changes as expected: expected_schema_3 = DataFrameSchema( { "col1": Column(Int, Check(lambda s: s >= 0)), "col3": Column(Object, Check(lambda x: x == 0)), }, strict=True, ) assert schema3 == expected_schema_3 # test that remove_columns can remove two columns: schema4 = schema2.remove_columns(["col2", "col3"]) expected_schema_4 = DataFrameSchema( {"col1": Column(Int, Check(lambda s: s >= 0))}, strict=True ) assert schema4 == expected_schema_4 == schema1
def test_check_function_decorator_transform(): """Test that transformer argument is in effect in check_input decorator.""" in_schema = DataFrameSchema( {"column1": Column(Int)}, transformer=lambda df: df.assign(column2="foo")) out_schema = DataFrameSchema( {"column1": Column(Int), "column2": Column(String)}) @check_input(in_schema) @check_output(out_schema) def func_input_transform1(df): return df result1 = func_input_transform1(pd.DataFrame({"column1": [1, 2, 3]})) assert "column2" in result1 @check_input(in_schema, 1) @check_output(out_schema, 1) def func_input_transform2(_, df): return _, df result2 = func_input_transform2(None, pd.DataFrame({"column1": [1, 2, 3]})) assert "column2" in result2[1]
def test_two_sample_ttest_hypothesis_relationships(): """Check allowable relationships in two-sample ttest.""" for relationship in Hypothesis.RELATIONSHIPS: schema = DataFrameSchema({ "height_in_feet": Column(Float, [ Hypothesis.two_sample_ttest( sample1="M", sample2="F", groupby="sex", relationship=relationship, alpha=0.5), ]), "sex": Column(String) }) assert isinstance(schema, DataFrameSchema) for relationship in ["foo", "bar", 1, 2, 3, None]: with pytest.raises(errors.SchemaInitError): DataFrameSchema({ "height_in_feet": Column(Float, [ Hypothesis.two_sample_ttest( sample1="M", sample2="F", groupby="sex", relationship=relationship, alpha=0.5), ]), "sex": Column(String) })
def test_coerce_dtype_nullable_str(): """Tests how null values are handled in string dtypes.""" # dataframes with columns where the last two values are null df_nans = pd.DataFrame({ "col": ["foobar", "foo", "bar", "baz", np.nan, np.nan], }) df_nones = pd.DataFrame({ "col": ["foobar", "foo", "bar", "baz", None, None], }) with pytest.raises(errors.SchemaError): for df in [df_nans, df_nones]: DataFrameSchema({ "col": Column(String, coerce=True, nullable=False) }).validate(df) schema = DataFrameSchema({ "col": Column(String, coerce=True, nullable=True) }) for df in [df_nans, df_nones]: validated_df = schema.validate(df) assert isinstance(validated_df, pd.DataFrame) assert pd.isna(validated_df["col"].iloc[-1]) assert pd.isna(validated_df["col"].iloc[-2]) for i in range(4): assert isinstance(validated_df["col"].iloc[i], str)
def test_check_function_decorator_errors(): """Test that the check_input and check_output decorators error properly.""" # case 1: checks that the input and output decorators error when different # types are passed in and out @check_input(DataFrameSchema({"column1": Column(Int)})) @check_output(DataFrameSchema({"column2": Column(Float)})) def test_func(df): return df with pytest.raises( errors.SchemaError, match=r"^error in check_input decorator of function"): test_func(pd.DataFrame({"column2": ["a", "b", "c"]})) with pytest.raises( errors.SchemaError, match=r"^error in check_output decorator of function"): test_func(pd.DataFrame({"column1": [1, 2, 3]})) # case 2: check that if the input decorator refers to an index that's not # in the function signature, it will fail in a way that's easy to interpret @check_input(DataFrameSchema({"column1": Column(Int)}), 1) def test_incorrect_check_input_index(df): return df with pytest.raises( errors.SchemaError, match=r"^error in check_input decorator of function"): test_incorrect_check_input_index(pd.DataFrame({"column1": [1, 2, 3]}) )
def test_column_regex_non_str_types() -> None: """Check that column name regex matching excludes non-string types.""" data = pd.DataFrame( { 1: [1, 2, 3], 2.2: [1, 2, 3], pd.Timestamp("2018/01/01"): [1, 2, 3], "foo_1": [1, 2, 3], "foo_2": [1, 2, 3], "foo_3": [1, 2, 3], } ) schema = DataFrameSchema( columns={ "foo_": Column(Int, Check.gt(0), regex=True), r"\d+": Column(Int, Check.gt(0), regex=True), r"\d+\.\d+": Column(Int, Check.gt(0), regex=True), "2018-01-01": Column(Int, Check.gt(0), regex=True), }, ) assert isinstance(schema.validate(data), pd.DataFrame) # test MultiIndex column case data = pd.DataFrame( { (1, 1): [1, 2, 3], (2.2, 4.5): [1, 2, 3], ("foo", "bar"): [1, 2, 3], } ) schema = DataFrameSchema( columns={("foo_*", "bar_*"): Column(Int, regex=True)}, ) schema.validate(data)
def test_coerce_dtype_in_dataframe(): """Tests coercions of datatypes, especially regarding nullable integers.""" df = pd.DataFrame({ "column1": [10.0, 20.0, 30.0], "column2": ["2018-01-01", "2018-02-01", "2018-03-01"], "column3": [1, 2, None], "column4": [1., 1., np.nan], }) # specify `coerce` at the Column level schema1 = DataFrameSchema({ "column1": Column(Int, Check(lambda x: x > 0), coerce=True), "column2": Column(DateTime, coerce=True), "column3": Column(String, coerce=True, nullable=True), }) # specify `coerce` at the DataFrameSchema level schema2 = DataFrameSchema({ "column1": Column(Int, Check(lambda x: x > 0)), "column2": Column(DateTime), "column3": Column(String, nullable=True), }, coerce=True) for schema in [schema1, schema2]: result = schema.validate(df) assert result.column1.dtype == Int.value assert result.column2.dtype == DateTime.value for _, x in result.column3.iteritems(): assert pd.isna(x) or isinstance(x, str) # make sure that correct error is raised when null values are present # in a float column that's coerced to an int schema = DataFrameSchema({"column4": Column(Int, coerce=True)}) with pytest.raises(ValueError): schema.validate(df)
def test_one_sample_hypothesis(): """Check one sample ttest.""" schema = DataFrameSchema({ "height_in_feet": Column( Float, [ Hypothesis.one_sample_ttest( popmean=5, relationship="greater_than", alpha=0.1), ] ), }) subset_schema = DataFrameSchema({ "group": Column(String), "height_in_feet": Column( Float, [ Hypothesis.one_sample_ttest( sample="A", groupby="group", popmean=5, relationship="greater_than", alpha=0.1), ] ), }) df = ( pd.DataFrame({ "height_in_feet": [8.1, 7, 6.5, 6.7, 5.1], "group": ["A", "A", "B", "B", "A"], }) ) schema.validate(df) subset_schema.validate(df)
def test_schema_equality_operators(): """Test the usage of == for DataFrameSchema, SeriesSchema and SeriesSchemaBase.""" df_schema = DataFrameSchema({ "col1": Column(Int, Check(lambda s: s >= 0)), "col2": Column(String, Check(lambda s: s >= 2)), }, strict=True) df_schema_columns_in_different_order = DataFrameSchema({ "col2": Column(String, Check(lambda s: s >= 2)), "col1": Column(Int, Check(lambda s: s >= 0)), }, strict=True) series_schema = SeriesSchema( String, checks=[Check(lambda s: s.str.startswith("foo"))], nullable=False, allow_duplicates=True, name="my_series") series_schema_base = SeriesSchemaBase( String, checks=[Check(lambda s: s.str.startswith("foo"))], nullable=False, allow_duplicates=True, name="my_series") not_equal_schema = DataFrameSchema({ "col1": Column(String) }, strict=False) assert df_schema == copy.deepcopy(df_schema) assert df_schema != not_equal_schema assert df_schema == df_schema_columns_in_different_order assert series_schema == copy.deepcopy(series_schema) assert series_schema != not_equal_schema assert series_schema_base == copy.deepcopy(series_schema_base) assert series_schema_base != not_equal_schema
def tests_multi_index_subindex_coerce(): """MultIndex component should override sub indexes.""" indexes = [ Index(String, coerce=True), Index(String, coerce=False), Index(String, coerce=True), Index(String, coerce=False), ] data = pd.DataFrame(index=pd.MultiIndex.from_arrays([[1, 2, 3, 4]] * 4)) schema = DataFrameSchema(index=MultiIndex(indexes), coerce=False) validated_df = schema(data) for level_i in range(validated_df.index.nlevels): if indexes[level_i].coerce: assert validated_df.index.get_level_values(level_i).dtype == \ indexes[level_i].dtype else: # dtype should be string representation of pandas strings assert validated_df.index.get_level_values(level_i).dtype == \ "object" # coerce=True in MultiIndex should override subindex coerce setting schema_override = DataFrameSchema(index=MultiIndex(indexes), coerce=True) validated_df_override = schema_override(data) for level_i in range(validated_df.index.nlevels): assert validated_df_override.index.get_level_values(level_i).dtype == \ indexes[level_i].dtype
def test_dataframe_checks(): """Tests that dataframe checks validate, error when a DataFrame doesn't comply with the schema, simple tests of the groupby checks which are covered in more detail above.""" schema = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(Float), "col3": Column(String), "col4": Column(String), }, checks=[ Check(lambda df: df["col1"] < df["col2"]), Check(lambda df: df["col3"] == df["col4"]), ], ) df = pd.DataFrame({ "col1": [1, 2, 3], "col2": [2.0, 3.0, 4.0], "col3": ["foo", "bar", "baz"], "col4": ["foo", "bar", "baz"], }) assert isinstance(schema.validate(df), pd.DataFrame) # test invalid schema error raising invalid_df = df.copy() invalid_df["col1"] = invalid_df["col1"] * 3 with pytest.raises(errors.SchemaError): schema.validate(invalid_df) # test groupby checks groupby_check_schema = DataFrameSchema( columns={ "col1": Column(Int), "col3": Column(String), }, checks=[ Check(lambda g: g["foo"]["col1"].iat[0] == 1, groupby="col3"), Check(lambda g: g["foo"]["col2"].iat[0] == 2.0, groupby="col3"), Check(lambda g: g["foo"]["col3"].iat[0] == "foo", groupby="col3"), Check( lambda g: g[("foo", "foo")]["col1"].iat[0] == 1, groupby=["col3", "col4"], ), ], ) assert isinstance(groupby_check_schema.validate(df), pd.DataFrame) # test element-wise checks element_wise_check_schema = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(Float), }, checks=Check(lambda row: row["col1"] < row["col2"], element_wise=True), ) assert isinstance(element_wise_check_schema.validate(df), pd.DataFrame)
def test_coerce_without_dtype(): """Test that an error is thrown when a dtype isn't specified and coerce is True.""" with pytest.raises(errors.SchemaInitError): DataFrameSchema({"col": Column(coerce=True)}) with pytest.raises(errors.SchemaInitError): DataFrameSchema({"col": Column()}, coerce=True)
def test_check_groups(): """Tests uses of groupby and groups (for values within columns).""" schema = DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s["foo"] > 10, groupby="col2", groups=["foo"]), Check(lambda s: s["foo"] > 10, groupby="col2", groups="foo"), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), }) df = pd.DataFrame({ "col1": [7, 8, 9, 11, 12, 13], "col2": ["bar", "bar", "bar", "foo", "foo", "foo"], }) validated_df = schema.validate(df) assert isinstance(validated_df, pd.DataFrame) assert len(validated_df.columns) == 2 assert set(validated_df.columns) == {"col1", "col2"} # raise KeyError when groups does not include a particular group name schema_fail_key_error = DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s["bar"] > 10, groupby="col2", groups="foo"), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), }) with pytest.raises(KeyError, match="^'bar'"): schema_fail_key_error.validate(df) # raise KeyError when the group does not exist in the groupby column when # referenced in the Check function schema_fail_nonexistent_key_in_fn = DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s["baz"] > 10, groupby="col2", groups=["foo"]), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), }) with pytest.raises(KeyError, match="^'baz'"): schema_fail_nonexistent_key_in_fn.validate(df) # raise KeyError when the group does not exist in the groups argument. schema_fail_nonexistent_key_in_groups = DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s["foo"] > 10, groupby="col2", groups=["baz"]), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), }) with pytest.raises(KeyError): schema_fail_nonexistent_key_in_groups.validate(df)
def test_dataframe_hypothesis_checks(): """ Test that two specific implementations of a Hypothesis work as expected and that using a Column that wasn't defined will error. """ df = pd.DataFrame({ "col1": range(100, 201), "col2": range(0, 101), }) hypothesis_check_schema = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(Int), }, checks=[ # two-sample test Hypothesis( test=stats.ttest_ind, samples=["col1", "col2"], relationship=lambda stat, pvalue, alpha=0.01: (stat > 0 and pvalue / 2 < alpha), relationship_kwargs={"alpha": 0.5}, ), # one-sample test Hypothesis( test=stats.ttest_1samp, samples=["col1"], relationship=lambda stat, pvalue, alpha=0.01: (stat > 0 and pvalue / 2 < alpha), test_kwargs={"popmean": 50}, relationship_kwargs={"alpha": 0.01}, ), ], ) hypothesis_check_schema.validate(df) # raise error when using groupby for a column that doesn't exist hypothesis_check_schema_groupby = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(Int), }, checks=[ # two-sample test Hypothesis( test=stats.ttest_ind, samples=["col1", "col2"], groupby="col3", relationship=lambda stat, pvalue, alpha=0.01: (stat > 0 and pvalue / 2 < alpha), relationship_kwargs={"alpha": 0.5}, ), ], ) with pytest.raises(errors.SchemaDefinitionError): hypothesis_check_schema_groupby.validate(df)
def test_dataframe_hypothesis_checks(): df = pd.DataFrame({ "col1": range(100, 201), "col2": range(0, 101), }) hypothesis_check_schema = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(Int), }, checks=[ # two-sample test Hypothesis( test=stats.ttest_ind, samples=["col1", "col2"], relationship=lambda stat, pvalue, alpha=0.01: ( stat > 0 and pvalue / 2 < alpha ), relationship_kwargs={"alpha": 0.5}, ), # one-sample test Hypothesis( test=stats.ttest_1samp, samples=["col1"], relationship=lambda stat, pvalue, alpha=0.01: ( stat > 0 and pvalue / 2 < alpha ), test_kwargs={"popmean": 50}, relationship_kwargs={"alpha": 0.01}, ), ] ) hypothesis_check_schema.validate(df) # raise error when using groupby hypothesis_check_schema_groupby = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(Int), }, checks=[ # two-sample test Hypothesis( test=stats.ttest_ind, samples=["col1", "col2"], groupby="col3", relationship=lambda stat, pvalue, alpha=0.01: ( stat > 0 and pvalue / 2 < alpha ), relationship_kwargs={"alpha": 0.5}, ), ] ) with pytest.raises(errors.SchemaDefinitionError): hypothesis_check_schema_groupby.validate(df)
def test_no_dtype_dataframe(): schema = DataFrameSchema({"col": Column(nullable=False)}) validated_df = schema.validate(pd.DataFrame({"col": [-123.1, -76.3, 1.0]})) assert isinstance(validated_df, pd.DataFrame) schema = DataFrameSchema({"col": Column(nullable=True)}) validated_df = schema.validate(pd.DataFrame({"col": [-123.1, None, 1.0]})) assert isinstance(validated_df, pd.DataFrame) with pytest.raises(errors.SchemaError): schema = DataFrameSchema({"col": Column(nullable=False)}) schema.validate(pd.DataFrame({"col": [-123.1, None, 1.0]}))
def test_check_input_method_decorators(): """Test the check_input and check_output decorator behaviours when the dataframe is changed within the function being checked""" in_schema = DataFrameSchema({"column1": Column(String)}) out_schema = DataFrameSchema({"column2": Column(Int)}) dataframe = pd.DataFrame({"column1": ["a", "b", "c"]}) def _transform_helper(df): return df.assign(column2=[1, 2, 3]) class TransformerClass(): """Contains functions with different signatures representing the way that the decorators can be called.""" # pylint: disable=E0012,C0111,C0116,W0613, R0201 # disables missing-function-docstring as this is a factory method # disables unused-arguments because handling the second argument is # what is being tested and this is intentional. # disables no-self-use because having TransformerClass with functions # is cleaner. @check_input(in_schema) @check_output(out_schema) def transform_first_arg(self, df): return _transform_helper(df) @check_input(in_schema, 0) @check_output(out_schema) def transform_first_arg_with_list_getter(self, df): return _transform_helper(df) @check_input(in_schema, 1) @check_output(out_schema) def transform_secord_arg_with_list_getter(self, x, df): return _transform_helper(df) @check_input(in_schema, "df") @check_output(out_schema) def transform_secord_arg_with_dict_getter(self, x, df): return _transform_helper(df) def _assert_expectation(result_df): assert isinstance(result_df, pd.DataFrame) assert "column2" in result_df.columns transformer = TransformerClass() _assert_expectation(transformer.transform_first_arg(dataframe)) _assert_expectation( transformer.transform_first_arg_with_list_getter(dataframe)) _assert_expectation( transformer.transform_secord_arg_with_list_getter(None, dataframe)) _assert_expectation( transformer.transform_secord_arg_with_dict_getter(None, dataframe))
def test_dataframe_schema_check(): """Test that DataFrameSchema-level Checks work properly.""" data = pd.DataFrame([range(10) for _ in range(10)]) schema_check_return_bool = DataFrameSchema( checks=Check(lambda df: (df < 10).all())) assert isinstance(schema_check_return_bool.validate(data), pd.DataFrame) schema_check_return_series = DataFrameSchema( checks=Check(lambda df: df[0] < 10)) assert isinstance(schema_check_return_series.validate(data), pd.DataFrame) schema_check_return_df = DataFrameSchema(checks=Check(lambda df: df < 10)) assert isinstance(schema_check_return_df.validate(data), pd.DataFrame)
def test_dataframe_checks(): schema = DataFrameSchema(columns={ "col1": Column(Int), "col2": Column(Float), "col3": Column(String), "col4": Column(String), }, checks=[ Check(lambda df: df["col1"] < df["col2"]), Check(lambda df: df["col3"] == df["col4"]), ]) df = pd.DataFrame({ "col1": [1, 2, 3], "col2": [2.0, 3.0, 4.0], "col3": ["foo", "bar", "baz"], "col4": ["foo", "bar", "baz"], }) assert isinstance(schema.validate(df), pd.DataFrame) # test invalid schema error raising invalid_df = df.copy() invalid_df["col1"] = invalid_df["col1"] * 3 with pytest.raises(errors.SchemaError): schema.validate(invalid_df) # test groupby checks groupby_check_schema = DataFrameSchema( columns={ "col1": Column(Int), "col3": Column(String), }, checks=[ Check(lambda g: g["foo"]["col1"].iat[0] == 1, groupby="col3"), Check(lambda g: g["foo"]["col2"].iat[0] == 2.0, groupby="col3"), Check(lambda g: g["foo"]["col3"].iat[0] == "foo", groupby="col3"), Check(lambda g: g[("foo", "foo")]["col1"].iat[0] == 1, groupby=["col3", "col4"]), ]) assert isinstance(groupby_check_schema.validate(df), pd.DataFrame) # test element-wise checks element_wise_check_schema = DataFrameSchema( columns={ "col1": Column(Int), "col2": Column(Float), }, checks=Check(lambda row: row["col1"] < row["col2"], element_wise=True)) assert isinstance(element_wise_check_schema.validate(df), pd.DataFrame)
def test_no_dtype_dataframe(): """Test how nullability is handled in DataFrameSchemas where no type is specified.""" schema = DataFrameSchema({"col": Column(nullable=False)}) validated_df = schema.validate(pd.DataFrame({"col": [-123.1, -76.3, 1.0]})) assert isinstance(validated_df, pd.DataFrame) schema = DataFrameSchema({"col": Column(nullable=True)}) validated_df = schema.validate(pd.DataFrame({"col": [-123.1, None, 1.0]})) assert isinstance(validated_df, pd.DataFrame) with pytest.raises(errors.SchemaError): schema = DataFrameSchema({"col": Column(nullable=False)}) schema.validate(pd.DataFrame({"col": [-123.1, None, 1.0]}))
def test_python_builtin_types(): """Test support python data types can be used for validation.""" schema = DataFrameSchema( { "int_col": Column(int), "float_col": Column(float), "str_col": Column(str), "bool_col": Column(bool), "object_col": Column(object), "complex_col": Column(complex), } ) df = pd.DataFrame( { "int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0], "str_col": list("abc"), "bool_col": [True, False, True], "object_col": [[1], 1, {"foo": "bar"}], "complex_col": [complex(1), complex(2), complex(3)], } ) assert isinstance(schema(df), pd.DataFrame) assert schema.dtype["int_col"] == PandasDtype.Int.str_alias assert schema.dtype["float_col"] == PandasDtype.Float.str_alias assert schema.dtype["str_col"] == PandasDtype.Str.str_alias assert schema.dtype["bool_col"] == PandasDtype.Bool.str_alias assert schema.dtype["object_col"] == PandasDtype.Object.str_alias assert schema.dtype["complex_col"] == PandasDtype.Complex.str_alias
def test_schema_get_dtype(): """Test that schema dtype and get_dtype methods handle regex columns.""" schema = DataFrameSchema({ "col1": Column(Int), "var*": Column(Float, regex=True), }) data = pd.DataFrame({ "col1": [1, 2, 3], "var1": [1.0, 1.1, 1.2], "var2": [1.0, 1.1, 1.2], "var3": [1.0, 1.1, 1.2], }) with pytest.warns(UserWarning) as record: assert schema.dtype == {"col1": Int.str_alias} assert len(record) == 1 assert record[0].message.args[0].startswith( "Schema has columns specified as regex column names:") assert schema.get_dtype(data) == { "col1": Int.str_alias, "var1": Float.str_alias, "var2": Float.str_alias, "var3": Float.str_alias, }
def test_required(): """Tests how a Required Column is handled when it's not included, included and then not specified and a second column which is implicitly required isn't available.""" schema = DataFrameSchema({ "col1": Column(Int, required=False), "col2": Column(String) }) df_ok_1 = pd.DataFrame({ "col2": ['hello', 'world'] }) df = schema.validate(df_ok_1) assert isinstance(df, pd.DataFrame) assert len(df.columns) == 1 assert set(df.columns) == {"col2"} df_ok_2 = pd.DataFrame({ "col1": [1, 2], "col2": ['hello', 'world'] }) df = schema.validate(df_ok_2) assert isinstance(df, pd.DataFrame) assert len(df.columns) == 2 assert set(df.columns) == {"col1", "col2"} df_not_ok = pd.DataFrame({ "col1": [1, 2] }) with pytest.raises(Exception): schema.validate(df_not_ok)
def test_column_regex_strict() -> None: """Test that Column regex patterns correctly parsed in DataFrameSchema.""" data = pd.DataFrame( { "foo_1": [1, 2, 3], "foo_2": [1, 2, 3], "foo_3": [1, 2, 3], } ) schema = DataFrameSchema( columns={"foo_*": Column(Int, regex=True)}, strict=True ) assert isinstance(schema.validate(data), pd.DataFrame) # adding an extra column in the dataframe should cause error data = data.assign(bar=[1, 2, 3]) with pytest.raises(errors.SchemaError): schema.validate(data) # adding an extra regex column to the schema should pass the strictness # test validated_data = schema.add_columns( {"bar_*": Column(Int, regex=True)} ).validate(data.assign(bar_1=[1, 2, 3])) assert isinstance(validated_data, pd.DataFrame)
def test_multi_index_columns() -> None: """Tests that multi-index Columns within DataFrames validate correctly.""" schema = DataFrameSchema( { ("zero", "foo"): Column(Float, Check(lambda s: (s > 0) & (s < 1))), ("zero", "bar"): Column( String, Check(lambda s: s.isin(["a", "b", "c", "d"])) ), ("one", "foo"): Column(Int, Check(lambda s: (s > 0) & (s < 10))), ("one", "bar"): Column( DateTime, Check(lambda s: s == pd.Timestamp(2019, 1, 1)) ), } ) validated_df = schema.validate( pd.DataFrame( { ("zero", "foo"): [0.1, 0.2, 0.7, 0.3], ("zero", "bar"): ["a", "b", "c", "d"], ("one", "foo"): [1, 6, 4, 7], ("one", "bar"): pd.to_datetime(["2019/01/01"] * 4), } ) ) assert isinstance(validated_df, pd.DataFrame)
def init_schema_no_groupby_column(): DataFrameSchema({ "col1": Column(Int, [ Check(lambda s: s["foo"] > 10, groupby=["col2"]), ]), })
def test_datetime(): """Test datetime types can be validated properly by schema.validate""" schema = DataFrameSchema( columns={ "col": Column( pa.DateTime, checks=Check(lambda s: s.min() > pd.Timestamp("2015")), ) } ) validated_df = schema.validate( pd.DataFrame( {"col": pd.to_datetime(["2019/01/01", "2018/05/21", "2016/03/10"])} ) ) assert isinstance(validated_df, pd.DataFrame) with pytest.raises(SchemaError): schema.validate( pd.DataFrame( {"col": pd.to_datetime(["2010/01/01"])} ) )
def test_check_groupby_multiple_columns(): """Tests uses of groupby to specify dependencies between one column and a number of other columns, including error handling.""" schema = DataFrameSchema({ "col1": Column( Int, [ Check( lambda s: s[("bar", True)].sum() == 16, # 7 + 9 groupby=["col2", "col3"]), ]), "col2": Column(String, Check(lambda s: s.isin(["foo", "bar"]))), "col3": Column(Bool), }) df_pass = pd.DataFrame({ "col1": [7, 8, 9, 11, 12, 13], "col2": ["bar", "bar", "bar", "foo", "foo", "foo"], "col3": [True, False, True, False, True, False], }) df = schema.validate(df_pass) assert isinstance(df, pd.DataFrame) assert len(df.columns) == 3 assert set(df.columns) == {"col1", "col2", "col3"}
def test_column_in_dataframe_schema(): """Test that a Column check returns a dataframe.""" schema = DataFrameSchema( {"a": Column(Int, Check(lambda x: x > 0, element_wise=True))} ) data = pd.DataFrame({"a": [1, 2, 3]}) assert isinstance(schema.validate(data), pd.DataFrame)