def tests_multi_index_subindex_coerce(): """MultIndex component should override sub indexes.""" indexes = [ Index(String, coerce=True), Index(String, coerce=False), Index(String, coerce=True), Index(String, coerce=False), ] data = pd.DataFrame(index=pd.MultiIndex.from_arrays([[1, 2, 3, 4]] * 4)) schema = DataFrameSchema(index=MultiIndex(indexes), coerce=False) validated_df = schema(data) for level_i in range(validated_df.index.nlevels): if indexes[level_i].coerce: assert validated_df.index.get_level_values(level_i).dtype == \ indexes[level_i].dtype else: # dtype should be string representation of pandas strings assert validated_df.index.get_level_values(level_i).dtype == \ "object" # coerce=True in MultiIndex should override subindex coerce setting schema_override = DataFrameSchema(index=MultiIndex(indexes), coerce=True) validated_df_override = schema_override(data) for level_i in range(validated_df.index.nlevels): assert validated_df_override.index.get_level_values(level_i).dtype == \ indexes[level_i].dtype
def tests_multi_index_subindex_coerce(): """MultIndex component should override sub indexes.""" indexes = [ Index(String, coerce=True), Index(String, coerce=False), Index(String, coerce=True), Index(String, coerce=False), ] data = pd.DataFrame(index=pd.MultiIndex.from_arrays([[1, 2, 3, 4]] * 4)) # coerce=True in MultiIndex and DataFrameSchema should override subindex # coerce setting for schema_override in [ DataFrameSchema(index=MultiIndex(indexes, coerce=True)), DataFrameSchema(index=MultiIndex(indexes), coerce=True), ]: validated_df_override = schema_override(data) for level_i in range(validated_df_override.index.nlevels): assert (validated_df_override.index.get_level_values(level_i).dtype == "object") # coerce=False at the MultiIndex level should result in two type errors schema = DataFrameSchema(index=MultiIndex(indexes)) with pytest.raises(errors.SchemaErrors, match="A total of 2 schema errors were found"): schema(data, lazy=True)
def _deserialize_schema(serialized_schema): # pylint: disable-all from pandera import DataFrameSchema, Column, Index, MultiIndex columns, index = None, None if serialized_schema["columns"] is not None: columns = { col_name: Column(**_deserialize_component_stats(column_stats)) for col_name, column_stats in serialized_schema["columns"].items() } if serialized_schema["index"] is not None: index = [ _deserialize_component_stats(index_component) for index_component in serialized_schema["index"] ] if index is None: pass elif len(index) == 1: index = Index(**index[0]) else: index = MultiIndex( indexes=[Index(**index_properties) for index_properties in index]) return DataFrameSchema( columns={col_name: column for col_name, column in columns.items()}, index=index, coerce=serialized_schema["coerce"], strict=serialized_schema["strict"], )
def test_multi_index_index(): schema = DataFrameSchema( columns={ "column1": Column(Float, Check(lambda s: s > 0)), "column2": Column(Float, Check(lambda s: s > 0)), }, index=MultiIndex(indexes=[ Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"), Index(String, Check(lambda s: s.isin(["foo", "bar"])), name="index1"), ])) df = pd.DataFrame( data={ "column1": [0.1, 0.5, 123.1, 10.6, 22.31], "column2": [0.1, 0.5, 123.1, 10.6, 22.31], }, index=pd.MultiIndex.from_arrays( [[0, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]], names=["index0", "index1"], )) validated_df = schema.validate(df) assert isinstance(validated_df, pd.DataFrame) # failure case df_fail = df.copy() df_fail.index = pd.MultiIndex.from_arrays( [[-1, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]], names=["index0", "index1"], ) with pytest.raises(errors.SchemaError): schema.validate(df_fail)
def test_series_schema_with_index(coerce): """Test SeriesSchema with Index and MultiIndex components.""" schema_with_index = SeriesSchema( pandas_dtype=Int, index=Index(Int, coerce=coerce), ) validated_series = schema_with_index(pd.Series([1, 2, 3], index=[1, 2, 3])) assert isinstance(validated_series, pd.Series) schema_with_multiindex = SeriesSchema( pandas_dtype=Int, index=MultiIndex( [ Index(Int, coerce=coerce), Index(String, coerce=coerce), ] ), ) multi_index = pd.MultiIndex.from_arrays( [[0, 1, 2], ["foo", "bar", "foo"]], ) validated_series_multiindex = schema_with_multiindex( pd.Series([1, 2, 3], index=multi_index) ) assert isinstance(validated_series_multiindex, pd.Series) assert (validated_series_multiindex.index == multi_index).all()
def schema_multiindex(): """Fixture for schema with MultiIndex.""" schema = DataFrameSchema( columns={ "col1": Column(pandas_dtype=Int), "col2": Column(pandas_dtype=Float), }, index=MultiIndex([ Index(pandas_dtype=String, name="ind0"), Index(pandas_dtype=String, name="ind1"), ]), ) return schema
def _deserialize_schema(serialized_schema): # pylint: disable=import-outside-toplevel from pandera import Index, MultiIndex # GH#475 serialized_schema = serialized_schema if serialized_schema else {} if not isinstance(serialized_schema, Mapping): raise pandera.errors.SchemaDefinitionError( "Schema representation must be a mapping.") columns = serialized_schema.get("columns") index = serialized_schema.get("index") checks = serialized_schema.get("checks") if columns is not None: columns = { col_name: Column(**_deserialize_component_stats(column_stats)) for col_name, column_stats in columns.items() } if index is not None: index = [ _deserialize_component_stats(index_component) for index_component in index ] if checks is not None: # handles unregistered checks by raising AttributeErrors from getattr checks = [ _deserialize_check_stats(getattr(Check, check_name), check_stats) for check_name, check_stats in checks.items() ] if index is None: pass elif len(index) == 1: index = Index(**index[0]) else: index = MultiIndex( indexes=[Index(**index_properties) for index_properties in index]) return DataFrameSchema( columns=columns, checks=checks, index=index, coerce=serialized_schema.get("coerce", False), strict=serialized_schema.get("strict", False), )
def test_schema_component_equality_operators(): """Test the usage of == for Column, Index and MultiIndex.""" column = Column(Int, Check(lambda s: s >= 0)) index = Index(Int, [Check(lambda x: 1 <= x <= 11, element_wise=True)]) multi_index = MultiIndex(indexes=[ Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"), Index(String, Check(lambda s: s.isin(["foo", "bar"])), name="index1"), ]) not_equal_schema = DataFrameSchema( {"col1": Column(Int, Check(lambda s: s >= 0))}) assert column == copy.deepcopy(column) assert column != not_equal_schema assert index == copy.deepcopy(index) assert index != not_equal_schema assert multi_index == copy.deepcopy(multi_index) assert multi_index != not_equal_schema
def test_multi_index_schema_coerce(): """Test that multi index can be type-coerced.""" indexes = [ Index(Float), Index(Int), Index(String), ] schema = DataFrameSchema(index=MultiIndex(indexes=indexes)) df = pd.DataFrame(index=pd.MultiIndex.from_arrays([ [1.0, 2.1, 3.5, 4.8], [5, 6, 7, 8], ["9", "10", "11", "12"], ])) validated_df = schema(df) for level_i in range(validated_df.index.nlevels): assert (validated_df.index.get_level_values(level_i).dtype == indexes[level_i].dtype)
def test_multi_index_index(): """Tests that multi-index Indexes within DataFrames validate correctly.""" schema = DataFrameSchema( columns={ "column1": Column(Float, Check(lambda s: s > 0)), "column2": Column(Float, Check(lambda s: s > 0)), }, index=MultiIndex(indexes=[ Index(Int, Check(lambda s: (s < 5) & (s >= 0)), name="index0"), Index( String, Check(lambda s: s.isin(["foo", "bar"])), name="index1", ), ]), ) df = pd.DataFrame( data={ "column1": [0.1, 0.5, 123.1, 10.6, 22.31], "column2": [0.1, 0.5, 123.1, 10.6, 22.31], }, index=pd.MultiIndex.from_arrays( [[0, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]], names=["index0", "index1"], ), ) validated_df = schema.validate(df) assert isinstance(validated_df, pd.DataFrame) assert schema.index.names == ["index0", "index1"] assert (schema.index.__repr__() == f"<Schema MultiIndex: '{schema.index.names}'>") # failure case df_fail = df.copy() df_fail.index = pd.MultiIndex.from_arrays( [[-1, 1, 2, 3, 4], ["foo", "bar", "foo", "bar", "foo"]], names=["index0", "index1"], ) with pytest.raises(errors.SchemaError): schema.validate(df_fail)
def _deserialize_schema(serialized_schema): # pylint: disable=import-outside-toplevel from pandera import Check, Column, DataFrameSchema, Index, MultiIndex columns, index, checks = None, None, None if serialized_schema["columns"] is not None: columns = { col_name: Column(**_deserialize_component_stats(column_stats)) for col_name, column_stats in serialized_schema["columns"].items() } if serialized_schema["index"] is not None: index = [ _deserialize_component_stats(index_component) for index_component in serialized_schema["index"] ] if serialized_schema["checks"] is not None: # handles unregistered checks by raising AttributeErrors from getattr checks = [ _deserialize_check_stats(getattr(Check, check_name), check_stats) for check_name, check_stats in serialized_schema["checks"].items() ] if index is None: pass elif len(index) == 1: index = Index(**index[0]) else: index = MultiIndex( indexes=[Index(**index_properties) for index_properties in index]) return DataFrameSchema( columns=columns, checks=checks, index=index, coerce=serialized_schema["coerce"], strict=serialized_schema["strict"], )
COLUMN_TEMPLATE = """ Column( dtype={dtype}, checks={checks}, nullable={nullable}, allow_duplicates={allow_duplicates}, coerce={coerce}, required={required}, regex={regex}, ) """ INDEX_TEMPLATE = ("Index(dtype={dtype},checks={checks}," "nullable={nullable},coerce={coerce},name={name})") MULTIINDEX_TEMPLATE = """ MultiIndex(indexes=[{indexes}]) """ def _format_checks(checks_dict): if checks_dict is None: return "None" checks = [] for check_name, check_kwargs in checks_dict.items(): if check_kwargs is None: warnings.warn(f"Check {check_name} cannot be serialized. " "This check will be ignored") else: args = ", ".join(f"{k}={v.__repr__()}"
def test_multiindex_incorrect_input(indexes): """Passing in non-Index object raises SchemaInitError.""" with pytest.raises((errors.SchemaInitError, TypeError)): MultiIndex(indexes)
def test_multiindex_unordered_init_exception(indexes): """Un-named indexes in unordered MultiIndex raises an exception.""" with pytest.raises(errors.SchemaInitError): MultiIndex(indexes, ordered=False)
[ pd.MultiIndex.from_arrays([[1, 2, 3], [1, 2, 3]], names=["a", "a" ]), False, ], [ pd.MultiIndex.from_arrays([[1, 2, 3], ["a", "b", "c"]], names=["a", "a"]), True, ], ], ) @pytest.mark.parametrize( "schema", [ MultiIndex([Index(int, name="a"), Index(int, name="a")]), MultiIndex( [Index(int, name="a"), Index(int, name="a")], coerce=True), ], ) def test_multiindex_duplicate_index_names(multiindex, error, schema): """Test MultiIndex schema component can handle duplicate index names.""" if error: with pytest.raises(errors.SchemaError): schema(pd.DataFrame(index=multiindex)) with pytest.raises(errors.SchemaErrors): schema(pd.DataFrame(index=multiindex), lazy=True) else: assert isinstance(schema(pd.DataFrame(index=multiindex)), pd.DataFrame)
DataFrameSchema(columns={"new_name": Column(name="old_name")}) @pytest.mark.parametrize( "columns,index", [ ( { "a": Column(Int, required=False), "b": Column(Int, required=False), }, None, ), ( None, MultiIndex(indexes=[Index(Int, name="a"), Index(Int, name="b")], ), ), ], ) def test_ordered_dataframe(columns: Dict[str, Column], index: MultiIndex): """Test that columns are ordered.""" schema = DataFrameSchema(columns=columns, index=index, ordered=True) df = pd.DataFrame( data=[[1, 2, 3]], columns=["a", "a", "b"], index=pd.MultiIndex.from_arrays([[1], [2], [3]], names=["a", "a", "b"]), ) assert isinstance(schema.validate(df), pd.DataFrame)