예제 #1
0
def test_dataframeschema():
    """Test that DataFrameSchema is compatible with pydantic."""
    assert isinstance(
        DataFrameSchemaPydantic(
            pa_schema=pa.DataFrameSchema(),
            pa_mi=pa.MultiIndex([pa.Index(str), pa.Index(int)]),
        ),
        DataFrameSchemaPydantic,
    )
예제 #2
0
def test_dataframe_strategy_with_indexes(pdtype, data):
    """Test dataframe strategy with index and multiindex components."""
    dataframe_schema_index = pa.DataFrameSchema(index=pa.Index(pdtype))
    dataframe_schema_multiindex = pa.DataFrameSchema(index=pa.MultiIndex(
        [pa.Index(pdtype, name=f"index{i}") for i in range(3)]))

    dataframe_schema_index(data.draw(dataframe_schema_index.strategy(size=10)))
    dataframe_schema_multiindex(
        data.draw(dataframe_schema_multiindex.strategy(size=10)))
예제 #3
0
def _create_schema(index="single"):

    if index == "multi":
        index = pa.MultiIndex([
            pa.Index(pa.Int, name="int_index0"),
            pa.Index(pa.Int, name="int_index1"),
            pa.Index(pa.Int, name="int_index2"),
        ])
    elif index == "single":
        index = pa.Index(pa.Int, name="int_index")
    else:
        index = None

    return pa.DataFrameSchema(columns={
        "int_column":
        pa.Column(
            pa.Int,
            checks=[
                pa.Check.greater_than(0),
                pa.Check.less_than(10),
                pa.Check.in_range(0, 10),
            ],
        ),
        "float_column":
        pa.Column(
            pa.Float,
            checks=[
                pa.Check.greater_than(-10),
                pa.Check.less_than(20),
                pa.Check.in_range(-10, 20),
            ],
        ),
        "str_column":
        pa.Column(
            pa.String,
            checks=[
                pa.Check.isin(["foo", "bar", "x", "xy"]),
                pa.Check.str_length(1, 3)
            ],
        ),
        "datetime_column":
        pa.Column(pa.DateTime,
                  checks=[
                      pa.Check.greater_than(pd.Timestamp("20100101")),
                      pa.Check.less_than(pd.Timestamp("20200101")),
                  ]),
        "timedelta_column":
        pa.Column(pa.Timedelta,
                  checks=[
                      pa.Check.greater_than(pd.Timedelta(1000, unit="ns")),
                      pa.Check.less_than(pd.Timedelta(10000, unit="ns")),
                  ]),
    },
                              index=index,
                              coerce=False,
                              strict=True)
예제 #4
0
def test_multiindex():
    """Test that multiple Index annotations create a MultiIndex."""
    class Schema(pa.SchemaModel):
        a: Index[int] = pa.Field(gt=0)
        b: Index[str]

    expected = pa.DataFrameSchema(index=pa.MultiIndex([
        pa.Index(int, name="a", checks=pa.Check.gt(0)),
        pa.Index(str, name="b"),
    ]))
    assert expected == Schema.to_schema()
예제 #5
0
def test_multiindex_example():
    """
    Test MultiIndex schema component example method generates examples that
    pass.
    """
    pdtype = pa.PandasDtype.Float
    multiindex = pa.MultiIndex(indexes=[
        pa.Index(pdtype, allow_duplicates=False, name="level_0"),
        pa.Index(pdtype, nullable=True),
        pa.Index(pdtype),
    ])
    for _ in range(10):
        example = multiindex.example()
        multiindex(pd.DataFrame(index=example))
예제 #6
0
def test_multiindex_example() -> None:
    """
    Test MultiIndex schema component example method generates examples that
    pass.
    """
    data_type = pa.Float()
    multiindex = pa.MultiIndex(indexes=[
        pa.Index(data_type, unique=True, name="level_0"),
        pa.Index(data_type, nullable=True),
        pa.Index(data_type),
    ])
    for _ in range(10):
        example = multiindex.example()
        multiindex(pd.DataFrame(index=example))
예제 #7
0
파일: event.py 프로젝트: Thelin90/heuleum
def create_event_schema(
    coerce: bool = True,
    strict: bool = True,
    nullable: bool = False,
):
    """Function to validate that event schema is correct, it also does value checks in runtime
    (really nice stuff, right here). If this fails, then write to dead letter queue.

    Args:
        coerce (bool): Flag given to determine whether to coerce series to specified type
        strict (bool): Flag given to determine whether or not to accept columns in the
            dataframe that are not in the DataFrame
        nullable (bool): If columns should be nullable or not

    Returns: A pandas DataFrame schema that validates that the types are correct, and that the
    values inserted are correct.

    """
    return pa.DataFrameSchema(
        {
            "id": pa.Column(pa.String, nullable=nullable),
            "timestamp": pa.Column(pa.DateTime, nullable=nullable),
            "version": pa.Column(pa.String, nullable=nullable),
        },
        index=pa.Index(pa.Int),
        strict=strict,
        coerce=coerce,
    )
예제 #8
0
def create_invoice_stats_schema(coerce: bool = True,
                                strict: bool = True,
                                nullable: bool = True):
    """Function to validate that invoice stats schema is correct, it also does value checks in
    runtime (really nice stuff, right here).

    Args:
        coerce (bool): Flag given to determine whether to coerce series to specified type
        strict (bool): Flag given to determine whether or not to accept columns in the
            dataframe that are not in the DataFrame
        nullable (bool): If columns should be nullable or not

    Returns: A pandas DataFrame schema that validates that the types are correct

    """
    return pa.DataFrameSchema(
        {
            INVOICE_STATS_COLUMN_NAMES.get("invoice_median"):
            pa.Column(pa.Float64, nullable=nullable),
            INVOICE_STATS_COLUMN_NAMES.get("invoice_mean"):
            pa.Column(pa.Float64, nullable=nullable),
        },
        index=pa.Index(pa.Int),
        strict=strict,
        coerce=coerce,
    )
예제 #9
0
def test_index_dtypes(
    dtype: pandas_engine.DataType,
    coerce: bool,
    schema_cls,
    data: st.DataObject,
):
    """Test koalas Index and MultiIndex on subset of datatypes.

    Only test basic datatypes since index handling in pandas is already a
    little finicky.
    """
    if coerce and dtype is pandas_engine.DateTime:
        pytest.skip(
            "koalas cannot coerce a koalas DateTime index to datetime.")

    if schema_cls is pa.Index:
        schema = schema_cls(dtype, name="field")
        schema.coerce = coerce
    else:
        schema = schema_cls(indexes=[pa.Index(dtype, name="field")],
                            coerce=True)
    sample = data.draw(schema.strategy(size=3))

    if dtype is pandas_engine.DateTime or isinstance(dtype,
                                                     pandas_engine.DateTime):
        # handle datetimes
        if MIN_TIMESTAMP is not None and (sample.to_frame() <
                                          MIN_TIMESTAMP).any(axis=None):
            with pytest.raises(OverflowError,
                               match="mktime argument out of range"):
                ks.DataFrame(pd.DataFrame(index=sample))
            return
    else:
        assert isinstance(schema(ks.DataFrame(pd.DataFrame(index=sample))),
                          ks.DataFrame)
예제 #10
0
def test_multiindex_strategy(data):
    """Test MultiIndex schema component strategy."""
    pdtype = pa.PandasDtype.Float
    multiindex = pa.MultiIndex(indexes=[
        pa.Index(pdtype, allow_duplicates=False, name="level_0"),
        pa.Index(pdtype, nullable=True),
        pa.Index(pdtype),
    ])
    strat = multiindex.strategy(size=10)
    example = data.draw(strat)
    for i in range(example.nlevels):
        assert example.get_level_values(i).dtype == pdtype.str_alias

    with pytest.raises(pa.errors.BaseStrategyOnlyError):
        strategies.multiindex_strategy(
            pdtype, strategies.pandas_dtype_strategy(pdtype))
예제 #11
0
def test_index_example() -> None:
    """
    Test Index schema component example method generates examples that pass.
    """
    data_type = pa.Int()
    index_schema = pa.Index(data_type, unique=True)
    for _ in range(10):
        index_schema(pd.DataFrame(index=index_schema.example()))
예제 #12
0
def test_index_example():
    """
    Test Index schema component example method generates examples that pass.
    """
    pdtype = pa.PandasDtype.Int
    index_schema = pa.Index(pdtype, allow_duplicates=False)
    for _ in range(10):
        index_schema(pd.DataFrame(index=index_schema.example()))
예제 #13
0
def test_config() -> None:
    """Test that Config can be inherited and translate into DataFrameSchema options."""
    class Base(pa.SchemaModel):
        a: Series[int]
        idx_1: Index[str]
        idx_2: Index[str]

        class Config:
            name = "Base schema"
            coerce = True
            ordered = True
            multiindex_coerce = True
            multiindex_strict = True
            multiindex_name: Optional[str] = "mi"

    class Child(Base):
        b: Series[int]

        class Config:
            name = "Child schema"
            strict = True
            multiindex_strict = False
            description = "foo"
            title = "bar"

    expected = pa.DataFrameSchema(
        columns={
            "a": pa.Column(int),
            "b": pa.Column(int)
        },
        index=pa.MultiIndex(
            [pa.Index(str, name="idx_1"),
             pa.Index(str, name="idx_2")],
            coerce=True,
            strict=False,
            name="mi",
        ),
        name="Child schema",
        coerce=True,
        strict=True,
        ordered=True,
        description="foo",
        title="bar",
    )

    assert expected == Child.to_schema()
예제 #14
0
def test_register_custom_groupby_check(custom_check_teardown: None) -> None:
    """Test registering a custom groupby check."""
    @extensions.register_check_method(
        statistics=["group_a", "group_b"],
        supported_types=(pd.Series, pd.DataFrame),
        check_type="groupby",
    )
    def custom_check(dict_groups, *, group_a, group_b):
        """
        Test that the mean values in group A is larger than that of group B.

        Note that this function can handle groups of both dataframes and
        series.
        """
        return (dict_groups[group_a].values.mean() >
                dict_groups[group_b].values.mean())

    # column groupby check
    data_column_check = pd.DataFrame({
        "col1": [20, 20, 10, 10],
        "col2": list("aabb"),
    })

    schema_column_check = pa.DataFrameSchema({
        "col1":
        pa.Column(
            int,
            Check.custom_check(group_a="a", group_b="b", groupby="col2"),
        ),
        "col2":
        pa.Column(str),
    })
    assert isinstance(schema_column_check(data_column_check), pd.DataFrame)

    # dataframe groupby check
    data_df_check = pd.DataFrame(
        {
            "col1": [20, 20, 10, 10],
            "col2": [30, 30, 5, 5],
            "col3": [10, 10, 1, 1],
        },
        index=pd.Index(list("aabb"), name="my_index"),
    )
    schema_df_check = pa.DataFrameSchema(
        columns={
            "col1": pa.Column(int),
            "col2": pa.Column(int),
            "col3": pa.Column(int),
        },
        index=pa.Index(str, name="my_index"),
        checks=Check.custom_check(group_a="a", group_b="b",
                                  groupby="my_index"),
    )
    assert isinstance(schema_df_check(data_df_check), pd.DataFrame)

    for kwargs in [{"element_wise": True}, {"element_wise": False}]:
        with pytest.warns(UserWarning):
            Check.custom_check(val=10, **kwargs)
예제 #15
0
def test_index_strategy(data):
    """Test Index schema component strategy."""
    pdtype = pa.PandasDtype.Int
    index_schema = pa.Index(pdtype, allow_duplicates=False, name="index")
    strat = index_schema.strategy(size=10)
    example = data.draw(strat)
    assert (~example.duplicated()).all()
    assert example.dtype == pdtype.str_alias
    index_schema(pd.DataFrame(index=example))
예제 #16
0
def test_multiindex_strategy(data) -> None:
    """Test MultiIndex schema component strategy."""
    data_type = pa.Float()
    multiindex = pa.MultiIndex(indexes=[
        pa.Index(data_type, unique=True, name="level_0"),
        pa.Index(data_type, nullable=True),
        pa.Index(data_type),
    ])
    strat = multiindex.strategy(size=10)
    example = data.draw(strat)
    for i in range(example.nlevels):
        actual_data_type = pandas_engine.Engine.dtype(
            example.get_level_values(i).dtype)
        assert data_type.check(actual_data_type)

    with pytest.raises(pa.errors.BaseStrategyOnlyError):
        strategies.multiindex_strategy(
            data_type, strategies.pandas_dtype_strategy(data_type))
예제 #17
0
def test_seriesschema():
    """Test that SeriesSchemaBase is compatible with pydantic."""
    assert isinstance(
        SeriesSchemaPydantic(
            pa_series_schema=pa.SeriesSchema(),
            pa_column=pa.Column(),
            pa_index=pa.Index(),
        ),
        SeriesSchemaPydantic,
    )
예제 #18
0
def test_index_strategy(data) -> None:
    """Test Index schema component strategy."""
    data_type = pa.Int()
    index_schema = pa.Index(data_type, unique=True, name="index")
    strat = index_schema.strategy(size=10)
    example = data.draw(strat)

    assert (~example.duplicated()).all()
    actual_data_type = pandas_engine.Engine.dtype(example.dtype)
    assert data_type.check(actual_data_type)
    index_schema(pd.DataFrame(index=example))
예제 #19
0
파일: rules.py 프로젝트: nialov/tracerepo
def database_schema() -> pa.DataFrameSchema:
    """
    Get pandera DataFrame schema for database.csv.
    """
    schema = pa.DataFrameSchema(
        # Index is the area name
        index=pa.Index(
            **name_column_kwargs(allow_duplicates=False, geom_type=ColumnNames.AREA)
        ),
        coerce=True,
        # Columns
        columns={
            # traces, thematic and scale columns are strings
            ColumnNames.TRACES.value: pa.Column(
                **name_column_kwargs(
                    allow_duplicates=True, geom_type=ColumnNames.TRACES
                )
            ),
            ColumnNames.THEMATIC.value: pa.Column(
                **name_column_kwargs(allow_duplicates=True, geom_type=None)
            ),
            ColumnNames.SCALE.value: pa.Column(
                **name_column_kwargs(allow_duplicates=True, geom_type=None)
            ),
            # area-shape must be one of the enum values
            ColumnNames.AREA_SHAPE.value: pa.Column(
                **enum_column_kwargs(enum_class=AreaShapes)
            ),
            # validated must be one of the enum values
            ColumnNames.VALIDITY.value: pa.Column(
                **enum_column_kwargs(enum_class=ValidationResults)
            ),
            ColumnNames.SNAP_THRESHOLD.value: pa.Column(
                pa.Float,
                checks=[
                    pa.Check.greater_than_or_equal_to(1e-8),
                    pa.Check.less_than_or_equal_to(1e8),
                ],
                coerce=True,
                nullable=False,
            ),
        },
    )
    assert isinstance(schema, pa.DataFrameSchema)
    return schema
예제 #20
0
def create_invoice_schema(
    max_invoice_value: Decimal,
    min_invoice_value: Decimal,
    coerce: bool = True,
    strict: bool = True,
    nullable: bool = False,
):
    """Function to validate that invoice schema is correct, it also does value checks in runtime
    (really nice stuff, right here).

    Args:
        max_invoice_value (Decimal): Given max invoice value
        min_invoice_value (Decimal): Given min invoice value
        coerce (bool): Flag given to determine whether to coerce series to specified type
        strict (bool): Flag given to determine whether or not to accept columns in the
            dataframe that are not in the DataFrame
        nullable (bool): If columns should be nullable or not

    Returns: A pandas DataFrame schema that validates that the types are correct, and that the
    values inserted are correct. If a row is inserted that does not follow:

    0 < invoice_value < 200000000.00

    An error will be thrown in runtime.

    """
    return pa.DataFrameSchema(
        {
            INVOICE_COLUMN_NAMES.get("invoice_name"):
            pa.Column(pa.String, nullable=nullable),
            INVOICE_COLUMN_NAMES.get("invoice_value"):
            pa.Column(
                pa.Float64,
                checks=[
                    pa.Check.less_than_or_equal_to(max_invoice_value),
                    pa.Check.greater_than_or_equal_to(min_invoice_value),
                ],
                nullable=nullable,
            ),
        },
        index=pa.Index(pa.Int),
        strict=strict,
        coerce=coerce,
    )
예제 #21
0
def test_inherit_schemamodel_fields_alias():
    """Test that columns and index aliases are inherited."""
    class Base(pa.SchemaModel):
        a: Series[int]
        idx: Index[str]

    class Mid(Base):
        b: Series[str] = pa.Field(alias="_b")
        idx: Index[str]

    class ChildOverrideAttr(Mid):
        b: Series[int]

    class ChildOverrideAlias(Mid):
        b: Series[str] = pa.Field(alias="new_b")

    class ChildNewAttr(Mid):
        c: Series[int]

    class ChildEmpty(Mid):
        pass

    expected_mid = pa.DataFrameSchema(
        columns={
            "a": pa.Column(int),
            "_b": pa.Column(str)
        },
        index=pa.Index(str),
    )
    expected_child_override_attr = expected_mid.rename_columns({
        "_b": "b"
    }).update_column("b", pandas_dtype=int)
    expected_child_override_alias = expected_mid.rename_columns(
        {"_b": "new_b"})
    expected_child_new_attr = expected_mid.add_columns({
        "c": pa.Column(int),
    })

    assert expected_mid == Mid.to_schema()
    assert expected_child_override_attr == ChildOverrideAttr.to_schema()
    assert expected_child_override_alias == ChildOverrideAlias.to_schema()
    assert expected_child_new_attr == ChildNewAttr.to_schema()
    assert expected_mid == ChildEmpty.to_schema()
예제 #22
0
def test_to_schema():
    """Test that SchemaModel.to_schema() can produce the correct schema."""
    class Schema(pa.SchemaModel):
        a: Series[int]
        b: Series[str]
        idx: Index[str]

    expected = pa.DataFrameSchema(
        columns={
            "a": pa.Column(int),
            "b": pa.Column(str)
        },
        index=pa.Index(str),
    )

    assert expected == Schema.to_schema()

    with pytest.raises(TypeError):
        Schema()
예제 #23
0
def test_schemamodel_with_fields():
    """Test that Fields are translated in the schema."""
    class Schema(pa.SchemaModel):
        a: Series[int] = pa.Field(eq=9, ne=0)
        b: Series[str]
        idx: Index[str] = pa.Field(str_length={"min_value": 1})

    actual = Schema.to_schema()
    expected = pa.DataFrameSchema(
        columns={
            "a":
            pa.Column(int,
                      checks=[pa.Check.equal_to(9),
                              pa.Check.not_equal_to(0)]),
            "b":
            pa.Column(str),
        },
        index=pa.Index(str, pa.Check.str_length(1)),
    )

    assert actual == expected
예제 #24
0
def test_inherit_schemamodel_fields():
    """Test that columns and indices are inherited."""
    class Base(pa.SchemaModel):
        a: Series[int]
        idx: Index[str]

    class Mid(Base):
        b: Series[str]
        idx: Index[str]

    class Child(Mid):
        b: Series[int]

    expected = pa.DataFrameSchema(
        columns={
            "a": pa.Column(int),
            "b": pa.Column(int)
        },
        index=pa.Index(str),
    )

    assert expected == Child.to_schema()
예제 #25
0
def test_index_dtypes(
    dtype: pandas_engine.DataType,
    coerce: bool,
    schema_cls,
    data: st.DataObject,
):
    """Test modin Index and MultiIndex on subset of datatypes.

    Only test basic datatypes since index handling in pandas is already a
    little finicky.
    """
    if schema_cls is pa.Index:
        schema = schema_cls(dtype, name="field", coerce=coerce)
    else:
        schema = schema_cls(indexes=[pa.Index(dtype, name="field")])
        schema.coerce = coerce
    sample = data.draw(schema.strategy(size=3))
    # pandas (and modin) use object arrays to store boolean data
    if dtype is bool:
        assert sample.dtype == "object"
        return
    assert isinstance(schema(mpd.DataFrame(pd.DataFrame(index=sample))),
                      mpd.DataFrame)
예제 #26
0
def test_to_schema_and_validate() -> None:
    """
    Test that SchemaModel.to_schema() can produce the correct schema and
    can validate dataframe objects.
    """
    class Schema(pa.SchemaModel):
        a: Series[int]
        b: Series[str]
        c: Series[Any]
        idx: Index[str]

    expected = pa.DataFrameSchema(
        columns={
            "a": pa.Column(int),
            "b": pa.Column(str),
            "c": pa.Column()
        },
        index=pa.Index(str),
    )
    assert expected == Schema.to_schema()

    Schema(pd.DataFrame({"a": [1], "b": ["foo"], "c": [3.4]}, index=["1"]))
    with pytest.raises(pa.errors.SchemaError):
        Schema(pd.DataFrame({"a": [1]}))
예제 #27
0
def test_field_name_access_inherit():
    """Test that column and index names can be accessed through the class"""
    class Base(pa.SchemaModel):
        a: Series[int]
        b: Series[int] = pa.Field()
        c: Series[int] = pa.Field(alias="_c")
        d: Series[int] = pa.Field(alias=123)
        i1: Index[int]
        i2: Index[int] = pa.Field()

    class Child(Base):
        b: Series[str] = pa.Field(alias="_b")
        c: Series[str]
        d: Series[str] = pa.Field()
        extra1: Series[int]
        extra2: Series[int] = pa.Field()
        extra3: Series[int] = pa.Field(alias="_extra3")
        i1: Index[str]
        i3: Index[int] = pa.Field(alias="_i3")

    expected_base = pa.DataFrameSchema(
        columns={
            "a": pa.Column(int),
            "b": pa.Column(int),
            "_c": pa.Column(int),
            123: pa.Column(int),
        },
        index=pa.MultiIndex([
            pa.Index(int, name="i1"),
            pa.Index(int, name="i2"),
        ]),
    )

    expected_child = pa.DataFrameSchema(
        columns={
            "a": pa.Column(int),
            "_b": pa.Column(str),
            "c": pa.Column(str),
            "d": pa.Column(str),
            "extra1": pa.Column(int),
            "extra2": pa.Column(int),
            "_extra3": pa.Column(int),
        },
        index=pa.MultiIndex([
            pa.Index(str, name="i1"),
            pa.Index(int, name="i2"),
            pa.Index(int, name="_i3"),
        ]),
    )

    assert expected_base == Base.to_schema()
    assert expected_child == Child.to_schema()
    assert Child.a == "a"  # pylint:disable=no-member
    assert Child.b == "_b"
    assert Child.c == "c"
    assert Child.d == "d"
    assert Child.extra1 == "extra1"
    assert Child.extra2 == "extra2"
    assert Child.extra3 == "_extra3"
    assert Child.i1 == "i1"
    assert Child.i2 == "i2"
    assert Child.i3 == "_i3"
예제 #28
0
def _create_schema(index="single"):

    if index == "multi":
        index = pa.MultiIndex([
            pa.Index(pa.Int, name="int_index0"),
            pa.Index(pa.Int, name="int_index1"),
            pa.Index(pa.Int, name="int_index2"),
        ])
    elif index == "single":
        # make sure io modules can handle case when index name is None
        index = pa.Index(pa.Int, name=None)
    else:
        index = None

    return pa.DataFrameSchema(
        columns={
            "int_column":
            pa.Column(
                pa.Int,
                checks=[
                    pa.Check.greater_than(0),
                    pa.Check.less_than(10),
                    pa.Check.in_range(0, 10),
                ],
            ),
            "float_column":
            pa.Column(
                pa.Float,
                checks=[
                    pa.Check.greater_than(-10),
                    pa.Check.less_than(20),
                    pa.Check.in_range(-10, 20),
                ],
            ),
            "str_column":
            pa.Column(
                pa.String,
                checks=[
                    pa.Check.isin(["foo", "bar", "x", "xy"]),
                    pa.Check.str_length(1, 3),
                ],
            ),
            "datetime_column":
            pa.Column(
                pa.DateTime,
                checks=[
                    pa.Check.greater_than(pd.Timestamp("20100101")),
                    pa.Check.less_than(pd.Timestamp("20200101")),
                ],
            ),
            "timedelta_column":
            pa.Column(
                pa.Timedelta,
                checks=[
                    pa.Check.greater_than(pd.Timedelta(1000, unit="ns")),
                    pa.Check.less_than(pd.Timedelta(10000, unit="ns")),
                ],
            ),
            "optional_props_column":
            pa.Column(
                pa.String,
                nullable=True,
                allow_duplicates=True,
                coerce=True,
                required=False,
                regex=True,
                checks=[pa.Check.str_length(1, 3)],
            ),
        },
        index=index,
        coerce=False,
        strict=True,
    )
예제 #29
0
def test_get_dataframe_schema_statistics():
    """Test that dataframe schema statistics logic is correct."""
    schema = pa.DataFrameSchema(
        columns={
            "int": pa.Column(
                pa.Int, checks=[
                    pa.Check.greater_than_or_equal_to(0),
                    pa.Check.less_than_or_equal_to(100),
                ],
                nullable=True,
            ),
            "float": pa.Column(
                pa.Float, checks=[
                    pa.Check.greater_than_or_equal_to(50),
                    pa.Check.less_than_or_equal_to(100),
                ]),
            "str": pa.Column(
                pa.String, checks=[
                    pa.Check.isin(["foo", "bar", "baz"])
                ]
            ),
        },
        index=pa.Index(
            pa.Int,
            checks=pa.Check.greater_than_or_equal_to(0),
            nullable=False,
            name="int_index"
        )
    )
    expectation = {
        "columns": {
            "int": {
                "pandas_dtype": pa.Int,
                "checks": {
                    "greater_than_or_equal_to": {"min_value": 0},
                    "less_than_or_equal_to": {"max_value": 100},
                },
                "nullable": True,
                "allow_duplicates": True,
                "coerce": False,
                "required": True,
                "regex": False,
            },
            "float": {
                "pandas_dtype": pa.Float,
                "checks": {
                    "greater_than_or_equal_to": {"min_value": 50},
                    "less_than_or_equal_to": {"max_value": 100},
                },
                "nullable": False,
                "allow_duplicates": True,
                "coerce": False,
                "required": True,
                "regex": False,
            },
            "str": {
                "pandas_dtype": pa.String,
                "checks": {"isin": {"allowed_values": ["foo", "bar", "baz"]}},
                "nullable": False,
                "allow_duplicates": True,
                "coerce": False,
                "required": True,
                "regex": False,
            },
        },
        "index": [
            {
                "pandas_dtype": pa.Int,
                "checks": {"greater_than_or_equal_to": {"min_value": 0}},
                "nullable": False,
                "coerce": False,
                "name": "int_index",
            }
        ],
        "coerce": False,
    }
    statistics = schema_statistics.get_dataframe_schema_statistics(schema)
    assert statistics == expectation
예제 #30
0
        "checks": {
            "greater_than_or_equal_to": {"min_value": 0},
            "less_than_or_equal_to": {"max_value": 100},
        },
        "name": None,
        "coerce": False,
    }


@pytest.mark.parametrize("index_schema_component, expectation", [
    [
        pa.Index(
            pa.Int,
            checks=[
                pa.Check.greater_than_or_equal_to(10),
                pa.Check.less_than_or_equal_to(20),
            ],
            nullable=False,
            name="int_index",
        ),
        [
            {
                "pandas_dtype": pa.Int,
                "nullable": False,
                "checks": {
                    "greater_than_or_equal_to": {"min_value": 10},
                    "less_than_or_equal_to": {"max_value": 20},
                },
                "name": "int_index",
                "coerce": False,
            }