def _create_expectations_for_low_card_column(
        cls,
        dataset,
        column,
        column_cache,
        excluded_expectations=None,
        included_expectations=None,
    ):
        cls._create_non_nullity_expectations(
            dataset,
            column,
            excluded_expectations=excluded_expectations,
            included_expectations=included_expectations,
        )

        if (
            not excluded_expectations
            or "expect_column_distinct_values_to_be_in_set" not in excluded_expectations
        ) and (
            not included_expectations
            or "expect_column_distinct_values_to_be_in_set" in included_expectations
        ):
            value_set = dataset.expect_column_distinct_values_to_be_in_set(
                column, value_set=None, result_format="SUMMARY"
            ).result["observed_value"]
            dataset.expect_column_distinct_values_to_be_in_set(
                column, value_set=value_set, result_format="SUMMARY"
            )

        if (
            not excluded_expectations
            or "expect_column_kl_divergence_to_be_less_than"
            not in excluded_expectations
        ) and (
            not included_expectations
            or "expect_column_kl_divergence_to_be_less_than" in included_expectations
        ):
            if cls._get_column_cardinality_with_caching(
                dataset, column, column_cache
            ) in [ProfilerCardinality.TWO, ProfilerCardinality.VERY_FEW,]:
                partition_object = build_categorical_partition_object(dataset, column)
                dataset.expect_column_kl_divergence_to_be_less_than(
                    column,
                    partition_object=partition_object,
                    threshold=0.6,
                    catch_exceptions=True,
                )
示例#2
0
    def _create_expectations_for_low_card_column(cls, dataset, column,
                                                 column_cache):
        cls._create_non_nullity_expectations(dataset, column)

        value_set = \
        dataset.expect_column_distinct_values_to_be_in_set(column, value_set=None, result_format="SUMMARY").result[
            "observed_value"]
        dataset.expect_column_distinct_values_to_be_in_set(
            column, value_set=value_set, result_format="SUMMARY")

        if cls._get_column_cardinality_with_caching(
                dataset, column, column_cache) in ["two", "very few"]:
            partition_object = build_categorical_partition_object(
                dataset, column)
            dataset.expect_column_kl_divergence_to_be_less_than(
                column,
                partition_object=partition_object,
                threshold=0.6,
                catch_exceptions=True)
def test_build_categorical_partition(non_numeric_high_card_dataset):

    # Verify that we can build expected categorical partition objects
    # Note that this relies on the underlying sort behavior of the system in question
    # For weights, that will be unambiguous, but for values, it could depend on locale

    partition = build_categorical_partition_object(
        non_numeric_high_card_dataset, "medcardnonnum", sort="count"
    )

    assert partition == {
        "values": [
            "hW0kFZ6ijfciJWN4vvgcFa6MWv8cTeVk",
            "T7EUE54HUhyJ9Hnxv1pKY0Bmg42qiggP",
            "2K8njWnvuq1u6tkzreNhxTEyO8PTeWer",
            "k8B9KCXhaQb6Q82zFbAzOESAtDxK174J",
            "NhTsracusfp5V6zVeWqLZnychDl7jjO4",
            "oRnY5jDWFw2KZRYLh6ihFd021ggy4UxJ",
            "ajcLVizD2vwZlmmGKyXYki03SWn7fnt3",
            "NfX4KfEompMbbKloFq8NQpdXtk5PjaPe",
            "mS2AVcLFp6i36sX7yAUrdfM0g0RB2X4D",
        ],
        "weights": [0.18, 0.17, 0.16, 0.145, 0.125, 0.11, 0.085, 0.02, 0.005],
    }

    partition = build_categorical_partition_object(
        non_numeric_high_card_dataset, "medcardnonnum", sort="value"
    )

    try:
        assert partition == {
            "values": [
                "2K8njWnvuq1u6tkzreNhxTEyO8PTeWer",
                "NfX4KfEompMbbKloFq8NQpdXtk5PjaPe",
                "NhTsracusfp5V6zVeWqLZnychDl7jjO4",
                "T7EUE54HUhyJ9Hnxv1pKY0Bmg42qiggP",
                "ajcLVizD2vwZlmmGKyXYki03SWn7fnt3",
                "hW0kFZ6ijfciJWN4vvgcFa6MWv8cTeVk",
                "k8B9KCXhaQb6Q82zFbAzOESAtDxK174J",
                "mS2AVcLFp6i36sX7yAUrdfM0g0RB2X4D",
                "oRnY5jDWFw2KZRYLh6ihFd021ggy4UxJ",
            ],
            "weights": [0.16, 0.02, 0.125, 0.17, 0.085, 0.18, 0.145, 0.005, 0.11],
        }
    except AssertionError:
        # Postgres uses a lexigraphical sort that differs from the one used in python natively
        # Since we *want* to preserve the underlying system's ability to do compute (and the user
        # can override if desired), we allow this explicitly.
        assert partition == {
            "values": [
                "2K8njWnvuq1u6tkzreNhxTEyO8PTeWer",
                "ajcLVizD2vwZlmmGKyXYki03SWn7fnt3",
                "hW0kFZ6ijfciJWN4vvgcFa6MWv8cTeVk",
                "k8B9KCXhaQb6Q82zFbAzOESAtDxK174J",
                "mS2AVcLFp6i36sX7yAUrdfM0g0RB2X4D",
                "NfX4KfEompMbbKloFq8NQpdXtk5PjaPe",
                "NhTsracusfp5V6zVeWqLZnychDl7jjO4",
                "oRnY5jDWFw2KZRYLh6ihFd021ggy4UxJ",
                "T7EUE54HUhyJ9Hnxv1pKY0Bmg42qiggP",
            ],
            "weights": [0.16, 0.085, 0.18, 0.145, 0.005, 0.02, 0.125, 0.11, 0.17],
        }