Пример #1
0
def test_batches_are_accessible(
    monkeypatch,
    multibatch_generic_csv_generator,
    multibatch_generic_csv_generator_context,
):
    """
    What does this test and why?
    Batches created in the multibatch_generic_csv_generator fixture should be available using the
    multibatch_generic_csv_generator_context
    This test most likely duplicates tests elsewhere, but it is more of a test of the configurable fixture.
    """

    context: DataContext = multibatch_generic_csv_generator_context
    data_relative_path = "../data"
    data_path = os.path.join(context.root_directory, data_relative_path)
    datasource_name = "generic_csv_generator"
    data_connector_name = "daily_data_connector"
    asset_name = "daily_data_asset"

    datasource = context.datasources[datasource_name]

    data_connector = datasource.data_connectors[data_connector_name]

    total_batches: int = 20
    file_list = multibatch_generic_csv_generator(
        data_path=data_path, num_event_batches=total_batches)

    assert (
        data_connector._get_data_reference_list_from_cache_by_data_asset_name(
            data_asset_name=asset_name) == file_list)

    batch_request_1 = BatchRequest(
        datasource_name="generic_csv_generator",
        data_connector_name="daily_data_connector",
        data_asset_name="daily_data_asset",
        data_connector_query={
            "index": -1,
        },
    )
    # Should give most recent batch
    validator_1 = context.get_validator(
        batch_request=batch_request_1,
        create_expectation_suite_with_name="my_expectation_suite_name_1",
    )
    metric_max = validator_1.get_metric(
        MetricConfiguration("column.max",
                            metric_domain_kwargs={"column": "batch_num"}))
    assert metric_max == total_batches
    metric_value_set = validator_1.get_metric(
        MetricConfiguration(
            "column.distinct_values",
            metric_domain_kwargs={"column": "string_cardinality_3"},
        ))
    assert metric_value_set == {"category0", "category1", "category2"}

    batch_request_2 = BatchRequest(
        datasource_name="generic_csv_generator",
        data_connector_name="daily_data_connector",
        data_asset_name="daily_data_asset",
        data_connector_query={
            "index": -2,
        },
    )
    validator_2 = context.get_validator(
        batch_request=batch_request_2,
        create_expectation_suite_with_name="my_expectation_suite_name_2",
    )
    metric_max = validator_2.get_metric(
        MetricConfiguration("column.max",
                            metric_domain_kwargs={"column": "batch_num"}))
    assert metric_max == total_batches - 1
    metric_value_set = validator_2.get_metric(
        MetricConfiguration(
            "column.distinct_values",
            metric_domain_kwargs={"column": "string_cardinality_3"},
        ))
    assert metric_value_set == {"category0", "category1", "category2"}

    for batch_num in range(1, total_batches + 1):
        batch_request = BatchRequest(
            datasource_name="generic_csv_generator",
            data_connector_name="daily_data_connector",
            data_asset_name="daily_data_asset",
            data_connector_query={
                "index": -batch_num,
            },
        )
        validator = context.get_validator(
            batch_request=batch_request,
            create_expectation_suite_with_name=
            f"my_expectation_suite_name__{batch_num}",
        )
        metric_max = validator.get_metric(
            MetricConfiguration("column.max",
                                metric_domain_kwargs={"column": "batch_num"}))
        assert metric_max == (total_batches + 1) - batch_num
        metric_value_set = validator.get_metric(
            MetricConfiguration(
                "column.distinct_values",
                metric_domain_kwargs={"column": "string_cardinality_3"},
            ))
        assert metric_value_set == {"category0", "category1", "category2"}
Пример #2
0
    def get_validation_dependencies(
        self,
        configuration: Optional[ExpectationConfiguration] = None,
        execution_engine: Optional[ExecutionEngine] = None,
        runtime_configuration: Optional[dict] = None,
    ):
        # this calls TableExpectation.get_validation_dependencies to set baseline dependencies
        # for the aggregate version of the expectation
        dependencies = super(ColumnMapExpectation, self).get_validation_dependencies(
            configuration, execution_engine, runtime_configuration
        )

        # only PandasExecutionEngine supports the column map version of the expectation
        if isinstance(execution_engine, PandasExecutionEngine):
            column_name = configuration.kwargs.get("column")
            expected_type = configuration.kwargs.get("type_")
            metric_kwargs = get_metric_kwargs(
                configuration=configuration,
                metric_name="table.column_types",
                runtime_configuration=runtime_configuration,
            )
            metric_domain_kwargs = metric_kwargs.get("metric_domain_kwargs")
            metric_value_kwargs = metric_kwargs.get("metric_value_kwargs")
            table_column_types_configuration = MetricConfiguration(
                "table.column_types",
                metric_domain_kwargs=metric_domain_kwargs,
                metric_value_kwargs=metric_value_kwargs,
            )
            actual_column_types_list = execution_engine.resolve_metrics(
                [table_column_types_configuration]
            )[table_column_types_configuration.id]
            actual_column_type = [
                type_dict["type"]
                for type_dict in actual_column_types_list
                if type_dict["name"] == column_name
            ][0]

            # only use column map version if column dtype is object
            if actual_column_type.type.__name__ == "object_" and expected_type not in [
                "object",
                "object_",
                "O",
                None,
            ]:
                # this resets dependencies using  ColumnMapExpectation.get_validation_dependencies
                dependencies = super().get_validation_dependencies(
                    configuration, execution_engine, runtime_configuration
                )

        # this adds table.column_types dependency for both aggregate and map versions of expectation
        column_types_metric_kwargs = get_metric_kwargs(
            metric_name="table.column_types",
            configuration=configuration,
            runtime_configuration=runtime_configuration,
        )
        dependencies["metrics"]["table.column_types"] = MetricConfiguration(
            metric_name="table.column_types",
            metric_domain_kwargs=column_types_metric_kwargs["metric_domain_kwargs"],
            metric_value_kwargs=column_types_metric_kwargs["metric_value_kwargs"],
        )

        return dependencies
Пример #3
0
        lambda batch_identifiers: int(batch_identifiers["month"]) < 3
    })
jan_feb_batch_definition_list: list = (
    jan_feb_batch_filter.select_from_data_connector_query(
        batch_definition_list=total_batch_definition_list))

# Get the highest max and lowest min between January and February
cumulative_max = 0
cumulative_min = np.Inf
for batch_definition in jan_feb_batch_definition_list:
    batch_id: str = batch_definition.id
    current_max = validator.get_metric(
        MetricConfiguration(
            "column.max",
            metric_domain_kwargs={
                "column": "fare_amount",
                "batch_id": batch_id
            },
        ))
    cumulative_max = current_max if current_max > cumulative_max else cumulative_max

    current_min = validator.get_metric(
        MetricConfiguration(
            "column.min",
            metric_domain_kwargs={
                "column": "fare_amount",
                "batch_id": batch_id
            },
        ))
    cumulative_min = current_min if current_min < cumulative_min else cumulative_min
Пример #4
0
    def _self_check_fetch_batch(
        self,
        pretty_print: bool,
        example_data_reference: Any,
        data_asset_name: str,
    ):
        """
        Helper function for self_check() to retrieve batch using example_data_reference and data_asset_name,
        all while printing helpful messages. First 5 rows of batch_data are printed by default.

        Args:
            pretty_print (bool): print to console?
            example_data_reference (Any): data_reference to retrieve
            data_asset_name (str): data_asset_name to retrieve

        """
        if pretty_print:
            print(f"\n\t\tFetching batch data...")

        batch_definition_list: List[
            BatchDefinition
        ] = self._map_data_reference_to_batch_definition_list(
            data_reference=example_data_reference,
            data_asset_name=data_asset_name,
        )
        assert len(batch_definition_list) == 1
        batch_definition: BatchDefinition = batch_definition_list[0]

        # _execution_engine might be None for some tests
        if batch_definition is None or self._execution_engine is None:
            return {}

        batch_data: Any
        batch_spec: BatchSpec
        batch_data, batch_spec, _ = self.get_batch_data_and_metadata(
            batch_definition=batch_definition
        )

        # Note: get_batch_data_and_metadata will have loaded the data into the currently-defined execution engine.
        # Consequently, when we build a Validator, we do not need to specifically load the batch into it to
        # resolve metrics.
        validator: Validator = Validator(execution_engine=batch_data.execution_engine)
        data: Any = validator.get_metric(
            metric=MetricConfiguration(
                metric_name="table.head",
                metric_domain_kwargs={
                    "batch_id": batch_definition.id,
                },
                metric_value_kwargs={
                    "n_rows": 5,
                },
            )
        )
        n_rows: int = validator.get_metric(
            metric=MetricConfiguration(
                metric_name="table.row_count",
                metric_domain_kwargs={
                    "batch_id": batch_definition.id,
                },
            )
        )

        if pretty_print and data is not None:
            print(f"\n\t\tShowing 5 rows")
            print(data)

        return {
            "batch_spec": batch_spec,
            "n_rows": n_rows,
        }
Пример #5
0
def test_map_value_set_spark(spark_session, basic_spark_df_execution_engine):
    engine: SparkDFExecutionEngine = build_spark_engine(
        spark=spark_session,
        df=pd.DataFrame(
            {"a": [1, 2, 3, 3, None]},
        ),
        batch_id="my_id",
    )

    metrics: dict = {}

    table_columns_metric: MetricConfiguration
    results: dict

    table_columns_metric, results = get_table_columns_metric(engine=engine)
    metrics.update(results)

    condition_metric = MetricConfiguration(
        metric_name="column_values.in_set.condition",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3]},
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(condition_metric,), metrics=metrics
    )
    metrics.update(results)

    # Note: metric_dependencies is optional here in the config when called from a validator.
    aggregate_partial = MetricConfiguration(
        metric_name="column_values.in_set.unexpected_count.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3]},
        metric_dependencies={"unexpected_condition": condition_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(aggregate_partial,), metrics=metrics
    )
    metrics.update(results)
    desired_metric = MetricConfiguration(
        metric_name="column_values.in_set.unexpected_count",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3]},
        metric_dependencies={"metric_partial_fn": aggregate_partial},
    )

    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    metrics.update(results)
    assert results == {desired_metric.id: 0}

    # We run the same computation again, this time with None being replaced by nan instead of NULL
    # to demonstrate this behavior
    df = pd.DataFrame({"a": [1, 2, 3, 3, None]})
    df = spark_session.createDataFrame(df)
    engine = basic_spark_df_execution_engine
    engine.load_batch_data(batch_id="my_id", batch_data=df)

    condition_metric = MetricConfiguration(
        metric_name="column_values.in_set.condition",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3]},
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(condition_metric,), metrics=metrics
    )
    metrics.update(results)

    # Note: metric_dependencies is optional here in the config when called from a validator.
    aggregate_partial = MetricConfiguration(
        metric_name="column_values.in_set.unexpected_count.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3]},
        metric_dependencies={"unexpected_condition": condition_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(aggregate_partial,), metrics=metrics
    )
    metrics.update(results)
    desired_metric = MetricConfiguration(
        metric_name="column_values.in_set.unexpected_count",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3]},
        metric_dependencies={"metric_partial_fn": aggregate_partial},
    )

    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    metrics.update(results)
    assert results == {desired_metric.id: 1}
    def get_validation_dependencies(
        self,
        configuration: Optional[ExpectationConfiguration] = None,
        execution_engine: Optional[ExecutionEngine] = None,
        runtime_configuration: Optional[dict] = None,
    ):
        # This calls TableExpectation.get_validation_dependencies to set baseline dependencies for the aggregate version
        # of the expectation.
        # We need to keep this as super(ColumnMapExpectation, self), which calls
        # TableExpectation.get_validation_dependencies instead of ColumnMapExpectation.get_validation_dependencies.
        # This is because the map version of this expectation is only supported for Pandas, so we want the aggregate
        # version for the other backends.
        dependencies = super(ColumnMapExpectation,
                             self).get_validation_dependencies(
                                 configuration, execution_engine,
                                 runtime_configuration)

        # Only PandasExecutionEngine supports the column map version of the expectation.
        if isinstance(execution_engine, PandasExecutionEngine):
            column_name = configuration.kwargs.get("column")
            expected_types_list = configuration.kwargs.get("type_list")
            metric_kwargs = get_metric_kwargs(
                configuration=configuration,
                metric_name="table.column_types",
                runtime_configuration=runtime_configuration,
            )
            metric_domain_kwargs = metric_kwargs.get("metric_domain_kwargs")
            metric_value_kwargs = metric_kwargs.get("metric_value_kwargs")
            table_column_types_configuration = MetricConfiguration(
                "table.column_types",
                metric_domain_kwargs=metric_domain_kwargs,
                metric_value_kwargs=metric_value_kwargs,
            )
            actual_column_types_list = execution_engine.resolve_metrics([
                table_column_types_configuration
            ])[table_column_types_configuration.id]
            actual_column_type = [
                type_dict["type"] for type_dict in actual_column_types_list
                if type_dict["name"] == column_name
            ][0]

            # only use column map version if column dtype is object
            if (actual_column_type.type.__name__ == "object_"
                    and expected_types_list is not None):
                # this resets dependencies using  ColumnMapExpectation.get_validation_dependencies
                dependencies = super().get_validation_dependencies(
                    configuration, execution_engine, runtime_configuration)

        # this adds table.column_types dependency for both aggregate and map versions of expectation
        column_types_metric_kwargs = get_metric_kwargs(
            metric_name="table.column_types",
            configuration=configuration,
            runtime_configuration=runtime_configuration,
        )
        dependencies["metrics"]["table.column_types"] = MetricConfiguration(
            metric_name="table.column_types",
            metric_domain_kwargs=column_types_metric_kwargs[
                "metric_domain_kwargs"],
            metric_value_kwargs=column_types_metric_kwargs[
                "metric_value_kwargs"],
        )

        return dependencies
def test_map_unique_sa_column_exists(sa):
    engine = _build_sa_engine(
        pd.DataFrame(
            {"a": [1, 2, 3, 3, None], "b": ["foo", "bar", "baz", "qux", "fish"]}
        ),
        sa,
    )
    condition_metric = MetricConfiguration(
        metric_name="column_values.unique.condition",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
    )
    metrics = engine.resolve_metrics(metrics_to_resolve=(condition_metric,))

    # This is no longer a MAP_CONDITION because mssql does not support it. Instead, it is a WINDOW_CONDITION
    #
    # aggregate_fn = MetricConfiguration(
    #     metric_name="column_values.unique.unexpected_count.aggregate_fn",
    #     metric_domain_kwargs={"column": "a"},
    #     metric_value_kwargs=dict(),
    #     metric_dependencies={"unexpected_condition": condition_metric},
    # )
    # aggregate_fn_metrics = engine.resolve_metrics(
    #     metrics_to_resolve=(aggregate_fn,), metrics=metrics
    # )

    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_count",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        # metric_dependencies={"metric_partial_fn": aggregate_fn},
        metric_dependencies={"unexpected_condition": condition_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,),
        metrics=metrics,  # metrics=aggregate_fn_metrics
    )
    assert results[desired_metric.id] == 2

    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_values",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={
            "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20}
        },
        metric_dependencies={"unexpected_condition": condition_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    assert results[desired_metric.id] == [3, 3]

    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_value_counts",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={
            "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20}
        },
        metric_dependencies={"unexpected_condition": condition_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    assert results[desired_metric.id] == [(3, 2)]

    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_rows",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={
            "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20}
        },
        metric_dependencies={"unexpected_condition": condition_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    assert results[desired_metric.id] == [(3, "baz"), (3, "qux")]
Пример #8
0
    def __init__(
        self,
        profile_dataset,
        excluded_expectations: list = None,
        ignored_columns: list = None,
        not_null_only: bool = False,
        primary_or_compound_key: list = False,
        semantic_types_dict: dict = None,
        table_expectations_only: bool = False,
        value_set_threshold: str = "MANY",
    ):
        """
        The UserConfigurableProfiler is used to build an expectation suite from a dataset. The profiler may be
        instantiated with or without a config. The config may contain a semantic_types dict or not. Once a profiler is
        instantiated, if config items change, a new profiler will be needed.

        Write an entry on how to use the profiler for the GE docs site
                Args:
                    profile_dataset: A Great Expectations Dataset or Validator object
                    excluded_expectations: A list of expectations to not include in the suite
                    ignored_columns: A list of columns for which you would like to NOT create expectations
                    not_null_only: Boolean, default False. By default, each column is evaluated for nullity. If the column
                        values contain fewer than 50% null values, then the profiler will add
                        `expect_column_values_to_not_be_null`; if greater than 50% it will add
                        `expect_column_values_to_be_null`. If not_null_only is set to True, the profiler will add a
                        not_null expectation irrespective of the percent nullity (and therefore will not add an
                        `expect_column_values_to_be_null`
                    primary_or_compound_key: A list containing one or more columns which are a dataset's primary or
                        compound key. This will create an `expect_column_values_to_be_unique` or
                        `expect_compound_columns_to_be_unique` expectation. This will occur even if one or more of the
                        primary_or_compound_key columns are specified in ignored_columns
                    semantic_types_dict: A dictionary where the keys are available semantic_types (see profiler.base.profiler_semantic_types)
                        and the values are lists of columns for which you would like to create semantic_type specific
                        expectations e.g.:
                        "semantic_types": { "value_set": ["state","country"], "numeric":["age", "amount_due"]}
                    table_expectations_only: Boolean, default False. If True, this will only create the two table level expectations
                        available to this profiler (`expect_table_columns_to_match_ordered_list` and
                        `expect_table_row_count_to_be_between`). If a primary_or_compound key is specified, it will create
                        a uniqueness expectation for that column as well
                    value_set_threshold: Takes a string from the following ordered list - "none", "one", "two",
                        "very_few", "few", "many", "very_many", "unique". When the profiler runs without a semantic_types
                        dict, each column is profiled for cardinality. This threshold determines the greatest cardinality
                        for which to add `expect_column_values_to_be_in_set`. For example, if value_set_threshold is set to
                        "unique", it will add a value_set expectation for every included column. If set to "few", it will
                        add a value_set expectation for columns whose cardinality is one of "one", "two", "very_few" or
                        "few". The default value is "many". For the purposes of comparing whether two tables are identical,
                        it might make the most sense to set this to "unique"
        """
        self.column_info = {}
        self.profile_dataset = profile_dataset
        assert isinstance(self.profile_dataset, (Dataset, Validator, Batch))

        if isinstance(self.profile_dataset, Batch):
            self.profile_dataset = Validator(
                execution_engine=self.profile_dataset.data.execution_engine,
                batches=[self.profile_dataset],
            )
            self.all_table_columns = self.profile_dataset.get_metric(
                MetricConfiguration("table.columns", dict()))
        elif isinstance(self.profile_dataset, Validator):
            self.all_table_columns = self.profile_dataset.get_metric(
                MetricConfiguration("table.columns", dict()))
        else:
            self.all_table_columns = self.profile_dataset.get_table_columns()

        self.semantic_types_dict = semantic_types_dict
        assert isinstance(self.semantic_types_dict, (dict, type(None)))

        self.ignored_columns = ignored_columns or []
        assert isinstance(self.ignored_columns, list)

        self.excluded_expectations = excluded_expectations or []
        assert isinstance(self.excluded_expectations, list)

        assert isinstance(value_set_threshold,
                          str), "value_set_threshold must be a string"
        self.value_set_threshold = value_set_threshold.upper()
        assert (
            self.value_set_threshold in OrderedProfilerCardinality.__members__
        ), f"value_set_threshold must be one of {[i for i in OrderedProfilerCardinality.__members__]}"

        self.not_null_only = not_null_only
        assert isinstance(self.not_null_only, bool)

        self.table_expectations_only = table_expectations_only
        assert isinstance(self.table_expectations_only, bool)
        if self.table_expectations_only is True:
            logger.info(
                "table_expectations_only is set to True. When used to build a suite, this profiler will ignore all"
                "columns and create expectations only at the table level. If you would also like to create expectations "
                "at the column level, you can instantiate a new profiler with table_expectations_only set to False"
            )

        self.primary_or_compound_key = primary_or_compound_key or []
        assert isinstance(self.primary_or_compound_key, list)

        if self.table_expectations_only:
            self.ignored_columns = self.all_table_columns

        if self.primary_or_compound_key:
            for column in self.primary_or_compound_key:
                if column not in self.all_table_columns:
                    raise ValueError(
                        f"Column {column} not found. Please ensure that this column is in the {type(profile_dataset).__name__} "
                        f"if you would like to use it as a primary_or_compound_key."
                    )

        included_columns = [
            column_name for column_name in self.all_table_columns
            if column_name not in self.ignored_columns
        ]

        for column_name in included_columns:
            self._add_column_cardinality_to_column_info(
                self.profile_dataset, column_name)
            self._add_column_type_to_column_info(self.profile_dataset,
                                                 column_name)

        if self.semantic_types_dict is not None:
            self._validate_semantic_types_dict(self.profile_dataset)
            for column_name in included_columns:
                self._add_semantic_types_by_column_from_config_to_column_info(
                    column_name)
        self.semantic_type_functions = {
            "DATETIME": self._build_expectations_datetime,
            "NUMERIC": self._build_expectations_numeric,
            "STRING": self._build_expectations_string,
            "VALUE_SET": self._build_expectations_value_set,
            "BOOLEAN": self._build_expectations_value_set,
        }
def test_sparkdf_batch_aggregate_metrics(caplog, spark_session):
    import datetime

    engine = _build_spark_engine(
        pd.DataFrame({"a": [1, 2, 1, 2, 3, 3], "b": [4, 4, 4, 4, 4, 4]}), spark_session
    )

    desired_metric_1 = MetricConfiguration(
        metric_name="column.max.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
    )
    desired_metric_2 = MetricConfiguration(
        metric_name="column.min.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
    )
    desired_metric_3 = MetricConfiguration(
        metric_name="column.max.aggregate_fn",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=dict(),
    )
    desired_metric_4 = MetricConfiguration(
        metric_name="column.min.aggregate_fn",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=dict(),
    )
    metrics = engine.resolve_metrics(
        metrics_to_resolve=(
            desired_metric_1,
            desired_metric_2,
            desired_metric_3,
            desired_metric_4,
        )
    )
    desired_metric_1 = MetricConfiguration(
        metric_name="column.max",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={"metric_partial_fn": desired_metric_1},
    )
    desired_metric_2 = MetricConfiguration(
        metric_name="column.min",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={"metric_partial_fn": desired_metric_2},
    )
    desired_metric_3 = MetricConfiguration(
        metric_name="column.max",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=dict(),
        metric_dependencies={"metric_partial_fn": desired_metric_3},
    )
    desired_metric_4 = MetricConfiguration(
        metric_name="column.min",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=dict(),
        metric_dependencies={"metric_partial_fn": desired_metric_4},
    )
    start = datetime.datetime.now()
    caplog.clear()
    caplog.set_level(logging.DEBUG, logger="great_expectations")
    res = engine.resolve_metrics(
        metrics_to_resolve=(
            desired_metric_1,
            desired_metric_2,
            desired_metric_3,
            desired_metric_4,
        ),
        metrics=metrics,
    )
    end = datetime.datetime.now()
    print(end - start)
    assert res[desired_metric_1.id] == 3
    assert res[desired_metric_2.id] == 1
    assert res[desired_metric_3.id] == 4
    assert res[desired_metric_4.id] == 4

    # Check that all four of these metrics were computed on a single domain
    found_message = False
    for record in caplog.records:
        if (
            record.message
            == "SparkDFExecutionEngine computed 4 metrics on domain_id ()"
        ):
            found_message = True
    assert found_message
def test_map_value_set_spark(spark_session):
    engine = _build_spark_engine(pd.DataFrame({"a": [1, 2, 3, 3, None]}), spark_session)

    condition_metric = MetricConfiguration(
        metric_name="column_values.in_set.condition",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3]},
    )
    metrics = engine.resolve_metrics(metrics_to_resolve=(condition_metric,))

    # Note: metric_dependencies is optional here in the config when called from a validator.
    aggregate_partial = MetricConfiguration(
        metric_name="column_values.in_set.unexpected_count.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3]},
        metric_dependencies={"unexpected_condition": condition_metric},
    )
    metrics = engine.resolve_metrics(
        metrics_to_resolve=(aggregate_partial,), metrics=metrics
    )
    desired_metric = MetricConfiguration(
        metric_name="column_values.in_set.unexpected_count",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3]},
        metric_dependencies={"metric_partial_fn": aggregate_partial},
    )

    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    assert results == {desired_metric.id: 0}

    # We run the same computation again, this time with None being replaced by nan instead of NULL
    # to demonstrate this behavior
    df = pd.DataFrame({"a": [1, 2, 3, 3, None]})
    df = spark_session.createDataFrame(df)
    engine = SparkDFExecutionEngine(batch_data_dict={"my_id": df})

    condition_metric = MetricConfiguration(
        metric_name="column_values.in_set.condition",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3]},
    )
    metrics = engine.resolve_metrics(metrics_to_resolve=(condition_metric,))

    # Note: metric_dependencies is optional here in the config when called from a validator.
    aggregate_partial = MetricConfiguration(
        metric_name="column_values.in_set.unexpected_count.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3]},
        metric_dependencies={"unexpected_condition": condition_metric},
    )
    metrics = engine.resolve_metrics(
        metrics_to_resolve=(aggregate_partial,), metrics=metrics
    )
    desired_metric = MetricConfiguration(
        metric_name="column_values.in_set.unexpected_count",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3]},
        metric_dependencies={"metric_partial_fn": aggregate_partial},
    )

    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    assert results == {desired_metric.id: 1}
Пример #11
0
def test_z_score_under_threshold_spark(spark_session):
    engine: SparkDFExecutionEngine = build_spark_engine(
        spark=spark_session,
        df=pd.DataFrame(
            {"a": [1, 2, 3, 3, None]},
        ),
        batch_id="my_id",
    )

    metrics: dict = {}

    table_columns_metric: MetricConfiguration
    results: dict

    table_columns_metric, results = get_table_columns_metric(engine=engine)
    metrics.update(results)

    mean = MetricConfiguration(
        metric_name="column.mean.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    stdev = MetricConfiguration(
        metric_name="column.standard_deviation.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    desired_metrics = (mean, stdev)
    results = engine.resolve_metrics(
        metrics_to_resolve=desired_metrics, metrics=metrics
    )
    metrics.update(results)

    mean = MetricConfiguration(
        metric_name="column.mean",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={"metric_partial_fn": mean},
    )
    stdev = MetricConfiguration(
        metric_name="column.standard_deviation",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "metric_partial_fn": stdev,
            "table.columns": table_columns_metric,
        },
    )
    desired_metrics = (mean, stdev)
    results = engine.resolve_metrics(
        metrics_to_resolve=desired_metrics, metrics=metrics
    )
    metrics.update(results)

    desired_metric = MetricConfiguration(
        metric_name="column_values.z_score.map",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "column.standard_deviation": stdev,
            "column.mean": mean,
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    metrics.update(results)
    desired_metric = MetricConfiguration(
        metric_name="column_values.z_score.under_threshold.condition",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"double_sided": True, "threshold": 2},
        metric_dependencies={
            "column_values.z_score.map": desired_metric,
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    metrics.update(results)

    desired_metric = MetricConfiguration(
        metric_name="column_values.z_score.under_threshold.unexpected_count.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"double_sided": True, "threshold": 2},
        metric_dependencies={"unexpected_condition": desired_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    metrics.update(results)

    desired_metric = MetricConfiguration(
        metric_name="column_values.z_score.under_threshold.unexpected_count",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"double_sided": True, "threshold": 2},
        metric_dependencies={"metric_partial_fn": desired_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    assert results[desired_metric.id] == 0
Пример #12
0
def test_z_score_under_threshold_pd():
    df = pd.DataFrame({"a": [1, 2, 3, None]})
    engine = PandasExecutionEngine(batch_data_dict={"my_id": df})

    metrics: dict = {}

    table_columns_metric: MetricConfiguration
    results: dict

    table_columns_metric, results = get_table_columns_metric(engine=engine)
    metrics.update(results)

    mean = MetricConfiguration(
        metric_name="column.mean",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    stdev = MetricConfiguration(
        metric_name="column.standard_deviation",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    desired_metrics = (mean, stdev)
    results = engine.resolve_metrics(
        metrics_to_resolve=desired_metrics, metrics=metrics
    )
    metrics.update(results)

    desired_metric = MetricConfiguration(
        metric_name="column_values.z_score.map",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "column.standard_deviation": stdev,
            "column.mean": mean,
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    metrics.update(results)
    desired_metric = MetricConfiguration(
        metric_name="column_values.z_score.under_threshold.condition",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"double_sided": True, "threshold": 2},
        metric_dependencies={
            "column_values.z_score.map": desired_metric,
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    assert list(results[desired_metric.id][0]) == [False, False, False]
    metrics.update(results)
    desired_metric = MetricConfiguration(
        metric_name="column_values.z_score.under_threshold.unexpected_count",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"double_sided": True, "threshold": 2},
        metric_dependencies={"unexpected_condition": desired_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    assert results[desired_metric.id] == 0
Пример #13
0
def test_map_unique_spark_column_exists(spark_session):
    engine: SparkDFExecutionEngine = build_spark_engine(
        spark=spark_session,
        df=pd.DataFrame(
            {
                "a": [1, 2, 3, 3, 4, None],
                "b": [None, "foo", "bar", "baz", "qux", "fish"],
            }
        ),
        batch_id="my_id",
    )

    metrics: dict = {}

    table_columns_metric: MetricConfiguration
    results: dict

    table_columns_metric, results = get_table_columns_metric(engine=engine)
    metrics.update(results)

    condition_metric = MetricConfiguration(
        metric_name="column_values.unique.condition",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(condition_metric,), metrics=metrics
    )
    metrics.update(results)

    # unique is a *window* function so does not use the aggregate_fn version of unexpected count
    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_count",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "unexpected_condition": condition_metric,
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    metrics.update(results)
    assert results[desired_metric.id] == 2

    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_values",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={
            "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20}
        },
        metric_dependencies={
            "unexpected_condition": condition_metric,
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    metrics.update(results)
    assert results[desired_metric.id] == [3, 3]

    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_value_counts",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={
            "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20}
        },
        metric_dependencies={
            "unexpected_condition": condition_metric,
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    metrics.update(results)
    assert results[desired_metric.id] == [(3, 2)]

    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_rows",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={
            "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20}
        },
        metric_dependencies={"unexpected_condition": condition_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    metrics.update(results)
    assert results[desired_metric.id] == [(3, "bar"), (3, "baz")]
Пример #14
0
def test_sa_batch_aggregate_metrics(caplog, sa):
    import datetime

    engine = build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 1, 2, 3, 3],
            "b": [4, 4, 4, 4, 4, 4]
        }), sa)

    metrics: dict = {}

    table_columns_metric: MetricConfiguration
    results: dict

    table_columns_metric, results = get_table_columns_metric(engine=engine)
    metrics.update(results)

    desired_metric_1 = MetricConfiguration(
        metric_name="column.max.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    desired_metric_2 = MetricConfiguration(
        metric_name="column.min.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    desired_metric_3 = MetricConfiguration(
        metric_name="column.max.aggregate_fn",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    desired_metric_4 = MetricConfiguration(
        metric_name="column.min.aggregate_fn",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(
            desired_metric_1,
            desired_metric_2,
            desired_metric_3,
            desired_metric_4,
        ),
        metrics=metrics,
    )
    metrics.update(results)

    desired_metric_1 = MetricConfiguration(
        metric_name="column.max",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "metric_partial_fn": desired_metric_1,
            "table.columns": table_columns_metric,
        },
    )
    desired_metric_2 = MetricConfiguration(
        metric_name="column.min",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "metric_partial_fn": desired_metric_2,
            "table.columns": table_columns_metric,
        },
    )
    desired_metric_3 = MetricConfiguration(
        metric_name="column.max",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "metric_partial_fn": desired_metric_3,
            "table.columns": table_columns_metric,
        },
    )
    desired_metric_4 = MetricConfiguration(
        metric_name="column.min",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "metric_partial_fn": desired_metric_4,
            "table.columns": table_columns_metric,
        },
    )
    caplog.clear()
    caplog.set_level(logging.DEBUG, logger="great_expectations")
    start = datetime.datetime.now()
    results = engine.resolve_metrics(
        metrics_to_resolve=(
            desired_metric_1,
            desired_metric_2,
            desired_metric_3,
            desired_metric_4,
        ),
        metrics=metrics,
    )
    metrics.update(results)
    end = datetime.datetime.now()
    print("t1")
    print(end - start)
    assert results[desired_metric_1.id] == 3
    assert results[desired_metric_2.id] == 1
    assert results[desired_metric_3.id] == 4
    assert results[desired_metric_4.id] == 4

    # Check that all four of these metrics were computed on a single domain
    found_message = False
    for record in caplog.records:
        if (record.message ==
                "SqlAlchemyExecutionEngine computed 4 metrics on domain_id ()"
            ):
            found_message = True
    assert found_message
def test_map_unique_spark_column_exists(spark_session):
    engine = _build_spark_engine(
        pd.DataFrame(
            {
                "a": [1, 2, 3, 3, 4, None],
                "b": [None, "foo", "bar", "baz", "qux", "fish"],
            }
        ),
        spark_session,
    )

    condition_metric = MetricConfiguration(
        metric_name="column_values.unique.condition",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
    )
    metrics = engine.resolve_metrics(metrics_to_resolve=(condition_metric,))

    # unique is a *window* function so does not use the aggregate_fn version of unexpected count
    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_count",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={"unexpected_condition": condition_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    assert results[desired_metric.id] == 2

    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_values",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={
            "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20}
        },
        metric_dependencies={"unexpected_condition": condition_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    assert results[desired_metric.id] == [3, 3]

    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_value_counts",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={
            "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20}
        },
        metric_dependencies={"unexpected_condition": condition_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    assert results[desired_metric.id] == [(3, 2)]

    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_rows",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={
            "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20}
        },
        metric_dependencies={"unexpected_condition": condition_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    assert results[desired_metric.id] == [(3, "bar"), (3, "baz")]
Пример #16
0
def build_continuous_partition_object(
    execution_engine, domain_kwargs, bins="auto", n_bins=10, allow_relative_error=False
):
    """Convenience method for building a partition object on continuous data from a dataset and column

    Args:
        execution_engine (ExecutionEngine): the execution engine with which to compute the partition
        domain_kwargs (dict): The domain kwargs describing the domain for which to compute the partition
        bins (string): One of 'uniform' (for uniformly spaced bins), 'ntile' (for percentile-spaced bins), or 'auto'
            (for automatically spaced bins)
        n_bins (int): Ignored if bins is auto.
        allow_relative_error: passed to get_column_quantiles, set to False for only precise
            values, True to allow approximate values on systems with only binary choice (e.g. Redshift), and to a
            value between zero and one for systems that allow specification of relative error (e.g.
            SparkDFDataset).

    Returns:

        A new partition_object::

            {
                "bins": (list) The endpoints of the partial partition of reals,
                "weights": (list) The densities of the bins implied by the partition.
            }

            See :ref:`partition_object`.
    """
    partition_metric_configuration = MetricConfiguration(
        "column.partition",
        metric_domain_kwargs=domain_kwargs,
        metric_value_kwargs={
            "bins": bins,
            "n_bins": n_bins,
            "allow_relative_error": allow_relative_error,
        },
    )
    bins = execution_engine.resolve_metrics([partition_metric_configuration])[
        partition_metric_configuration.id
    ]
    if isinstance(bins, np.ndarray):
        bins = bins.tolist()
    else:
        bins = list(bins)
    hist_metric_configuration = MetricConfiguration(
        "column.histogram",
        metric_domain_kwargs=domain_kwargs,
        metric_value_kwargs={
            "bins": tuple(bins),
        },
    )
    nonnull_configuration = MetricConfiguration(
        "column_values.nonnull.count",
        metric_domain_kwargs=domain_kwargs,
        metric_value_kwargs={
            "bins": tuple(bins),
        },
    )
    metrics = execution_engine.resolve_metrics(
        (hist_metric_configuration, nonnull_configuration)
    )
    weights = list(
        np.array(metrics[hist_metric_configuration.id])
        / metrics[nonnull_configuration.id]
    )
    tail_weights = (1 - sum(weights)) / 2
    partition_object = {
        "bins": bins,
        "weights": weights,
        "tail_weights": [tail_weights, tail_weights],
    }
    return partition_object
def test_z_score_under_threshold_spark(spark_session):
    engine = _build_spark_engine(pd.DataFrame({"a": [1, 2, 3, 3, None]}), spark_session)

    mean = MetricConfiguration(
        metric_name="column.mean.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
    )
    stdev = MetricConfiguration(
        metric_name="column.standard_deviation.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
    )
    desired_metrics = (mean, stdev)
    metrics = engine.resolve_metrics(metrics_to_resolve=desired_metrics)

    mean = MetricConfiguration(
        metric_name="column.mean",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={"metric_partial_fn": mean},
    )
    stdev = MetricConfiguration(
        metric_name="column.standard_deviation",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={"metric_partial_fn": stdev},
    )
    desired_metrics = (mean, stdev)
    metrics = engine.resolve_metrics(
        metrics_to_resolve=(desired_metrics), metrics=metrics
    )

    desired_metric = MetricConfiguration(
        metric_name="column_values.z_score.map",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={"column.standard_deviation": stdev, "column.mean": mean},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    metrics.update(results)
    desired_metric = MetricConfiguration(
        metric_name="column_values.z_score.under_threshold.condition",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"double_sided": True, "threshold": 2},
        metric_dependencies={"column_values.z_score.map": desired_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    metrics.update(results)

    desired_metric = MetricConfiguration(
        metric_name="column_values.z_score.under_threshold.unexpected_count.aggregate_fn",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"double_sided": True, "threshold": 2},
        metric_dependencies={"unexpected_condition": desired_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    metrics.update(results)

    desired_metric = MetricConfiguration(
        metric_name="column_values.z_score.under_threshold.unexpected_count",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"double_sided": True, "threshold": 2},
        metric_dependencies={"metric_partial_fn": desired_metric},
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )
    assert results[desired_metric.id] == 0
Пример #18
0
 def get_validation_dependencies(
     self,
     configuration: Optional[ExpectationConfiguration] = None,
     execution_engine: Optional[ExecutionEngine] = None,
     runtime_configuration: Optional[dict] = None,
 ):
     all_dependencies = super().get_validation_dependencies(
         configuration, execution_engine, runtime_configuration
     )
     dependencies = all_dependencies["metrics"]
     partition_object = configuration.kwargs["partition_object"]
     domain_kwargs = configuration.get_domain_kwargs()
     is_categorical = None
     bins = None
     if partition_object is None:
         if configuration.kwargs.get(
             "bucketize_data", self.default_kwarg_values["bucketize_data"]
         ):
             is_categorical = False
             partition_metric_configuration = MetricConfiguration(
                 "column.partition",
                 metric_domain_kwargs=domain_kwargs,
                 metric_value_kwargs={
                     "bins": "auto",
                     "allow_relative_error": False,
                 },
             )
             #
             # Note: 20201116 - JPC - the execution engine doesn't provide capability to evaluate
             # dependencies, so we use a validator
             #
             validator = Validator(execution_engine=execution_engine)
             graph = ValidationGraph()
             validator.build_metric_dependency_graph(
                 graph=graph,
                 child_node=partition_metric_configuration,
                 configuration=configuration,
                 execution_engine=execution_engine,
             )
             bins = validator.resolve_validation_graph(graph, metrics=dict())[
                 partition_metric_configuration.id
             ]
             hist_metric_configuration = MetricConfiguration(
                 "column.histogram",
                 metric_domain_kwargs=domain_kwargs,
                 metric_value_kwargs={
                     "bins": tuple(bins),
                 },
             )
             nonnull_configuration = MetricConfiguration(
                 "column_values.nonnull.count",
                 metric_domain_kwargs=domain_kwargs,
                 metric_value_kwargs=dict(),
             )
             #
             # NOTE 20201117 - JPC - Would prefer not to include partition_metric_configuraiton here,
             # since we have already evaluated it, and its result is in the kwargs for the histogram.
             # However, currently the dependencies' configurations are not passed to the _validate method
             #
             dependencies["column.partition"] = partition_metric_configuration
             dependencies["column.histogram"] = hist_metric_configuration
             dependencies["column_values.nonnull.count"] = nonnull_configuration
         else:
             is_categorical = True
             counts_configuration = MetricConfiguration(
                 "column.value_counts",
                 metric_domain_kwargs=domain_kwargs,
                 metric_value_kwargs={
                     "sort": "value",
                 },
             )
             nonnull_configuration = MetricConfiguration(
                 "column_values.nonnull.count",
                 metric_domain_kwargs=domain_kwargs,
             )
             dependencies["column.value_counts"] = counts_configuration
             dependencies["column_values.nonnull.count"] = nonnull_configuration
     if is_categorical is True or is_valid_categorical_partition_object(
         partition_object
     ):
         dependencies["column.value_counts"] = MetricConfiguration(
             "column.value_counts",
             metric_domain_kwargs=domain_kwargs,
             metric_value_kwargs={"sort": "value"},
         )
         dependencies["column_values.nonnull.count"] = MetricConfiguration(
             "column_values.nonnull.count", domain_kwargs
         )
     else:
         if (
             bins is None
         ):  # if the user did not supply a partition_object, so we just computed it
             if not is_valid_partition_object(partition_object):
                 raise ValueError("Invalid partition_object provided")
             bins = partition_object["bins"]
         hist_metric_configuration = MetricConfiguration(
             "column.histogram",
             metric_domain_kwargs=domain_kwargs,
             metric_value_kwargs={
                 "bins": bins,
             },
         )
         nonnull_configuration = MetricConfiguration(
             "column_values.nonnull.count",
             metric_domain_kwargs=domain_kwargs,
             metric_value_kwargs=dict(),
         )
         dependencies["column.histogram"] = hist_metric_configuration
         dependencies["column_values.nonnull.count"] = nonnull_configuration
         below_partition = MetricConfiguration(
             "column_values.between.count",
             metric_domain_kwargs=domain_kwargs,
             metric_value_kwargs={"max_value": bins[0]},
         )
         above_partition = MetricConfiguration(
             "column_values.between.count",
             metric_domain_kwargs=domain_kwargs,
             metric_value_kwargs={"min_value": bins[-1], "strict_min": True},
         )
         dependencies["below_partition"] = below_partition
         dependencies["above_partition"] = above_partition
     return all_dependencies
Пример #19
0
from great_expectations.data_context.data_context import DataContext
from great_expectations.validator.validation_graph import MetricConfiguration

context = DataContext()
suite = context.get_expectation_suite("yellow_trip_data_validations")

# February BatchRequest and Validator
batch_request_february = BatchRequest(
    datasource_name="taxi_pandas",
    data_connector_name="monthly",
    data_asset_name="my_reports",
    data_connector_query={"index": -2},
)
validator_february = context.get_validator(
    batch_request=batch_request_february, expectation_suite=suite)
february_table_row_count = validator_february.get_metric(
    MetricConfiguration("table.row_count", metric_domain_kwargs={}))

# March BatchRequest and Validator
batch_request_march = BatchRequest(
    datasource_name="taxi_pandas",
    data_connector_name="monthly",
    data_asset_name="my_reports",
    data_connector_query={"index": -1},
)
validator_march = context.get_validator(batch_request=batch_request_march,
                                        expectation_suite=suite)
print(
    validator_march.expect_table_row_count_to_equal(
        value=february_table_row_count))