Пример #1
0
    def _get_evaluation_dependencies(
        cls,
        metric: MetricConfiguration,
        configuration: Optional[ExpectationConfiguration] = None,
        execution_engine: Optional[ExecutionEngine] = None,
        runtime_configuration: Optional[dict] = None,
    ):
        """Returns a dictionary of given metric names and their corresponding configuration, specifying the metric
        types and their respective domains"""
        dependencies: dict = super()._get_evaluation_dependencies(
            metric=metric,
            configuration=configuration,
            execution_engine=execution_engine,
            runtime_configuration=runtime_configuration,
        )

        if metric.metric_name == "column_values.z_score.under_threshold.condition":
            dependencies["column_values.z_score.map"] = MetricConfiguration(
                metric_name="column_values.z_score.map",
                metric_domain_kwargs=metric.metric_domain_kwargs,
            )

        if metric.metric_name == "column_values.z_score.map":
            dependencies["column.mean"] = MetricConfiguration(
                metric_name="column.mean",
                metric_domain_kwargs=metric.metric_domain_kwargs,
            )
            dependencies["column.standard_deviation"] = MetricConfiguration(
                metric_name="column.standard_deviation",
                metric_domain_kwargs=metric.metric_domain_kwargs,
            )

        return dependencies
    def _get_evaluation_dependencies(
        cls,
        metric: MetricConfiguration,
        configuration: Optional[ExpectationConfiguration] = None,
        execution_engine: Optional[ExecutionEngine] = None,
        runtime_configuration: Optional[dict] = None,
    ):
        """Returns a dictionary of given metric names and their corresponding configuration, specifying the metric
        types and their respective domains"""
        dependencies: dict = super()._get_evaluation_dependencies(
            metric=metric,
            configuration=configuration,
            execution_engine=execution_engine,
            runtime_configuration=runtime_configuration,
        )

        if isinstance(execution_engine, SqlAlchemyExecutionEngine):
            dependencies["column.mean"] = MetricConfiguration(
                metric_name="column.mean",
                metric_domain_kwargs=metric.metric_domain_kwargs,
                metric_value_kwargs=None,
                metric_dependencies=None,
            )
            dependencies["column_values.null.unexpected_count"] = MetricConfiguration(
                metric_name="column_values.null.unexpected_count",
                metric_domain_kwargs=metric.metric_domain_kwargs,
                metric_value_kwargs=None,
                metric_dependencies=None,
            )

        return dependencies
def test_resolve_metrics_with_incomplete_metric_input():
    engine = PandasExecutionEngine()

    mean = MetricConfiguration(
        metric_name="column.mean",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
    )
    stdev = MetricConfiguration(
        metric_name="column.standard_deviation",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
    )

    desired_metric = MetricConfiguration(
        metric_name="column_values.z_score.map",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
        metric_dependencies={
            "column.standard_deviation": stdev,
            "column.mean": mean,
        },
    )

    # Ensuring that incomplete metrics given raises a GreatExpectationsError
    with pytest.raises(GreatExpectationsError) as error:
        engine.resolve_metrics(metrics_to_resolve=(desired_metric, ),
                               metrics={})
Пример #4
0
    def _get_evaluation_dependencies(
        cls,
        metric: MetricConfiguration,
        configuration: Optional[ExpectationConfiguration] = None,
        execution_engine: Optional[ExecutionEngine] = None,
        runtime_configuration: Optional[dict] = None,
    ):
        """
        Returns a dictionary of given metric names and their corresponding configuration, specifying
        the metric types and their respective domains"""
        dependencies: dict = super()._get_evaluation_dependencies(
            metric=metric,
            configuration=configuration,
            execution_engine=execution_engine,
            runtime_configuration=runtime_configuration,
        )

        if metric.metric_name == "data_profiler.profile_percent_diff":
            dependencies["data_profiler.profile_report"] = MetricConfiguration(
                metric_name="data_profiler.profile_report",
                metric_domain_kwargs=metric.metric_domain_kwargs,
                metric_value_kwargs=metric.metric_value_kwargs,
            )
            dependencies["data_profiler.profile_diff"] = MetricConfiguration(
                metric_name="data_profiler.profile_diff",
                metric_domain_kwargs=metric.metric_domain_kwargs,
                metric_value_kwargs=metric.metric_value_kwargs,
            )

        return dependencies
Пример #5
0
    def _get_evaluation_dependencies(
        cls,
        metric: MetricConfiguration,
        configuration: Optional[ExpectationConfiguration] = None,
        execution_engine: Optional[ExecutionEngine] = None,
        runtime_configuration: Optional[dict] = None,
    ):
        dependencies: dict = super()._get_evaluation_dependencies(
            metric=metric,
            configuration=configuration,
            execution_engine=execution_engine,
            runtime_configuration=runtime_configuration,
        )

        dependencies["column.distinct_values.count"] = MetricConfiguration(
            metric_name="column.distinct_values.count",
            metric_domain_kwargs=metric.metric_domain_kwargs,
        )

        dependencies[
            "column_values.nonnull.unexpected_count"] = MetricConfiguration(
                metric_name="column_values.nonnull.unexpected_count",
                metric_domain_kwargs=metric.metric_domain_kwargs,
            )

        return dependencies
Пример #6
0
def get_table_columns_metric(
        engine: ExecutionEngine) -> [MetricConfiguration, dict]:
    resolved_metrics: dict = {}

    results: dict

    table_column_types_metric: MetricConfiguration = MetricConfiguration(
        metric_name="table.column_types",
        metric_domain_kwargs=dict(),
        metric_value_kwargs={
            "include_nested": True,
        },
        metric_dependencies=None,
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(table_column_types_metric, ))
    resolved_metrics.update(results)

    table_columns_metric: MetricConfiguration = MetricConfiguration(
        metric_name="table.columns",
        metric_domain_kwargs=dict(),
        metric_value_kwargs=None,
        metric_dependencies={
            "table.column_types": table_column_types_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(table_columns_metric, ), metrics=resolved_metrics)
    resolved_metrics.update(results)

    return table_columns_metric, resolved_metrics
    def _get_evaluation_dependencies(
        cls,
        metric: MetricConfiguration,
        configuration: Optional[ExpectationConfiguration] = None,
        execution_engine: Optional[ExecutionEngine] = None,
        runtime_configuration: Optional[dict] = None,
    ):
        bins = metric.metric_value_kwargs.get("bins",
                                              cls.default_kwarg_values["bins"])
        n_bins = metric.metric_value_kwargs.get(
            "n_bins", cls.default_kwarg_values["n_bins"])
        allow_relative_error = metric.metric_value_kwargs[
            "allow_relative_error"]

        dependencies: dict = super()._get_evaluation_dependencies(
            metric=metric,
            configuration=configuration,
            execution_engine=execution_engine,
            runtime_configuration=runtime_configuration,
        )

        if bins == "uniform":
            dependencies["column.min"] = MetricConfiguration(
                metric_name="column.min",
                metric_domain_kwargs=metric.metric_domain_kwargs,
            )
            dependencies["column.max"] = MetricConfiguration(
                metric_name="column.max",
                metric_domain_kwargs=metric.metric_domain_kwargs,
            )
        elif bins in ["ntile", "quantile", "percentile"]:
            dependencies["column.quantile_values"] = MetricConfiguration(
                metric_name="column.quantile_values",
                metric_domain_kwargs=metric.metric_domain_kwargs,
                metric_value_kwargs={
                    "quantiles":
                    np.linspace(start=0, stop=1, num=n_bins + 1).tolist(),
                    "allow_relative_error":
                    allow_relative_error,
                },
            )
        elif bins == "auto":
            dependencies["column_values.nonnull.count"] = MetricConfiguration(
                metric_name="column_values.nonnull.count",
                metric_domain_kwargs=metric.metric_domain_kwargs,
            )
            dependencies["column.quantile_values"] = MetricConfiguration(
                metric_name="column.quantile_values",
                metric_domain_kwargs=metric.metric_domain_kwargs,
                metric_value_kwargs={
                    "quantiles": (0.0, 0.25, 0.75, 1.0),
                    "allow_relative_error": allow_relative_error,
                },
            )
        else:
            raise ValueError("Invalid parameter for bins argument")

        return dependencies
def test_sa_batch_unexpected_condition_temp_table(caplog, sa):
    def validate_tmp_tables():
        temp_tables = [
            name
            for name in get_sqlite_temp_table_names(engine.engine)
            if name.startswith("ge_temp_")
        ]
        tables = [
            name
            for name in get_sqlite_table_names(engine.engine)
            if name.startswith("ge_temp_")
        ]
        assert len(temp_tables) == 0
        assert len(tables) == 0

    engine = build_sa_engine(
        pd.DataFrame({"a": [1, 2, 1, 2, 3, 3], "b": [4, 4, 4, 4, 4, 4]}), sa
    )

    metrics: dict = {}

    table_columns_metric: MetricConfiguration
    results: dict

    table_columns_metric, results = get_table_columns_metric(engine=engine)
    metrics.update(results)

    validate_tmp_tables()

    condition_metric = MetricConfiguration(
        metric_name="column_values.unique.condition",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(condition_metric,), metrics=metrics
    )
    metrics.update(results)

    validate_tmp_tables()

    desired_metric = MetricConfiguration(
        metric_name="column_values.unique.unexpected_count",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
        metric_dependencies={
            "unexpected_condition": condition_metric,
        },
    )
    results = engine.resolve_metrics(
        metrics_to_resolve=(desired_metric,), metrics=metrics
    )

    validate_tmp_tables()
Пример #9
0
    def _get_evaluation_dependencies(
        cls,
        metric: MetricConfiguration,
        configuration: Optional[ExpectationConfiguration] = None,
        execution_engine: Optional[ExecutionEngine] = None,
        runtime_configuration: Optional[dict] = None,
    ):
        """This should return a dictionary:
        {
          "dependency_name": MetricConfiguration,
          ...
        }
        """
        dependencies: dict = super()._get_evaluation_dependencies(
            metric=metric,
            configuration=configuration,
            execution_engine=execution_engine,
            runtime_configuration=runtime_configuration,
        )

        if isinstance(execution_engine, SqlAlchemyExecutionEngine):
            dependencies["column_values.nonnull.count"] = MetricConfiguration(
                metric_name="column_values.nonnull.count",
                metric_domain_kwargs=metric.metric_domain_kwargs,
            )

        return dependencies
    def _build_table_column_name_to_inferred_semantic_domain_type_map(
        self,
        batch_ids: List[str],
        validator: "Validator",  # noqa: F821
        column_names: List[str],
    ) -> None:
        column_types_dict_list: List[Dict[str, Any]] = validator.get_metric(
            metric=MetricConfiguration(
                metric_name="table.column_types",
                metric_domain_kwargs={
                    "batch_id": batch_ids[-1],  # active_batch_id
                },
                metric_value_kwargs={
                    "include_nested": True,
                },
                metric_dependencies=None,
            )
        )

        column_name: str
        self._table_column_name_to_inferred_semantic_domain_type_map = {
            column_name: self._infer_semantic_domain_type_from_table_column_type(
                column_types_dict_list=column_types_dict_list,
                column_name=column_name,
            ).semantic_domain_type
            for column_name in column_names
        }
    def _get_evaluation_dependencies(
        cls,
        metric: MetricConfiguration,
        configuration: Optional[ExpectationConfiguration] = None,
        execution_engine: Optional[ExecutionEngine] = None,
        runtime_configuration: Optional[dict] = None,
    ):
        """Returns a dictionary of given metric names and their corresponding configuration, specifying the metric
        types and their respective domains"""
        dependencies: dict = super()._get_evaluation_dependencies(
            metric=metric,
            configuration=configuration,
            execution_engine=execution_engine,
            runtime_configuration=runtime_configuration,
        )

        table_domain_kwargs: dict = {
            k: v
            for k, v in metric.metric_domain_kwargs.items() if k != "column"
        }
        dependencies["table.column_types"] = MetricConfiguration(
            metric_name="table.column_types",
            metric_domain_kwargs=table_domain_kwargs,
            metric_value_kwargs={
                "include_nested": True,
            },
            metric_dependencies=None,
        )

        return dependencies
    def get_validation_dependencies(
        self,
        configuration: Optional[ExpectationConfiguration] = None,
        execution_engine: Optional[ExecutionEngine] = None,
        runtime_configuration: Optional[dict] = None,
    ) -> dict:

        dependencies = super().get_validation_dependencies(
            configuration=configuration,
            execution_engine=execution_engine,
            runtime_configuration=runtime_configuration,
        )
        metric_kwargs = get_metric_kwargs(
            metric_name="column_values.string_integers.increasing.map",
            configuration=configuration,
            runtime_configuration=runtime_configuration,
        )

        dependencies["metrics"][
            "column_values.string_integers.increasing.map"] = MetricConfiguration(
                metric_name="column_values.string_integers.increasing.map",
                metric_domain_kwargs=metric_kwargs["metric_domain_kwargs"],
                metric_value_kwargs=metric_kwargs["metric_value_kwargs"],
            )

        return dependencies
Пример #13
0
    def _get_evaluation_dependencies(
        cls,
        metric: MetricConfiguration,
        configuration: Optional[ExpectationConfiguration] = None,
        execution_engine: Optional[ExecutionEngine] = None,
        runtime_configuration: Optional[Dict] = None,
    ):
        """Returns a dictionary of given metric names and their corresponding configuration,
        specifying the metric types and their respective domains"""
        dependencies: dict = super()._get_evaluation_dependencies(
            metric=metric,
            configuration=configuration,
            execution_engine=execution_engine,
            runtime_configuration=runtime_configuration,
        )

        if isinstance(
            execution_engine, (SqlAlchemyExecutionEngine, SparkDFExecutionEngine)
        ):
            dependencies["column.value_counts"] = MetricConfiguration(
                metric_name="column.value_counts",
                metric_domain_kwargs=metric.metric_domain_kwargs,
                metric_value_kwargs={
                    "sort": "value",
                    "collate": None,
                },
            )

        return dependencies
Пример #14
0
def build_categorical_partition_object(execution_engine, domain_kwargs, sort="value"):
    """Convenience method for building a partition object on categorical data from a dataset and column

    Args:
        execution_engine (ExecutionEngine): the execution engine with which to compute the partition
        domain_kwargs (dict): The domain kwargs describing the domain for which to compute the partition
        sort (string): must be one of "value", "count", or "none".
            - if "value" then values in the resulting partition object will be sorted lexigraphically
            - if "count" then values will be sorted according to descending count (frequency)
            - if "none" then values will not be sorted

    Returns:
        A new partition_object::

        {
            "values": (list) the categorical values for which each weight applies,
            "weights": (list) The densities of the values implied by the partition.
        }
        See :ref:`partition_object`.
    """
    counts_configuration = MetricConfiguration(
        "column.partition",
        metric_domain_kwargs=domain_kwargs,
        metric_value_kwargs={
            "sort": sort,
        },
        metric_dependencies=None,
    )
    nonnull_configuration = MetricConfiguration(
        "column_values.nonnull.count",
        metric_domain_kwargs=domain_kwargs,
        metric_value_kwargs=None,
        metric_dependencies=None,
    )
    metrics = execution_engine.resolve_metrics(
        (counts_configuration, nonnull_configuration)
    )

    return {
        "values": list(metrics[counts_configuration.id].index),
        "weights": list(
            np.array(metrics[counts_configuration.id])
            / metrics[nonnull_configuration.id]
        ),
    }
Пример #15
0
 def head(self, n_rows=5, fetch_all=False):
     # FIXME - we should use a Validator after resolving circularity
     # Validator(self._data.execution_engine, batches=(self,)).get_metric(MetricConfiguration("table.head", {"batch_id": self.id}, {"n_rows": n_rows, "fetch_all": fetch_all}))
     metric = MetricConfiguration(
         "table.head",
         {"batch_id": self.id},
         {"n_rows": n_rows, "fetch_all": fetch_all},
     )
     return self._data.execution_engine.resolve_metrics((metric,))[metric.id]
Пример #16
0
def test_resolve_metric_bundle_with_nonexistent_metric():
    df = pd.DataFrame({"a": [1, 2, 3, None]})

    # Building engine and configurations in attempt to resolve metrics
    engine = PandasExecutionEngine(batch_data_dict={"made_up_id": df})
    mean = MetricConfiguration(
        metric_name="column.i_don't_exist",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
    )
    stdev = MetricConfiguration(
        metric_name="column.nonexistent",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
    )
    desired_metrics = (mean, stdev)

    with pytest.raises(ge_exceptions.MetricProviderError) as e:
        # noinspection PyUnusedLocal
        metrics = engine.resolve_metrics(metrics_to_resolve=desired_metrics)
def test_resolve_metric_bundle_with_nonexistent_metric(spark_session):
    engine: SparkDFExecutionEngine = build_spark_engine(
        spark=spark_session,
        df=pd.DataFrame({
            "a": [1, 2, 1, 2, 3, 3],
            "b": [4, 4, 4, 4, 4, 4]
        }, ),
        batch_id="1234",
    )

    desired_metric_1 = MetricConfiguration(
        metric_name="column_values.unique",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
    )
    desired_metric_2 = MetricConfiguration(
        metric_name="column.min",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
    )
    desired_metric_3 = MetricConfiguration(
        metric_name="column.max",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=None,
    )
    desired_metric_4 = MetricConfiguration(
        metric_name="column.does_not_exist",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=None,
    )

    # Ensuring a metric provider error is raised if metric does not exist
    with pytest.raises(ge_exceptions.MetricProviderError) as e:
        # noinspection PyUnusedLocal
        res = engine.resolve_metrics(metrics_to_resolve=(
            desired_metric_1,
            desired_metric_2,
            desired_metric_3,
            desired_metric_4,
        ))
        print(e)
Пример #18
0
def test_resolve_metric_bundle():
    df = pd.DataFrame({"a": [1, 2, 3, None]})

    # Building engine and configurations in attempt to resolve metrics
    engine = PandasExecutionEngine(batch_data_dict={"made-up-id": df})

    metrics: dict = {}

    table_columns_metric: MetricConfiguration
    results: dict

    table_columns_metric, results = get_table_columns_metric(engine=engine)
    metrics.update(results)

    mean = MetricConfiguration(
        metric_name="column.mean",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    stdev = MetricConfiguration(
        metric_name="column.standard_deviation",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    desired_metrics = (mean, stdev)
    results = engine.resolve_metrics(metrics_to_resolve=desired_metrics,
                                     metrics=metrics)
    metrics.update(results)

    # Ensuring metrics have been properly resolved
    assert (metrics[("column.mean", "column=a",
                     ())] == 2.0), "mean metric not properly computed"
    assert metrics[("column.standard_deviation", "column=a",
                    ())] == 1.0, ("standard deviation "
                                  "metric not properly computed")
def test_resolve_metrics_with_extraneous_value_key():
    df = pd.DataFrame({"a": [1, 2, 3, None]})
    engine = PandasExecutionEngine(batch_data_dict={"my_id": df})

    metrics: dict = {}

    table_columns_metric: MetricConfiguration
    results: dict

    table_columns_metric, results = get_table_columns_metric(engine=engine)

    metrics.update(results)

    mean = MetricConfiguration(
        metric_name="column.mean",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )
    # Ensuring that an unused value key will not mess up computation
    stdev = MetricConfiguration(
        metric_name="column.standard_deviation",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3, 4, 5]},
        metric_dependencies={
            "table.columns": table_columns_metric,
        },
    )

    desired_metrics = (mean, stdev)
    results = engine.resolve_metrics(metrics_to_resolve=desired_metrics,
                                     metrics=metrics)
    metrics.update(results)

    # Ensuring extraneous value key did not change computation
    assert (metrics[("column.standard_deviation", "column=a",
                     "value_set=[1, 2, 3, 4, 5]")] == 1.0)
Пример #20
0
 def _get_evaluation_dependencies(
     cls,
     metric: MetricConfiguration,
     configuration: Optional[ExpectationConfiguration] = None,
     execution_engine: Optional[ExecutionEngine] = None,
     runtime_configuration: Optional[dict] = None,
 ):
     dependencies: dict = super()._get_evaluation_dependencies(
         metric=metric,
         configuration=configuration,
         execution_engine=execution_engine,
         runtime_configuration=runtime_configuration,
     )
     table_domain_kwargs: dict = {
         k: v
         for k, v in metric.metric_domain_kwargs.items() if k != "column"
     }
     dependencies["table.column_types"] = MetricConfiguration(
         metric_name="table.column_types",
         metric_domain_kwargs=table_domain_kwargs,
         metric_value_kwargs={
             "include_nested": True,
         },
         metric_dependencies=None,
     )
     dependencies["table.columns"] = MetricConfiguration(
         metric_name="table.columns",
         metric_domain_kwargs=table_domain_kwargs,
         metric_value_kwargs=None,
         metric_dependencies=None,
     )
     dependencies["table.row_count"] = MetricConfiguration(
         metric_name="table.row_count",
         metric_domain_kwargs=table_domain_kwargs,
         metric_value_kwargs=None,
         metric_dependencies=None,
     )
     return dependencies
Пример #21
0
    def get_table_row_counts(
        self,
        validator: Optional["Validator"] = None,  # noqa: F821
        batch_ids: Optional[List[str]] = None,
        variables: Optional[ParameterContainer] = None,
    ) -> Dict[str, int]:
        if validator is None:
            validator = self.get_validator(variables=variables)

        if batch_ids is None:
            batch_ids = self.get_batch_ids(variables=variables)

        batch_id: str

        metric_configurations_by_batch_id: Dict[str, List[MetricConfiguration]] = {
            batch_id: [
                MetricConfiguration(
                    metric_name="table.row_count",
                    metric_domain_kwargs={
                        "batch_id": batch_id,
                    },
                    metric_value_kwargs={
                        "include_nested": True,
                    },
                    metric_dependencies=None,
                )
            ]
            for batch_id in batch_ids
        }

        resolved_metrics_by_batch_id: Dict[
            str, Dict[Tuple[str, str, str], Any]
        ] = get_resolved_metrics_by_key(
            validator=validator,
            metric_configurations_by_key=metric_configurations_by_batch_id,
        )

        batch_id: str
        resolved_metrics: Dict[Tuple[str, str, str], Any]
        metric_value: Any
        table_row_count_lists_by_batch_id: Dict[str, List[int]] = {
            batch_id: [metric_value for metric_value in resolved_metrics.values()]
            for batch_id, resolved_metrics in resolved_metrics_by_batch_id.items()
        }
        table_row_counts_by_batch_id: Dict[str, int] = {
            batch_id: metric_value[0]
            for batch_id, metric_value in table_row_count_lists_by_batch_id.items()
        }

        return table_row_counts_by_batch_id
def test_resolve_metric_bundle_with_nonexistent_metric(sa):
    engine = build_sa_engine(
        pd.DataFrame({
            "a": [1, 2, 1, 2, 3, 3],
            "b": [4, 4, 4, 4, 4, 4]
        }), sa)

    desired_metric_1 = MetricConfiguration(
        metric_name="column_values.unique",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
    )
    desired_metric_2 = MetricConfiguration(
        metric_name="column.min",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=None,
    )
    desired_metric_3 = MetricConfiguration(
        metric_name="column.max",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=None,
    )
    desired_metric_4 = MetricConfiguration(
        metric_name="column.does_not_exist",
        metric_domain_kwargs={"column": "b"},
        metric_value_kwargs=None,
    )

    # Ensuring a metric provider error is raised if metric does not exist
    with pytest.raises(ge_exceptions.MetricProviderError) as e:
        res = engine.resolve_metrics(metrics_to_resolve=(
            desired_metric_1,
            desired_metric_2,
            desired_metric_3,
            desired_metric_4,
        ))
        print(e)
Пример #23
0
def test_alice_columnar_table_single_batch_batches_are_accessible(
    alice_columnar_table_single_batch_context,
    alice_columnar_table_single_batch,
):
    """
    What does this test and why?
    Batches created in the multibatch_generic_csv_generator fixture should be available using the
    multibatch_generic_csv_generator_context
    This test most likely duplicates tests elsewhere, but it is more of a test of the configurable fixture.
    """

    context: DataContext = alice_columnar_table_single_batch_context

    datasource_name: str = "alice_columnar_table_single_batch_datasource"
    data_connector_name: str = "alice_columnar_table_single_batch_data_connector"
    data_asset_name: str = "alice_columnar_table_single_batch_data_asset"

    datasource: Datasource = cast(Datasource,
                                  context.datasources[datasource_name])
    data_connector: DataConnector = datasource.data_connectors[
        data_connector_name]

    file_list: List[str] = [
        alice_columnar_table_single_batch["sample_data_relative_path"]
    ]

    assert (
        data_connector._get_data_reference_list_from_cache_by_data_asset_name(
            data_asset_name=data_asset_name) == file_list)

    batch_request_1: BatchRequest = BatchRequest(
        datasource_name=datasource_name,
        data_connector_name=data_connector_name,
        data_asset_name=data_asset_name,
        data_connector_query={
            "index": -1,
        },
    )
    # Should give most recent batch
    validator_1: Validator = context.get_validator(
        batch_request=batch_request_1,
        create_expectation_suite_with_name="my_expectation_suite_name_1",
    )
    metric_max: int = validator_1.get_metric(
        MetricConfiguration("column.max",
                            metric_domain_kwargs={"column": "event_type"}))
    assert metric_max == 73
    def _get_domains(
        self,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        """
        Find the column suffix for each column and return all domains matching the specified suffix.
        """
        column_name_suffixes: Union[str, Iterable,
                                    List[str]] = self.column_name_suffixes
        if isinstance(column_name_suffixes, str):
            column_name_suffixes = [column_name_suffixes]
        else:
            if not isinstance(column_name_suffixes, (Iterable, List)):
                raise ValueError(
                    "Unrecognized column_name_suffixes directive -- must be a list or a string."
                )

        batch_id: str = self.get_batch_id(variables=variables)
        table_column_names: List[str] = self.get_validator(
            variables=variables).get_metric(metric=MetricConfiguration(
                metric_name="table.columns",
                metric_domain_kwargs={
                    "batch_id": batch_id,
                },
                metric_value_kwargs=None,
                metric_dependencies=None,
            ))

        candidate_column_names: List[str] = list(
            filter(
                lambda candidate_column_name: candidate_column_name.endswith(
                    tuple(column_name_suffixes)),
                table_column_names,
            ))

        column_name: str
        domains: List[Domain] = [
            Domain(
                domain_type=self.domain_type,
                domain_kwargs={
                    "column": column_name,
                },
            ) for column_name in candidate_column_names
        ]

        return domains
Пример #25
0
def test_populate_dependencies_with_incorrect_metric_name():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]})
    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    # noinspection PyUnusedLocal
    expectation = ExpectColumnValueZScoresToBeLessThan(expectation_configuration)
    # noinspection PyUnusedLocal
    batch = Batch(data=df)
    graph = ValidationGraph()
    engine = PandasExecutionEngine()
    for configuration in [expectation_configuration]:
        expectation_impl = get_expectation_impl(
            "expect_column_value_z_scores_to_be_less_than"
        )
        validation_dependencies = expectation_impl(
            configuration
        ).get_validation_dependencies(
            configuration,
            engine,
        )

        try:
            Validator(execution_engine=engine).build_metric_dependency_graph(
                graph=graph,
                execution_engine=engine,
                metric_configuration=MetricConfiguration(
                    "column_values.not_a_metric", IDDict()
                ),
                configuration=configuration,
            )
        except ge_exceptions.MetricProviderError as e:
            graph = e

    assert isinstance(graph, ge_exceptions.MetricProviderError)
    def _get_domains(
        self,
        rule_name: str,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        """
        Find the semantic column type for each column and return all domains matching the specified type or types.
        """
        batch_ids: List[str] = self.get_batch_ids(variables=variables)
        table_column_names: List[str] = self.get_validator(
            variables=variables
        ).get_metric(
            metric=MetricConfiguration(
                metric_name="table.columns",
                metric_domain_kwargs={
                    "batch_id": batch_ids[-1],  # active_batch_id
                },
                metric_value_kwargs=None,
                metric_dependencies=None,
            )
        )

        # First check the column name ends in "_id".
        candidate_column_names: List[str] = list(
            filter(
                lambda candidate_column_name: candidate_column_name.endswith(
                    tuple(self.column_name_suffixes)
                ),
                table_column_names,
            )
        )

        column_name: str
        domains: List[Domain] = build_domains_from_column_names(
            rule_name=rule_name,
            column_names=candidate_column_names,
            domain_type=self.domain_type,
            table_column_name_to_inferred_semantic_domain_type_map=None,
        )

        return domains
    def _get_evaluation_dependencies(
        cls,
        metric: MetricConfiguration,
        configuration: Optional[ExpectationConfiguration] = None,
        execution_engine: Optional[ExecutionEngine] = None,
        runtime_configuration: Optional[dict] = None,
    ):
        dependencies: dict = super()._get_evaluation_dependencies(
            metric=metric,
            configuration=configuration,
            execution_engine=execution_engine,
            runtime_configuration=runtime_configuration,
        )

        if metric.metric_name == "column_values.value_length.equals.condition":
            dependencies["column_values.value_length.map"] = MetricConfiguration(
                metric_name="column_values.value_length.map",
                metric_domain_kwargs=metric.metric_domain_kwargs,
            )

        return dependencies
    def _get_domains(
        self,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        """
        Obtains and returns domains for all columns of a table (or for configured columns, if they exist in the table).
        """
        batch_id: str = self.get_batch_id(variables=variables)
        table_columns: List[str] = self.get_validator(variables=variables).get_metric(
            metric=MetricConfiguration(
                metric_name="table.columns",
                metric_domain_kwargs={
                    "batch_id": batch_id,
                },
                metric_value_kwargs=None,
                metric_dependencies=None,
            )
        )
        if self.column_names is None:
            self.column_names = table_columns
        else:
            column_name: str
            for column_name in self.column_names:
                if column_name not in table_columns:
                    raise ge_exceptions.ProfilerExecutionError(
                        message=f'Error: The column "{column_name}" in BatchData does not exist.'
                    )

        column_name: str
        domains: List[Domain] = [
            Domain(
                domain_type=self.domain_type,
                domain_kwargs={
                    "column": column_name,
                },
            )
            for column_name in self.column_names
        ]

        return domains
    def _generate_metric_configurations_to_check_cardinality(
        self,
        batch_ids: List[str],
        column_names: List[str],
    ) -> Dict[str, List[MetricConfiguration]]:
        """Generate metric configurations used to compute metrics for checking cardinality.

        Args:
            batch_ids: List of batch_ids used to create metric configurations.
            column_names: List of column_names used to create metric configurations.

        Returns:
            Dictionary of the form {
                "my_column_name": List[MetricConfiguration],
            }
        """

        cardinality_limit_mode: Union[
            AbsoluteCardinalityLimit, RelativeCardinalityLimit
        ] = self.cardinality_checker.cardinality_limit_mode

        batch_id: str
        metric_configurations: Dict[str, List[MetricConfiguration]] = {
            column_name: [
                MetricConfiguration(
                    metric_name=cardinality_limit_mode.metric_name_defining_limit,
                    metric_domain_kwargs={
                        "column": column_name,
                        "batch_id": batch_id,
                    },
                    metric_value_kwargs=None,
                    metric_dependencies=None,
                )
                for batch_id in batch_ids
            ]
            for column_name in column_names
        }

        return metric_configurations
Пример #30
0
    def _generate_metric_configurations(
        map_metric_name: str,
        batch_ids: List[str],
        column_names: List[str],
    ) -> Dict[str, List[MetricConfiguration]]:
        """
        Generate metric configurations used to compute "unexpected_count" values for "map_metric_name".

        Args:
            map_metric_name: the name of a map metric (must be a supported and registered map metric); the suffix
            ".unexpected_count" will be appended to "map_metric_name" to be used in MetricConfiguration to get values.
            batch_ids: List of batch_ids used to create metric configurations.
            column_names: List of column_names used to create metric configurations.

        Returns:
            Dictionary of the form {
                column_name: List[MetricConfiguration],
            }
        """
        column_name: str
        batch_id: str
        metric_configurations: Dict[str, List[MetricConfiguration]] = {
            column_name: [
                MetricConfiguration(
                    metric_name=f"{map_metric_name}.unexpected_count",
                    metric_domain_kwargs={
                        "column": column_name,
                        "batch_id": batch_id,
                    },
                    metric_value_kwargs=None,
                    metric_dependencies=None,
                )
                for batch_id in batch_ids
            ]
            for column_name in column_names
        }

        return metric_configurations