def test_parse_validation_graph_with_bad_metrics_args(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) graph = ValidationGraph() engine = PandasExecutionEngine() validator = Validator(execution_engine=engine) for configuration in [expectation_configuration]: expectation_impl = get_expectation_impl( "expect_column_value_z_scores_to_be_less_than") validation_dependencies = expectation_impl( configuration).get_validation_dependencies( configuration, execution_engine=engine, ) for metric_configuration in validation_dependencies["metrics"].values( ): validator.build_metric_dependency_graph( graph=graph, execution_engine=engine, metric_configuration=metric_configuration, configuration=configuration, ) ready_metrics, needed_metrics = validator._parse_validation_graph( validation_graph=graph, metrics=("nonexistent", "NONE")) assert len(ready_metrics) == 2 and len(needed_metrics) == 9
def test_parse_validation_graph(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) # noinspection PyUnusedLocal expectation = ExpectColumnValueZScoresToBeLessThan( expectation_configuration) # noinspection PyUnusedLocal batch = Batch(data=df) graph = ValidationGraph() engine = PandasExecutionEngine() for configuration in [expectation_configuration]: expectation_impl = get_expectation_impl( "expect_column_value_z_scores_to_be_less_than") validation_dependencies = expectation_impl( configuration).get_validation_dependencies(configuration, engine) for metric_configuration in validation_dependencies["metrics"].values( ): Validator(execution_engine=engine).build_metric_dependency_graph( graph=graph, execution_engine=engine, metric_configuration=metric_configuration, configuration=configuration, ) ready_metrics, needed_metrics = Validator(engine)._parse_validation_graph( validation_graph=graph, metrics=dict()) assert len(ready_metrics) == 2 and len(needed_metrics) == 9
def test_populate_dependencies(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) expectation = ExpectColumnValueZScoresToBeLessThan( expectationConfiguration) batch = Batch(data=df) graph = ValidationGraph() engine = PandasExecutionEngine() for configuration in [expectationConfiguration]: expectation_impl = get_expectation_impl( "expect_column_value_z_scores_to_be_less_than") validation_dependencies = expectation_impl( configuration).get_validation_dependencies( configuration, engine, ) for metric_configuration in validation_dependencies["metrics"].values( ): Validator(execution_engine=engine).build_metric_dependency_graph( graph, metric_configuration, configuration, execution_engine=engine) assert len(graph.edges) == 10
def test_populate_dependencies_with_incorrect_metric_name(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) # noinspection PyUnusedLocal expectation = ExpectColumnValueZScoresToBeLessThan(expectation_configuration) # noinspection PyUnusedLocal batch = Batch(data=df) graph = ValidationGraph() engine = PandasExecutionEngine() for configuration in [expectation_configuration]: expectation_impl = get_expectation_impl( "expect_column_value_z_scores_to_be_less_than" ) validation_dependencies = expectation_impl( configuration ).get_validation_dependencies( configuration, engine, ) try: Validator(execution_engine=engine).build_metric_dependency_graph( graph=graph, execution_engine=engine, metric_configuration=MetricConfiguration( "column_values.not_a_metric", IDDict() ), configuration=configuration, ) except ge_exceptions.MetricProviderError as e: graph = e assert isinstance(graph, ge_exceptions.MetricProviderError)
def test_resolve_validation_graph_with_bad_config_catch_exceptions_true( basic_datasource, ): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]}) batch = basic_datasource.get_single_batch_from_batch_request( RuntimeBatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "data_asset_name": "IN_MEMORY_DATA_ASSET", "runtime_parameters": { "batch_data": df, }, "batch_identifiers": { "pipeline_stage_name": 0, "airflow_run_id": 0, "custom_key_0": 0, }, })) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_max_to_be_between", kwargs={ "column": "not_in_table", "min_value": 1, "max_value": 29 }, ) runtime_configuration = { "catch_exceptions": True, "result_format": { "result_format": "BASIC" }, } execution_engine = PandasExecutionEngine() validator = Validator(execution_engine=execution_engine, batches=[batch]) expectation_impl = get_expectation_impl( expectation_configuration.expectation_type) validation_dependencies = expectation_impl().get_validation_dependencies( expectation_configuration, execution_engine, runtime_configuration)["metrics"] graph = ValidationGraph() for metric_configuration in validation_dependencies.values(): validator.build_metric_dependency_graph( graph=graph, execution_engine=execution_engine, metric_configuration=metric_configuration, configuration=expectation_configuration, runtime_configuration=runtime_configuration, ) metrics: Dict[Tuple[str, str, str], Any] = {} aborted_metrics_info: Dict[Tuple[str, str, str], Dict[ str, Union[MetricConfiguration, Set[ExceptionInfo], int]], ] = validator.resolve_validation_graph( graph=graph, metrics=metrics, runtime_configuration=runtime_configuration, ) assert len(aborted_metrics_info) == 1 aborted_metric_info_item = list(aborted_metrics_info.values())[0] assert aborted_metric_info_item[ "num_failures"] == MAX_METRIC_COMPUTATION_RETRIES assert len(aborted_metric_info_item["exception_info"]) == 1 exception_info = next(iter(aborted_metric_info_item["exception_info"])) assert (exception_info["exception_message"] == 'Error: The column "not_in_table" in BatchData does not exist.')
def get_validation_dependencies( self, configuration: Optional[ExpectationConfiguration] = None, execution_engine: Optional[ExecutionEngine] = None, runtime_configuration: Optional[dict] = None, ): all_dependencies = super().get_validation_dependencies( configuration, execution_engine, runtime_configuration ) dependencies = all_dependencies["metrics"] partition_object = configuration.kwargs["partition_object"] domain_kwargs = configuration.get_domain_kwargs() is_categorical = None bins = None if partition_object is None: if configuration.kwargs.get( "bucketize_data", self.default_kwarg_values["bucketize_data"] ): is_categorical = False partition_metric_configuration = MetricConfiguration( "column.partition", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={ "bins": "auto", "allow_relative_error": False, }, ) # # Note: 20201116 - JPC - the execution engine doesn't provide capability to evaluate # dependencies, so we use a validator # validator = Validator(execution_engine=execution_engine) graph = ValidationGraph() validator.build_metric_dependency_graph( graph=graph, child_node=partition_metric_configuration, configuration=configuration, execution_engine=execution_engine, ) bins = validator.resolve_validation_graph(graph, metrics=dict())[ partition_metric_configuration.id ] hist_metric_configuration = MetricConfiguration( "column.histogram", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={"bins": tuple(bins),}, ) nonnull_configuration = MetricConfiguration( "column_values.nonnull.count", metric_domain_kwargs=domain_kwargs, metric_value_kwargs=dict(), ) # # NOTE 20201117 - JPC - Would prefer not to include partition_metric_configuraiton here, # since we have already evaluated it, and its result is in the kwargs for the histogram. # However, currently the dependencies' configurations are not passed to the _validate method # dependencies["column.partition"] = partition_metric_configuration dependencies["column.histogram"] = hist_metric_configuration dependencies["column_values.nonnull.count"] = nonnull_configuration else: is_categorical = True counts_configuration = MetricConfiguration( "column.value_counts", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={"sort": "value",}, ) nonnull_configuration = MetricConfiguration( "column_values.nonnull.count", metric_domain_kwargs=domain_kwargs, ) dependencies["column.value_counts"] = counts_configuration dependencies["column_values.nonnull.count"] = nonnull_configuration if is_categorical is True or is_valid_categorical_partition_object( partition_object ): dependencies["column.value_counts"] = MetricConfiguration( "column.value_counts", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={"sort": "value"}, ) dependencies["column_values.nonnull.count"] = MetricConfiguration( "column_values.nonnull.count", domain_kwargs ) else: if ( bins is None ): # if the user did not supply a partition_object, so we just computed it if not is_valid_partition_object(partition_object): raise ValueError("Invalid partition_object provided") bins = partition_object["bins"] hist_metric_configuration = MetricConfiguration( "column.histogram", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={"bins": bins,}, ) nonnull_configuration = MetricConfiguration( "column_values.nonnull.count", metric_domain_kwargs=domain_kwargs, metric_value_kwargs=dict(), ) dependencies["column.histogram"] = hist_metric_configuration dependencies["column_values.nonnull.count"] = nonnull_configuration below_partition = MetricConfiguration( "column_values.between.count", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={"max_value": bins[0]}, ) above_partition = MetricConfiguration( "column_values.between.count", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={"min_value": bins[-1], "strict_min": True}, ) dependencies["below_partition"] = below_partition dependencies["above_partition"] = above_partition return all_dependencies