def test_batches_are_accessible( monkeypatch, multibatch_generic_csv_generator, multibatch_generic_csv_generator_context, ): """ What does this test and why? Batches created in the multibatch_generic_csv_generator fixture should be available using the multibatch_generic_csv_generator_context This test most likely duplicates tests elsewhere, but it is more of a test of the configurable fixture. """ context: DataContext = multibatch_generic_csv_generator_context data_relative_path = "../data" data_path = os.path.join(context.root_directory, data_relative_path) datasource_name = "generic_csv_generator" data_connector_name = "daily_data_connector" asset_name = "daily_data_asset" datasource = context.datasources[datasource_name] data_connector = datasource.data_connectors[data_connector_name] total_batches: int = 20 file_list = multibatch_generic_csv_generator( data_path=data_path, num_event_batches=total_batches) assert ( data_connector._get_data_reference_list_from_cache_by_data_asset_name( data_asset_name=asset_name) == file_list) batch_request_1 = BatchRequest( datasource_name="generic_csv_generator", data_connector_name="daily_data_connector", data_asset_name="daily_data_asset", data_connector_query={ "index": -1, }, ) # Should give most recent batch validator_1 = context.get_validator( batch_request=batch_request_1, create_expectation_suite_with_name="my_expectation_suite_name_1", ) metric_max = validator_1.get_metric( MetricConfiguration("column.max", metric_domain_kwargs={"column": "batch_num"})) assert metric_max == total_batches metric_value_set = validator_1.get_metric( MetricConfiguration( "column.distinct_values", metric_domain_kwargs={"column": "string_cardinality_3"}, )) assert metric_value_set == {"category0", "category1", "category2"} batch_request_2 = BatchRequest( datasource_name="generic_csv_generator", data_connector_name="daily_data_connector", data_asset_name="daily_data_asset", data_connector_query={ "index": -2, }, ) validator_2 = context.get_validator( batch_request=batch_request_2, create_expectation_suite_with_name="my_expectation_suite_name_2", ) metric_max = validator_2.get_metric( MetricConfiguration("column.max", metric_domain_kwargs={"column": "batch_num"})) assert metric_max == total_batches - 1 metric_value_set = validator_2.get_metric( MetricConfiguration( "column.distinct_values", metric_domain_kwargs={"column": "string_cardinality_3"}, )) assert metric_value_set == {"category0", "category1", "category2"} for batch_num in range(1, total_batches + 1): batch_request = BatchRequest( datasource_name="generic_csv_generator", data_connector_name="daily_data_connector", data_asset_name="daily_data_asset", data_connector_query={ "index": -batch_num, }, ) validator = context.get_validator( batch_request=batch_request, create_expectation_suite_with_name= f"my_expectation_suite_name__{batch_num}", ) metric_max = validator.get_metric( MetricConfiguration("column.max", metric_domain_kwargs={"column": "batch_num"})) assert metric_max == (total_batches + 1) - batch_num metric_value_set = validator.get_metric( MetricConfiguration( "column.distinct_values", metric_domain_kwargs={"column": "string_cardinality_3"}, )) assert metric_value_set == {"category0", "category1", "category2"}
def get_validation_dependencies( self, configuration: Optional[ExpectationConfiguration] = None, execution_engine: Optional[ExecutionEngine] = None, runtime_configuration: Optional[dict] = None, ): # this calls TableExpectation.get_validation_dependencies to set baseline dependencies # for the aggregate version of the expectation dependencies = super(ColumnMapExpectation, self).get_validation_dependencies( configuration, execution_engine, runtime_configuration ) # only PandasExecutionEngine supports the column map version of the expectation if isinstance(execution_engine, PandasExecutionEngine): column_name = configuration.kwargs.get("column") expected_type = configuration.kwargs.get("type_") metric_kwargs = get_metric_kwargs( configuration=configuration, metric_name="table.column_types", runtime_configuration=runtime_configuration, ) metric_domain_kwargs = metric_kwargs.get("metric_domain_kwargs") metric_value_kwargs = metric_kwargs.get("metric_value_kwargs") table_column_types_configuration = MetricConfiguration( "table.column_types", metric_domain_kwargs=metric_domain_kwargs, metric_value_kwargs=metric_value_kwargs, ) actual_column_types_list = execution_engine.resolve_metrics( [table_column_types_configuration] )[table_column_types_configuration.id] actual_column_type = [ type_dict["type"] for type_dict in actual_column_types_list if type_dict["name"] == column_name ][0] # only use column map version if column dtype is object if actual_column_type.type.__name__ == "object_" and expected_type not in [ "object", "object_", "O", None, ]: # this resets dependencies using ColumnMapExpectation.get_validation_dependencies dependencies = super().get_validation_dependencies( configuration, execution_engine, runtime_configuration ) # this adds table.column_types dependency for both aggregate and map versions of expectation column_types_metric_kwargs = get_metric_kwargs( metric_name="table.column_types", configuration=configuration, runtime_configuration=runtime_configuration, ) dependencies["metrics"]["table.column_types"] = MetricConfiguration( metric_name="table.column_types", metric_domain_kwargs=column_types_metric_kwargs["metric_domain_kwargs"], metric_value_kwargs=column_types_metric_kwargs["metric_value_kwargs"], ) return dependencies
lambda batch_identifiers: int(batch_identifiers["month"]) < 3 }) jan_feb_batch_definition_list: list = ( jan_feb_batch_filter.select_from_data_connector_query( batch_definition_list=total_batch_definition_list)) # Get the highest max and lowest min between January and February cumulative_max = 0 cumulative_min = np.Inf for batch_definition in jan_feb_batch_definition_list: batch_id: str = batch_definition.id current_max = validator.get_metric( MetricConfiguration( "column.max", metric_domain_kwargs={ "column": "fare_amount", "batch_id": batch_id }, )) cumulative_max = current_max if current_max > cumulative_max else cumulative_max current_min = validator.get_metric( MetricConfiguration( "column.min", metric_domain_kwargs={ "column": "fare_amount", "batch_id": batch_id }, )) cumulative_min = current_min if current_min < cumulative_min else cumulative_min
def _self_check_fetch_batch( self, pretty_print: bool, example_data_reference: Any, data_asset_name: str, ): """ Helper function for self_check() to retrieve batch using example_data_reference and data_asset_name, all while printing helpful messages. First 5 rows of batch_data are printed by default. Args: pretty_print (bool): print to console? example_data_reference (Any): data_reference to retrieve data_asset_name (str): data_asset_name to retrieve """ if pretty_print: print(f"\n\t\tFetching batch data...") batch_definition_list: List[ BatchDefinition ] = self._map_data_reference_to_batch_definition_list( data_reference=example_data_reference, data_asset_name=data_asset_name, ) assert len(batch_definition_list) == 1 batch_definition: BatchDefinition = batch_definition_list[0] # _execution_engine might be None for some tests if batch_definition is None or self._execution_engine is None: return {} batch_data: Any batch_spec: BatchSpec batch_data, batch_spec, _ = self.get_batch_data_and_metadata( batch_definition=batch_definition ) # Note: get_batch_data_and_metadata will have loaded the data into the currently-defined execution engine. # Consequently, when we build a Validator, we do not need to specifically load the batch into it to # resolve metrics. validator: Validator = Validator(execution_engine=batch_data.execution_engine) data: Any = validator.get_metric( metric=MetricConfiguration( metric_name="table.head", metric_domain_kwargs={ "batch_id": batch_definition.id, }, metric_value_kwargs={ "n_rows": 5, }, ) ) n_rows: int = validator.get_metric( metric=MetricConfiguration( metric_name="table.row_count", metric_domain_kwargs={ "batch_id": batch_definition.id, }, ) ) if pretty_print and data is not None: print(f"\n\t\tShowing 5 rows") print(data) return { "batch_spec": batch_spec, "n_rows": n_rows, }
def test_map_value_set_spark(spark_session, basic_spark_df_execution_engine): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame( {"a": [1, 2, 3, 3, None]}, ), batch_id="my_id", ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) condition_metric = MetricConfiguration( metric_name="column_values.in_set.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(condition_metric,), metrics=metrics ) metrics.update(results) # Note: metric_dependencies is optional here in the config when called from a validator. aggregate_partial = MetricConfiguration( metric_name="column_values.in_set.unexpected_count.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(aggregate_partial,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.in_set.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"metric_partial_fn": aggregate_partial}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results == {desired_metric.id: 0} # We run the same computation again, this time with None being replaced by nan instead of NULL # to demonstrate this behavior df = pd.DataFrame({"a": [1, 2, 3, 3, None]}) df = spark_session.createDataFrame(df) engine = basic_spark_df_execution_engine engine.load_batch_data(batch_id="my_id", batch_data=df) condition_metric = MetricConfiguration( metric_name="column_values.in_set.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(condition_metric,), metrics=metrics ) metrics.update(results) # Note: metric_dependencies is optional here in the config when called from a validator. aggregate_partial = MetricConfiguration( metric_name="column_values.in_set.unexpected_count.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(aggregate_partial,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.in_set.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"metric_partial_fn": aggregate_partial}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results == {desired_metric.id: 1}
def get_validation_dependencies( self, configuration: Optional[ExpectationConfiguration] = None, execution_engine: Optional[ExecutionEngine] = None, runtime_configuration: Optional[dict] = None, ): # This calls TableExpectation.get_validation_dependencies to set baseline dependencies for the aggregate version # of the expectation. # We need to keep this as super(ColumnMapExpectation, self), which calls # TableExpectation.get_validation_dependencies instead of ColumnMapExpectation.get_validation_dependencies. # This is because the map version of this expectation is only supported for Pandas, so we want the aggregate # version for the other backends. dependencies = super(ColumnMapExpectation, self).get_validation_dependencies( configuration, execution_engine, runtime_configuration) # Only PandasExecutionEngine supports the column map version of the expectation. if isinstance(execution_engine, PandasExecutionEngine): column_name = configuration.kwargs.get("column") expected_types_list = configuration.kwargs.get("type_list") metric_kwargs = get_metric_kwargs( configuration=configuration, metric_name="table.column_types", runtime_configuration=runtime_configuration, ) metric_domain_kwargs = metric_kwargs.get("metric_domain_kwargs") metric_value_kwargs = metric_kwargs.get("metric_value_kwargs") table_column_types_configuration = MetricConfiguration( "table.column_types", metric_domain_kwargs=metric_domain_kwargs, metric_value_kwargs=metric_value_kwargs, ) actual_column_types_list = execution_engine.resolve_metrics([ table_column_types_configuration ])[table_column_types_configuration.id] actual_column_type = [ type_dict["type"] for type_dict in actual_column_types_list if type_dict["name"] == column_name ][0] # only use column map version if column dtype is object if (actual_column_type.type.__name__ == "object_" and expected_types_list is not None): # this resets dependencies using ColumnMapExpectation.get_validation_dependencies dependencies = super().get_validation_dependencies( configuration, execution_engine, runtime_configuration) # this adds table.column_types dependency for both aggregate and map versions of expectation column_types_metric_kwargs = get_metric_kwargs( metric_name="table.column_types", configuration=configuration, runtime_configuration=runtime_configuration, ) dependencies["metrics"]["table.column_types"] = MetricConfiguration( metric_name="table.column_types", metric_domain_kwargs=column_types_metric_kwargs[ "metric_domain_kwargs"], metric_value_kwargs=column_types_metric_kwargs[ "metric_value_kwargs"], ) return dependencies
def test_map_unique_sa_column_exists(sa): engine = _build_sa_engine( pd.DataFrame( {"a": [1, 2, 3, 3, None], "b": ["foo", "bar", "baz", "qux", "fish"]} ), sa, ) condition_metric = MetricConfiguration( metric_name="column_values.unique.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) metrics = engine.resolve_metrics(metrics_to_resolve=(condition_metric,)) # This is no longer a MAP_CONDITION because mssql does not support it. Instead, it is a WINDOW_CONDITION # # aggregate_fn = MetricConfiguration( # metric_name="column_values.unique.unexpected_count.aggregate_fn", # metric_domain_kwargs={"column": "a"}, # metric_value_kwargs=dict(), # metric_dependencies={"unexpected_condition": condition_metric}, # ) # aggregate_fn_metrics = engine.resolve_metrics( # metrics_to_resolve=(aggregate_fn,), metrics=metrics # ) desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), # metric_dependencies={"metric_partial_fn": aggregate_fn}, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics, # metrics=aggregate_fn_metrics ) assert results[desired_metric.id] == 2 desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_values", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results[desired_metric.id] == [3, 3] desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_value_counts", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results[desired_metric.id] == [(3, 2)] desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_rows", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results[desired_metric.id] == [(3, "baz"), (3, "qux")]
def __init__( self, profile_dataset, excluded_expectations: list = None, ignored_columns: list = None, not_null_only: bool = False, primary_or_compound_key: list = False, semantic_types_dict: dict = None, table_expectations_only: bool = False, value_set_threshold: str = "MANY", ): """ The UserConfigurableProfiler is used to build an expectation suite from a dataset. The profiler may be instantiated with or without a config. The config may contain a semantic_types dict or not. Once a profiler is instantiated, if config items change, a new profiler will be needed. Write an entry on how to use the profiler for the GE docs site Args: profile_dataset: A Great Expectations Dataset or Validator object excluded_expectations: A list of expectations to not include in the suite ignored_columns: A list of columns for which you would like to NOT create expectations not_null_only: Boolean, default False. By default, each column is evaluated for nullity. If the column values contain fewer than 50% null values, then the profiler will add `expect_column_values_to_not_be_null`; if greater than 50% it will add `expect_column_values_to_be_null`. If not_null_only is set to True, the profiler will add a not_null expectation irrespective of the percent nullity (and therefore will not add an `expect_column_values_to_be_null` primary_or_compound_key: A list containing one or more columns which are a dataset's primary or compound key. This will create an `expect_column_values_to_be_unique` or `expect_compound_columns_to_be_unique` expectation. This will occur even if one or more of the primary_or_compound_key columns are specified in ignored_columns semantic_types_dict: A dictionary where the keys are available semantic_types (see profiler.base.profiler_semantic_types) and the values are lists of columns for which you would like to create semantic_type specific expectations e.g.: "semantic_types": { "value_set": ["state","country"], "numeric":["age", "amount_due"]} table_expectations_only: Boolean, default False. If True, this will only create the two table level expectations available to this profiler (`expect_table_columns_to_match_ordered_list` and `expect_table_row_count_to_be_between`). If a primary_or_compound key is specified, it will create a uniqueness expectation for that column as well value_set_threshold: Takes a string from the following ordered list - "none", "one", "two", "very_few", "few", "many", "very_many", "unique". When the profiler runs without a semantic_types dict, each column is profiled for cardinality. This threshold determines the greatest cardinality for which to add `expect_column_values_to_be_in_set`. For example, if value_set_threshold is set to "unique", it will add a value_set expectation for every included column. If set to "few", it will add a value_set expectation for columns whose cardinality is one of "one", "two", "very_few" or "few". The default value is "many". For the purposes of comparing whether two tables are identical, it might make the most sense to set this to "unique" """ self.column_info = {} self.profile_dataset = profile_dataset assert isinstance(self.profile_dataset, (Dataset, Validator, Batch)) if isinstance(self.profile_dataset, Batch): self.profile_dataset = Validator( execution_engine=self.profile_dataset.data.execution_engine, batches=[self.profile_dataset], ) self.all_table_columns = self.profile_dataset.get_metric( MetricConfiguration("table.columns", dict())) elif isinstance(self.profile_dataset, Validator): self.all_table_columns = self.profile_dataset.get_metric( MetricConfiguration("table.columns", dict())) else: self.all_table_columns = self.profile_dataset.get_table_columns() self.semantic_types_dict = semantic_types_dict assert isinstance(self.semantic_types_dict, (dict, type(None))) self.ignored_columns = ignored_columns or [] assert isinstance(self.ignored_columns, list) self.excluded_expectations = excluded_expectations or [] assert isinstance(self.excluded_expectations, list) assert isinstance(value_set_threshold, str), "value_set_threshold must be a string" self.value_set_threshold = value_set_threshold.upper() assert ( self.value_set_threshold in OrderedProfilerCardinality.__members__ ), f"value_set_threshold must be one of {[i for i in OrderedProfilerCardinality.__members__]}" self.not_null_only = not_null_only assert isinstance(self.not_null_only, bool) self.table_expectations_only = table_expectations_only assert isinstance(self.table_expectations_only, bool) if self.table_expectations_only is True: logger.info( "table_expectations_only is set to True. When used to build a suite, this profiler will ignore all" "columns and create expectations only at the table level. If you would also like to create expectations " "at the column level, you can instantiate a new profiler with table_expectations_only set to False" ) self.primary_or_compound_key = primary_or_compound_key or [] assert isinstance(self.primary_or_compound_key, list) if self.table_expectations_only: self.ignored_columns = self.all_table_columns if self.primary_or_compound_key: for column in self.primary_or_compound_key: if column not in self.all_table_columns: raise ValueError( f"Column {column} not found. Please ensure that this column is in the {type(profile_dataset).__name__} " f"if you would like to use it as a primary_or_compound_key." ) included_columns = [ column_name for column_name in self.all_table_columns if column_name not in self.ignored_columns ] for column_name in included_columns: self._add_column_cardinality_to_column_info( self.profile_dataset, column_name) self._add_column_type_to_column_info(self.profile_dataset, column_name) if self.semantic_types_dict is not None: self._validate_semantic_types_dict(self.profile_dataset) for column_name in included_columns: self._add_semantic_types_by_column_from_config_to_column_info( column_name) self.semantic_type_functions = { "DATETIME": self._build_expectations_datetime, "NUMERIC": self._build_expectations_numeric, "STRING": self._build_expectations_string, "VALUE_SET": self._build_expectations_value_set, "BOOLEAN": self._build_expectations_value_set, }
def test_sparkdf_batch_aggregate_metrics(caplog, spark_session): import datetime engine = _build_spark_engine( pd.DataFrame({"a": [1, 2, 1, 2, 3, 3], "b": [4, 4, 4, 4, 4, 4]}), spark_session ) desired_metric_1 = MetricConfiguration( metric_name="column.max.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) desired_metric_2 = MetricConfiguration( metric_name="column.min.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) desired_metric_3 = MetricConfiguration( metric_name="column.max.aggregate_fn", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=dict(), ) desired_metric_4 = MetricConfiguration( metric_name="column.min.aggregate_fn", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=dict(), ) metrics = engine.resolve_metrics( metrics_to_resolve=( desired_metric_1, desired_metric_2, desired_metric_3, desired_metric_4, ) ) desired_metric_1 = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={"metric_partial_fn": desired_metric_1}, ) desired_metric_2 = MetricConfiguration( metric_name="column.min", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={"metric_partial_fn": desired_metric_2}, ) desired_metric_3 = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=dict(), metric_dependencies={"metric_partial_fn": desired_metric_3}, ) desired_metric_4 = MetricConfiguration( metric_name="column.min", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=dict(), metric_dependencies={"metric_partial_fn": desired_metric_4}, ) start = datetime.datetime.now() caplog.clear() caplog.set_level(logging.DEBUG, logger="great_expectations") res = engine.resolve_metrics( metrics_to_resolve=( desired_metric_1, desired_metric_2, desired_metric_3, desired_metric_4, ), metrics=metrics, ) end = datetime.datetime.now() print(end - start) assert res[desired_metric_1.id] == 3 assert res[desired_metric_2.id] == 1 assert res[desired_metric_3.id] == 4 assert res[desired_metric_4.id] == 4 # Check that all four of these metrics were computed on a single domain found_message = False for record in caplog.records: if ( record.message == "SparkDFExecutionEngine computed 4 metrics on domain_id ()" ): found_message = True assert found_message
def test_map_value_set_spark(spark_session): engine = _build_spark_engine(pd.DataFrame({"a": [1, 2, 3, 3, None]}), spark_session) condition_metric = MetricConfiguration( metric_name="column_values.in_set.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, ) metrics = engine.resolve_metrics(metrics_to_resolve=(condition_metric,)) # Note: metric_dependencies is optional here in the config when called from a validator. aggregate_partial = MetricConfiguration( metric_name="column_values.in_set.unexpected_count.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"unexpected_condition": condition_metric}, ) metrics = engine.resolve_metrics( metrics_to_resolve=(aggregate_partial,), metrics=metrics ) desired_metric = MetricConfiguration( metric_name="column_values.in_set.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"metric_partial_fn": aggregate_partial}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results == {desired_metric.id: 0} # We run the same computation again, this time with None being replaced by nan instead of NULL # to demonstrate this behavior df = pd.DataFrame({"a": [1, 2, 3, 3, None]}) df = spark_session.createDataFrame(df) engine = SparkDFExecutionEngine(batch_data_dict={"my_id": df}) condition_metric = MetricConfiguration( metric_name="column_values.in_set.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, ) metrics = engine.resolve_metrics(metrics_to_resolve=(condition_metric,)) # Note: metric_dependencies is optional here in the config when called from a validator. aggregate_partial = MetricConfiguration( metric_name="column_values.in_set.unexpected_count.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"unexpected_condition": condition_metric}, ) metrics = engine.resolve_metrics( metrics_to_resolve=(aggregate_partial,), metrics=metrics ) desired_metric = MetricConfiguration( metric_name="column_values.in_set.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3]}, metric_dependencies={"metric_partial_fn": aggregate_partial}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results == {desired_metric.id: 1}
def test_z_score_under_threshold_spark(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame( {"a": [1, 2, 3, 3, None]}, ), batch_id="my_id", ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) mean = MetricConfiguration( metric_name="column.mean.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) stdev = MetricConfiguration( metric_name="column.standard_deviation.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metrics = (mean, stdev) results = engine.resolve_metrics( metrics_to_resolve=desired_metrics, metrics=metrics ) metrics.update(results) mean = MetricConfiguration( metric_name="column.mean", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={"metric_partial_fn": mean}, ) stdev = MetricConfiguration( metric_name="column.standard_deviation", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "metric_partial_fn": stdev, "table.columns": table_columns_metric, }, ) desired_metrics = (mean, stdev) results = engine.resolve_metrics( metrics_to_resolve=desired_metrics, metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.map", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "column.standard_deviation": stdev, "column.mean": mean, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={ "column_values.z_score.map": desired_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.unexpected_count.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={"unexpected_condition": desired_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={"metric_partial_fn": desired_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results[desired_metric.id] == 0
def test_z_score_under_threshold_pd(): df = pd.DataFrame({"a": [1, 2, 3, None]}) engine = PandasExecutionEngine(batch_data_dict={"my_id": df}) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) mean = MetricConfiguration( metric_name="column.mean", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) stdev = MetricConfiguration( metric_name="column.standard_deviation", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metrics = (mean, stdev) results = engine.resolve_metrics( metrics_to_resolve=desired_metrics, metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.map", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "column.standard_deviation": stdev, "column.mean": mean, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={ "column_values.z_score.map": desired_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert list(results[desired_metric.id][0]) == [False, False, False] metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={"unexpected_condition": desired_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results[desired_metric.id] == 0
def test_map_unique_spark_column_exists(spark_session): engine: SparkDFExecutionEngine = build_spark_engine( spark=spark_session, df=pd.DataFrame( { "a": [1, 2, 3, 3, 4, None], "b": [None, "foo", "bar", "baz", "qux", "fish"], } ), batch_id="my_id", ) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) condition_metric = MetricConfiguration( metric_name="column_values.unique.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(condition_metric,), metrics=metrics ) metrics.update(results) # unique is a *window* function so does not use the aggregate_fn version of unexpected count desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "unexpected_condition": condition_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results[desired_metric.id] == 2 desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_values", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={ "unexpected_condition": condition_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results[desired_metric.id] == [3, 3] desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_value_counts", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={ "unexpected_condition": condition_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results[desired_metric.id] == [(3, 2)] desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_rows", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results[desired_metric.id] == [(3, "bar"), (3, "baz")]
def test_sa_batch_aggregate_metrics(caplog, sa): import datetime engine = build_sa_engine( pd.DataFrame({ "a": [1, 2, 1, 2, 3, 3], "b": [4, 4, 4, 4, 4, 4] }), sa) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) desired_metric_1 = MetricConfiguration( metric_name="column.max.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metric_2 = MetricConfiguration( metric_name="column.min.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metric_3 = MetricConfiguration( metric_name="column.max.aggregate_fn", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metric_4 = MetricConfiguration( metric_name="column.min.aggregate_fn", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=( desired_metric_1, desired_metric_2, desired_metric_3, desired_metric_4, ), metrics=metrics, ) metrics.update(results) desired_metric_1 = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "metric_partial_fn": desired_metric_1, "table.columns": table_columns_metric, }, ) desired_metric_2 = MetricConfiguration( metric_name="column.min", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "metric_partial_fn": desired_metric_2, "table.columns": table_columns_metric, }, ) desired_metric_3 = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=dict(), metric_dependencies={ "metric_partial_fn": desired_metric_3, "table.columns": table_columns_metric, }, ) desired_metric_4 = MetricConfiguration( metric_name="column.min", metric_domain_kwargs={"column": "b"}, metric_value_kwargs=dict(), metric_dependencies={ "metric_partial_fn": desired_metric_4, "table.columns": table_columns_metric, }, ) caplog.clear() caplog.set_level(logging.DEBUG, logger="great_expectations") start = datetime.datetime.now() results = engine.resolve_metrics( metrics_to_resolve=( desired_metric_1, desired_metric_2, desired_metric_3, desired_metric_4, ), metrics=metrics, ) metrics.update(results) end = datetime.datetime.now() print("t1") print(end - start) assert results[desired_metric_1.id] == 3 assert results[desired_metric_2.id] == 1 assert results[desired_metric_3.id] == 4 assert results[desired_metric_4.id] == 4 # Check that all four of these metrics were computed on a single domain found_message = False for record in caplog.records: if (record.message == "SqlAlchemyExecutionEngine computed 4 metrics on domain_id ()" ): found_message = True assert found_message
def test_map_unique_spark_column_exists(spark_session): engine = _build_spark_engine( pd.DataFrame( { "a": [1, 2, 3, 3, 4, None], "b": [None, "foo", "bar", "baz", "qux", "fish"], } ), spark_session, ) condition_metric = MetricConfiguration( metric_name="column_values.unique.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) metrics = engine.resolve_metrics(metrics_to_resolve=(condition_metric,)) # unique is a *window* function so does not use the aggregate_fn version of unexpected count desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results[desired_metric.id] == 2 desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_values", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results[desired_metric.id] == [3, 3] desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_value_counts", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results[desired_metric.id] == [(3, 2)] desired_metric = MetricConfiguration( metric_name="column_values.unique.unexpected_rows", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20} }, metric_dependencies={"unexpected_condition": condition_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results[desired_metric.id] == [(3, "bar"), (3, "baz")]
def build_continuous_partition_object( execution_engine, domain_kwargs, bins="auto", n_bins=10, allow_relative_error=False ): """Convenience method for building a partition object on continuous data from a dataset and column Args: execution_engine (ExecutionEngine): the execution engine with which to compute the partition domain_kwargs (dict): The domain kwargs describing the domain for which to compute the partition bins (string): One of 'uniform' (for uniformly spaced bins), 'ntile' (for percentile-spaced bins), or 'auto' (for automatically spaced bins) n_bins (int): Ignored if bins is auto. allow_relative_error: passed to get_column_quantiles, set to False for only precise values, True to allow approximate values on systems with only binary choice (e.g. Redshift), and to a value between zero and one for systems that allow specification of relative error (e.g. SparkDFDataset). Returns: A new partition_object:: { "bins": (list) The endpoints of the partial partition of reals, "weights": (list) The densities of the bins implied by the partition. } See :ref:`partition_object`. """ partition_metric_configuration = MetricConfiguration( "column.partition", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={ "bins": bins, "n_bins": n_bins, "allow_relative_error": allow_relative_error, }, ) bins = execution_engine.resolve_metrics([partition_metric_configuration])[ partition_metric_configuration.id ] if isinstance(bins, np.ndarray): bins = bins.tolist() else: bins = list(bins) hist_metric_configuration = MetricConfiguration( "column.histogram", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={ "bins": tuple(bins), }, ) nonnull_configuration = MetricConfiguration( "column_values.nonnull.count", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={ "bins": tuple(bins), }, ) metrics = execution_engine.resolve_metrics( (hist_metric_configuration, nonnull_configuration) ) weights = list( np.array(metrics[hist_metric_configuration.id]) / metrics[nonnull_configuration.id] ) tail_weights = (1 - sum(weights)) / 2 partition_object = { "bins": bins, "weights": weights, "tail_weights": [tail_weights, tail_weights], } return partition_object
def test_z_score_under_threshold_spark(spark_session): engine = _build_spark_engine(pd.DataFrame({"a": [1, 2, 3, 3, None]}), spark_session) mean = MetricConfiguration( metric_name="column.mean.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) stdev = MetricConfiguration( metric_name="column.standard_deviation.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) desired_metrics = (mean, stdev) metrics = engine.resolve_metrics(metrics_to_resolve=desired_metrics) mean = MetricConfiguration( metric_name="column.mean", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={"metric_partial_fn": mean}, ) stdev = MetricConfiguration( metric_name="column.standard_deviation", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={"metric_partial_fn": stdev}, ) desired_metrics = (mean, stdev) metrics = engine.resolve_metrics( metrics_to_resolve=(desired_metrics), metrics=metrics ) desired_metric = MetricConfiguration( metric_name="column_values.z_score.map", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={"column.standard_deviation": stdev, "column.mean": mean}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={"column_values.z_score.map": desired_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.unexpected_count.aggregate_fn", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={"unexpected_condition": desired_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={"metric_partial_fn": desired_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results[desired_metric.id] == 0
def get_validation_dependencies( self, configuration: Optional[ExpectationConfiguration] = None, execution_engine: Optional[ExecutionEngine] = None, runtime_configuration: Optional[dict] = None, ): all_dependencies = super().get_validation_dependencies( configuration, execution_engine, runtime_configuration ) dependencies = all_dependencies["metrics"] partition_object = configuration.kwargs["partition_object"] domain_kwargs = configuration.get_domain_kwargs() is_categorical = None bins = None if partition_object is None: if configuration.kwargs.get( "bucketize_data", self.default_kwarg_values["bucketize_data"] ): is_categorical = False partition_metric_configuration = MetricConfiguration( "column.partition", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={ "bins": "auto", "allow_relative_error": False, }, ) # # Note: 20201116 - JPC - the execution engine doesn't provide capability to evaluate # dependencies, so we use a validator # validator = Validator(execution_engine=execution_engine) graph = ValidationGraph() validator.build_metric_dependency_graph( graph=graph, child_node=partition_metric_configuration, configuration=configuration, execution_engine=execution_engine, ) bins = validator.resolve_validation_graph(graph, metrics=dict())[ partition_metric_configuration.id ] hist_metric_configuration = MetricConfiguration( "column.histogram", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={ "bins": tuple(bins), }, ) nonnull_configuration = MetricConfiguration( "column_values.nonnull.count", metric_domain_kwargs=domain_kwargs, metric_value_kwargs=dict(), ) # # NOTE 20201117 - JPC - Would prefer not to include partition_metric_configuraiton here, # since we have already evaluated it, and its result is in the kwargs for the histogram. # However, currently the dependencies' configurations are not passed to the _validate method # dependencies["column.partition"] = partition_metric_configuration dependencies["column.histogram"] = hist_metric_configuration dependencies["column_values.nonnull.count"] = nonnull_configuration else: is_categorical = True counts_configuration = MetricConfiguration( "column.value_counts", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={ "sort": "value", }, ) nonnull_configuration = MetricConfiguration( "column_values.nonnull.count", metric_domain_kwargs=domain_kwargs, ) dependencies["column.value_counts"] = counts_configuration dependencies["column_values.nonnull.count"] = nonnull_configuration if is_categorical is True or is_valid_categorical_partition_object( partition_object ): dependencies["column.value_counts"] = MetricConfiguration( "column.value_counts", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={"sort": "value"}, ) dependencies["column_values.nonnull.count"] = MetricConfiguration( "column_values.nonnull.count", domain_kwargs ) else: if ( bins is None ): # if the user did not supply a partition_object, so we just computed it if not is_valid_partition_object(partition_object): raise ValueError("Invalid partition_object provided") bins = partition_object["bins"] hist_metric_configuration = MetricConfiguration( "column.histogram", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={ "bins": bins, }, ) nonnull_configuration = MetricConfiguration( "column_values.nonnull.count", metric_domain_kwargs=domain_kwargs, metric_value_kwargs=dict(), ) dependencies["column.histogram"] = hist_metric_configuration dependencies["column_values.nonnull.count"] = nonnull_configuration below_partition = MetricConfiguration( "column_values.between.count", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={"max_value": bins[0]}, ) above_partition = MetricConfiguration( "column_values.between.count", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={"min_value": bins[-1], "strict_min": True}, ) dependencies["below_partition"] = below_partition dependencies["above_partition"] = above_partition return all_dependencies
from great_expectations.data_context.data_context import DataContext from great_expectations.validator.validation_graph import MetricConfiguration context = DataContext() suite = context.get_expectation_suite("yellow_trip_data_validations") # February BatchRequest and Validator batch_request_february = BatchRequest( datasource_name="taxi_pandas", data_connector_name="monthly", data_asset_name="my_reports", data_connector_query={"index": -2}, ) validator_february = context.get_validator( batch_request=batch_request_february, expectation_suite=suite) february_table_row_count = validator_february.get_metric( MetricConfiguration("table.row_count", metric_domain_kwargs={})) # March BatchRequest and Validator batch_request_march = BatchRequest( datasource_name="taxi_pandas", data_connector_name="monthly", data_asset_name="my_reports", data_connector_query={"index": -1}, ) validator_march = context.get_validator(batch_request=batch_request_march, expectation_suite=suite) print( validator_march.expect_table_row_count_to_equal( value=february_table_row_count))