def test_resolve_config_using_acceptable_arguments( profiler_with_placeholder_args: RuleBasedProfiler, ) -> None: old_config: RuleBasedProfilerConfig = profiler_with_placeholder_args.config # Roundtrip through schema validation to add/or restore any missing fields. old_deserialized_config: dict = ruleBasedProfilerConfigSchema.load( old_config.to_json_dict()) old_deserialized_config.pop("class_name") old_deserialized_config.pop("module_name") old_config = RuleBasedProfilerConfig(**old_deserialized_config) # Brand new config is created but existing attributes are unchanged new_config: RuleBasedProfilerConfig = ( RuleBasedProfilerConfig.resolve_config_using_acceptable_arguments( profiler=profiler_with_placeholder_args, )) # Roundtrip through schema validation to add/or restore any missing fields. # new_deserialized_config: dict = ruleBasedProfilerConfigSchema.load(new_config.to_json_dict()) new_deserialized_config: dict = new_config.to_json_dict() new_deserialized_config.pop("class_name") new_deserialized_config.pop("module_name") new_config = RuleBasedProfilerConfig(**new_deserialized_config) assert id(old_config) != id(new_config) assert all(old_config[attr] == new_config[attr] for attr in ("config_version", "name"))
def test_rule_based_profiler_from_commented_map(): data = { "name": "my_RBP", "class_name": "RuleBasedProfiler", "module_name": "great_expectations.rule_based_profiler", "config_version": 1.0, "variables": { "foo": "bar" }, "rules": { "rule_1": { "domain_builder": { "class_name": "DomainBuilder" }, "parameter_builders": [{ "class_name": "ParameterBuilder", "name": "my_parameter" }], "expectation_configuration_builders": [{ "class_name": "ExpectationConfigurationBuilder", "expectation_type": "expect_column_pair_values_A_to_be_greater_than_B", }], }, }, } commented_map = CommentedMap(data) config = RuleBasedProfilerConfig.from_commented_map(commented_map) assert all(hasattr(config, k) for k in data)
def test_add_profiler( empty_data_context: DataContext, profiler_config_with_placeholder_args: RuleBasedProfilerConfig, ): args = profiler_config_with_placeholder_args.to_json_dict() for attr in ("class_name", "module_name"): args.pop(attr, None) profiler = empty_data_context.add_profiler(**args) assert isinstance(profiler, RuleBasedProfiler)
def test_resolve_config_using_acceptable_arguments_with_runtime_overrides_with_batch_requests( profiler_with_placeholder_args: RuleBasedProfiler, ) -> None: runtime_override_rule: dict = { "domain_builder": { "class_name": "TableDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "parameter_builders": [ { "class_name": "MetricMultiBatchParameterBuilder", "module_name": "great_expectations.rule_based_profiler.parameter_builder", "metric_name": "my_other_metric", "name": "my_additional_parameter", }, ], "expectation_configuration_builders": [ { "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "expectation_type": "expect_column_values_to_be_between", "meta": { "details": { "note": "Here's another rule", }, }, }, ], } runtime_override_rule_name: str = "rule_with_batch_request" runtime_override_rules: Dict[str, dict] = { runtime_override_rule_name: runtime_override_rule } config: RuleBasedProfilerConfig = ( RuleBasedProfilerConfig.resolve_config_using_acceptable_arguments( profiler=profiler_with_placeholder_args, rules=runtime_override_rules, )) domain_builder: dict = config.rules[runtime_override_rule_name][ "domain_builder"] assert domain_builder == { "class_name": "TableDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder.table_domain_builder", }
def serialization_self_check(self, pretty_print: bool) -> None: """ Fufills the abstract method defined by the parent class. See `ConfigurationStore` for more details. """ test_profiler_name = f"profiler_{''.join([random.choice(list('0123456789ABCDEF')) for _ in range(20)])}" test_profiler_configuration = RuleBasedProfilerConfig( name=test_profiler_name, class_name="RuleBasedProfiler", module_name="great_expectations.rule_based_profiler", config_version=1.0, rules={}, ) test_key: Union[GeCloudIdentifier, ConfigurationIdentifier] if self.ge_cloud_mode: test_key = self.key_class(resource_type="contract", ge_cloud_id=str(uuid.uuid4())) else: test_key = self.key_class(configuration_key=test_profiler_name) if pretty_print: print( f"Attempting to add a new test key {test_key} to Profiler store..." ) self.set(key=test_key, value=test_profiler_configuration) if pretty_print: print( f"\tTest key {test_key} successfully added to Profiler store.\n" ) print( f"Attempting to retrieve the test value associated with key {test_key} from Profiler store..." ) test_value = self.get(key=test_key) if pretty_print: print( f"\tTest value successfully retrieved from Profiler store: {test_value}\n" ) print( f"Cleaning up test key {test_key} and value from Profiler store..." ) test_value = self.remove_key(key=test_key) if pretty_print: print( f"\tTest key and value successfully removed from Profiler store: {test_value}\n" )
def test_add_profiler_with_invalid_config_raises_error( empty_data_context: DataContext, profiler_config_with_placeholder_args: RuleBasedProfilerConfig, ): args = profiler_config_with_placeholder_args.to_json_dict() for attr in ("class_name", "module_name"): args.pop(attr, None) # Setting invalid configuration to check that it is caught by DataContext wrapper method args["config_version"] = -1 with pytest.raises(ValidationError) as e: empty_data_context.add_profiler(**args) assert "config_version" in str(e.value)
def test_add_profiler_with_batch_request_containing_batch_data_raises_error( mock_data_context: mock.MagicMock, ): profiler_config = RuleBasedProfilerConfig( name="my_profiler_config", class_name="RuleBasedProfiler", module_name="great_expectations.rule_based_profiler", config_version=1.0, rules={ "rule_1": { "domain_builder": { "class_name": "TableDomainBuilder", "batch_request": { "runtime_parameters": { "batch_data": pd.DataFrame() # Cannot be serialized in store } }, }, "parameter_builders": [ { "class_name": "MetricMultiBatchParameterBuilder", "name": "my_parameter", "metric_name": "my_metric", }, ], "expectation_configuration_builders": [ { "class_name": "DefaultExpectationConfigurationBuilder", "expectation_type": "expect_column_pair_values_A_to_be_greater_than_B", }, ], } }, ) with pytest.raises(InvalidConfigError) as e: RuleBasedProfiler.add_profiler( profiler_config, data_context=mock_data_context, profiler_store=mock_data_context.profiler_store, ) assert "batch_data found in batch_request" in str(e.value)
def get_profiler_run_usage_statistics( profiler: "RuleBasedProfiler", # noqa: F821 variables: Optional[dict] = None, rules: Optional[dict] = None, *args, **kwargs, ) -> dict: usage_statistics_handler: Optional[ UsageStatisticsHandler ] = profiler._usage_statistics_handler data_context_id: Optional[str] = None if usage_statistics_handler: data_context_id = usage_statistics_handler._data_context_id anonymizer: Optional[Anonymizer] = _anonymizers.get(data_context_id, None) if anonymizer is None: anonymizer = Anonymizer(data_context_id) _anonymizers[data_context_id] = anonymizer payload: dict = {} if usage_statistics_handler: # noinspection PyBroadException try: anonymizer = usage_statistics_handler.anonymizer resolved_runtime_config: "RuleBasedProfilerConfig" = ( # noqa: F821 RuleBasedProfilerConfig.resolve_config_using_acceptable_arguments( profiler=profiler, variables=variables, rules=rules, ) ) payload: dict = anonymizer.anonymize(obj=resolved_runtime_config) except Exception as e: logger.debug( f"{UsageStatsExceptionPrefix.EMIT_EXCEPTION.value}: {e} type: {type(e)}, get_profiler_run_usage_statistics: Unable to create anonymized_profiler_run payload field" ) return payload
class ExpectColumnStdevToBeBetween(ColumnExpectation): """Expect the column standard deviation to be between a minimum value and a maximum value. Uses sample standard deviation (normalized by N-1). expect_column_stdev_to_be_between is a \ :func:`column_aggregate_expectation <great_expectations.execution_engine.MetaExecutionEngine.column_aggregate_expectation>`. Args: column (str): \ The column name. min_value (float or None): \ The minimum value for the column standard deviation. max_value (float or None): \ The maximum value for the column standard deviation. strict_min (boolean): If True, the column standard deviation must be strictly larger than min_value, default=False strict_max (boolean): If True, the column standard deviation must be strictly smaller than max_value, default=False Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. \ For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. Notes: These fields in the result object are customized for this expectation: :: { "observed_value": (float) The true standard deviation for the column } * min_value and max_value are both inclusive unless strict_min or strict_max are set to True. * If min_value is None, then max_value is treated as an upper bound * If max_value is None, then min_value is treated as a lower bound See Also: :func:`expect_column_mean_to_be_between \ <great_expectations.execution_engine.execution_engine.ExecutionEngine.expect_column_mean_to_be_between>` :func:`expect_column_median_to_be_between \ <great_expectations.execution_engine.execution_engine.ExecutionEngine.expect_column_median_to_be_between>` """ # This dictionary contains metadata for display in the public gallery library_metadata = { "maturity": "production", "tags": ["core expectation", "column aggregate expectation"], "contributors": ["@great_expectations"], "requirements": [], } metric_dependencies = ("column.standard_deviation", ) success_keys = ( "min_value", "strict_min", "max_value", "strict_max", "auto", "profiler_config", ) stdev_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="NumericMetricRangeMultiBatchParameterBuilder", name="stdev_range_estimator", metric_name="column.standard_deviation", metric_multi_batch_parameter_builder_name=None, metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, false_positive_rate=f"{VARIABLES_KEY}false_positive_rate", quantile_statistic_interpolation_method= f"{VARIABLES_KEY}quantile_statistic_interpolation_method", estimator=f"{VARIABLES_KEY}estimator", n_resamples=f"{VARIABLES_KEY}n_resamples", random_seed=f"{VARIABLES_KEY}random_seed", include_estimator_samples_histogram_in_details= f"{VARIABLES_KEY}include_estimator_samples_histogram_in_details", truncate_values=f"{VARIABLES_KEY}truncate_values", round_decimals=f"{VARIABLES_KEY}round_decimals", evaluation_parameter_builder_configs=None, ) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ stdev_range_estimator_parameter_builder_config, ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name= "expect_column_stdev_to_be_between", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "default_expect_column_stdev_to_be_between_rule": { "variables": { "strict_min": False, "strict_max": False, "false_positive_rate": 0.05, "quantile_statistic_interpolation_method": "auto", "estimator": "bootstrap", "n_resamples": 9999, "random_seed": None, "include_estimator_samples_histogram_in_details": False, "truncate_values": { "lower_bound": 0, "upper_bound": None, }, "round_decimals": 2, }, "domain_builder": { "class_name": "ColumnDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [ { "expectation_type": "expect_column_stdev_to_be_between", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", "min_value": f"{PARAMETER_KEY}{stdev_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[0]", "max_value": f"{PARAMETER_KEY}{stdev_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[1]", "strict_min": f"{VARIABLES_KEY}strict_min", "strict_max": f"{VARIABLES_KEY}strict_max", "meta": { "profiler_details": f"{PARAMETER_KEY}{stdev_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, }, ], }, }, ) default_kwarg_values = { "min_value": None, "strict_min": False, "max_value": None, "strict_max": False, "result_format": "BASIC", "include_config": True, "catch_exceptions": False, "auto": False, "profiler_config": default_profiler_config, } args_keys = ( "column", "min_value", "max_value", "strict_min", "strict_max", ) def validate_configuration( self, configuration: Optional[ExpectationConfiguration]) -> None: """ Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that necessary configuration arguments have been provided for the validation of the expectation. Args: configuration (OPTIONAL[ExpectationConfiguration]): \ An optional Expectation Configuration entry that will be used to configure the expectation Returns: None. Raises InvalidExpectationConfigurationError if the config is not validated successfully """ super().validate_configuration(configuration) self.validate_metric_value_between_configuration( configuration=configuration) @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "min_value", "max_value", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) params_with_json_schema = { "column": { "schema": { "type": "string" }, "value": params.get("column") }, "min_value": { "schema": { "type": "number" }, "value": params.get("min_value"), }, "max_value": { "schema": { "type": "number" }, "value": params.get("max_value"), }, "row_condition": { "schema": { "type": "string" }, "value": params.get("row_condition"), }, "condition_parser": { "schema": { "type": "string" }, "value": params.get("condition_parser"), }, "strict_min": { "schema": { "type": "boolean" }, "value": params.get("strict_min"), }, "strict_max": { "schema": { "type": "boolean" }, "value": params.get("strict_max"), }, } if (params["min_value"] is None) and (params["max_value"] is None): template_str = "standard deviation may have any numerical value." else: at_least_str, at_most_str = handle_strict_min_max(params) if params["min_value"] is not None and params[ "max_value"] is not None: template_str = f"standard deviation must be {at_least_str} $min_value and {at_most_str} $max_value." elif params["min_value"] is None: template_str = f"standard deviation must be {at_most_str} $max_value." elif params["max_value"] is None: template_str = f"standard deviation must be {at_least_str} $min_value." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"], with_schema=True) template_str = f"{conditional_template_str}, then {template_str}" params_with_json_schema.update(conditional_params) return (template_str, params_with_json_schema, styling) @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "min_value", "max_value", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) if (params["min_value"] is None) and (params["max_value"] is None): template_str = "standard deviation may have any numerical value." else: at_least_str, at_most_str = handle_strict_min_max(params) if params["min_value"] is not None and params[ "max_value"] is not None: template_str = f"standard deviation must be {at_least_str} $min_value and {at_most_str} $max_value." elif params["min_value"] is None: template_str = f"standard deviation must be {at_most_str} $max_value." elif params["max_value"] is None: template_str = f"standard deviation must be {at_least_str} $min_value." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"]) template_str = f"{conditional_template_str}, then {template_str}" params.update(conditional_params) return [ RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": template_str, "params": params, "styling": styling, }, }) ] def _validate( self, configuration: ExpectationConfiguration, metrics: Dict, runtime_configuration: dict = None, execution_engine: ExecutionEngine = None, ): return self._validate_metric_value_between( metric_name="column.standard_deviation", configuration=configuration, metrics=metrics, runtime_configuration=runtime_configuration, execution_engine=execution_engine, )
class ExpectColumnValuesToBeBetween(ColumnMapExpectation): """Expect column entries to be between a minimum value and a maximum value (inclusive). expect_column_values_to_be_between is a \ :func:`column_map_expectation <great_expectations.execution_engine.execution_engine.MetaExecutionEngine .column_map_expectation>`. Args: column (str): \ The column name. min_value (comparable type or None): The minimum value for a column entry. max_value (comparable type or None): The maximum value for a column entry. strict_min (boolean): If True, values must be strictly larger than min_value, default=False strict_max (boolean): If True, values must be strictly smaller than max_value, default=False Keyword Args: allow_cross_type_comparisons (boolean or None) : If True, allow comparisons between types (e.g. integer and\ string). Otherwise, attempting such comparisons will raise an exception. parse_strings_as_datetimes (boolean or None) : If True, parse min_value, max_value, and all non-null column\ values to datetimes before making comparisons. output_strftime_format (str or None): \ A valid strfime format for datetime output. Only used if parse_strings_as_datetimes=True. mostly (None or a float between 0 and 1): \ Return `"success": True` if at least mostly fraction of values match the expectation. \ For more detail, see :ref:`mostly`. Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. Notes: * min_value and max_value are both inclusive unless strict_min or strict_max are set to True. * If min_value is None, then max_value is treated as an upper bound, and there is no minimum value checked. * If max_value is None, then min_value is treated as a lower bound, and there is no maximum value checked. See Also: :func:`expect_column_value_lengths_to_be_between \ <great_expectations.execution_engine.execution_engine.ExecutionEngine .expect_column_value_lengths_to_be_between>` """ # This dictionary contains metadata for display in the public gallery library_metadata = { "maturity": "production", "tags": ["core expectation", "column map expectation"], "contributors": ["@great_expectations"], "requirements": [], "has_full_test_suite": True, "manually_reviewed_code": True, } map_metric = "column_values.between" success_keys = ( "min_value", "max_value", "strict_min", "strict_max", "allow_cross_type_comparisons", "mostly", "parse_strings_as_datetimes", "auto", "profiler_config", ) column_min_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="NumericMetricRangeMultiBatchParameterBuilder", name="column_min_range_estimator", metric_name="column.min", metric_multi_batch_parameter_builder_name=None, metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, false_positive_rate=f"{VARIABLES_KEY}false_positive_rate", quantile_statistic_interpolation_method= f"{VARIABLES_KEY}quantile_statistic_interpolation_method", estimator=f"{VARIABLES_KEY}estimator", n_resamples=f"{VARIABLES_KEY}n_resamples", random_seed=f"{VARIABLES_KEY}random_seed", include_estimator_samples_histogram_in_details= f"{VARIABLES_KEY}include_estimator_samples_histogram_in_details", truncate_values=f"{VARIABLES_KEY}truncate_values", round_decimals=f"{VARIABLES_KEY}round_decimals", evaluation_parameter_builder_configs=None, ) column_max_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="NumericMetricRangeMultiBatchParameterBuilder", name="column_max_range_estimator", metric_name="column.max", metric_multi_batch_parameter_builder_name=None, metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, false_positive_rate=f"{VARIABLES_KEY}false_positive_rate", quantile_statistic_interpolation_method= f"{VARIABLES_KEY}quantile_statistic_interpolation_method", estimator=f"{VARIABLES_KEY}estimator", n_resamples=f"{VARIABLES_KEY}n_resamples", random_seed=f"{VARIABLES_KEY}random_seed", include_estimator_samples_histogram_in_details= f"{VARIABLES_KEY}include_estimator_samples_histogram_in_details", truncate_values=f"{VARIABLES_KEY}truncate_values", round_decimals=f"{VARIABLES_KEY}round_decimals", evaluation_parameter_builder_configs=None, ) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ column_min_range_estimator_parameter_builder_config, column_max_range_estimator_parameter_builder_config, ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name= "expect_column_values_to_be_between", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "default_expect_column_values_to_be_between_rule": { "variables": { "mostly": 1.0, "strict_min": False, "strict_max": False, "false_positive_rate": 0.05, "quantile_statistic_interpolation_method": "auto", "estimator": "bootstrap", "n_resamples": 9999, "random_seed": None, "include_estimator_samples_histogram_in_details": False, "truncate_values": { "lower_bound": None, "upper_bound": None, }, "round_decimals": 1, }, "domain_builder": { "class_name": "ColumnDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [ { "expectation_type": "expect_column_values_to_be_between", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", "min_value": f"{PARAMETER_KEY}{column_min_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[0]", "max_value": f"{PARAMETER_KEY}{column_max_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[1]", "mostly": f"{VARIABLES_KEY}mostly", "strict_min": f"{VARIABLES_KEY}strict_min", "strict_max": f"{VARIABLES_KEY}strict_max", "meta": { "profiler_details": { "column_min_range_estimator": f"{PARAMETER_KEY}{column_min_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", "column_max_range_estimator": f"{PARAMETER_KEY}{column_max_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, }, }, ], }, }, ) default_kwarg_values = { "row_condition": None, "condition_parser": None, # we expect this to be explicitly set whenever a row_condition is passed "mostly": 1, "min_value": None, "max_value": None, "strict_min": False, "strict_max": False, # tolerance=1e-9, "parse_strings_as_datetimes": False, "allow_cross_type_comparisons": None, "result_format": "BASIC", "include_config": True, "catch_exceptions": False, "meta": None, "auto": False, "profiler_config": default_profiler_config, } args_keys = ( "column", "min_value", "max_value", "strict_min", "strict_max", ) def validate_configuration( self, configuration: Optional[ExpectationConfiguration]) -> None: """ Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that necessary configuration arguments have been provided for the validation of the expectation. Args: configuration (OPTIONAL[ExpectationConfiguration]): \ An optional Expectation Configuration entry that will be used to configure the expectation Returns: None. Raises InvalidExpectationConfigurationError if the config is not validated successfully """ # Setting up a configuration super().validate_configuration(configuration) min_val = None max_val = None if "min_value" in configuration.kwargs: min_val = configuration.kwargs["min_value"] if "max_value" in configuration.kwargs: max_val = configuration.kwargs["max_value"] assert (min_val is not None or max_val is not None), "min_value and max_value cannot both be None" self.validate_metric_value_between_configuration( configuration=configuration) @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "min_value", "max_value", "mostly", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) params_with_json_schema = { "column": { "schema": { "type": "string" }, "value": params.get("column") }, "min_value": { "schema": { "type": "number" }, "value": params.get("min_value"), }, "max_value": { "schema": { "type": "number" }, "value": params.get("max_value"), }, "mostly": { "schema": { "type": "number" }, "value": params.get("mostly") }, "mostly_pct": { "schema": { "type": "string" }, "value": params.get("mostly_pct"), }, "row_condition": { "schema": { "type": "string" }, "value": params.get("row_condition"), }, "condition_parser": { "schema": { "type": "string" }, "value": params.get("condition_parser"), }, "strict_min": { "schema": { "type": "boolean" }, "value": params.get("strict_min"), }, "strict_max": { "schema": { "type": "boolean" }, "value": params.get("strict_max"), }, } template_str = "" if (params["min_value"] is None) and (params["max_value"] is None): template_str += "may have any numerical value." else: at_least_str, at_most_str = handle_strict_min_max(params) mostly_str = "" if params["mostly"] is not None and params["mostly"] < 1.0: params_with_json_schema["mostly_pct"]["value"] = num_to_str( params["mostly"] * 100, precision=15, no_scientific=True) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") mostly_str = ", at least $mostly_pct % of the time" if params["min_value"] is not None and params[ "max_value"] is not None: template_str += f"values must be {at_least_str} $min_value and {at_most_str} $max_value{mostly_str}." elif params["min_value"] is None: template_str += f"values must be {at_most_str} $max_value{mostly_str}." elif params["max_value"] is None: template_str += f"values must be {at_least_str} $min_value{mostly_str}." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"], with_schema=True) template_str = f"{conditional_template_str}, then {template_str}" params_with_json_schema.update(conditional_params) return (template_str, params_with_json_schema, styling) # NOTE: This method is a pretty good example of good usage of `params`. @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "min_value", "max_value", "mostly", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) template_str = "" if (params["min_value"] is None) and (params["max_value"] is None): template_str += "may have any numerical value." else: at_least_str, at_most_str = handle_strict_min_max(params) mostly_str = "" if params["mostly"] is not None and params["mostly"] < 1.0: params["mostly_pct"] = num_to_str(params["mostly"] * 100, precision=15, no_scientific=True) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") mostly_str = ", at least $mostly_pct % of the time" if params["min_value"] is not None and params[ "max_value"] is not None: template_str += f"values must be {at_least_str} $min_value and {at_most_str} $max_value{mostly_str}." elif params["min_value"] is None: template_str += f"values must be {at_most_str} $max_value{mostly_str}." elif params["max_value"] is None: template_str += f"values must be {at_least_str} $min_value{mostly_str}." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"]) template_str = f"{conditional_template_str}, then {template_str}" params.update(conditional_params) return [ RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": template_str, "params": params, "styling": styling, }, }) ]
class ExpectTableColumnsToMatchSet(TableExpectation): """Expect the columns to match an *unordered* set. expect_table_columns_to_match_set is a :func:`expectation \ <great_expectations.validator.validator.Validator.expectation>`, not a ``column_map_expectation`` or ``column_aggregate_expectation``. Args: column_set (list of str): \ The column names, in any order. exact_match (boolean): \ Whether the list of columns must exactly match the observed columns. Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. """ library_metadata = { "maturity": "production", "tags": ["core expectation", "table expectation"], "contributors": [ "@great_expectations", ], "requirements": [], "has_full_test_suite": True, "manually_reviewed_code": True, } metric_dependencies = ("table.columns",) success_keys = ( "column_set", "exact_match", "auto", "profiler_config", ) mean_table_columns_set_match_multi_batch_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="MeanTableColumnsSetMatchMultiBatchParameterBuilder", name="column_names_set_estimator", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, evaluation_parameter_builder_configs=None, ) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ mean_table_columns_set_match_multi_batch_parameter_builder_config, ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name="expect_table_columns_to_match_set", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "expect_table_columns_to_match_set": { "variables": { "exact_match": None, "success_ratio": 1.0, }, "domain_builder": { "class_name": "TableDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [ { "expectation_type": "expect_table_columns_to_match_set", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "condition": f"{PARAMETER_KEY}{mean_table_columns_set_match_multi_batch_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}success_ratio >= {VARIABLES_KEY}success_ratio", "column_set": f"{PARAMETER_KEY}{mean_table_columns_set_match_multi_batch_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}", "exact_match": f"{VARIABLES_KEY}exact_match", "meta": { "profiler_details": f"{PARAMETER_KEY}{mean_table_columns_set_match_multi_batch_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, }, ], }, }, ) default_kwarg_values = { "column_set": None, "exact_match": True, "result_format": "BASIC", "include_config": True, "catch_exceptions": False, "auto": False, "profiler_config": default_profiler_config, } args_keys = ( "column_set", "exact_match", ) def validate_configuration( self, configuration: Optional[ExpectationConfiguration] ) -> None: """ Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that necessary configuration arguments have been provided for the validation of the expectation. Args: configuration (OPTIONAL[ExpectationConfiguration]): \ An optional Expectation Configuration entry that will be used to configure the expectation Returns: None. Raises InvalidExpectationConfigurationError if the config is not validated successfully """ # Setting up a configuration super().validate_configuration(configuration) # Ensuring that a proper value has been provided try: assert "column_set" in configuration.kwargs, "column_set is required" assert ( isinstance(configuration.kwargs["column_set"], (list, set, dict)) or configuration.kwargs["column_set"] is None ), "column_set must be a list, set, or None" if isinstance(configuration.kwargs["column_set"], dict): assert ( "$PARAMETER" in configuration.kwargs["column_set"] ), 'Evaluation Parameter dict for column_set kwarg must have "$PARAMETER" key.' except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get("include_column_name", True) include_column_name = ( include_column_name if include_column_name is not None else True ) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, ["column_set", "exact_match"] ) if params["column_set"] is None: template_str = "Must specify a set or list of columns." else: # standardize order of the set for output params["column_list"] = list(params["column_set"]) column_list_template_str = ", ".join( [f"$column_list_{idx}" for idx in range(len(params["column_list"]))] ) exact_match_str = "exactly" if params["exact_match"] is True else "at least" template_str = f"Must have {exact_match_str} these columns (in any order): {column_list_template_str}" for idx in range(len(params["column_list"])): params[f"column_list_{str(idx)}"] = params["column_list"][idx] params_with_json_schema = { "column_list": { "schema": {"type": "array"}, "value": params.get("column_list"), }, "exact_match": { "schema": {"type": "boolean"}, "value": params.get("exact_match"), }, } return (template_str, params_with_json_schema, styling) @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get("include_column_name", True) include_column_name = ( include_column_name if include_column_name is not None else True ) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, ["column_set", "exact_match"] ) if params["column_set"] is None: template_str = "Must specify a set or list of columns." else: # standardize order of the set for output params["column_list"] = list(params["column_set"]) column_list_template_str = ", ".join( [f"$column_list_{idx}" for idx in range(len(params["column_list"]))] ) exact_match_str = "exactly" if params["exact_match"] is True else "at least" template_str = f"Must have {exact_match_str} these columns (in any order): {column_list_template_str}" for idx in range(len(params["column_list"])): params[f"column_list_{str(idx)}"] = params["column_list"][idx] return [ RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": template_str, "params": params, "styling": styling, }, } ) ] def _validate( self, configuration: ExpectationConfiguration, metrics: Dict, runtime_configuration: dict = None, execution_engine: ExecutionEngine = None, ): # Obtaining columns and ordered list for sake of comparison expected_column_set = self.get_success_kwargs(configuration).get("column_set") expected_column_set = ( set(expected_column_set) if expected_column_set is not None else set() ) actual_column_list = metrics.get("table.columns") actual_column_set = set(actual_column_list) exact_match = self.get_success_kwargs(configuration).get("exact_match") if ( (expected_column_set is None) and (exact_match is not True) ) or actual_column_set == expected_column_set: return {"success": True, "result": {"observed_value": actual_column_list}} else: # Convert to lists and sort to lock order for testing and output rendering # unexpected_list contains items from the dataset columns that are not in expected_column_set unexpected_list = sorted(list(actual_column_set - expected_column_set)) # missing_list contains items from expected_column_set that are not in the dataset columns missing_list = sorted(list(expected_column_set - actual_column_set)) # observed_value contains items that are in the dataset columns observed_value = sorted(actual_column_list) mismatched = {} if len(unexpected_list) > 0: mismatched["unexpected"] = unexpected_list if len(missing_list) > 0: mismatched["missing"] = missing_list result = { "observed_value": observed_value, "details": {"mismatched": mismatched}, } return_success = { "success": True, "result": result, } return_failed = { "success": False, "result": result, } if exact_match: return return_failed else: # Failed if there are items in the missing list (but OK to have unexpected_list) if len(missing_list) > 0: return return_failed # Passed if there are no items in the missing list else: return return_success
class ExpectTableRowCountToBeBetween(TableExpectation): """Expect the number of rows to be between two values. expect_table_row_count_to_be_between is a :func:`expectation \ <great_expectations.validator.validator.Validator.expectation>`, not a ``column_map_expectation`` or ``column_aggregate_expectation``. Keyword Args: min_value (int or None): \ The minimum number of rows, inclusive. max_value (int or None): \ The maximum number of rows, inclusive. Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. Notes: * min_value and max_value are both inclusive. * If min_value is None, then max_value is treated as an upper bound, and the number of acceptable rows has \ no minimum. * If max_value is None, then min_value is treated as a lower bound, and the number of acceptable rows has \ no maximum. See Also: expect_table_row_count_to_equal """ library_metadata = { "maturity": "production", "tags": ["core expectation", "table expectation"], "contributors": [ "@great_expectations", ], "requirements": [], "has_full_test_suite": True, "manually_reviewed_code": True, } metric_dependencies = ("table.row_count",) success_keys = ( "min_value", "max_value", "auto", "profiler_config", ) table_row_count_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="NumericMetricRangeMultiBatchParameterBuilder", name="table_row_count_range_estimator", metric_name="table.row_count", metric_domain_kwargs=None, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, false_positive_rate=f"{VARIABLES_KEY}false_positive_rate", quantile_statistic_interpolation_method=f"{VARIABLES_KEY}quantile_statistic_interpolation_method", estimator=f"{VARIABLES_KEY}estimator", num_bootstrap_samples=f"{VARIABLES_KEY}num_bootstrap_samples", bootstrap_random_seed=f"{VARIABLES_KEY}bootstrap_random_seed", truncate_values=f"{VARIABLES_KEY}truncate_values", round_decimals=f"{VARIABLES_KEY}round_decimals", evaluation_parameter_builder_configs=None, json_serialize=True, ) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ table_row_count_range_estimator_parameter_builder_config, ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name="expect_table_row_count_to_be_between", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "default_expect_table_row_count_to_be_between_rule": { "variables": { "false_positive_rate": 0.05, "quantile_statistic_interpolation_method": "auto", "estimator": "bootstrap", "num_bootstrap_samples": 9999, "bootstrap_random_seed": None, "truncate_values": { "lower_bound": 0, "upper_bound": None, }, "round_decimals": 0, }, "domain_builder": { "class_name": "TableDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [ { "expectation_type": "expect_table_row_count_to_be_between", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "min_value": f"{PARAMETER_KEY}{table_row_count_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[0]", "max_value": f"{PARAMETER_KEY}{table_row_count_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[1]", "meta": { "profiler_details": f"{PARAMETER_KEY}{table_row_count_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, } ], }, }, ) default_kwarg_values = { "min_value": None, "max_value": None, "result_format": "BASIC", "include_config": True, "catch_exceptions": False, "meta": None, "auto": False, "profiler_config": default_profiler_config, } args_keys = ( "min_value", "max_value", ) def validate_configuration( self, configuration: Optional[ExpectationConfiguration] ) -> None: """ Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that necessary configuration arguments have been provided for the validation of the expectation. Args: configuration (OPTIONAL[ExpectationConfiguration]): \ An optional Expectation Configuration entry that will be used to configure the expectation Returns: None. Raises InvalidExpectationConfigurationError if the config is not validated successfully """ # Setting up a configuration super().validate_configuration(configuration) self.validate_metric_value_between_configuration(configuration=configuration) @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get("include_column_name", True) include_column_name = ( include_column_name if include_column_name is not None else True ) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "min_value", "max_value", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) # format params params_with_json_schema = { "min_value": { "schema": {"type": "number"}, "value": params.get("min_value"), }, "max_value": { "schema": {"type": "number"}, "value": params.get("max_value"), }, "condition_parser": { "schema": {"type": "string"}, "value": params.get("condition_parser"), }, "strict_min": { "schema": {"type": "boolean"}, "value": params.get("strict_min"), }, "strict_max": { "schema": {"type": "boolean"}, "value": params.get("strict_max"), }, } if params["min_value"] is None and params["max_value"] is None: template_str = "May have any number of rows." else: at_least_str, at_most_str = handle_strict_min_max(params) if params["min_value"] is not None and params["max_value"] is not None: template_str = f"Must have {at_least_str} $min_value and {at_most_str} $max_value rows." elif params["min_value"] is None: template_str = f"Must have {at_most_str} $max_value rows." elif params["max_value"] is None: template_str = f"Must have {at_least_str} $min_value rows." if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"], with_schema=True ) template_str = ( conditional_template_str + ", then " + template_str[0].lower() + template_str[1:] ) params_with_json_schema.update(conditional_params) return (template_str, params_with_json_schema, styling) @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get("include_column_name", True) include_column_name = ( include_column_name if include_column_name is not None else True ) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "min_value", "max_value", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) if params["min_value"] is None and params["max_value"] is None: template_str = "May have any number of rows." else: at_least_str, at_most_str = handle_strict_min_max(params) if params["min_value"] is not None and params["max_value"] is not None: template_str = f"Must have {at_least_str} $min_value and {at_most_str} $max_value rows." elif params["min_value"] is None: template_str = f"Must have {at_most_str} $max_value rows." elif params["max_value"] is None: template_str = f"Must have {at_least_str} $min_value rows." if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine(params["row_condition"]) template_str = ( conditional_template_str + ", then " + template_str[0].lower() + template_str[1:] ) params.update(conditional_params) return [ RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": template_str, "params": params, "styling": styling, }, } ) ] def _validate( self, configuration: ExpectationConfiguration, metrics: Dict, runtime_configuration: dict = None, execution_engine: ExecutionEngine = None, ): return self._validate_metric_value_between( metric_name="table.row_count", configuration=configuration, metrics=metrics, runtime_configuration=runtime_configuration, execution_engine=execution_engine, )
class ExpectColumnValueLengthsToBeBetween(ColumnMapExpectation): """Expect column entries to be strings with length between a minimum value and a maximum value (inclusive). This expectation only works for string-type values. Invoking it on ints or floats will raise a TypeError. expect_column_value_lengths_to_be_between is a \ :func:`column_map_expectation <great_expectations.execution_engine.execution_engine.MetaExecutionEngine .column_map_expectation>`. Args: column (str): \ The column name. min_value (int or None): \ The minimum value for a column entry length. max_value (int or None): \ The maximum value for a column entry length. Keyword Args: mostly (None or a float between 0 and 1): \ Return `"success": True` if at least mostly fraction of values match the expectation. \ For more detail, see :ref:`mostly`. Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. Notes: * min_value and max_value are both inclusive. * If min_value is None, then max_value is treated as an upper bound, and the number of acceptable rows has \ no minimum. * If max_value is None, then min_value is treated as a lower bound, and the number of acceptable rows has \ no maximum. See Also: :func:`expect_column_value_lengths_to_equal \ <great_expectations.execution_engine.execution_engine.ExecutionEngine.expect_column_value_lengths_to_equal>` """ # This dictionary contains metadata for display in the public gallery library_metadata = { "maturity": "production", "tags": ["core expectation", "column map expectation"], "contributors": ["@great_expectations"], "requirements": [], "has_full_test_suite": True, "manually_reviewed_code": True, } map_metric = "column_values.value_length.between" success_keys = ( "min_value", "max_value", "strict_min", "strict_max", "mostly", "auto", "profiler_config", ) column_min_length_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="NumericMetricRangeMultiBatchParameterBuilder", name="column_min_length_range_estimator", metric_name="column_values.length.min", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, false_positive_rate=f"{VARIABLES_KEY}false_positive_rate", quantile_statistic_interpolation_method= f"{VARIABLES_KEY}quantile_statistic_interpolation_method", estimator=f"{VARIABLES_KEY}estimator", n_resamples=f"{VARIABLES_KEY}n_resamples", random_seed=f"{VARIABLES_KEY}random_seed", include_estimator_samples_histogram_in_details= f"{VARIABLES_KEY}include_estimator_samples_histogram_in_details", truncate_values=f"{VARIABLES_KEY}truncate_values", round_decimals=f"{VARIABLES_KEY}round_decimals", evaluation_parameter_builder_configs=None, json_serialize=True, ) column_max_length_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="NumericMetricRangeMultiBatchParameterBuilder", name="column_max_length_range_estimator", metric_name="column_values.length.max", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, false_positive_rate=f"{VARIABLES_KEY}false_positive_rate", quantile_statistic_interpolation_method= f"{VARIABLES_KEY}quantile_statistic_interpolation_method", estimator=f"{VARIABLES_KEY}estimator", n_resamples=f"{VARIABLES_KEY}n_resamples", random_seed=f"{VARIABLES_KEY}random_seed", include_estimator_samples_histogram_in_details= f"{VARIABLES_KEY}include_estimator_samples_histogram_in_details", truncate_values=f"{VARIABLES_KEY}truncate_values", round_decimals=f"{VARIABLES_KEY}round_decimals", evaluation_parameter_builder_configs=None, json_serialize=True, ) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ column_min_length_range_estimator_parameter_builder_config, column_max_length_range_estimator_parameter_builder_config, ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name= "expect_column_value_lengths_to_be_between", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "default_expect_column_value_lengths_to_be_between_rule": { "variables": { "mostly": 1.0, "strict_min": False, "strict_max": False, "false_positive_rate": 0.05, "quantile_statistic_interpolation_method": "auto", "estimator": "bootstrap", "n_resamples": 9999, "random_seed": None, "include_estimator_samples_histogram_in_details": False, "truncate_values": { "lower_bound": 0, "upper_bound": None, }, "round_decimals": 0, }, "domain_builder": { "class_name": "ColumnDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [ { "expectation_type": "expect_column_value_lengths_to_be_between", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", "min_value": f"{PARAMETER_KEY}{column_min_length_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[0]", "max_value": f"{PARAMETER_KEY}{column_max_length_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[1]", "mostly": f"{VARIABLES_KEY}mostly", "strict_min": f"{VARIABLES_KEY}strict_min", "strict_max": f"{VARIABLES_KEY}strict_max", "meta": { "profiler_details": { "column_min_range_estimator": f"{PARAMETER_KEY}{column_min_length_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", "column_max_range_estimator": f"{PARAMETER_KEY}{column_max_length_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, }, }, ], }, }, ) default_kwarg_values = { "row_condition": None, "condition_parser": None, "min_value": None, "max_value": None, "strict_min": None, "strict_max": None, "mostly": 1, "result_format": "BASIC", "include_config": True, "catch_exceptions": False, "auto": False, "profiler_config": default_profiler_config, } args_keys = ( "column", "min_value", "max_value", ) def validate_configuration( self, configuration: Optional[ExpectationConfiguration]) -> None: super().validate_configuration(configuration) if configuration is None: configuration = self.configuration try: assert (configuration.kwargs.get("min_value") is not None or configuration.kwargs.get("max_value") is not None), "min_value and max_value cannot both be None" if configuration.kwargs.get("min_value"): assert (isinstance( configuration.kwargs["min_value"], dict) or float( configuration.kwargs.get("min_value")).is_integer() ), "min_value and max_value must be integers" if isinstance(configuration.kwargs.get("min_value"), dict): assert "$PARAMETER" in configuration.kwargs.get( "min_value" ), 'Evaluation Parameter dict for min_value kwarg must have "$PARAMETER" key.' if configuration.kwargs.get("max_value"): assert (isinstance( configuration.kwargs["max_value"], dict) or float( configuration.kwargs.get("max_value")).is_integer() ), "min_value and max_value must be integers" if isinstance(configuration.kwargs.get("max_value"), dict): assert "$PARAMETER" in configuration.kwargs.get( "max_value" ), 'Evaluation Parameter dict for max_value kwarg must have "$PARAMETER" key.' except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "min_value", "max_value", "mostly", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) params_with_json_schema = { "column": { "schema": { "type": "string" }, "value": params.get("column") }, "min_value": { "schema": { "type": "number" }, "value": params.get("min_value"), }, "max_value": { "schema": { "type": "number" }, "value": params.get("max_value"), }, "mostly": { "schema": { "type": "number" }, "value": params.get("mostly") }, "mostly_pct": { "schema": { "type": "string" }, "value": params.get("mostly_pct"), }, "row_condition": { "schema": { "type": "string" }, "value": params.get("row_condition"), }, "condition_parser": { "schema": { "type": "string" }, "value": params.get("condition_parser"), }, "strict_min": { "schema": { "type": "boolean" }, "value": params.get("strict_min"), }, "strict_max": { "schema": { "type": "boolean" }, "value": params.get("strict_max"), }, } if (params["min_value"] is None) and (params["max_value"] is None): template_str = "values may have any length." else: at_least_str, at_most_str = handle_strict_min_max(params) if params["mostly"] is not None and params["mostly"] < 1.0: params_with_json_schema["mostly_pct"]["value"] = num_to_str( params["mostly"] * 100, precision=15, no_scientific=True) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") if params["min_value"] is not None and params[ "max_value"] is not None: template_str = f"values must be {at_least_str} $min_value and {at_most_str} $max_value characters long, at least $mostly_pct % of the time." elif params["min_value"] is None: template_str = f"values must be {at_most_str} $max_value characters long, at least $mostly_pct % of the time." elif params["max_value"] is None: template_str = f"values must be {at_least_str} $min_value characters long, at least $mostly_pct % of the time." else: if params["min_value"] is not None and params[ "max_value"] is not None: template_str = f"values must always be {at_least_str} $min_value and {at_most_str} $max_value characters long." elif params["min_value"] is None: template_str = f"values must always be {at_most_str} $max_value characters long." elif params["max_value"] is None: template_str = f"values must always be {at_least_str} $min_value characters long." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"], with_schema=True) template_str = f"{conditional_template_str}, then {template_str}" params_with_json_schema.update(conditional_params) return (template_str, params_with_json_schema, styling) @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration: ExpectationConfiguration = None, result: ExpectationValidationResult = None, language: str = None, runtime_configuration: dict = None, **kwargs, ) -> List[Union[dict, str, RenderedStringTemplateContent, RenderedTableContent, RenderedBulletListContent, RenderedGraphContent, Any, ]]: runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "min_value", "max_value", "mostly", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) if (params["min_value"] is None) and (params["max_value"] is None): template_str = "values may have any length." else: at_least_str, at_most_str = handle_strict_min_max(params) if params["mostly"] is not None and params["mostly"] < 1.0: params["mostly_pct"] = num_to_str(params["mostly"] * 100, precision=15, no_scientific=True) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") if params["min_value"] is not None and params[ "max_value"] is not None: template_str = f"values must be {at_least_str} $min_value and {at_most_str} $max_value characters long, at least $mostly_pct % of the time." elif params["min_value"] is None: template_str = f"values must be {at_most_str} $max_value characters long, at least $mostly_pct % of the time." elif params["max_value"] is None: template_str = f"values must be {at_least_str} $min_value characters long, at least $mostly_pct % of the time." else: if params["min_value"] is not None and params[ "max_value"] is not None: template_str = f"values must always be {at_least_str} $min_value and {at_most_str} $max_value characters long." elif params["min_value"] is None: template_str = f"values must always be {at_most_str} $max_value characters long." elif params["max_value"] is None: template_str = f"values must always be {at_least_str} $min_value characters long." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"]) template_str = f"{conditional_template_str}, then {template_str}" params.update(conditional_params) return [ RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": template_str, "params": params, "styling": styling, }, }) ]
class ExpectColumnQuantileValuesToBeBetween(ColumnExpectation): # noinspection PyUnresolvedReferences """Expect specific provided column quantiles to be between provided minimum and maximum values. ``quantile_ranges`` must be a dictionary with two keys: * ``quantiles``: (list of float) increasing ordered list of desired quantile values * ``value_ranges``: (list of lists): Each element in this list consists of a list with two values, a lower \ and upper bound (inclusive) for the corresponding quantile. These values must be [min, max] ordered. For each provided range: * min_value and max_value are both inclusive. * If min_value is None, then max_value is treated as an upper bound only * If max_value is None, then min_value is treated as a lower bound only The length of the quantiles list and quantile_values list must be equal. For example: :: # my_df.my_col = [1,2,2,3,3,3,4] >>> my_df.expect_column_quantile_values_to_be_between( "my_col", { "quantiles": [0., 0.333, 0.6667, 1.], "value_ranges": [[0,1], [2,3], [3,4], [4,5]] } ) { "success": True, "result": { "observed_value": { "quantiles: [0., 0.333, 0.6667, 1.], "values": [1, 2, 3, 4], } "element_count": 7, "missing_count": 0, "missing_percent": 0.0, "details": { "success_details": [true, true, true, true] } } } } `expect_column_quantile_values_to_be_between` can be computationally intensive for large datasets. expect_column_quantile_values_to_be_between is a \ :func:`column_aggregate_expectation <great_expectations.execution_engine.MetaExecutionEngine.column_aggregate_expectation>`. Args: column (str): \ The column name. quantile_ranges (dictionary): \ Quantiles and associated value ranges for the column. See above for details. allow_relative_error (boolean or string): \ Whether to allow relative error in quantile communications on backends that support or require it. Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. Notes: These fields in the result object are customized for this expectation: :: details.success_details See Also: :func:`expect_column_min_to_be_between \ <great_expectations.execution_engine.execution_engine.ExecutionEngine.expect_column_min_to_be_between>` :func:`expect_column_max_to_be_between \ <great_expectations.execution_engine.execution_engine.ExecutionEngine.expect_column_max_to_be_between>` :func:`expect_column_median_to_be_between \ <great_expectations.execution_engine.execution_engine.ExecutionEngine.expect_column_median_to_be_between>` """ # This dictionary contains metadata for display in the public gallery library_metadata = { "maturity": "production", "tags": ["core expectation", "column aggregate expectation"], "contributors": ["@great_expectations"], "requirements": [], } metric_dependencies = ("column.quantile_values", ) success_keys = ( "quantile_ranges", "allow_relative_error", "auto", "profiler_config", ) quantile_value_ranges_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="NumericMetricRangeMultiBatchParameterBuilder", name="quantile_value_ranges_estimator", metric_name="column.quantile_values", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs={ "quantiles": f"{VARIABLES_KEY}quantiles", "allow_relative_error": f"{VARIABLES_KEY}allow_relative_error", }, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, false_positive_rate=f"{VARIABLES_KEY}false_positive_rate", quantile_statistic_interpolation_method= f"{VARIABLES_KEY}quantile_statistic_interpolation_method", estimator=f"{VARIABLES_KEY}estimator", num_bootstrap_samples=f"{VARIABLES_KEY}num_bootstrap_samples", bootstrap_random_seed=f"{VARIABLES_KEY}bootstrap_random_seed", truncate_values=f"{VARIABLES_KEY}truncate_values", round_decimals=f"{VARIABLES_KEY}round_decimals", evaluation_parameter_builder_configs=None, json_serialize=True, ) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ quantile_value_ranges_estimator_parameter_builder_config, ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name= "expect_column_quantile_values_to_be_between", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "default_expect_column_quantile_values_to_be_between_rule": { "variables": { "quantiles": [ 0.25, 0.5, 0.75, ], "allow_relative_error": "linear", "false_positive_rate": 0.05, "quantile_statistic_interpolation_method": "auto", "estimator": "bootstrap", "num_bootstrap_samples": 9999, "bootstrap_random_seed": None, "truncate_values": { "lower_bound": None, "upper_bound": None, }, "round_decimals": 1, }, "domain_builder": { "class_name": "ColumnDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [{ "expectation_type": "expect_column_quantile_values_to_be_between", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", "quantile_ranges": { "quantiles": f"{VARIABLES_KEY}quantiles", "value_ranges": f"{PARAMETER_KEY}{quantile_value_ranges_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}", }, "allow_relative_error": f"{VARIABLES_KEY}allow_relative_error", "meta": { "profiler_details": f"{PARAMETER_KEY}{quantile_value_ranges_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, }], }, }, ) default_kwarg_values = { "row_condition": None, "condition_parser": None, "quantile_ranges": None, "result_format": "BASIC", "allow_relative_error": False, "include_config": True, "catch_exceptions": False, "meta": None, "auto": False, "profiler_config": default_profiler_config, } args_keys = ( "column", "quantile_ranges", "allow_relative_error", ) def validate_configuration( self, configuration: Optional[ExpectationConfiguration]) -> None: super().validate_configuration(configuration) try: assert ( "quantile_ranges" in configuration.kwargs), "quantile_ranges must be provided" assert isinstance(configuration.kwargs["quantile_ranges"], dict), "quantile_ranges should be a dictionary" assert all([ True if None in x or x == sorted(x) else False for x in configuration.kwargs["quantile_ranges"]["value_ranges"] ]), "quantile_ranges must consist of ordered pairs" except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) # Ensuring actual quantiles and their value ranges match up quantile_ranges = configuration.kwargs["quantile_ranges"] quantiles = quantile_ranges["quantiles"] quantile_value_ranges = quantile_ranges["value_ranges"] if len(quantiles) != len(quantile_value_ranges): raise ValueError( "quantile_values and quantiles must have the same number of elements" ) @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration["kwargs"], ["column", "quantile_ranges", "row_condition", "condition_parser"], ) header_params_with_json_schema = { "column": { "schema": { "type": "string" }, "value": params.get("column") }, "mostly": { "schema": { "type": "number" }, "value": params.get("mostly") }, "row_condition": { "schema": { "type": "string" }, "value": params.get("row_condition"), }, "condition_parser": { "schema": { "type": "string" }, "value": params.get("condition_parser"), }, } header_template_str = "quantiles must be within the following value ranges." if include_column_name: header_template_str = f"$column {header_template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"], with_schema=True) header_template_str = (conditional_template_str + ", then " + header_template_str[0].lower() + header_template_str[1:]) header_params_with_json_schema.update(conditional_params) quantile_ranges = (params.get("quantile_ranges") if params.get("quantile_ranges") else {}) quantiles = (quantile_ranges.get("quantiles") if quantile_ranges.get("quantiles") else []) value_ranges = (quantile_ranges.get("value_ranges") if quantile_ranges.get("value_ranges") else []) table_header_row = [ { "schema": { "type": "string" }, "value": "Quantile" }, { "schema": { "type": "string" }, "value": "Min Value" }, { "schema": { "type": "string" }, "value": "Max Value" }, ] table_rows = [] quantile_strings = {0.25: "Q1", 0.75: "Q3", 0.50: "Median"} for quantile, value_range in zip(quantiles, value_ranges): quantile_string = quantile_strings.get(quantile, f"{quantile:3.2f}") table_rows.append([ { "value": quantile_string, "schema": { "type": "string" }, }, { "value": value_range[0] if value_range[0] is not None else "Any", "schema": { "type": "number" if value_range[0] is not None else "string" }, }, { "value": value_range[1] if value_range[1] is not None else "Any", "schema": { "type": "number" if value_range[1] is not None else "string" }, }, ]) return ( header_template_str, header_params_with_json_schema, styling, table_header_row, table_rows, ) @classmethod @renderer(renderer_type="atomic.prescriptive.summary") @render_evaluation_parameter_string def _prescriptive_summary( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): """ Rendering function that is utilized by GE Cloud Front-end """ ( header_template_str, header_params_with_json_schema, _, table_header_row, table_rows, ) = cls._atomic_prescriptive_template(configuration, result, language, runtime_configuration, **kwargs) value_obj = renderedAtomicValueSchema.load({ "header": { "schema": { "type": "StringValueType" }, "value": { "template": header_template_str, "params": header_params_with_json_schema, }, }, "header_row": table_header_row, "table": table_rows, "schema": { "type": "TableType" }, }) rendered = RenderedAtomicContent(name="atomic.prescriptive.summary", value=value_obj, value_type="TableType") return rendered @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration["kwargs"], ["column", "quantile_ranges", "row_condition", "condition_parser"], ) template_str = "quantiles must be within the following value ranges." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"]) template_str = (conditional_template_str + ", then " + template_str[0].lower() + template_str[1:]) params.update(conditional_params) expectation_string_obj = { "content_block_type": "string_template", "string_template": { "template": template_str, "params": params }, } quantiles = params["quantile_ranges"]["quantiles"] value_ranges = params["quantile_ranges"]["value_ranges"] table_header_row = ["Quantile", "Min Value", "Max Value"] table_rows = [] quantile_strings = {0.25: "Q1", 0.75: "Q3", 0.50: "Median"} for quantile, value_range in zip(quantiles, value_ranges): quantile_string = quantile_strings.get(quantile, f"{quantile:3.2f}") table_rows.append([ quantile_string, str(value_range[0]) if value_range[0] is not None else "Any", str(value_range[1]) if value_range[1] is not None else "Any", ]) quantile_range_table = RenderedTableContent( **{ "content_block_type": "table", "header_row": table_header_row, "table": table_rows, "styling": { "body": { "classes": [ "table", "table-sm", "table-unbordered", "col-4", "mt-2", ], }, "parent": { "styles": { "list-style-type": "none" } }, }, }) return [expectation_string_obj, quantile_range_table] @classmethod @renderer(renderer_type="renderer.diagnostic.observed_value") def _diagnostic_observed_value_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): if result.result is None or result.result.get( "observed_value") is None: return "--" quantiles = result.result.get("observed_value", {}).get("quantiles", []) value_ranges = result.result.get("observed_value", {}).get("values", []) table_header_row = ["Quantile", "Value"] table_rows = [] quantile_strings = {0.25: "Q1", 0.75: "Q3", 0.50: "Median"} for idx, quantile in enumerate(quantiles): quantile_string = quantile_strings.get(quantile) table_rows.append([ quantile_string if quantile_string else f"{quantile:3.2f}", str(value_ranges[idx]), ]) return RenderedTableContent( **{ "content_block_type": "table", "header_row": table_header_row, "table": table_rows, "styling": { "body": { "classes": ["table", "table-sm", "table-unbordered", "col-4"], } }, }) @classmethod def _atomic_diagnostic_observed_value_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): template_string = None params_with_json_schema = None table_header_row = None table_rows = None if result.result is None or result.result.get( "observed_value") is None: template_string = "--" params_with_json_schema = {} return ( template_string, params_with_json_schema, table_header_row, table_rows, ) quantiles = result.result.get("observed_value", {}).get("quantiles", []) value_ranges = result.result.get("observed_value", {}).get("values", []) table_header_row = [ { "schema": { "type": "string" }, "value": "Quantile" }, { "schema": { "type": "string" }, "value": "Value" }, ] table_rows = [] quantile_strings = {0.25: "Q1", 0.75: "Q3", 0.50: "Median"} for idx, quantile in enumerate(quantiles): quantile_string = quantile_strings.get(quantile) table_rows.append([ { "value": quantile_string if quantile_string else f"{quantile:3.2f}", "schema": { "type": "string" }, }, { "value": value_ranges[idx], "schema": { "type": "number" } }, ]) return template_string, params_with_json_schema, table_header_row, table_rows @classmethod @renderer(renderer_type="atomic.diagnostic.observed_value") def _atomic_diagnostic_observed_value( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): ( template_string, params_with_json_schema, table_header_row, table_rows, ) = cls._atomic_diagnostic_observed_value_template( configuration, result, language, runtime_configuration, **kwargs) if template_string is not None: value_obj = renderedAtomicValueSchema.load({ "template": template_string, "params": {}, "schema": { "type": "StringValueType" }, }) return RenderedAtomicContent( name="atomic.diagnostic.observed_value", value=value_obj, value_type="StringValueType", ) else: value_obj = renderedAtomicValueSchema.load({ "header_row": table_header_row, "table": table_rows, "schema": { "type": "TableType" }, }) return RenderedAtomicContent( name="atomic.diagnostic.observed_value", value=value_obj, value_type="TableType", ) @classmethod @renderer(renderer_type="renderer.descriptive.quantile_table") def _descriptive_quantile_table_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): assert result, "Must pass in result." table_rows = [] quantiles = result.result["observed_value"]["quantiles"] quantile_ranges = result.result["observed_value"]["values"] quantile_strings = {0.25: "Q1", 0.75: "Q3", 0.50: "Median"} for idx, quantile in enumerate(quantiles): quantile_string = quantile_strings.get(quantile) table_rows.append([ { "content_block_type": "string_template", "string_template": { "template": quantile_string if quantile_string else f"{quantile:3.2f}", "tooltip": { "content": "expect_column_quantile_values_to_be_between \n expect_column_median_to_be_between" if quantile == 0.50 else "expect_column_quantile_values_to_be_between" }, }, }, quantile_ranges[idx], ]) return RenderedTableContent( **{ "content_block_type": "table", "header": RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": "Quantiles", "tag": "h6" }, }), "table": table_rows, "styling": { "classes": ["col-3", "mt-1", "pl-1", "pr-1"], "body": { "classes": ["table", "table-sm", "table-unbordered"], }, }, }) def get_validation_dependencies( self, configuration: Optional[ExpectationConfiguration] = None, execution_engine: Optional[ExecutionEngine] = None, runtime_configuration: Optional[dict] = None, ): all_dependencies = super().get_validation_dependencies( configuration, execution_engine, runtime_configuration) # column.quantile_values expects a "quantiles" key all_dependencies["metrics"][ "column.quantile_values"].metric_value_kwargs[ "quantiles"] = configuration.kwargs["quantile_ranges"][ "quantiles"] return all_dependencies def _validate( self, configuration: ExpectationConfiguration, metrics: Dict, runtime_configuration: dict = None, execution_engine: ExecutionEngine = None, ): quantile_vals = metrics.get("column.quantile_values") quantile_ranges = configuration.kwargs.get("quantile_ranges") quantiles = quantile_ranges["quantiles"] quantile_value_ranges = quantile_ranges["value_ranges"] # We explicitly allow "None" to be interpreted as +/- infinity comparison_quantile_ranges = [[ -np.inf if lower_bound is None else lower_bound, np.inf if upper_bound is None else upper_bound, ] for (lower_bound, upper_bound) in quantile_value_ranges] success_details = [ range_[0] <= quantile_vals[idx] <= range_[1] for idx, range_ in enumerate(comparison_quantile_ranges) ] return { "success": np.all(success_details), "result": { "observed_value": { "quantiles": quantiles, "values": quantile_vals }, "details": { "success_details": success_details }, }, }
class ExpectColumnValuesToBeInSet(ColumnMapExpectation): """Expect each column value to be in a given set. For example: :: # my_df.my_col = [1,2,2,3,3,3] >>> my_df.expect_column_values_to_be_in_set( "my_col", [2,3] ) { "success": false "result": { "unexpected_count": 1 "unexpected_percent": 16.66666666666666666, "unexpected_percent_nonmissing": 16.66666666666666666, "partial_unexpected_list": [ 1 ], }, } expect_column_values_to_be_in_set is a \ :func:`column_map_expectation <great_expectations.execution_engine.execution_engine.MetaExecutionEngine .column_map_expectation>`. Args: column (str): \ The column name. value_set (set-like): \ A set of objects used for comparison. Keyword Args: mostly (None or a float between 0 and 1): \ Return `"success": True` if at least mostly fraction of values match the expectation. \ For more detail, see :ref:`mostly`. parse_strings_as_datetimes (boolean or None) : If True values provided in value_set will be parsed as \ datetimes before making comparisons. Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. See Also: :func:`expect_column_values_to_not_be_in_set \ <great_expectations.execution_engine.execution_engine.ExecutionEngine .expect_column_values_to_not_be_in_set>` """ # This dictionary contains metadata for display in the public gallery library_metadata = { "maturity": "production", "tags": ["core expectation", "column map expectation"], "contributors": ["@great_expectations"], "requirements": [], "has_full_test_suite": True, "manually_reviewed_code": True, } map_metric = "column_values.in_set" args_keys = ( "column", "value_set", ) success_keys = ( "value_set", "mostly", "parse_strings_as_datetimes", "auto", "profiler_config", ) value_set_estimator_parameter_builder_config: ParameterBuilderConfig = ( ParameterBuilderConfig( module_name= "great_expectations.rule_based_profiler.parameter_builder", class_name="ValueSetMultiBatchParameterBuilder", name="value_set_estimator", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, evaluation_parameter_builder_configs=None, )) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ value_set_estimator_parameter_builder_config, ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name= "expect_column_values_to_be_in_set", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "default_expect_column_values_to_be_in_set_rule": { "variables": { "mostly": 1.0, }, "domain_builder": { "class_name": "ColumnDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [ { "expectation_type": "expect_column_values_to_be_in_set", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", "value_set": f"{PARAMETER_KEY}{value_set_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}", "mostly": f"{VARIABLES_KEY}mostly", "meta": { "profiler_details": f"{PARAMETER_KEY}{value_set_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, }, ], }, }, ) default_kwarg_values = { "value_set": [], "parse_strings_as_datetimes": False, "auto": False, "profiler_config": default_profiler_config, } @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "value_set", "mostly", "parse_strings_as_datetimes", "row_condition", "condition_parser", ], ) params_with_json_schema = { "column": { "schema": { "type": "string" }, "value": params.get("column") }, "value_set": { "schema": { "type": "array" }, "value": params.get("value_set"), }, "mostly": { "schema": { "type": "number" }, "value": params.get("mostly") }, "mostly_pct": { "schema": { "type": "string" }, "value": params.get("mostly_pct"), }, "parse_strings_as_datetimes": { "schema": { "type": "boolean" }, "value": params.get("parse_strings_as_datetimes"), }, "row_condition": { "schema": { "type": "string" }, "value": params.get("row_condition"), }, "condition_parser": { "schema": { "type": "string" }, "value": params.get("condition_parser"), }, } if params["value_set"] is None or len(params["value_set"]) == 0: values_string = "[ ]" else: for i, v in enumerate(params["value_set"]): params[f"v__{str(i)}"] = v values_string = " ".join( [f"$v__{str(i)}" for i, v in enumerate(params["value_set"])]) template_str = f"values must belong to this set: {values_string}" if params["mostly"] is not None and params["mostly"] < 1.0: params_with_json_schema["mostly_pct"]["value"] = num_to_str( params["mostly"] * 100, precision=15, no_scientific=True) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") template_str += ", at least $mostly_pct % of the time." else: template_str += "." if params.get("parse_strings_as_datetimes"): template_str += " Values should be parsed as datetimes." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"], with_schema=True) template_str = f"{conditional_template_str}, then {template_str}" params_with_json_schema.update(conditional_params) params_with_json_schema = add_values_with_json_schema_from_list_in_params( params=params, params_with_json_schema=params_with_json_schema, param_key_with_list="value_set", ) return (template_str, params_with_json_schema, styling) @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "value_set", "mostly", "parse_strings_as_datetimes", "row_condition", "condition_parser", ], ) if params["value_set"] is None or len(params["value_set"]) == 0: values_string = "[ ]" else: for i, v in enumerate(params["value_set"]): params[f"v__{str(i)}"] = v values_string = " ".join( [f"$v__{str(i)}" for i, v in enumerate(params["value_set"])]) template_str = f"values must belong to this set: {values_string}" if params["mostly"] is not None and params["mostly"] < 1.0: params["mostly_pct"] = num_to_str(params["mostly"] * 100, precision=15, no_scientific=True) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") template_str += ", at least $mostly_pct % of the time." else: template_str += "." if params.get("parse_strings_as_datetimes"): template_str += " Values should be parsed as datetimes." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"]) template_str = f"{conditional_template_str}, then {template_str}" params.update(conditional_params) return [ RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": template_str, "params": params, "styling": styling, }, }) ] @classmethod @renderer(renderer_type="renderer.descriptive.example_values_block") def _descriptive_example_values_block_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): assert result, "Must pass in result." if "partial_unexpected_counts" in result.result: partial_unexpected_counts = result.result[ "partial_unexpected_counts"] values = [str(v["value"]) for v in partial_unexpected_counts] elif "partial_unexpected_list" in result.result: values = [ str(item) for item in result.result["partial_unexpected_list"] ] else: return classes = ["col-3", "mt-1", "pl-1", "pr-1"] if any(len(value) > 80 for value in values): content_block_type = "bullet_list" content_block_class = RenderedBulletListContent else: content_block_type = "value_list" content_block_class = ValueListContent new_block = content_block_class( **{ "content_block_type": content_block_type, "header": RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": "Example Values", "tooltip": { "content": "expect_column_values_to_be_in_set" }, "tag": "h6", }, }), content_block_type: [{ "content_block_type": "string_template", "string_template": { "template": "$value", "params": { "value": value }, "styling": { "default": { "classes": ["badge", "badge-info"] if content_block_type == "value_list" else [], "styles": { "word-break": "break-all" }, }, }, }, } for value in values], "styling": { "classes": classes, }, }) return new_block def validate_configuration( self, configuration: Optional[ExpectationConfiguration]) -> None: super().validate_configuration(configuration) # supports extensibility by allowing value_set to not be provided in config but captured via child-class default_kwarg_values, e.g. parameterized expectations value_set = configuration.kwargs.get( "value_set") or self.default_kwarg_values.get("value_set") try: assert ("value_set" in configuration.kwargs or value_set), "value_set is required" assert isinstance( value_set, (list, set, dict)), "value_set must be a list, set, or dict" if isinstance(value_set, dict): assert ( "$PARAMETER" in value_set ), 'Evaluation Parameter dict for value_set kwarg must have "$PARAMETER" key.' except AssertionError as e: raise InvalidExpectationConfigurationError(str(e))