def profiler_config_with_placeholder_args_multiple_rules_custom_values( profiler_config_with_placeholder_args_custom_values: RuleBasedProfilerConfig, ) -> RuleBasedProfilerConfig: rules: dict = profiler_config_with_placeholder_args_custom_values.rules rule: dict = { "domain_builder": { "class_name": "MyAdditionalCustomDomainBuilder" }, "parameter_builders": [{ "class_name": "MyAdditionalCustomParameterBuilder", "metric_name": "yet_another_metric", "name": "yet_another_parameter", }], "expectation_configuration_builders": [{ "class_name": "MyAdditionalCustomExpectationConfigurationBuilder", "expectation_type": "expect_additional_custom_expectation", "meta": { "details": { "note": "Here's another rule" } }, }], } rules["rule_2"] = rule return RuleBasedProfilerConfig( name=profiler_config_with_placeholder_args_custom_values.name, config_version=profiler_config_with_placeholder_args_custom_values. config_version, rules=rules, variables=profiler_config_with_placeholder_args_custom_values. variables, )
def profiler_config_with_placeholder_args_multiple_rules( profiler_config_with_placeholder_args: RuleBasedProfilerConfig, ) -> RuleBasedProfilerConfig: rules: dict = profiler_config_with_placeholder_args.rules rule: dict = { "domain_builder": { "class_name": "TableDomainBuilder" }, "parameter_builders": [{ "class_name": "MetricMultiBatchParameterBuilder", "metric_name": "my_other_metric", "name": "my_additional_parameter", }], "expectation_configuration_builders": [{ "class_name": "DefaultExpectationConfigurationBuilder", "expectation_type": "expect_column_values_to_be_between", "meta": { "details": { "note": "Here's another rule" } }, }], } rules["rule_2"] = rule return RuleBasedProfilerConfig( name=profiler_config_with_placeholder_args.name, config_version=profiler_config_with_placeholder_args.config_version, rules=rules, variables=profiler_config_with_placeholder_args.variables, )
def profiler_config_with_placeholder_args_custom_values( ) -> RuleBasedProfilerConfig: config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name="my_profiler", config_version=1.0, rules={ "rule_1": { "domain_builder": { "class_name": "MyCustomDomainBuilder" }, "expectation_configuration_builders": [{ "class_name": "MyCustomExpectationConfigurationBuilder", "expectation_type": "expect_custom_expectation", "meta": { "details": { "note": "My custom config" } }, }], "parameter_builders": [{ "class_name": "MyCustomParameterBuilder", "metric_name": "my_metric", "name": "my_parameter", }], } }, variables={"my_variable": "my_value"}, ) return config
def add_profiler( config: RuleBasedProfilerConfig, data_context: "DataContext", # noqa: F821 profiler_store: ProfilerStore, ge_cloud_id: Optional[str] = None, ) -> "RuleBasedProfiler": if not RuleBasedProfiler._check_validity_of_batch_requests_in_config( config=config): raise ge_exceptions.InvalidConfigError( f'batch_data found in batch_request cannot be saved to ProfilerStore "{profiler_store.store_name}"' ) # Chetan - 20220204 - DataContext to be removed once it can be decoupled from RBP new_profiler: "RuleBasedProfiler" = instantiate_class_from_config( config=config.to_json_dict(), runtime_environment={ "data_context": data_context, }, config_defaults={ "module_name": "great_expectations.rule_based_profiler", "class_name": "RuleBasedProfiler", }, ) key: Union[GeCloudIdentifier, ConfigurationIdentifier] if ge_cloud_id: key = GeCloudIdentifier(resource_type="contract", ge_cloud_id=ge_cloud_id) else: key = ConfigurationIdentifier(configuration_key=config.name, ) profiler_store.set(key=key, value=config) return new_profiler
def __init__( self, name: str, config_version: float, variables: Optional[Dict[str, Any]] = None, rules: Optional[Dict[str, Dict[str, Any]]] = None, data_context: Optional["DataContext"] = None, # noqa: F821 ): """ Create a new Profiler using configured rules. For a rule or an item in a rule configuration, instantiates the following if available: a domain builder, a parameter builder, and a configuration builder. These will be used to define profiler computation patterns. Args: name: The name of the RBP instance config_version: The version of the RBP (currently only 1.0 is supported) rules: A set of dictionaries, each of which contains its own domain_builder, parameter_builders, and expectation_configuration_builders configuration components variables: Any variables to be substituted within the rules data_context: DataContext object that defines a full runtime environment (data access, etc.) """ profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name=name, config_version=config_version, variables=variables, rules=rules, ) super().__init__( profiler_config=profiler_config, data_context=data_context, )
class ExpectColumnUniqueValueCountToBeBetween(ColumnExpectation): """Expect the number of unique values to be between a minimum value and a maximum value. expect_column_unique_value_count_to_be_between is a \ :func:`column_aggregate_expectation <great_expectations.execution_engine.MetaExecutionEngine.column_aggregate_expectation>`. Args: column (str): \ The column name. min_value (int or None): \ The minimum number of unique values allowed. max_value (int or None): \ The maximum number of unique values allowed. Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. Notes: These fields in the result object are customized for this expectation: :: { "observed_value": (int) The number of unique values in the column } * min_value and max_value are both inclusive. * If min_value is None, then max_value is treated as an upper bound * If max_value is None, then min_value is treated as a lower bound See Also: :func:`expect_column_proportion_of_unique_values_to_be_between \ <great_expectations.execution_engine.execution_engine.ExecutionEngine .expect_column_proportion_of_unique_values_to_be_between>` """ # This dictionary contains metadata for display in the public gallery library_metadata = { "maturity": "production", "tags": ["core expectation", "column aggregate expectation"], "contributors": ["@great_expectations"], "requirements": [], "has_full_test_suite": True, "manually_reviewed_code": True, } # Setting necessary computation metric dependencies and defining kwargs, as well as assigning kwargs default values\ metric_dependencies = ("column.distinct_values.count",) success_keys = ( "min_value", "max_value", "auto", "profiler_config", ) column_unique_values_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="NumericMetricRangeMultiBatchParameterBuilder", name="column_unique_values_range_estimator", metric_name="column.distinct_values.count", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, false_positive_rate=f"{VARIABLES_KEY}false_positive_rate", quantile_statistic_interpolation_method=f"{VARIABLES_KEY}quantile_statistic_interpolation_method", estimator=f"{VARIABLES_KEY}estimator", n_resamples=f"{VARIABLES_KEY}n_resamples", random_seed=f"{VARIABLES_KEY}random_seed", include_estimator_samples_histogram_in_details=f"{VARIABLES_KEY}include_estimator_samples_histogram_in_details", truncate_values=f"{VARIABLES_KEY}truncate_values", round_decimals=f"{VARIABLES_KEY}round_decimals", evaluation_parameter_builder_configs=None, json_serialize=True, ) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ column_unique_values_range_estimator_parameter_builder_config, ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name="expect_column_unique_value_count_to_be_between", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "default_expect_column_unique_values_to_be_between_rule": { "variables": { "mostly": 1.0, "strict_min": False, "strict_max": False, "false_positive_rate": 0.05, "quantile_statistic_interpolation_method": "auto", "estimator": "bootstrap", "n_resamples": 9999, "random_seed": None, "include_estimator_samples_histogram_in_details": False, "truncate_values": { "lower_bound": 0, "upper_bound": None, }, "round_decimals": 0, }, "domain_builder": { "class_name": "ColumnDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [ { "expectation_type": "expect_column_unique_value_count_to_be_between", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", "min_value": f"{PARAMETER_KEY}{column_unique_values_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[0]", "max_value": f"{PARAMETER_KEY}{column_unique_values_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[1]", "strict_min": f"{VARIABLES_KEY}strict_min", "strict_max": f"{VARIABLES_KEY}strict_max", "meta": { "profiler_details": f"{PARAMETER_KEY}{column_unique_values_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, }, ], }, }, ) # Default values default_kwarg_values = { "row_condition": None, "condition_parser": None, "min_value": None, "max_value": None, "result_format": "BASIC", "include_config": True, "catch_exceptions": False, "auto": False, "profiler_config": default_profiler_config, } args_keys = ( "column", "min_value", "max_value", ) """ A Column Aggregate Metric Decorator for the Unique Value Count""" def validate_configuration( self, configuration: Optional[ExpectationConfiguration] ) -> None: """ Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that necessary configuration arguments have been provided for the validation of the expectation. Args: configuration (OPTIONAL[ExpectationConfiguration]): \ An optional Expectation Configuration entry that will be used to configure the expectation Returns: None. Raises InvalidExpectationConfigurationError if the config is not validated successfully """ super().validate_configuration(configuration) self.validate_metric_value_between_configuration(configuration=configuration) @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get("include_column_name", True) include_column_name = ( include_column_name if include_column_name is not None else True ) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "min_value", "max_value", "mostly", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) params_with_json_schema = { "column": {"schema": {"type": "string"}, "value": params.get("column")}, "min_value": { "schema": {"type": "number"}, "value": params.get("min_value"), }, "max_value": { "schema": {"type": "number"}, "value": params.get("max_value"), }, "mostly": {"schema": {"type": "number"}, "value": params.get("mostly")}, "mostly_pct": { "schema": {"type": "string"}, "value": params.get("mostly_pct"), }, "row_condition": { "schema": {"type": "string"}, "value": params.get("row_condition"), }, "condition_parser": { "schema": {"type": "string"}, "value": params.get("condition_parser"), }, "strict_min": { "schema": {"type": "boolean"}, "value": params.get("strict_min"), }, "strict_max": { "schema": {"type": "boolean"}, "value": params.get("strict_max"), }, } at_least_str, at_most_str = handle_strict_min_max(params) if (params["min_value"] is None) and (params["max_value"] is None): template_str = "may have any number of unique values." else: if params["mostly"] is not None and params["mostly"] < 1.0: params_with_json_schema["mostly_pct"]["value"] = num_to_str( params["mostly"] * 100, precision=15, no_scientific=True ) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") if params["min_value"] is None: template_str = f"must have {at_most_str} $max_value unique values, at least $mostly_pct % of the time." elif params["max_value"] is None: template_str = f"must have {at_least_str} $min_value unique values, at least $mostly_pct % of the time." else: template_str = f"must have {at_least_str} $min_value and {at_most_str} $max_value unique values, at least $mostly_pct % of the time." else: if params["min_value"] is None: template_str = f"must have {at_most_str} $max_value unique values." elif params["max_value"] is None: template_str = f"must have {at_least_str} $min_value unique values." else: template_str = f"must have {at_least_str} $min_value and {at_most_str} $max_value unique values." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"], with_schema=True ) template_str = f"{conditional_template_str}, then {template_str}" params_with_json_schema.update(conditional_params) return (template_str, params_with_json_schema, styling) @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get("include_column_name", True) include_column_name = ( include_column_name if include_column_name is not None else True ) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "min_value", "max_value", "mostly", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) at_least_str, at_most_str = handle_strict_min_max(params) if (params["min_value"] is None) and (params["max_value"] is None): template_str = "may have any number of unique values." else: if params["mostly"] is not None and params["mostly"] < 1.0: params["mostly_pct"] = num_to_str( params["mostly"] * 100, precision=15, no_scientific=True ) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") if params["min_value"] is None: template_str = f"must have {at_most_str} $max_value unique values, at least $mostly_pct % of the time." elif params["max_value"] is None: template_str = f"must have {at_least_str} $min_value unique values, at least $mostly_pct % of the time." else: template_str = f"must have {at_least_str} $min_value and {at_most_str} $max_value unique values, at least $mostly_pct % of the time." else: if params["min_value"] is None: template_str = f"must have {at_most_str} $max_value unique values." elif params["max_value"] is None: template_str = f"must have {at_least_str} $min_value unique values." else: template_str = f"must have {at_least_str} $min_value and {at_most_str} $max_value unique values." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine(params["row_condition"]) template_str = f"{conditional_template_str}, then {template_str}" params.update(conditional_params) return [ RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": template_str, "params": params, "styling": styling, }, } ) ] @classmethod @renderer( renderer_type="renderer.descriptive.column_properties_table.distinct_count_row" ) def _descriptive_column_properties_table_distinct_count_row_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): assert result, "Must pass in result." observed_value = result.result["observed_value"] template_string_object = RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": "Distinct (n)", "tooltip": { "content": "expect_column_unique_value_count_to_be_between" }, }, } ) if not observed_value: return [template_string_object, "--"] else: return [template_string_object, observed_value] def _validate( self, configuration: ExpectationConfiguration, metrics: Dict, runtime_configuration: dict = None, execution_engine: ExecutionEngine = None, ): return self._validate_metric_value_between( metric_name="column.distinct_values.count", configuration=configuration, metrics=metrics, runtime_configuration=runtime_configuration, execution_engine=execution_engine, )
class ExpectColumnValuesToMatchStrftimeFormat(ColumnMapExpectation): """Expect column entries to be strings representing a date or time with a given format. expect_column_values_to_match_strftime_format is a \ :func:`column_map_expectation <great_expectations.execution_engine.execution_engine.MetaExecutionEngine .column_map_expectation>`. Args: column (str): \ The column name. strftime_format (str): \ A strftime format string to use for matching Keyword Args: mostly (None or a float between 0 and 1): \ Return `"success": True` if at least mostly fraction of values match the expectation. \ For more detail, see :ref:`mostly`. Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. """ library_metadata = { "maturity": "production", "tags": ["core expectation", "column map expectation"], "contributors": [ "@great_expectations", ], "requirements": [], "has_full_test_suite": True, "manually_reviewed_code": True, } map_metric = "column_values.match_strftime_format" success_keys = ( "strftime_format", "mostly", "auto", "profiler_config", ) date_format_string_parameter_builder_config: ParameterBuilderConfig = ( ParameterBuilderConfig( module_name= "great_expectations.rule_based_profiler.parameter_builder", class_name="SimpleDateFormatStringParameterBuilder", name="date_format_string_parameter_builder", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, evaluation_parameter_builder_configs=None, )) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ date_format_string_parameter_builder_config ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name= "expect_column_values_to_match_strftime_format", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "default_expect_column_values_to_match_strftime_format_rule": { "variables": { "mostly": 1.0, }, "domain_builder": { "class_name": "ColumnDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [ { "expectation_type": "expect_column_values_to_match_strftime_format", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", "strftime_format": f"{PARAMETER_KEY}{date_format_string_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}", "mostly": f"{VARIABLES_KEY}mostly", "meta": { "profiler_details": f"{PARAMETER_KEY}{date_format_string_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, }, ], }, }, ) default_kwarg_values = { "row_condition": None, "condition_parser": None, # we expect this to be explicitly set whenever a row_condition is passed "mostly": 1, "result_format": "BASIC", "include_config": True, "catch_exceptions": True, "auto": False, "profiler_config": default_profiler_config, } args_keys = ( "column", "strftime_format", ) def validate_configuration( self, configuration: Optional[ExpectationConfiguration]) -> None: super().validate_configuration(configuration) if configuration is None: configuration = self.configuration assert "strftime_format" in configuration.kwargs, "strftime_format is required" strftime_format = configuration.kwargs["strftime_format"] try: if isinstance(strftime_format, dict): assert ( "$PARAMETER" in strftime_format ), 'Evaluation Parameter dict for strftime_format kwarg must have "$PARAMETER" key.' else: datetime.strptime( datetime.strftime(datetime.now(), strftime_format), strftime_format, ) except ValueError as e: raise ValueError( f"Unable to use provided strftime_format. {str(e)}") except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "strftime_format", "mostly", "row_condition", "condition_parser", ], ) params_with_json_schema = { "column": { "schema": { "type": "string" }, "value": params.get("column") }, "strftime_format": { "schema": { "type": "string" }, "value": params.get("strftime_format"), }, "mostly": { "schema": { "type": "number" }, "value": params.get("mostly") }, "mostly_pct": { "schema": { "type": "string" }, "value": params.get("mostly_pct"), }, "row_condition": { "schema": { "type": "string" }, "value": params.get("row_condition"), }, "condition_parser": { "schema": { "type": "string" }, "value": params.get("condition_parser"), }, } if not params.get("strftime_format"): template_str = "values must match a strftime format but none was specified." else: template_str = ( "values must match the following strftime format: $strftime_format" ) if params["mostly"] is not None and params["mostly"] < 1.0: params_with_json_schema["mostly_pct"]["value"] = num_to_str( params["mostly"] * 100, precision=15, no_scientific=True) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") template_str += ", at least $mostly_pct % of the time." else: template_str += "." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"], with_schema=True) template_str = f"{conditional_template_str}, then {template_str}" params_with_json_schema.update(conditional_params) return (template_str, params_with_json_schema, styling) @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "strftime_format", "mostly", "row_condition", "condition_parser", ], ) if not params.get("strftime_format"): template_str = "values must match a strftime format but none was specified." else: template_str = ( "values must match the following strftime format: $strftime_format" ) if params["mostly"] is not None and params["mostly"] < 1.0: params["mostly_pct"] = num_to_str(params["mostly"] * 100, precision=15, no_scientific=True) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") template_str += ", at least $mostly_pct % of the time." else: template_str += "." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"]) template_str = f"{conditional_template_str}, then {template_str}" params.update(conditional_params) return [ RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": template_str, "params": params, "styling": styling, }, }) ]
class ExpectColumnValuesToMatchRegex(ColumnMapExpectation): """Expect column entries to be strings that match a given regular expression. Valid matches can be found \ anywhere in the string, for example "[at]+" will identify the following strings as expected: "cat", "hat", \ "aa", "a", and "t", and the following strings as unexpected: "fish", "dog". expect_column_values_to_match_regex is a \ :func:`column_map_expectation <great_expectations.execution_engine.execution_engine.MetaExecutionEngine .column_map_expectation>`. Args: column (str): \ The column name. regex (str): \ The regular expression the column entries should match. Keyword Args: mostly (None or a float between 0 and 1): \ Return `"success": True` if at least mostly fraction of values match the expectation. \ For more detail, see :ref:`mostly`. Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. See Also: :func:`expect_column_values_to_not_match_regex \ <great_expectations.execution_engine.execution_engine.ExecutionEngine .expect_column_values_to_not_match_regex>` :func:`expect_column_values_to_match_regex_list \ <great_expectations.execution_engine.execution_engine.ExecutionEngine .expect_column_values_to_match_regex_list>` """ library_metadata = { "maturity": "production", "tags": ["core expectation", "column map expectation"], "contributors": [ "@great_expectations", ], "requirements": [], "has_full_test_suite": True, "manually_reviewed_code": False, } map_metric = "column_values.match_regex" success_keys = ( "regex", "mostly", "auto", "profiler_config", ) regex_pattern_string_parameter_builder_config: ParameterBuilderConfig = ( ParameterBuilderConfig( module_name= "great_expectations.rule_based_profiler.parameter_builder", class_name="RegexPatternStringParameterBuilder", name="regex_pattern_string_parameter_builder", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, evaluation_parameter_builder_configs=None, )) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ regex_pattern_string_parameter_builder_config ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name= "expect_column_values_to_match_regex", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "default_expect_column_values_to_match_regex_rule": { "variables": { "mostly": 1.0, }, "domain_builder": { "class_name": "ColumnDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [ { "expectation_type": "expect_column_values_to_match_regex", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", "regex": f"{PARAMETER_KEY}{regex_pattern_string_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}", "mostly": f"{VARIABLES_KEY}mostly", "meta": { "profiler_details": f"{PARAMETER_KEY}{regex_pattern_string_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, }, ], }, }, ) default_kwarg_values = { "row_condition": None, "condition_parser": None, # we expect this to be explicitly set whenever a row_condition is passed "mostly": 1, "result_format": "BASIC", "include_config": True, "catch_exceptions": True, "regex": "(?s).*", "auto": False, "profiler_config": default_profiler_config, } args_keys = ( "column", "regex", ) def validate_configuration( self, configuration: Optional[ExpectationConfiguration]) -> None: super().validate_configuration(configuration) if configuration is None: configuration = self.configuration # supports extensibility by allowing value_set to not be provided in config but captured via child-class default_kwarg_values, e.g. parameterized expectations regex = configuration.kwargs.get( "regex") or self.default_kwarg_values.get("regex") try: assert "regex" in configuration.kwargs or regex, "regex is required" assert isinstance(regex, (str, dict)), "regex must be a string" if isinstance(regex, dict): assert ( "$PARAMETER" in regex ), 'Evaluation Parameter dict for regex kwarg must have "$PARAMETER" key.' except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) @classmethod @renderer(renderer_type="renderer.question") def _question_renderer(cls, configuration, result=None, language=None, runtime_configuration=None): column = configuration.kwargs.get("column") mostly = configuration.kwargs.get("mostly") regex = configuration.kwargs.get("regex") return f'Do at least {mostly * 100}% of values in column "{column}" match the regular expression {regex}?' @classmethod @renderer(renderer_type="renderer.answer") def _answer_renderer(cls, configuration=None, result=None, language=None, runtime_configuration=None): column = result.expectation_config.kwargs.get("column") mostly = result.expectation_config.kwargs.get("mostly") regex = result.expectation_config.kwargs.get("regex") if result.success: return f'At least {mostly * 100}% of values in column "{column}" match the regular expression {regex}.' else: return f'Less than {mostly * 100}% of values in column "{column}" match the regular expression {regex}.' @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, ["column", "regex", "mostly", "row_condition", "condition_parser"], ) params_with_json_schema = { "column": { "schema": { "type": "string" }, "value": params.get("column") }, "mostly": { "schema": { "type": "number" }, "value": params.get("mostly") }, "mostly_pct": { "schema": { "type": "string" }, "value": params.get("mostly_pct"), }, "regex": { "schema": { "type": "string" }, "value": params.get("regex") }, "row_condition": { "schema": { "type": "string" }, "value": params.get("row_condition"), }, "condition_parser": { "schema": { "type": "string" }, "value": params.get("condition_parser"), }, } if not params.get("regex"): template_str = ( "values must match a regular expression but none was specified." ) else: template_str = "values must match this regular expression: $regex" if params["mostly"] is not None and params["mostly"] < 1.0: params_with_json_schema["mostly_pct"]["value"] = num_to_str( params["mostly"] * 100, precision=15, no_scientific=True) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") template_str += ", at least $mostly_pct % of the time." else: template_str += "." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"], with_schema=True) template_str = f"{conditional_template_str}, then {template_str}" params_with_json_schema.update(conditional_params) return (template_str, params_with_json_schema, styling) @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, ["column", "regex", "mostly", "row_condition", "condition_parser"], ) if not params.get("regex"): template_str = ( "values must match a regular expression but none was specified." ) else: template_str = "values must match this regular expression: $regex" if params["mostly"] is not None and params["mostly"] < 1.0: params["mostly_pct"] = num_to_str(params["mostly"] * 100, precision=15, no_scientific=True) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") template_str += ", at least $mostly_pct % of the time." else: template_str += "." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"]) template_str = f"{conditional_template_str}, then {template_str}" params.update(conditional_params) params_with_json_schema = { "column": { "schema": { "type": "string" }, "value": params.get("column") }, "mostly": { "schema": { "type": "number" }, "value": params.get("mostly") }, "mostly_pct": { "schema": { "type": "number" }, "value": params.get("mostly_pct"), }, "regex": { "schema": { "type": "string" }, "value": params.get("regex") }, "row_condition": { "schema": { "type": "string" }, "value": params.get("row_condition"), }, "condition_parser": { "schema": { "type": "string" }, "value": params.get("condition_parser"), }, } return [ RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": template_str, "params": params, "styling": styling, }, }) ]