def test_default_expectation_configuration_builder_alice_null_condition_parameter_builder_validation_dependency_included( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs: dict = {"column": "user_id"} parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id.value[0]" condition: Optional[str] = None max_user_id: int = 999999999999 min_user_id_parameter_builder_config: ParameterBuilderConfig = ( ParameterBuilderConfig( module_name= "great_expectations.rule_based_profiler.parameter_builder", class_name="MetricMultiBatchParameterBuilder", name="my_min_user_id", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, )) validation_parameter_builder_configs: Optional[ List[ParameterBuilderConfig]] = [ min_user_id_parameter_builder_config, ] default_expectation_configuration_builder = DefaultExpectationConfigurationBuilder( expectation_type="expect_column_values_to_be_between", condition=condition, min_value=fully_qualified_parameter_name_for_value, max_value=max_user_id, validation_parameter_builder_configs= validation_parameter_builder_configs, data_context=data_context, ) expectation_configuration: Optional[ ExpectationConfiguration] = default_expectation_configuration_builder.build_expectation_configuration( domain=domain, parameters=parameters, batch_request=batch_request, ) assert expectation_configuration.kwargs["min_value"] == 397433
class ExpectColumnStdevToBeBetween(ColumnExpectation): """Expect the column standard deviation to be between a minimum value and a maximum value. Uses sample standard deviation (normalized by N-1). expect_column_stdev_to_be_between is a \ :func:`column_aggregate_expectation <great_expectations.execution_engine.MetaExecutionEngine.column_aggregate_expectation>`. Args: column (str): \ The column name. min_value (float or None): \ The minimum value for the column standard deviation. max_value (float or None): \ The maximum value for the column standard deviation. strict_min (boolean): If True, the column standard deviation must be strictly larger than min_value, default=False strict_max (boolean): If True, the column standard deviation must be strictly smaller than max_value, default=False Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. \ For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. Notes: These fields in the result object are customized for this expectation: :: { "observed_value": (float) The true standard deviation for the column } * min_value and max_value are both inclusive unless strict_min or strict_max are set to True. * If min_value is None, then max_value is treated as an upper bound * If max_value is None, then min_value is treated as a lower bound See Also: :func:`expect_column_mean_to_be_between \ <great_expectations.execution_engine.execution_engine.ExecutionEngine.expect_column_mean_to_be_between>` :func:`expect_column_median_to_be_between \ <great_expectations.execution_engine.execution_engine.ExecutionEngine.expect_column_median_to_be_between>` """ # This dictionary contains metadata for display in the public gallery library_metadata = { "maturity": "production", "tags": ["core expectation", "column aggregate expectation"], "contributors": ["@great_expectations"], "requirements": [], } metric_dependencies = ("column.standard_deviation", ) success_keys = ( "min_value", "strict_min", "max_value", "strict_max", "auto", "profiler_config", ) stdev_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="NumericMetricRangeMultiBatchParameterBuilder", name="stdev_range_estimator", metric_name="column.standard_deviation", metric_multi_batch_parameter_builder_name=None, metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, false_positive_rate=f"{VARIABLES_KEY}false_positive_rate", quantile_statistic_interpolation_method= f"{VARIABLES_KEY}quantile_statistic_interpolation_method", estimator=f"{VARIABLES_KEY}estimator", n_resamples=f"{VARIABLES_KEY}n_resamples", random_seed=f"{VARIABLES_KEY}random_seed", include_estimator_samples_histogram_in_details= f"{VARIABLES_KEY}include_estimator_samples_histogram_in_details", truncate_values=f"{VARIABLES_KEY}truncate_values", round_decimals=f"{VARIABLES_KEY}round_decimals", evaluation_parameter_builder_configs=None, ) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ stdev_range_estimator_parameter_builder_config, ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name= "expect_column_stdev_to_be_between", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "default_expect_column_stdev_to_be_between_rule": { "variables": { "strict_min": False, "strict_max": False, "false_positive_rate": 0.05, "quantile_statistic_interpolation_method": "auto", "estimator": "bootstrap", "n_resamples": 9999, "random_seed": None, "include_estimator_samples_histogram_in_details": False, "truncate_values": { "lower_bound": 0, "upper_bound": None, }, "round_decimals": 2, }, "domain_builder": { "class_name": "ColumnDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [ { "expectation_type": "expect_column_stdev_to_be_between", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", "min_value": f"{PARAMETER_KEY}{stdev_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[0]", "max_value": f"{PARAMETER_KEY}{stdev_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[1]", "strict_min": f"{VARIABLES_KEY}strict_min", "strict_max": f"{VARIABLES_KEY}strict_max", "meta": { "profiler_details": f"{PARAMETER_KEY}{stdev_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, }, ], }, }, ) default_kwarg_values = { "min_value": None, "strict_min": False, "max_value": None, "strict_max": False, "result_format": "BASIC", "include_config": True, "catch_exceptions": False, "auto": False, "profiler_config": default_profiler_config, } args_keys = ( "column", "min_value", "max_value", "strict_min", "strict_max", ) def validate_configuration( self, configuration: Optional[ExpectationConfiguration]) -> None: """ Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that necessary configuration arguments have been provided for the validation of the expectation. Args: configuration (OPTIONAL[ExpectationConfiguration]): \ An optional Expectation Configuration entry that will be used to configure the expectation Returns: None. Raises InvalidExpectationConfigurationError if the config is not validated successfully """ super().validate_configuration(configuration) self.validate_metric_value_between_configuration( configuration=configuration) @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "min_value", "max_value", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) params_with_json_schema = { "column": { "schema": { "type": "string" }, "value": params.get("column") }, "min_value": { "schema": { "type": "number" }, "value": params.get("min_value"), }, "max_value": { "schema": { "type": "number" }, "value": params.get("max_value"), }, "row_condition": { "schema": { "type": "string" }, "value": params.get("row_condition"), }, "condition_parser": { "schema": { "type": "string" }, "value": params.get("condition_parser"), }, "strict_min": { "schema": { "type": "boolean" }, "value": params.get("strict_min"), }, "strict_max": { "schema": { "type": "boolean" }, "value": params.get("strict_max"), }, } if (params["min_value"] is None) and (params["max_value"] is None): template_str = "standard deviation may have any numerical value." else: at_least_str, at_most_str = handle_strict_min_max(params) if params["min_value"] is not None and params[ "max_value"] is not None: template_str = f"standard deviation must be {at_least_str} $min_value and {at_most_str} $max_value." elif params["min_value"] is None: template_str = f"standard deviation must be {at_most_str} $max_value." elif params["max_value"] is None: template_str = f"standard deviation must be {at_least_str} $min_value." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"], with_schema=True) template_str = f"{conditional_template_str}, then {template_str}" params_with_json_schema.update(conditional_params) return (template_str, params_with_json_schema, styling) @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "min_value", "max_value", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) if (params["min_value"] is None) and (params["max_value"] is None): template_str = "standard deviation may have any numerical value." else: at_least_str, at_most_str = handle_strict_min_max(params) if params["min_value"] is not None and params[ "max_value"] is not None: template_str = f"standard deviation must be {at_least_str} $min_value and {at_most_str} $max_value." elif params["min_value"] is None: template_str = f"standard deviation must be {at_most_str} $max_value." elif params["max_value"] is None: template_str = f"standard deviation must be {at_least_str} $min_value." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"]) template_str = f"{conditional_template_str}, then {template_str}" params.update(conditional_params) return [ RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": template_str, "params": params, "styling": styling, }, }) ] def _validate( self, configuration: ExpectationConfiguration, metrics: Dict, runtime_configuration: dict = None, execution_engine: ExecutionEngine = None, ): return self._validate_metric_value_between( metric_name="column.standard_deviation", configuration=configuration, metrics=metrics, runtime_configuration=runtime_configuration, execution_engine=execution_engine, )
class ExpectColumnValuesToBeBetween(ColumnMapExpectation): """Expect column entries to be between a minimum value and a maximum value (inclusive). expect_column_values_to_be_between is a \ :func:`column_map_expectation <great_expectations.execution_engine.execution_engine.MetaExecutionEngine .column_map_expectation>`. Args: column (str): \ The column name. min_value (comparable type or None): The minimum value for a column entry. max_value (comparable type or None): The maximum value for a column entry. strict_min (boolean): If True, values must be strictly larger than min_value, default=False strict_max (boolean): If True, values must be strictly smaller than max_value, default=False Keyword Args: allow_cross_type_comparisons (boolean or None) : If True, allow comparisons between types (e.g. integer and\ string). Otherwise, attempting such comparisons will raise an exception. parse_strings_as_datetimes (boolean or None) : If True, parse min_value, max_value, and all non-null column\ values to datetimes before making comparisons. output_strftime_format (str or None): \ A valid strfime format for datetime output. Only used if parse_strings_as_datetimes=True. mostly (None or a float between 0 and 1): \ Return `"success": True` if at least mostly fraction of values match the expectation. \ For more detail, see :ref:`mostly`. Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. Notes: * min_value and max_value are both inclusive unless strict_min or strict_max are set to True. * If min_value is None, then max_value is treated as an upper bound, and there is no minimum value checked. * If max_value is None, then min_value is treated as a lower bound, and there is no maximum value checked. See Also: :func:`expect_column_value_lengths_to_be_between \ <great_expectations.execution_engine.execution_engine.ExecutionEngine .expect_column_value_lengths_to_be_between>` """ # This dictionary contains metadata for display in the public gallery library_metadata = { "maturity": "production", "tags": ["core expectation", "column map expectation"], "contributors": ["@great_expectations"], "requirements": [], "has_full_test_suite": True, "manually_reviewed_code": True, } map_metric = "column_values.between" success_keys = ( "min_value", "max_value", "strict_min", "strict_max", "allow_cross_type_comparisons", "mostly", "parse_strings_as_datetimes", "auto", "profiler_config", ) column_min_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="NumericMetricRangeMultiBatchParameterBuilder", name="column_min_range_estimator", metric_name="column.min", metric_multi_batch_parameter_builder_name=None, metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, false_positive_rate=f"{VARIABLES_KEY}false_positive_rate", quantile_statistic_interpolation_method= f"{VARIABLES_KEY}quantile_statistic_interpolation_method", estimator=f"{VARIABLES_KEY}estimator", n_resamples=f"{VARIABLES_KEY}n_resamples", random_seed=f"{VARIABLES_KEY}random_seed", include_estimator_samples_histogram_in_details= f"{VARIABLES_KEY}include_estimator_samples_histogram_in_details", truncate_values=f"{VARIABLES_KEY}truncate_values", round_decimals=f"{VARIABLES_KEY}round_decimals", evaluation_parameter_builder_configs=None, ) column_max_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="NumericMetricRangeMultiBatchParameterBuilder", name="column_max_range_estimator", metric_name="column.max", metric_multi_batch_parameter_builder_name=None, metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, false_positive_rate=f"{VARIABLES_KEY}false_positive_rate", quantile_statistic_interpolation_method= f"{VARIABLES_KEY}quantile_statistic_interpolation_method", estimator=f"{VARIABLES_KEY}estimator", n_resamples=f"{VARIABLES_KEY}n_resamples", random_seed=f"{VARIABLES_KEY}random_seed", include_estimator_samples_histogram_in_details= f"{VARIABLES_KEY}include_estimator_samples_histogram_in_details", truncate_values=f"{VARIABLES_KEY}truncate_values", round_decimals=f"{VARIABLES_KEY}round_decimals", evaluation_parameter_builder_configs=None, ) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ column_min_range_estimator_parameter_builder_config, column_max_range_estimator_parameter_builder_config, ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name= "expect_column_values_to_be_between", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "default_expect_column_values_to_be_between_rule": { "variables": { "mostly": 1.0, "strict_min": False, "strict_max": False, "false_positive_rate": 0.05, "quantile_statistic_interpolation_method": "auto", "estimator": "bootstrap", "n_resamples": 9999, "random_seed": None, "include_estimator_samples_histogram_in_details": False, "truncate_values": { "lower_bound": None, "upper_bound": None, }, "round_decimals": 1, }, "domain_builder": { "class_name": "ColumnDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [ { "expectation_type": "expect_column_values_to_be_between", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", "min_value": f"{PARAMETER_KEY}{column_min_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[0]", "max_value": f"{PARAMETER_KEY}{column_max_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[1]", "mostly": f"{VARIABLES_KEY}mostly", "strict_min": f"{VARIABLES_KEY}strict_min", "strict_max": f"{VARIABLES_KEY}strict_max", "meta": { "profiler_details": { "column_min_range_estimator": f"{PARAMETER_KEY}{column_min_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", "column_max_range_estimator": f"{PARAMETER_KEY}{column_max_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, }, }, ], }, }, ) default_kwarg_values = { "row_condition": None, "condition_parser": None, # we expect this to be explicitly set whenever a row_condition is passed "mostly": 1, "min_value": None, "max_value": None, "strict_min": False, "strict_max": False, # tolerance=1e-9, "parse_strings_as_datetimes": False, "allow_cross_type_comparisons": None, "result_format": "BASIC", "include_config": True, "catch_exceptions": False, "meta": None, "auto": False, "profiler_config": default_profiler_config, } args_keys = ( "column", "min_value", "max_value", "strict_min", "strict_max", ) def validate_configuration( self, configuration: Optional[ExpectationConfiguration]) -> None: """ Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that necessary configuration arguments have been provided for the validation of the expectation. Args: configuration (OPTIONAL[ExpectationConfiguration]): \ An optional Expectation Configuration entry that will be used to configure the expectation Returns: None. Raises InvalidExpectationConfigurationError if the config is not validated successfully """ # Setting up a configuration super().validate_configuration(configuration) min_val = None max_val = None if "min_value" in configuration.kwargs: min_val = configuration.kwargs["min_value"] if "max_value" in configuration.kwargs: max_val = configuration.kwargs["max_value"] assert (min_val is not None or max_val is not None), "min_value and max_value cannot both be None" self.validate_metric_value_between_configuration( configuration=configuration) @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "min_value", "max_value", "mostly", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) params_with_json_schema = { "column": { "schema": { "type": "string" }, "value": params.get("column") }, "min_value": { "schema": { "type": "number" }, "value": params.get("min_value"), }, "max_value": { "schema": { "type": "number" }, "value": params.get("max_value"), }, "mostly": { "schema": { "type": "number" }, "value": params.get("mostly") }, "mostly_pct": { "schema": { "type": "string" }, "value": params.get("mostly_pct"), }, "row_condition": { "schema": { "type": "string" }, "value": params.get("row_condition"), }, "condition_parser": { "schema": { "type": "string" }, "value": params.get("condition_parser"), }, "strict_min": { "schema": { "type": "boolean" }, "value": params.get("strict_min"), }, "strict_max": { "schema": { "type": "boolean" }, "value": params.get("strict_max"), }, } template_str = "" if (params["min_value"] is None) and (params["max_value"] is None): template_str += "may have any numerical value." else: at_least_str, at_most_str = handle_strict_min_max(params) mostly_str = "" if params["mostly"] is not None and params["mostly"] < 1.0: params_with_json_schema["mostly_pct"]["value"] = num_to_str( params["mostly"] * 100, precision=15, no_scientific=True) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") mostly_str = ", at least $mostly_pct % of the time" if params["min_value"] is not None and params[ "max_value"] is not None: template_str += f"values must be {at_least_str} $min_value and {at_most_str} $max_value{mostly_str}." elif params["min_value"] is None: template_str += f"values must be {at_most_str} $max_value{mostly_str}." elif params["max_value"] is None: template_str += f"values must be {at_least_str} $min_value{mostly_str}." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"], with_schema=True) template_str = f"{conditional_template_str}, then {template_str}" params_with_json_schema.update(conditional_params) return (template_str, params_with_json_schema, styling) # NOTE: This method is a pretty good example of good usage of `params`. @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "min_value", "max_value", "mostly", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) template_str = "" if (params["min_value"] is None) and (params["max_value"] is None): template_str += "may have any numerical value." else: at_least_str, at_most_str = handle_strict_min_max(params) mostly_str = "" if params["mostly"] is not None and params["mostly"] < 1.0: params["mostly_pct"] = num_to_str(params["mostly"] * 100, precision=15, no_scientific=True) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") mostly_str = ", at least $mostly_pct % of the time" if params["min_value"] is not None and params[ "max_value"] is not None: template_str += f"values must be {at_least_str} $min_value and {at_most_str} $max_value{mostly_str}." elif params["min_value"] is None: template_str += f"values must be {at_most_str} $max_value{mostly_str}." elif params["max_value"] is None: template_str += f"values must be {at_least_str} $min_value{mostly_str}." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"]) template_str = f"{conditional_template_str}, then {template_str}" params.update(conditional_params) return [ RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": template_str, "params": params, "styling": styling, }, }) ]
class ExpectTableColumnsToMatchSet(TableExpectation): """Expect the columns to match an *unordered* set. expect_table_columns_to_match_set is a :func:`expectation \ <great_expectations.validator.validator.Validator.expectation>`, not a ``column_map_expectation`` or ``column_aggregate_expectation``. Args: column_set (list of str): \ The column names, in any order. exact_match (boolean): \ Whether the list of columns must exactly match the observed columns. Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. """ library_metadata = { "maturity": "production", "tags": ["core expectation", "table expectation"], "contributors": [ "@great_expectations", ], "requirements": [], "has_full_test_suite": True, "manually_reviewed_code": True, } metric_dependencies = ("table.columns",) success_keys = ( "column_set", "exact_match", "auto", "profiler_config", ) mean_table_columns_set_match_multi_batch_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="MeanTableColumnsSetMatchMultiBatchParameterBuilder", name="column_names_set_estimator", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, evaluation_parameter_builder_configs=None, ) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ mean_table_columns_set_match_multi_batch_parameter_builder_config, ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name="expect_table_columns_to_match_set", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "expect_table_columns_to_match_set": { "variables": { "exact_match": None, "success_ratio": 1.0, }, "domain_builder": { "class_name": "TableDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [ { "expectation_type": "expect_table_columns_to_match_set", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "condition": f"{PARAMETER_KEY}{mean_table_columns_set_match_multi_batch_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}success_ratio >= {VARIABLES_KEY}success_ratio", "column_set": f"{PARAMETER_KEY}{mean_table_columns_set_match_multi_batch_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}", "exact_match": f"{VARIABLES_KEY}exact_match", "meta": { "profiler_details": f"{PARAMETER_KEY}{mean_table_columns_set_match_multi_batch_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, }, ], }, }, ) default_kwarg_values = { "column_set": None, "exact_match": True, "result_format": "BASIC", "include_config": True, "catch_exceptions": False, "auto": False, "profiler_config": default_profiler_config, } args_keys = ( "column_set", "exact_match", ) def validate_configuration( self, configuration: Optional[ExpectationConfiguration] ) -> None: """ Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that necessary configuration arguments have been provided for the validation of the expectation. Args: configuration (OPTIONAL[ExpectationConfiguration]): \ An optional Expectation Configuration entry that will be used to configure the expectation Returns: None. Raises InvalidExpectationConfigurationError if the config is not validated successfully """ # Setting up a configuration super().validate_configuration(configuration) # Ensuring that a proper value has been provided try: assert "column_set" in configuration.kwargs, "column_set is required" assert ( isinstance(configuration.kwargs["column_set"], (list, set, dict)) or configuration.kwargs["column_set"] is None ), "column_set must be a list, set, or None" if isinstance(configuration.kwargs["column_set"], dict): assert ( "$PARAMETER" in configuration.kwargs["column_set"] ), 'Evaluation Parameter dict for column_set kwarg must have "$PARAMETER" key.' except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get("include_column_name", True) include_column_name = ( include_column_name if include_column_name is not None else True ) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, ["column_set", "exact_match"] ) if params["column_set"] is None: template_str = "Must specify a set or list of columns." else: # standardize order of the set for output params["column_list"] = list(params["column_set"]) column_list_template_str = ", ".join( [f"$column_list_{idx}" for idx in range(len(params["column_list"]))] ) exact_match_str = "exactly" if params["exact_match"] is True else "at least" template_str = f"Must have {exact_match_str} these columns (in any order): {column_list_template_str}" for idx in range(len(params["column_list"])): params[f"column_list_{str(idx)}"] = params["column_list"][idx] params_with_json_schema = { "column_list": { "schema": {"type": "array"}, "value": params.get("column_list"), }, "exact_match": { "schema": {"type": "boolean"}, "value": params.get("exact_match"), }, } return (template_str, params_with_json_schema, styling) @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get("include_column_name", True) include_column_name = ( include_column_name if include_column_name is not None else True ) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, ["column_set", "exact_match"] ) if params["column_set"] is None: template_str = "Must specify a set or list of columns." else: # standardize order of the set for output params["column_list"] = list(params["column_set"]) column_list_template_str = ", ".join( [f"$column_list_{idx}" for idx in range(len(params["column_list"]))] ) exact_match_str = "exactly" if params["exact_match"] is True else "at least" template_str = f"Must have {exact_match_str} these columns (in any order): {column_list_template_str}" for idx in range(len(params["column_list"])): params[f"column_list_{str(idx)}"] = params["column_list"][idx] return [ RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": template_str, "params": params, "styling": styling, }, } ) ] def _validate( self, configuration: ExpectationConfiguration, metrics: Dict, runtime_configuration: dict = None, execution_engine: ExecutionEngine = None, ): # Obtaining columns and ordered list for sake of comparison expected_column_set = self.get_success_kwargs(configuration).get("column_set") expected_column_set = ( set(expected_column_set) if expected_column_set is not None else set() ) actual_column_list = metrics.get("table.columns") actual_column_set = set(actual_column_list) exact_match = self.get_success_kwargs(configuration).get("exact_match") if ( (expected_column_set is None) and (exact_match is not True) ) or actual_column_set == expected_column_set: return {"success": True, "result": {"observed_value": actual_column_list}} else: # Convert to lists and sort to lock order for testing and output rendering # unexpected_list contains items from the dataset columns that are not in expected_column_set unexpected_list = sorted(list(actual_column_set - expected_column_set)) # missing_list contains items from expected_column_set that are not in the dataset columns missing_list = sorted(list(expected_column_set - actual_column_set)) # observed_value contains items that are in the dataset columns observed_value = sorted(actual_column_list) mismatched = {} if len(unexpected_list) > 0: mismatched["unexpected"] = unexpected_list if len(missing_list) > 0: mismatched["missing"] = missing_list result = { "observed_value": observed_value, "details": {"mismatched": mismatched}, } return_success = { "success": True, "result": result, } return_failed = { "success": False, "result": result, } if exact_match: return return_failed else: # Failed if there are items in the missing list (but OK to have unexpected_list) if len(missing_list) > 0: return return_failed # Passed if there are no items in the missing list else: return return_success
class ExpectTableRowCountToBeBetween(TableExpectation): """Expect the number of rows to be between two values. expect_table_row_count_to_be_between is a :func:`expectation \ <great_expectations.validator.validator.Validator.expectation>`, not a ``column_map_expectation`` or ``column_aggregate_expectation``. Keyword Args: min_value (int or None): \ The minimum number of rows, inclusive. max_value (int or None): \ The maximum number of rows, inclusive. Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. Notes: * min_value and max_value are both inclusive. * If min_value is None, then max_value is treated as an upper bound, and the number of acceptable rows has \ no minimum. * If max_value is None, then min_value is treated as a lower bound, and the number of acceptable rows has \ no maximum. See Also: expect_table_row_count_to_equal """ library_metadata = { "maturity": "production", "tags": ["core expectation", "table expectation"], "contributors": [ "@great_expectations", ], "requirements": [], "has_full_test_suite": True, "manually_reviewed_code": True, } metric_dependencies = ("table.row_count",) success_keys = ( "min_value", "max_value", "auto", "profiler_config", ) table_row_count_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="NumericMetricRangeMultiBatchParameterBuilder", name="table_row_count_range_estimator", metric_name="table.row_count", metric_domain_kwargs=None, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, false_positive_rate=f"{VARIABLES_KEY}false_positive_rate", quantile_statistic_interpolation_method=f"{VARIABLES_KEY}quantile_statistic_interpolation_method", estimator=f"{VARIABLES_KEY}estimator", num_bootstrap_samples=f"{VARIABLES_KEY}num_bootstrap_samples", bootstrap_random_seed=f"{VARIABLES_KEY}bootstrap_random_seed", truncate_values=f"{VARIABLES_KEY}truncate_values", round_decimals=f"{VARIABLES_KEY}round_decimals", evaluation_parameter_builder_configs=None, json_serialize=True, ) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ table_row_count_range_estimator_parameter_builder_config, ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name="expect_table_row_count_to_be_between", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "default_expect_table_row_count_to_be_between_rule": { "variables": { "false_positive_rate": 0.05, "quantile_statistic_interpolation_method": "auto", "estimator": "bootstrap", "num_bootstrap_samples": 9999, "bootstrap_random_seed": None, "truncate_values": { "lower_bound": 0, "upper_bound": None, }, "round_decimals": 0, }, "domain_builder": { "class_name": "TableDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [ { "expectation_type": "expect_table_row_count_to_be_between", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "min_value": f"{PARAMETER_KEY}{table_row_count_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[0]", "max_value": f"{PARAMETER_KEY}{table_row_count_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[1]", "meta": { "profiler_details": f"{PARAMETER_KEY}{table_row_count_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, } ], }, }, ) default_kwarg_values = { "min_value": None, "max_value": None, "result_format": "BASIC", "include_config": True, "catch_exceptions": False, "meta": None, "auto": False, "profiler_config": default_profiler_config, } args_keys = ( "min_value", "max_value", ) def validate_configuration( self, configuration: Optional[ExpectationConfiguration] ) -> None: """ Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that necessary configuration arguments have been provided for the validation of the expectation. Args: configuration (OPTIONAL[ExpectationConfiguration]): \ An optional Expectation Configuration entry that will be used to configure the expectation Returns: None. Raises InvalidExpectationConfigurationError if the config is not validated successfully """ # Setting up a configuration super().validate_configuration(configuration) self.validate_metric_value_between_configuration(configuration=configuration) @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get("include_column_name", True) include_column_name = ( include_column_name if include_column_name is not None else True ) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "min_value", "max_value", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) # format params params_with_json_schema = { "min_value": { "schema": {"type": "number"}, "value": params.get("min_value"), }, "max_value": { "schema": {"type": "number"}, "value": params.get("max_value"), }, "condition_parser": { "schema": {"type": "string"}, "value": params.get("condition_parser"), }, "strict_min": { "schema": {"type": "boolean"}, "value": params.get("strict_min"), }, "strict_max": { "schema": {"type": "boolean"}, "value": params.get("strict_max"), }, } if params["min_value"] is None and params["max_value"] is None: template_str = "May have any number of rows." else: at_least_str, at_most_str = handle_strict_min_max(params) if params["min_value"] is not None and params["max_value"] is not None: template_str = f"Must have {at_least_str} $min_value and {at_most_str} $max_value rows." elif params["min_value"] is None: template_str = f"Must have {at_most_str} $max_value rows." elif params["max_value"] is None: template_str = f"Must have {at_least_str} $min_value rows." if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"], with_schema=True ) template_str = ( conditional_template_str + ", then " + template_str[0].lower() + template_str[1:] ) params_with_json_schema.update(conditional_params) return (template_str, params_with_json_schema, styling) @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get("include_column_name", True) include_column_name = ( include_column_name if include_column_name is not None else True ) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "min_value", "max_value", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) if params["min_value"] is None and params["max_value"] is None: template_str = "May have any number of rows." else: at_least_str, at_most_str = handle_strict_min_max(params) if params["min_value"] is not None and params["max_value"] is not None: template_str = f"Must have {at_least_str} $min_value and {at_most_str} $max_value rows." elif params["min_value"] is None: template_str = f"Must have {at_most_str} $max_value rows." elif params["max_value"] is None: template_str = f"Must have {at_least_str} $min_value rows." if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine(params["row_condition"]) template_str = ( conditional_template_str + ", then " + template_str[0].lower() + template_str[1:] ) params.update(conditional_params) return [ RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": template_str, "params": params, "styling": styling, }, } ) ] def _validate( self, configuration: ExpectationConfiguration, metrics: Dict, runtime_configuration: dict = None, execution_engine: ExecutionEngine = None, ): return self._validate_metric_value_between( metric_name="table.row_count", configuration=configuration, metrics=metrics, runtime_configuration=runtime_configuration, execution_engine=execution_engine, )
def test_mean_unexpected_map_metric_multi_batch_parameter_builder_bobby_check_serialized_keys_with_evaluation_parameter_builder_configs( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context ) my_total_count_metric_multi_batch_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="MetricMultiBatchParameterBuilder", name="my_total_count", metric_name="table.row_count", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=False, replace_nan_with_zero=False, reduce_scalar_metric=True, evaluation_parameter_builder_configs=None, ) my_null_count_metric_multi_batch_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="MetricMultiBatchParameterBuilder", name="my_null_count", metric_name="column_values.nonnull.unexpected_count", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=False, replace_nan_with_zero=False, reduce_scalar_metric=True, evaluation_parameter_builder_configs=None, ) evaluation_parameter_builder_configs: Optional[List[ParameterBuilderConfig]] = [ my_total_count_metric_multi_batch_parameter_builder_config, my_null_count_metric_multi_batch_parameter_builder_config, ] mean_unexpected_map_metric_multi_batch_parameter_builder: ParameterBuilder = ( MeanUnexpectedMapMetricMultiBatchParameterBuilder( name="my_pickup_datetime_count_values_unique_mean_unexpected_map_metric", map_metric_name="column_values.nonnull", total_count_parameter_builder_name="my_total_count", null_count_parameter_builder_name="my_null_count", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, evaluation_parameter_builder_configs=evaluation_parameter_builder_configs, data_context=data_context, ) ) # Note: "evaluation_parameter_builder_configs" is not one of "ParameterBuilder" formal property attributes. assert set( mean_unexpected_map_metric_multi_batch_parameter_builder.to_json_dict().keys() ) == { "class_name", "module_name", "name", "map_metric_name", "total_count_parameter_builder_name", "null_count_parameter_builder_name", "metric_domain_kwargs", "metric_value_kwargs", "evaluation_parameter_builder_configs", }
def test_mean_unexpected_map_metric_multi_batch_parameter_builder_bobby_datetime_dependencies_evaluated_mixed( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context ) batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } my_total_count_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder( name="my_total_count", metric_name="table.row_count", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=False, replace_nan_with_zero=False, reduce_scalar_metric=True, evaluation_parameter_builder_configs=None, data_context=data_context, ) my_null_count_metric_multi_batch_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="MetricMultiBatchParameterBuilder", name="my_null_count", metric_name="column_values.nonnull.unexpected_count", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=False, replace_nan_with_zero=False, reduce_scalar_metric=True, evaluation_parameter_builder_configs=None, ) evaluation_parameter_builder_configs: Optional[List[ParameterBuilderConfig]] = [ my_null_count_metric_multi_batch_parameter_builder_config, ] mean_unexpected_map_metric_multi_batch_parameter_builder: ParameterBuilder = ( MeanUnexpectedMapMetricMultiBatchParameterBuilder( name="my_pickup_datetime_count_values_unique_mean_unexpected_map_metric", map_metric_name="column_values.nonnull", total_count_parameter_builder_name="my_total_count", null_count_parameter_builder_name="my_null_count", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, evaluation_parameter_builder_configs=evaluation_parameter_builder_configs, data_context=data_context, ) ) metric_domain_kwargs: dict = {"column": "pickup_datetime"} domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) variables: Optional[ParameterContainer] = None parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } my_total_count_metric_multi_batch_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) mean_unexpected_map_metric_multi_batch_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) expected_parameter_value: float = 3.89e-3 parameter_node: ParameterNode = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=mean_unexpected_map_metric_multi_batch_parameter_builder.json_serialized_fully_qualified_parameter_name, expected_return_type=None, variables=variables, parameters=parameters, ) rtol: float = RTOL atol: float = 5.0e-1 * ATOL np.testing.assert_allclose( actual=parameter_node.value, desired=expected_parameter_value, rtol=rtol, atol=atol, err_msg=f"Actual value of {parameter_node.value} differs from expected value of {expected_parameter_value} by more than {atol + rtol * abs(parameter_node.value)} tolerance.", )
def _build_table_rule() -> Rule: """ This method builds "Rule" object focused on emitting "ExpectationConfiguration" objects for table "Domain" type. """ # Step-1: Instantiate "TableDomainBuilder" object. table_domain_builder: TableDomainBuilder = TableDomainBuilder( data_context=None, ) # Step-2: Declare "ParameterBuilder" for every metric of interest. table_row_count_metric_multi_batch_parameter_builder_for_metrics: ParameterBuilder = DataAssistant.commonly_used_parameter_builders.get_table_row_count_metric_multi_batch_parameter_builder( json_serialize=True) # Step-3: Declare "ParameterBuilder" for every "validation" need in "ExpectationConfigurationBuilder" objects. table_row_count_range_parameter_builder_for_validations: ParameterBuilder = DataAssistant.commonly_used_parameter_builders.build_numeric_metric_range_multi_batch_parameter_builder( metric_name="table.row_count", metric_value_kwargs=None, json_serialize=True, ) validation_parameter_builder_configs: Optional[ List[ParameterBuilderConfig]] # Step-4: Pass "validation" "ParameterBuilderConfig" objects to every "DefaultExpectationConfigurationBuilder", responsible for emitting "ExpectationConfiguration" (with specified "expectation_type"). validation_parameter_builder_configs = [ ParameterBuilderConfig( **table_row_count_range_parameter_builder_for_validations. to_json_dict(), ), ] expect_table_row_count_to_be_between_expectation_configuration_builder: DefaultExpectationConfigurationBuilder = DefaultExpectationConfigurationBuilder( expectation_type="expect_table_row_count_to_be_between", validation_parameter_builder_configs= validation_parameter_builder_configs, min_value= f"{table_row_count_range_parameter_builder_for_validations.fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[0]", max_value= f"{table_row_count_range_parameter_builder_for_validations.fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[1]", meta={ "profiler_details": f"{table_row_count_range_parameter_builder_for_validations.fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, ) # Step-5: Instantiate and return "Rule" object, comprised of "variables", "domain_builder", "parameter_builders", and "expectation_configuration_builders" components. variables: dict = { "false_positive_rate": 0.05, "quantile_statistic_interpolation_method": "auto", "estimator": "bootstrap", "n_resamples": 9999, "random_seed": None, "include_estimator_samples_histogram_in_details": False, "truncate_values": { "lower_bound": 0, "upper_bound": None, }, "round_decimals": 0, } parameter_builders: List[ParameterBuilder] = [ table_row_count_metric_multi_batch_parameter_builder_for_metrics, ] expectation_configuration_builders: List[ ExpectationConfigurationBuilder] = [ expect_table_row_count_to_be_between_expectation_configuration_builder, ] rule: Rule = Rule( name="table_rule", variables=variables, domain_builder=table_domain_builder, parameter_builders=parameter_builders, expectation_configuration_builders= expectation_configuration_builders, ) return rule
def _build_categorical_columns_rule() -> Rule: """ This method builds "Rule" object focused on emitting "ExpectationConfiguration" objects for categorical columns. """ # Step-1: Instantiate "CategoricalColumnDomainBuilder" for selecting columns containing "FEW" discrete values. categorical_column_type_domain_builder: CategoricalColumnDomainBuilder = ( CategoricalColumnDomainBuilder( include_column_names=None, exclude_column_names=None, include_column_name_suffixes=None, exclude_column_name_suffixes=None, semantic_type_filter_module_name=None, semantic_type_filter_class_name=None, include_semantic_types=None, exclude_semantic_types=None, allowed_semantic_types_passthrough=None, cardinality_limit_mode=CardinalityLimitMode.REL_100, max_unique_values=None, max_proportion_unique=None, data_context=None, )) # Step-2: Declare "ParameterBuilder" for every metric of interest. column_distinct_values_count_metric_multi_batch_parameter_builder_for_metrics: ParameterBuilder = DataAssistant.commonly_used_parameter_builders.get_column_distinct_values_count_metric_multi_batch_parameter_builder( json_serialize=True) # Step-3: Declare "ParameterBuilder" for every "validation" need in "ExpectationConfigurationBuilder" objects. column_distinct_values_count_range_parameter_builder_for_validations: ParameterBuilder = DataAssistant.commonly_used_parameter_builders.build_numeric_metric_range_multi_batch_parameter_builder( metric_name="column.distinct_values.count", metric_value_kwargs=None, json_serialize=True, ) validation_parameter_builder_configs: Optional[ List[ParameterBuilderConfig]] # Step-4: Pass "validation" "ParameterBuilderConfig" objects to every "DefaultExpectationConfigurationBuilder", responsible for emitting "ExpectationConfiguration" (with specified "expectation_type"). validation_parameter_builder_configs = [ ParameterBuilderConfig( ** column_distinct_values_count_range_parameter_builder_for_validations .to_json_dict(), ), ] expect_column_unique_value_count_to_be_between_expectation_configuration_builder: DefaultExpectationConfigurationBuilder = DefaultExpectationConfigurationBuilder( expectation_type="expect_column_unique_value_count_to_be_between", validation_parameter_builder_configs= validation_parameter_builder_configs, column= f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", min_value= f"{column_distinct_values_count_range_parameter_builder_for_validations.fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[0]", max_value= f"{column_distinct_values_count_range_parameter_builder_for_validations.fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[1]", strict_min=f"{VARIABLES_KEY}strict_min", strict_max=f"{VARIABLES_KEY}strict_max", meta={ "profiler_details": f"{column_distinct_values_count_range_parameter_builder_for_validations.fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, ) # Step-5: Instantiate and return "Rule" object, comprised of "variables", "domain_builder", "parameter_builders", and "expectation_configuration_builders" components. variables: dict = { "mostly": 1.0, "strict_min": False, "strict_max": False, "false_positive_rate": 0.05, "quantile_statistic_interpolation_method": "auto", "estimator": "bootstrap", "n_resamples": 9999, "random_seed": None, "include_estimator_samples_histogram_in_details": False, "truncate_values": { "lower_bound": 0.0, "upper_bound": None, }, "round_decimals": 1, } parameter_builders: List[ParameterBuilder] = [ column_distinct_values_count_metric_multi_batch_parameter_builder_for_metrics, ] expectation_configuration_builders: List[ ExpectationConfigurationBuilder] = [ expect_column_unique_value_count_to_be_between_expectation_configuration_builder, ] rule: Rule = Rule( name="categorical_columns_rule", variables=variables, domain_builder=categorical_column_type_domain_builder, parameter_builders=parameter_builders, expectation_configuration_builders= expectation_configuration_builders, ) return rule
def __init__( self, name: str, bucketize_data: Union[str, bool] = True, evaluation_parameter_builder_configs: Optional[ List[ParameterBuilderConfig] ] = None, data_context: Optional["BaseDataContext"] = None, # noqa: F821 ) -> None: """ Args: name: the name of this parameter -- this is user-specified parameter name (from configuration); it is not the fully-qualified parameter name; a fully-qualified parameter name must start with "$parameter." and may contain one or more subsequent parts (e.g., "$parameter.<my_param_from_config>.<metric_name>"). bucketize_data: If True (default), then data is continuous (non-categorical); hence, must bucketize it. evaluation_parameter_builder_configs: ParameterBuilder configurations, executing and making whose respective ParameterBuilder objects' outputs available (as fully-qualified parameter names) is pre-requisite. These "ParameterBuilder" configurations help build parameters needed for this "ParameterBuilder". data_context: BaseDataContext associated with this ParameterBuilder """ self._column_partition_metric_single_batch_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="MetricSingleBatchParameterBuilder", name="column_partition_metric_single_batch_parameter_builder", metric_name="column.partition", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs={ "bins": "auto", "allow_relative_error": False, }, enforce_numeric_metric=False, replace_nan_with_zero=False, reduce_scalar_metric=False, evaluation_parameter_builder_configs=None, ) self._column_value_counts_metric_single_batch_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="MetricSingleBatchParameterBuilder", name="column_value_counts_metric_single_batch_parameter_builder", metric_name="column.value_counts", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs={ "sort": "value", }, enforce_numeric_metric=False, replace_nan_with_zero=False, reduce_scalar_metric=False, evaluation_parameter_builder_configs=None, ) self._column_values_nonnull_count_metric_single_batch_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="MetricSingleBatchParameterBuilder", name="column_values_nonnull_count_metric_single_batch_parameter_builder", metric_name="column_values.nonnull.count", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=False, replace_nan_with_zero=False, reduce_scalar_metric=False, evaluation_parameter_builder_configs=None, ) if evaluation_parameter_builder_configs is None: evaluation_parameter_builder_configs = [ self._column_partition_metric_single_batch_parameter_builder_config, self._column_value_counts_metric_single_batch_parameter_builder_config, self._column_values_nonnull_count_metric_single_batch_parameter_builder_config, ] super().__init__( name=name, metric_name=None, metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=False, replace_nan_with_zero=False, reduce_scalar_metric=False, evaluation_parameter_builder_configs=evaluation_parameter_builder_configs, data_context=data_context, ) self._bucketize_data = bucketize_data
class ExpectColumnValueLengthsToBeBetween(ColumnMapExpectation): """Expect column entries to be strings with length between a minimum value and a maximum value (inclusive). This expectation only works for string-type values. Invoking it on ints or floats will raise a TypeError. expect_column_value_lengths_to_be_between is a \ :func:`column_map_expectation <great_expectations.execution_engine.execution_engine.MetaExecutionEngine .column_map_expectation>`. Args: column (str): \ The column name. min_value (int or None): \ The minimum value for a column entry length. max_value (int or None): \ The maximum value for a column entry length. Keyword Args: mostly (None or a float between 0 and 1): \ Return `"success": True` if at least mostly fraction of values match the expectation. \ For more detail, see :ref:`mostly`. Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. Notes: * min_value and max_value are both inclusive. * If min_value is None, then max_value is treated as an upper bound, and the number of acceptable rows has \ no minimum. * If max_value is None, then min_value is treated as a lower bound, and the number of acceptable rows has \ no maximum. See Also: :func:`expect_column_value_lengths_to_equal \ <great_expectations.execution_engine.execution_engine.ExecutionEngine.expect_column_value_lengths_to_equal>` """ # This dictionary contains metadata for display in the public gallery library_metadata = { "maturity": "production", "tags": ["core expectation", "column map expectation"], "contributors": ["@great_expectations"], "requirements": [], "has_full_test_suite": True, "manually_reviewed_code": True, } map_metric = "column_values.value_length.between" success_keys = ( "min_value", "max_value", "strict_min", "strict_max", "mostly", "auto", "profiler_config", ) column_min_length_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="NumericMetricRangeMultiBatchParameterBuilder", name="column_min_length_range_estimator", metric_name="column_values.length.min", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, false_positive_rate=f"{VARIABLES_KEY}false_positive_rate", quantile_statistic_interpolation_method= f"{VARIABLES_KEY}quantile_statistic_interpolation_method", estimator=f"{VARIABLES_KEY}estimator", n_resamples=f"{VARIABLES_KEY}n_resamples", random_seed=f"{VARIABLES_KEY}random_seed", include_estimator_samples_histogram_in_details= f"{VARIABLES_KEY}include_estimator_samples_histogram_in_details", truncate_values=f"{VARIABLES_KEY}truncate_values", round_decimals=f"{VARIABLES_KEY}round_decimals", evaluation_parameter_builder_configs=None, json_serialize=True, ) column_max_length_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="NumericMetricRangeMultiBatchParameterBuilder", name="column_max_length_range_estimator", metric_name="column_values.length.max", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, false_positive_rate=f"{VARIABLES_KEY}false_positive_rate", quantile_statistic_interpolation_method= f"{VARIABLES_KEY}quantile_statistic_interpolation_method", estimator=f"{VARIABLES_KEY}estimator", n_resamples=f"{VARIABLES_KEY}n_resamples", random_seed=f"{VARIABLES_KEY}random_seed", include_estimator_samples_histogram_in_details= f"{VARIABLES_KEY}include_estimator_samples_histogram_in_details", truncate_values=f"{VARIABLES_KEY}truncate_values", round_decimals=f"{VARIABLES_KEY}round_decimals", evaluation_parameter_builder_configs=None, json_serialize=True, ) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ column_min_length_range_estimator_parameter_builder_config, column_max_length_range_estimator_parameter_builder_config, ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name= "expect_column_value_lengths_to_be_between", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "default_expect_column_value_lengths_to_be_between_rule": { "variables": { "mostly": 1.0, "strict_min": False, "strict_max": False, "false_positive_rate": 0.05, "quantile_statistic_interpolation_method": "auto", "estimator": "bootstrap", "n_resamples": 9999, "random_seed": None, "include_estimator_samples_histogram_in_details": False, "truncate_values": { "lower_bound": 0, "upper_bound": None, }, "round_decimals": 0, }, "domain_builder": { "class_name": "ColumnDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [ { "expectation_type": "expect_column_value_lengths_to_be_between", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", "min_value": f"{PARAMETER_KEY}{column_min_length_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[0]", "max_value": f"{PARAMETER_KEY}{column_max_length_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[1]", "mostly": f"{VARIABLES_KEY}mostly", "strict_min": f"{VARIABLES_KEY}strict_min", "strict_max": f"{VARIABLES_KEY}strict_max", "meta": { "profiler_details": { "column_min_range_estimator": f"{PARAMETER_KEY}{column_min_length_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", "column_max_range_estimator": f"{PARAMETER_KEY}{column_max_length_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, }, }, ], }, }, ) default_kwarg_values = { "row_condition": None, "condition_parser": None, "min_value": None, "max_value": None, "strict_min": None, "strict_max": None, "mostly": 1, "result_format": "BASIC", "include_config": True, "catch_exceptions": False, "auto": False, "profiler_config": default_profiler_config, } args_keys = ( "column", "min_value", "max_value", ) def validate_configuration( self, configuration: Optional[ExpectationConfiguration]) -> None: super().validate_configuration(configuration) if configuration is None: configuration = self.configuration try: assert (configuration.kwargs.get("min_value") is not None or configuration.kwargs.get("max_value") is not None), "min_value and max_value cannot both be None" if configuration.kwargs.get("min_value"): assert (isinstance( configuration.kwargs["min_value"], dict) or float( configuration.kwargs.get("min_value")).is_integer() ), "min_value and max_value must be integers" if isinstance(configuration.kwargs.get("min_value"), dict): assert "$PARAMETER" in configuration.kwargs.get( "min_value" ), 'Evaluation Parameter dict for min_value kwarg must have "$PARAMETER" key.' if configuration.kwargs.get("max_value"): assert (isinstance( configuration.kwargs["max_value"], dict) or float( configuration.kwargs.get("max_value")).is_integer() ), "min_value and max_value must be integers" if isinstance(configuration.kwargs.get("max_value"), dict): assert "$PARAMETER" in configuration.kwargs.get( "max_value" ), 'Evaluation Parameter dict for max_value kwarg must have "$PARAMETER" key.' except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "min_value", "max_value", "mostly", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) params_with_json_schema = { "column": { "schema": { "type": "string" }, "value": params.get("column") }, "min_value": { "schema": { "type": "number" }, "value": params.get("min_value"), }, "max_value": { "schema": { "type": "number" }, "value": params.get("max_value"), }, "mostly": { "schema": { "type": "number" }, "value": params.get("mostly") }, "mostly_pct": { "schema": { "type": "string" }, "value": params.get("mostly_pct"), }, "row_condition": { "schema": { "type": "string" }, "value": params.get("row_condition"), }, "condition_parser": { "schema": { "type": "string" }, "value": params.get("condition_parser"), }, "strict_min": { "schema": { "type": "boolean" }, "value": params.get("strict_min"), }, "strict_max": { "schema": { "type": "boolean" }, "value": params.get("strict_max"), }, } if (params["min_value"] is None) and (params["max_value"] is None): template_str = "values may have any length." else: at_least_str, at_most_str = handle_strict_min_max(params) if params["mostly"] is not None and params["mostly"] < 1.0: params_with_json_schema["mostly_pct"]["value"] = num_to_str( params["mostly"] * 100, precision=15, no_scientific=True) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") if params["min_value"] is not None and params[ "max_value"] is not None: template_str = f"values must be {at_least_str} $min_value and {at_most_str} $max_value characters long, at least $mostly_pct % of the time." elif params["min_value"] is None: template_str = f"values must be {at_most_str} $max_value characters long, at least $mostly_pct % of the time." elif params["max_value"] is None: template_str = f"values must be {at_least_str} $min_value characters long, at least $mostly_pct % of the time." else: if params["min_value"] is not None and params[ "max_value"] is not None: template_str = f"values must always be {at_least_str} $min_value and {at_most_str} $max_value characters long." elif params["min_value"] is None: template_str = f"values must always be {at_most_str} $max_value characters long." elif params["max_value"] is None: template_str = f"values must always be {at_least_str} $min_value characters long." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"], with_schema=True) template_str = f"{conditional_template_str}, then {template_str}" params_with_json_schema.update(conditional_params) return (template_str, params_with_json_schema, styling) @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration: ExpectationConfiguration = None, result: ExpectationValidationResult = None, language: str = None, runtime_configuration: dict = None, **kwargs, ) -> List[Union[dict, str, RenderedStringTemplateContent, RenderedTableContent, RenderedBulletListContent, RenderedGraphContent, Any, ]]: runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "min_value", "max_value", "mostly", "row_condition", "condition_parser", "strict_min", "strict_max", ], ) if (params["min_value"] is None) and (params["max_value"] is None): template_str = "values may have any length." else: at_least_str, at_most_str = handle_strict_min_max(params) if params["mostly"] is not None and params["mostly"] < 1.0: params["mostly_pct"] = num_to_str(params["mostly"] * 100, precision=15, no_scientific=True) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") if params["min_value"] is not None and params[ "max_value"] is not None: template_str = f"values must be {at_least_str} $min_value and {at_most_str} $max_value characters long, at least $mostly_pct % of the time." elif params["min_value"] is None: template_str = f"values must be {at_most_str} $max_value characters long, at least $mostly_pct % of the time." elif params["max_value"] is None: template_str = f"values must be {at_least_str} $min_value characters long, at least $mostly_pct % of the time." else: if params["min_value"] is not None and params[ "max_value"] is not None: template_str = f"values must always be {at_least_str} $min_value and {at_most_str} $max_value characters long." elif params["min_value"] is None: template_str = f"values must always be {at_most_str} $max_value characters long." elif params["max_value"] is None: template_str = f"values must always be {at_least_str} $min_value characters long." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"]) template_str = f"{conditional_template_str}, then {template_str}" params.update(conditional_params) return [ RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": template_str, "params": params, "styling": styling, }, }) ]
class ExpectColumnQuantileValuesToBeBetween(ColumnExpectation): # noinspection PyUnresolvedReferences """Expect specific provided column quantiles to be between provided minimum and maximum values. ``quantile_ranges`` must be a dictionary with two keys: * ``quantiles``: (list of float) increasing ordered list of desired quantile values * ``value_ranges``: (list of lists): Each element in this list consists of a list with two values, a lower \ and upper bound (inclusive) for the corresponding quantile. These values must be [min, max] ordered. For each provided range: * min_value and max_value are both inclusive. * If min_value is None, then max_value is treated as an upper bound only * If max_value is None, then min_value is treated as a lower bound only The length of the quantiles list and quantile_values list must be equal. For example: :: # my_df.my_col = [1,2,2,3,3,3,4] >>> my_df.expect_column_quantile_values_to_be_between( "my_col", { "quantiles": [0., 0.333, 0.6667, 1.], "value_ranges": [[0,1], [2,3], [3,4], [4,5]] } ) { "success": True, "result": { "observed_value": { "quantiles: [0., 0.333, 0.6667, 1.], "values": [1, 2, 3, 4], } "element_count": 7, "missing_count": 0, "missing_percent": 0.0, "details": { "success_details": [true, true, true, true] } } } } `expect_column_quantile_values_to_be_between` can be computationally intensive for large datasets. expect_column_quantile_values_to_be_between is a \ :func:`column_aggregate_expectation <great_expectations.execution_engine.MetaExecutionEngine.column_aggregate_expectation>`. Args: column (str): \ The column name. quantile_ranges (dictionary): \ Quantiles and associated value ranges for the column. See above for details. allow_relative_error (boolean or string): \ Whether to allow relative error in quantile communications on backends that support or require it. Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. Notes: These fields in the result object are customized for this expectation: :: details.success_details See Also: :func:`expect_column_min_to_be_between \ <great_expectations.execution_engine.execution_engine.ExecutionEngine.expect_column_min_to_be_between>` :func:`expect_column_max_to_be_between \ <great_expectations.execution_engine.execution_engine.ExecutionEngine.expect_column_max_to_be_between>` :func:`expect_column_median_to_be_between \ <great_expectations.execution_engine.execution_engine.ExecutionEngine.expect_column_median_to_be_between>` """ # This dictionary contains metadata for display in the public gallery library_metadata = { "maturity": "production", "tags": ["core expectation", "column aggregate expectation"], "contributors": ["@great_expectations"], "requirements": [], } metric_dependencies = ("column.quantile_values", ) success_keys = ( "quantile_ranges", "allow_relative_error", "auto", "profiler_config", ) quantile_value_ranges_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="NumericMetricRangeMultiBatchParameterBuilder", name="quantile_value_ranges_estimator", metric_name="column.quantile_values", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs={ "quantiles": f"{VARIABLES_KEY}quantiles", "allow_relative_error": f"{VARIABLES_KEY}allow_relative_error", }, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, false_positive_rate=f"{VARIABLES_KEY}false_positive_rate", quantile_statistic_interpolation_method= f"{VARIABLES_KEY}quantile_statistic_interpolation_method", estimator=f"{VARIABLES_KEY}estimator", num_bootstrap_samples=f"{VARIABLES_KEY}num_bootstrap_samples", bootstrap_random_seed=f"{VARIABLES_KEY}bootstrap_random_seed", truncate_values=f"{VARIABLES_KEY}truncate_values", round_decimals=f"{VARIABLES_KEY}round_decimals", evaluation_parameter_builder_configs=None, json_serialize=True, ) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ quantile_value_ranges_estimator_parameter_builder_config, ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name= "expect_column_quantile_values_to_be_between", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "default_expect_column_quantile_values_to_be_between_rule": { "variables": { "quantiles": [ 0.25, 0.5, 0.75, ], "allow_relative_error": "linear", "false_positive_rate": 0.05, "quantile_statistic_interpolation_method": "auto", "estimator": "bootstrap", "num_bootstrap_samples": 9999, "bootstrap_random_seed": None, "truncate_values": { "lower_bound": None, "upper_bound": None, }, "round_decimals": 1, }, "domain_builder": { "class_name": "ColumnDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [{ "expectation_type": "expect_column_quantile_values_to_be_between", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", "quantile_ranges": { "quantiles": f"{VARIABLES_KEY}quantiles", "value_ranges": f"{PARAMETER_KEY}{quantile_value_ranges_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}", }, "allow_relative_error": f"{VARIABLES_KEY}allow_relative_error", "meta": { "profiler_details": f"{PARAMETER_KEY}{quantile_value_ranges_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, }], }, }, ) default_kwarg_values = { "row_condition": None, "condition_parser": None, "quantile_ranges": None, "result_format": "BASIC", "allow_relative_error": False, "include_config": True, "catch_exceptions": False, "meta": None, "auto": False, "profiler_config": default_profiler_config, } args_keys = ( "column", "quantile_ranges", "allow_relative_error", ) def validate_configuration( self, configuration: Optional[ExpectationConfiguration]) -> None: super().validate_configuration(configuration) try: assert ( "quantile_ranges" in configuration.kwargs), "quantile_ranges must be provided" assert isinstance(configuration.kwargs["quantile_ranges"], dict), "quantile_ranges should be a dictionary" assert all([ True if None in x or x == sorted(x) else False for x in configuration.kwargs["quantile_ranges"]["value_ranges"] ]), "quantile_ranges must consist of ordered pairs" except AssertionError as e: raise InvalidExpectationConfigurationError(str(e)) # Ensuring actual quantiles and their value ranges match up quantile_ranges = configuration.kwargs["quantile_ranges"] quantiles = quantile_ranges["quantiles"] quantile_value_ranges = quantile_ranges["value_ranges"] if len(quantiles) != len(quantile_value_ranges): raise ValueError( "quantile_values and quantiles must have the same number of elements" ) @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration["kwargs"], ["column", "quantile_ranges", "row_condition", "condition_parser"], ) header_params_with_json_schema = { "column": { "schema": { "type": "string" }, "value": params.get("column") }, "mostly": { "schema": { "type": "number" }, "value": params.get("mostly") }, "row_condition": { "schema": { "type": "string" }, "value": params.get("row_condition"), }, "condition_parser": { "schema": { "type": "string" }, "value": params.get("condition_parser"), }, } header_template_str = "quantiles must be within the following value ranges." if include_column_name: header_template_str = f"$column {header_template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"], with_schema=True) header_template_str = (conditional_template_str + ", then " + header_template_str[0].lower() + header_template_str[1:]) header_params_with_json_schema.update(conditional_params) quantile_ranges = (params.get("quantile_ranges") if params.get("quantile_ranges") else {}) quantiles = (quantile_ranges.get("quantiles") if quantile_ranges.get("quantiles") else []) value_ranges = (quantile_ranges.get("value_ranges") if quantile_ranges.get("value_ranges") else []) table_header_row = [ { "schema": { "type": "string" }, "value": "Quantile" }, { "schema": { "type": "string" }, "value": "Min Value" }, { "schema": { "type": "string" }, "value": "Max Value" }, ] table_rows = [] quantile_strings = {0.25: "Q1", 0.75: "Q3", 0.50: "Median"} for quantile, value_range in zip(quantiles, value_ranges): quantile_string = quantile_strings.get(quantile, f"{quantile:3.2f}") table_rows.append([ { "value": quantile_string, "schema": { "type": "string" }, }, { "value": value_range[0] if value_range[0] is not None else "Any", "schema": { "type": "number" if value_range[0] is not None else "string" }, }, { "value": value_range[1] if value_range[1] is not None else "Any", "schema": { "type": "number" if value_range[1] is not None else "string" }, }, ]) return ( header_template_str, header_params_with_json_schema, styling, table_header_row, table_rows, ) @classmethod @renderer(renderer_type="atomic.prescriptive.summary") @render_evaluation_parameter_string def _prescriptive_summary( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): """ Rendering function that is utilized by GE Cloud Front-end """ ( header_template_str, header_params_with_json_schema, _, table_header_row, table_rows, ) = cls._atomic_prescriptive_template(configuration, result, language, runtime_configuration, **kwargs) value_obj = renderedAtomicValueSchema.load({ "header": { "schema": { "type": "StringValueType" }, "value": { "template": header_template_str, "params": header_params_with_json_schema, }, }, "header_row": table_header_row, "table": table_rows, "schema": { "type": "TableType" }, }) rendered = RenderedAtomicContent(name="atomic.prescriptive.summary", value=value_obj, value_type="TableType") return rendered @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration["kwargs"], ["column", "quantile_ranges", "row_condition", "condition_parser"], ) template_str = "quantiles must be within the following value ranges." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"]) template_str = (conditional_template_str + ", then " + template_str[0].lower() + template_str[1:]) params.update(conditional_params) expectation_string_obj = { "content_block_type": "string_template", "string_template": { "template": template_str, "params": params }, } quantiles = params["quantile_ranges"]["quantiles"] value_ranges = params["quantile_ranges"]["value_ranges"] table_header_row = ["Quantile", "Min Value", "Max Value"] table_rows = [] quantile_strings = {0.25: "Q1", 0.75: "Q3", 0.50: "Median"} for quantile, value_range in zip(quantiles, value_ranges): quantile_string = quantile_strings.get(quantile, f"{quantile:3.2f}") table_rows.append([ quantile_string, str(value_range[0]) if value_range[0] is not None else "Any", str(value_range[1]) if value_range[1] is not None else "Any", ]) quantile_range_table = RenderedTableContent( **{ "content_block_type": "table", "header_row": table_header_row, "table": table_rows, "styling": { "body": { "classes": [ "table", "table-sm", "table-unbordered", "col-4", "mt-2", ], }, "parent": { "styles": { "list-style-type": "none" } }, }, }) return [expectation_string_obj, quantile_range_table] @classmethod @renderer(renderer_type="renderer.diagnostic.observed_value") def _diagnostic_observed_value_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): if result.result is None or result.result.get( "observed_value") is None: return "--" quantiles = result.result.get("observed_value", {}).get("quantiles", []) value_ranges = result.result.get("observed_value", {}).get("values", []) table_header_row = ["Quantile", "Value"] table_rows = [] quantile_strings = {0.25: "Q1", 0.75: "Q3", 0.50: "Median"} for idx, quantile in enumerate(quantiles): quantile_string = quantile_strings.get(quantile) table_rows.append([ quantile_string if quantile_string else f"{quantile:3.2f}", str(value_ranges[idx]), ]) return RenderedTableContent( **{ "content_block_type": "table", "header_row": table_header_row, "table": table_rows, "styling": { "body": { "classes": ["table", "table-sm", "table-unbordered", "col-4"], } }, }) @classmethod def _atomic_diagnostic_observed_value_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): template_string = None params_with_json_schema = None table_header_row = None table_rows = None if result.result is None or result.result.get( "observed_value") is None: template_string = "--" params_with_json_schema = {} return ( template_string, params_with_json_schema, table_header_row, table_rows, ) quantiles = result.result.get("observed_value", {}).get("quantiles", []) value_ranges = result.result.get("observed_value", {}).get("values", []) table_header_row = [ { "schema": { "type": "string" }, "value": "Quantile" }, { "schema": { "type": "string" }, "value": "Value" }, ] table_rows = [] quantile_strings = {0.25: "Q1", 0.75: "Q3", 0.50: "Median"} for idx, quantile in enumerate(quantiles): quantile_string = quantile_strings.get(quantile) table_rows.append([ { "value": quantile_string if quantile_string else f"{quantile:3.2f}", "schema": { "type": "string" }, }, { "value": value_ranges[idx], "schema": { "type": "number" } }, ]) return template_string, params_with_json_schema, table_header_row, table_rows @classmethod @renderer(renderer_type="atomic.diagnostic.observed_value") def _atomic_diagnostic_observed_value( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): ( template_string, params_with_json_schema, table_header_row, table_rows, ) = cls._atomic_diagnostic_observed_value_template( configuration, result, language, runtime_configuration, **kwargs) if template_string is not None: value_obj = renderedAtomicValueSchema.load({ "template": template_string, "params": {}, "schema": { "type": "StringValueType" }, }) return RenderedAtomicContent( name="atomic.diagnostic.observed_value", value=value_obj, value_type="StringValueType", ) else: value_obj = renderedAtomicValueSchema.load({ "header_row": table_header_row, "table": table_rows, "schema": { "type": "TableType" }, }) return RenderedAtomicContent( name="atomic.diagnostic.observed_value", value=value_obj, value_type="TableType", ) @classmethod @renderer(renderer_type="renderer.descriptive.quantile_table") def _descriptive_quantile_table_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): assert result, "Must pass in result." table_rows = [] quantiles = result.result["observed_value"]["quantiles"] quantile_ranges = result.result["observed_value"]["values"] quantile_strings = {0.25: "Q1", 0.75: "Q3", 0.50: "Median"} for idx, quantile in enumerate(quantiles): quantile_string = quantile_strings.get(quantile) table_rows.append([ { "content_block_type": "string_template", "string_template": { "template": quantile_string if quantile_string else f"{quantile:3.2f}", "tooltip": { "content": "expect_column_quantile_values_to_be_between \n expect_column_median_to_be_between" if quantile == 0.50 else "expect_column_quantile_values_to_be_between" }, }, }, quantile_ranges[idx], ]) return RenderedTableContent( **{ "content_block_type": "table", "header": RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": "Quantiles", "tag": "h6" }, }), "table": table_rows, "styling": { "classes": ["col-3", "mt-1", "pl-1", "pr-1"], "body": { "classes": ["table", "table-sm", "table-unbordered"], }, }, }) def get_validation_dependencies( self, configuration: Optional[ExpectationConfiguration] = None, execution_engine: Optional[ExecutionEngine] = None, runtime_configuration: Optional[dict] = None, ): all_dependencies = super().get_validation_dependencies( configuration, execution_engine, runtime_configuration) # column.quantile_values expects a "quantiles" key all_dependencies["metrics"][ "column.quantile_values"].metric_value_kwargs[ "quantiles"] = configuration.kwargs["quantile_ranges"][ "quantiles"] return all_dependencies def _validate( self, configuration: ExpectationConfiguration, metrics: Dict, runtime_configuration: dict = None, execution_engine: ExecutionEngine = None, ): quantile_vals = metrics.get("column.quantile_values") quantile_ranges = configuration.kwargs.get("quantile_ranges") quantiles = quantile_ranges["quantiles"] quantile_value_ranges = quantile_ranges["value_ranges"] # We explicitly allow "None" to be interpreted as +/- infinity comparison_quantile_ranges = [[ -np.inf if lower_bound is None else lower_bound, np.inf if upper_bound is None else upper_bound, ] for (lower_bound, upper_bound) in quantile_value_ranges] success_details = [ range_[0] <= quantile_vals[idx] <= range_[1] for idx, range_ in enumerate(comparison_quantile_ranges) ] return { "success": np.all(success_details), "result": { "observed_value": { "quantiles": quantiles, "values": quantile_vals }, "details": { "success_details": success_details }, }, }
class ExpectColumnValuesToBeInSet(ColumnMapExpectation): """Expect each column value to be in a given set. For example: :: # my_df.my_col = [1,2,2,3,3,3] >>> my_df.expect_column_values_to_be_in_set( "my_col", [2,3] ) { "success": false "result": { "unexpected_count": 1 "unexpected_percent": 16.66666666666666666, "unexpected_percent_nonmissing": 16.66666666666666666, "partial_unexpected_list": [ 1 ], }, } expect_column_values_to_be_in_set is a \ :func:`column_map_expectation <great_expectations.execution_engine.execution_engine.MetaExecutionEngine .column_map_expectation>`. Args: column (str): \ The column name. value_set (set-like): \ A set of objects used for comparison. Keyword Args: mostly (None or a float between 0 and 1): \ Return `"success": True` if at least mostly fraction of values match the expectation. \ For more detail, see :ref:`mostly`. parse_strings_as_datetimes (boolean or None) : If True values provided in value_set will be parsed as \ datetimes before making comparisons. Other Parameters: result_format (str or None): \ Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. For more detail, see :ref:`result_format <result_format>`. include_config (boolean): \ If True, then include the expectation config as part of the result object. \ For more detail, see :ref:`include_config`. catch_exceptions (boolean or None): \ If True, then catch exceptions and include them as part of the result object. \ For more detail, see :ref:`catch_exceptions`. meta (dict or None): \ A JSON-serializable dictionary (nesting allowed) that will be included in the output without \ modification. For more detail, see :ref:`meta`. Returns: An ExpectationSuiteValidationResult Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`. See Also: :func:`expect_column_values_to_not_be_in_set \ <great_expectations.execution_engine.execution_engine.ExecutionEngine .expect_column_values_to_not_be_in_set>` """ # This dictionary contains metadata for display in the public gallery library_metadata = { "maturity": "production", "tags": ["core expectation", "column map expectation"], "contributors": ["@great_expectations"], "requirements": [], "has_full_test_suite": True, "manually_reviewed_code": True, } map_metric = "column_values.in_set" args_keys = ( "column", "value_set", ) success_keys = ( "value_set", "mostly", "parse_strings_as_datetimes", "auto", "profiler_config", ) value_set_estimator_parameter_builder_config: ParameterBuilderConfig = ( ParameterBuilderConfig( module_name= "great_expectations.rule_based_profiler.parameter_builder", class_name="ValueSetMultiBatchParameterBuilder", name="value_set_estimator", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, evaluation_parameter_builder_configs=None, )) validation_parameter_builder_configs: List[ParameterBuilderConfig] = [ value_set_estimator_parameter_builder_config, ] default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig( name= "expect_column_values_to_be_in_set", # Convention: use "expectation_type" as profiler name. config_version=1.0, variables={}, rules={ "default_expect_column_values_to_be_in_set_rule": { "variables": { "mostly": 1.0, }, "domain_builder": { "class_name": "ColumnDomainBuilder", "module_name": "great_expectations.rule_based_profiler.domain_builder", }, "expectation_configuration_builders": [ { "expectation_type": "expect_column_values_to_be_in_set", "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", "validation_parameter_builder_configs": validation_parameter_builder_configs, "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", "value_set": f"{PARAMETER_KEY}{value_set_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}", "mostly": f"{VARIABLES_KEY}mostly", "meta": { "profiler_details": f"{PARAMETER_KEY}{value_set_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, }, ], }, }, ) default_kwarg_values = { "value_set": [], "parse_strings_as_datetimes": False, "auto": False, "profiler_config": default_profiler_config, } @classmethod def _atomic_prescriptive_template( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "value_set", "mostly", "parse_strings_as_datetimes", "row_condition", "condition_parser", ], ) params_with_json_schema = { "column": { "schema": { "type": "string" }, "value": params.get("column") }, "value_set": { "schema": { "type": "array" }, "value": params.get("value_set"), }, "mostly": { "schema": { "type": "number" }, "value": params.get("mostly") }, "mostly_pct": { "schema": { "type": "string" }, "value": params.get("mostly_pct"), }, "parse_strings_as_datetimes": { "schema": { "type": "boolean" }, "value": params.get("parse_strings_as_datetimes"), }, "row_condition": { "schema": { "type": "string" }, "value": params.get("row_condition"), }, "condition_parser": { "schema": { "type": "string" }, "value": params.get("condition_parser"), }, } if params["value_set"] is None or len(params["value_set"]) == 0: values_string = "[ ]" else: for i, v in enumerate(params["value_set"]): params[f"v__{str(i)}"] = v values_string = " ".join( [f"$v__{str(i)}" for i, v in enumerate(params["value_set"])]) template_str = f"values must belong to this set: {values_string}" if params["mostly"] is not None and params["mostly"] < 1.0: params_with_json_schema["mostly_pct"]["value"] = num_to_str( params["mostly"] * 100, precision=15, no_scientific=True) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") template_str += ", at least $mostly_pct % of the time." else: template_str += "." if params.get("parse_strings_as_datetimes"): template_str += " Values should be parsed as datetimes." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"], with_schema=True) template_str = f"{conditional_template_str}, then {template_str}" params_with_json_schema.update(conditional_params) params_with_json_schema = add_values_with_json_schema_from_list_in_params( params=params, params_with_json_schema=params_with_json_schema, param_key_with_list="value_set", ) return (template_str, params_with_json_schema, styling) @classmethod @renderer(renderer_type="renderer.prescriptive") @render_evaluation_parameter_string def _prescriptive_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): runtime_configuration = runtime_configuration or {} include_column_name = runtime_configuration.get( "include_column_name", True) include_column_name = (include_column_name if include_column_name is not None else True) styling = runtime_configuration.get("styling") params = substitute_none_for_missing( configuration.kwargs, [ "column", "value_set", "mostly", "parse_strings_as_datetimes", "row_condition", "condition_parser", ], ) if params["value_set"] is None or len(params["value_set"]) == 0: values_string = "[ ]" else: for i, v in enumerate(params["value_set"]): params[f"v__{str(i)}"] = v values_string = " ".join( [f"$v__{str(i)}" for i, v in enumerate(params["value_set"])]) template_str = f"values must belong to this set: {values_string}" if params["mostly"] is not None and params["mostly"] < 1.0: params["mostly_pct"] = num_to_str(params["mostly"] * 100, precision=15, no_scientific=True) # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".") template_str += ", at least $mostly_pct % of the time." else: template_str += "." if params.get("parse_strings_as_datetimes"): template_str += " Values should be parsed as datetimes." if include_column_name: template_str = f"$column {template_str}" if params["row_condition"] is not None: ( conditional_template_str, conditional_params, ) = parse_row_condition_string_pandas_engine( params["row_condition"]) template_str = f"{conditional_template_str}, then {template_str}" params.update(conditional_params) return [ RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": template_str, "params": params, "styling": styling, }, }) ] @classmethod @renderer(renderer_type="renderer.descriptive.example_values_block") def _descriptive_example_values_block_renderer( cls, configuration=None, result=None, language=None, runtime_configuration=None, **kwargs, ): assert result, "Must pass in result." if "partial_unexpected_counts" in result.result: partial_unexpected_counts = result.result[ "partial_unexpected_counts"] values = [str(v["value"]) for v in partial_unexpected_counts] elif "partial_unexpected_list" in result.result: values = [ str(item) for item in result.result["partial_unexpected_list"] ] else: return classes = ["col-3", "mt-1", "pl-1", "pr-1"] if any(len(value) > 80 for value in values): content_block_type = "bullet_list" content_block_class = RenderedBulletListContent else: content_block_type = "value_list" content_block_class = ValueListContent new_block = content_block_class( **{ "content_block_type": content_block_type, "header": RenderedStringTemplateContent( **{ "content_block_type": "string_template", "string_template": { "template": "Example Values", "tooltip": { "content": "expect_column_values_to_be_in_set" }, "tag": "h6", }, }), content_block_type: [{ "content_block_type": "string_template", "string_template": { "template": "$value", "params": { "value": value }, "styling": { "default": { "classes": ["badge", "badge-info"] if content_block_type == "value_list" else [], "styles": { "word-break": "break-all" }, }, }, }, } for value in values], "styling": { "classes": classes, }, }) return new_block def validate_configuration( self, configuration: Optional[ExpectationConfiguration]) -> None: super().validate_configuration(configuration) # supports extensibility by allowing value_set to not be provided in config but captured via child-class default_kwarg_values, e.g. parameterized expectations value_set = configuration.kwargs.get( "value_set") or self.default_kwarg_values.get("value_set") try: assert ("value_set" in configuration.kwargs or value_set), "value_set is required" assert isinstance( value_set, (list, set, dict)), "value_set must be a list, set, or dict" if isinstance(value_set, dict): assert ( "$PARAMETER" in value_set ), 'Evaluation Parameter dict for value_set kwarg must have "$PARAMETER" key.' except AssertionError as e: raise InvalidExpectationConfigurationError(str(e))
def build_map_metric_rule( rule_name: str, expectation_type: str, map_metric_name: str, include_column_names: Optional[Union[str, Optional[List[str]]]] = None, exclude_column_names: Optional[Union[str, Optional[List[str]]]] = None, include_column_name_suffixes: Optional[Union[str, Iterable, List[str]]] = None, exclude_column_name_suffixes: Optional[Union[str, Iterable, List[str]]] = None, semantic_type_filter_module_name: Optional[str] = None, semantic_type_filter_class_name: Optional[str] = None, include_semantic_types: Optional[Union[ str, SemanticDomainTypes, List[Union[str, SemanticDomainTypes]]]] = None, exclude_semantic_types: Optional[Union[ str, SemanticDomainTypes, List[Union[str, SemanticDomainTypes]]]] = None, max_unexpected_values: Union[str, int] = 0, max_unexpected_ratio: Optional[Union[str, float]] = None, min_max_unexpected_values_proportion: Union[str, float] = 9.75e-1, ) -> Rule: """ This method builds "Rule" object focused on emitting "ExpectationConfiguration" objects for any "map" style metric. """ # Step-1: Instantiate "MapMetricColumnDomainBuilder" for specified "map_metric_name" (subject to directives). map_metric_column_domain_builder: MapMetricColumnDomainBuilder = ( MapMetricColumnDomainBuilder( map_metric_name=map_metric_name, include_column_names=include_column_names, exclude_column_names=exclude_column_names, include_column_name_suffixes=include_column_name_suffixes, exclude_column_name_suffixes=exclude_column_name_suffixes, semantic_type_filter_module_name=semantic_type_filter_module_name, semantic_type_filter_class_name=semantic_type_filter_class_name, include_semantic_types=include_semantic_types, exclude_semantic_types=exclude_semantic_types, max_unexpected_values=max_unexpected_values, max_unexpected_ratio=max_unexpected_ratio, min_max_unexpected_values_proportion= min_max_unexpected_values_proportion, data_context=None, )) # Step-2: Declare "ParameterBuilder" for every metric of interest. column_values_unique_unexpected_count_metric_multi_batch_parameter_builder_for_metrics: ParameterBuilder = DataAssistant.commonly_used_parameter_builders.get_column_values_unique_unexpected_count_metric_multi_batch_parameter_builder( json_serialize=True) column_values_nonnull_unexpected_count_metric_multi_batch_parameter_builder_for_metrics: ParameterBuilder = DataAssistant.commonly_used_parameter_builders.get_column_values_nonnull_unexpected_count_metric_multi_batch_parameter_builder( json_serialize=True) column_values_null_unexpected_count_metric_multi_batch_parameter_builder_for_metrics: ParameterBuilder = DataAssistant.commonly_used_parameter_builders.get_column_values_null_unexpected_count_metric_multi_batch_parameter_builder( json_serialize=True) # Step-3: Set up "MeanUnexpectedMapMetricMultiBatchParameterBuilder" to compute "condition" for emitting "ExpectationConfiguration" (based on "Domain" data). total_count_metric_multi_batch_parameter_builder_for_evaluations: ParameterBuilder = DataAssistant.commonly_used_parameter_builders.get_table_row_count_metric_multi_batch_parameter_builder( json_serialize=False) column_values_nonnull_unexpected_count_metric_multi_batch_parameter_builder_for_evaluations: ParameterBuilder = DataAssistant.commonly_used_parameter_builders.get_column_values_nonnull_unexpected_count_metric_multi_batch_parameter_builder( json_serialize=False) evaluation_parameter_builder_configs: Optional[ List[ParameterBuilderConfig]] = [ ParameterBuilderConfig( ** total_count_metric_multi_batch_parameter_builder_for_evaluations .to_json_dict()), ParameterBuilderConfig( ** column_values_nonnull_unexpected_count_metric_multi_batch_parameter_builder_for_evaluations .to_json_dict()), ] column_values_attribute_mean_unexpected_value_multi_batch_parameter_builder_for_validations: MeanUnexpectedMapMetricMultiBatchParameterBuilder = MeanUnexpectedMapMetricMultiBatchParameterBuilder( name=f"{map_metric_name}.unexpected_value", map_metric_name=map_metric_name, total_count_parameter_builder_name= total_count_metric_multi_batch_parameter_builder_for_evaluations.name, null_count_parameter_builder_name= column_values_nonnull_unexpected_count_metric_multi_batch_parameter_builder_for_evaluations .name, metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, evaluation_parameter_builder_configs= evaluation_parameter_builder_configs, json_serialize=True, data_context=None, ) # Step-4: Pass "MeanUnexpectedMapMetricMultiBatchParameterBuilder" as "validation" "ParameterBuilder" for "DefaultExpectationConfigurationBuilder", responsible for emitting "ExpectationConfiguration" (with specified "expectation_type"). validation_parameter_builder_configs: Optional[ List[ParameterBuilderConfig]] = [ ParameterBuilderConfig( ** column_values_attribute_mean_unexpected_value_multi_batch_parameter_builder_for_validations .to_json_dict()), ] expect_column_values_to_be_attribute_expectation_configuration_builder: DefaultExpectationConfigurationBuilder = DefaultExpectationConfigurationBuilder( expectation_type=expectation_type, validation_parameter_builder_configs= validation_parameter_builder_configs, column= f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column", condition= f"{column_values_attribute_mean_unexpected_value_multi_batch_parameter_builder_for_validations.fully_qualified_parameter_name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY} <= 1.0 - {VARIABLES_KEY}success_ratio", meta={ "profiler_details": f"{column_values_attribute_mean_unexpected_value_multi_batch_parameter_builder_for_validations.fully_qualified_parameter_name}.{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}", }, ) # Step-5: Instantiate and return "Rule" object, comprised of "variables", "domain_builder", "parameter_builders", and "expectation_configuration_builders" components. variables: dict = { "success_ratio": 7.5e-1, } parameter_builders: List[ParameterBuilder] = [ column_values_unique_unexpected_count_metric_multi_batch_parameter_builder_for_metrics, column_values_nonnull_unexpected_count_metric_multi_batch_parameter_builder_for_metrics, column_values_null_unexpected_count_metric_multi_batch_parameter_builder_for_metrics, ] expectation_configuration_builders: List[ExpectationConfigurationBuilder] = [ expect_column_values_to_be_attribute_expectation_configuration_builder, ] rule: Rule = Rule( name=rule_name, variables=variables, domain_builder=map_metric_column_domain_builder, parameter_builders=parameter_builders, expectation_configuration_builders=expectation_configuration_builders, ) return rule
def test_oneshot_numeric_metric_range_multi_batch_parameter_builder_with_evaluation_dependency_bobby( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) batch_request: Dict[str, str] = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } metric_domain_kwargs: Dict[str, str] = {"column": "fare_amount"} fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range" my_column_min_metric_multi_batch_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="MetricMultiBatchParameterBuilder", name="my_column_min", metric_name="column.min", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, evaluation_parameter_builder_configs=None, ) evaluation_parameter_builder_configs: Optional[ List[ParameterBuilderConfig]] = [ my_column_min_metric_multi_batch_parameter_builder_config, ] numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="column_min_range", metric_name=None, metric_multi_batch_parameter_builder_name="my_column_min", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, estimator="oneshot", include_estimator_samples_histogram_in_details=True, false_positive_rate=1.0e-2, round_decimals=1, evaluation_parameter_builder_configs= evaluation_parameter_builder_configs, data_context=data_context, )) variables: Optional[ParameterContainer] = None domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_nodes: Optional[Dict[str, ParameterNode]] = ( parameter_container.parameter_nodes or {}) assert len(parameter_nodes) == 1 expected_value_dict: Dict[str, Optional[str]] = { "value": None, "details": { "metric_configuration": { "domain_kwargs": { "column": "fare_amount" }, "metric_name": "column.min", "metric_value_kwargs": None, "metric_dependencies": None, }, "num_batches": 3, }, } parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) actual_values_01: np.ndarray = parameter_node.pop("value") parameter_node["value"] = None actual_estimation_histogram: np.ndarray = parameter_node.details.pop( "estimation_histogram") assert parameter_node == expected_value_dict actual_value_01_lower: float = actual_values_01[0] actual_value_01_upper: float = actual_values_01[1] expected_value_01_lower: float = -51.7 expected_value_01_upper: float = -21.0 assert actual_value_01_lower == expected_value_01_lower assert actual_value_01_upper == expected_value_01_upper expected_estimation_histogram: np.ndarray = np.array([ 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, ]) # Assert no significant difference between expected (null hypothesis) and actual estimation histograms. ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram, data2=expected_estimation_histogram) p_value: float = ks_result[1] assert p_value > 9.5e-1 numeric_metric_range_parameter_builder = ( NumericMetricRangeMultiBatchParameterBuilder( name="column_min_range", metric_name="column.min", metric_multi_batch_parameter_builder_name="my_column_min", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, estimator="oneshot", include_estimator_samples_histogram_in_details=True, false_positive_rate=5.0e-2, round_decimals=1, evaluation_parameter_builder_configs= evaluation_parameter_builder_configs, data_context=data_context, )) numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, recompute_existing_parameter_values=True, batch_request=batch_request, ) parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) actual_values_05 = parameter_node.pop("value") parameter_node["value"] = None actual_estimation_histogram: np.ndarray = parameter_node.details.pop( "estimation_histogram") assert parameter_node == expected_value_dict actual_value_05_lower: float = actual_values_05[0] actual_value_05_upper: float = actual_values_05[1] expected_value_05_lower: float = -50.5 expected_value_05_upper: float = -21.1 assert actual_value_05_lower == expected_value_05_lower assert actual_value_05_upper == expected_value_05_upper # if false positive rate is higher, our range should be more narrow assert actual_value_01_lower < actual_value_05_lower assert actual_value_01_upper > actual_value_05_upper expected_estimation_histogram: np.ndarray = np.array([ 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, ]) # Assert no significant difference between expected (null hypothesis) and actual estimation histograms. ks_result = stats.ks_2samp(data1=actual_estimation_histogram, data2=expected_estimation_histogram) p_value: float = ks_result[1] assert p_value > 9.5e-1