def __init__( self, expectation_type: str, meta: Optional[Dict[str, Any]] = None, condition: Optional[str] = None, validation_parameter_builder_configs: Optional[ List[ParameterBuilderConfig]] = None, data_context: Optional["BaseDataContext"] = None, # noqa: F821 **kwargs, ) -> None: """ Args: expectation_type: the "expectation_type" argument of "ExpectationConfiguration" object to be emitted. meta: the "meta" argument of "ExpectationConfiguration" object to be emitted condition: Boolean statement (expressed as string and following specified grammar), which controls whether or not underlying logic should be executed and thus resulting "ExpectationConfiguration" emitted validation_parameter_builder_configs: ParameterBuilder configurations, having whose outputs available (as fully-qualified parameter names) is pre-requisite for present ExpectationConfigurationBuilder instance These "ParameterBuilder" configurations help build kwargs needed for this "ExpectationConfigurationBuilder" data_context: BaseDataContext associated with this ExpectationConfigurationBuilder kwargs: additional arguments """ super().__init__( expectation_type=expectation_type, validation_parameter_builder_configs= validation_parameter_builder_configs, data_context=data_context, **kwargs, ) if meta is None: meta = {} self._meta = meta if not isinstance(meta, dict): raise ge_exceptions.ProfilerExecutionError( message= f"""Argument "{meta}" in "{self.__class__.__name__}" must be of type "dictionary" \ (value of type "{str(type(meta))}" was encountered). """) if condition and (not isinstance(condition, str)): raise ge_exceptions.ProfilerExecutionError( message= f"""Argument "{condition}" in "{self.__class__.__name__}" must be of type "string" \ (value of type "{str(type(condition))}" was encountered). """) self._condition = condition self._validation_parameter_builder_configs = ( validation_parameter_builder_configs) self._kwargs = kwargs
def validate_fully_qualified_parameter_name(fully_qualified_parameter_name: str): if not fully_qualified_parameter_name.startswith("$"): raise ge_exceptions.ProfilerExecutionError( message=f"""Unable to get value for parameter name "{fully_qualified_parameter_name}" -- parameter \ names must start with $ (e.g., "${fully_qualified_parameter_name}"). """ )
def _get_deterministic_estimate( self, metric_values: np.ndarray, domain: Domain, *, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, **kwargs, ) -> Tuple[Number, Number]: # Obtain false_positive_rate from "rule state" (i.e., variables and parameters); from instance variable otherwise. false_positive_rate: np.float64 = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=kwargs.get("false_positive_rate", 5.0e-2), expected_return_type=(float, np.float64), variables=variables, parameters=parameters, ) if not (0.0 <= false_positive_rate <= 1.0): raise ge_exceptions.ProfilerExecutionError( message=f"The confidence level for {self.__class__.__name__} is outside of [0.0, 1.0] closed interval." ) return compute_quantiles( metric_values=metric_values, false_positive_rate=false_positive_rate, )
def get_batch_ids( data_context: Optional["BaseDataContext"] = None, # noqa: F821 batch_list: Optional[List[Batch]] = None, batch_request: Optional[Union[str, BatchRequestBase, dict]] = None, domain: Optional[Domain] = None, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, ) -> Optional[List[str]]: batch: Batch if batch_list is None or all([batch is None for batch in batch_list]): if batch_request is None: return None batch_request = build_batch_request( domain=domain, batch_request=batch_request, variables=variables, parameters=parameters, ) batch_list = data_context.get_batch_list(batch_request=batch_request) batch_ids: List[str] = [batch.id for batch in batch_list] num_batch_ids: int = len(batch_ids) if num_batch_ids == 0: raise ge_exceptions.ProfilerExecutionError( message= f"""{__name__}.get_batch_ids() must return at least one batch_id ({num_batch_ids} were retrieved). """) return batch_ids
def _get_round_decimals_using_heuristics( self, metric_values: np.ndarray, domain: Domain, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, ) -> int: # Obtain round_decimals directive from "rule state" (i.e., variables and parameters); from instance variable otherwise. round_decimals: Optional[ int] = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=self.round_decimals, expected_return_type=None, variables=variables, parameters=parameters, ) if round_decimals is None: round_decimals = MAX_DECIMALS else: if not isinstance(round_decimals, int) or (round_decimals < 0): raise ge_exceptions.ProfilerExecutionError( message= f"""The directive "round_decimals" for {self.__class__.__name__} can be 0 or a positive integer, or must be omitted (or set to None). """) if np.issubdtype(metric_values.dtype, np.integer): round_decimals = 0 return round_decimals
def get_parameter_value_and_validate_return_type( domain: Optional[Domain] = None, parameter_reference: Optional[Union[Any, str]] = None, expected_return_type: Optional[Union[type, tuple]] = None, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, ) -> Optional[Any]: """ This method allows for the parameter_reference to be specified as an object (literal, dict, any typed object, etc.) or as a fully-qualified parameter name. In either case, it can optionally validate the type of the return value. """ if isinstance(parameter_reference, dict): parameter_reference = safe_deep_copy(data=parameter_reference) parameter_reference = get_parameter_value( domain=domain, parameter_reference=parameter_reference, variables=variables, parameters=parameters, ) if expected_return_type is not None: if not isinstance(parameter_reference, expected_return_type): raise ge_exceptions.ProfilerExecutionError( message= f"""Argument "{parameter_reference}" must be of type "{str(expected_return_type)}" \ (value of type "{str(type(parameter_reference))}" was encountered). """) return parameter_reference
def validate_fully_qualified_parameter_name( fully_qualified_parameter_name: str, ) -> None: if not is_fully_qualified_parameter_name_literal_string_format( fully_qualified_parameter_name=fully_qualified_parameter_name): raise ge_exceptions.ProfilerExecutionError( message= f"""Unable to get value for parameter name "{fully_qualified_parameter_name}" -- parameter \ names must start with {FULLY_QUALIFIED_PARAMETER_NAME_DELIMITER_CHARACTER} (e.g., "{FULLY_QUALIFIED_PARAMETER_NAME_DELIMITER_CHARACTER}{fully_qualified_parameter_name}"). """)
def get_validator( purpose: str, *, data_context: Optional["BaseDataContext"] = None, # noqa: F821 batch_list: Optional[List[Batch]] = None, batch_request: Optional[Union[str, BatchRequestBase, dict]] = None, domain: Optional[Domain] = None, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, ) -> Optional["Validator"]: # noqa: F821 validator: Optional["Validator"] # noqa: F821 expectation_suite_name: str = f"tmp.{purpose}" if domain is None: expectation_suite_name = ( f"{expectation_suite_name}_suite_{str(uuid.uuid4())[:8]}") else: expectation_suite_name = ( f"{expectation_suite_name}_{domain.id}_suite_{str(uuid.uuid4())[:8]}" ) batch: Batch if batch_list is None or all([batch is None for batch in batch_list]): if batch_request is None: return None batch_request = build_batch_request( domain=domain, batch_request=batch_request, variables=variables, parameters=parameters, ) validator = data_context.get_validator( batch_request=batch_request, create_expectation_suite_with_name=expectation_suite_name, ) else: num_batches: int = len(batch_list) if num_batches == 0: raise ge_exceptions.ProfilerExecutionError( message= f"""{__name__}.get_validator() must utilize at least one Batch ({num_batches} are available). """) expectation_suite: ExpectationSuite = data_context.create_expectation_suite( expectation_suite_name=expectation_suite_name) validator = data_context.get_validator_using_batch_list( expectation_suite=expectation_suite, batch_list=batch_list, ) # Always disabled for RBP and DataAssistants due to volume of metric calculations validator.show_progress_bars = False return validator
def _build_parameters( self, domain: Domain, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, recompute_existing_parameter_values: bool = False, ) -> Attributes: """ Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details. Returns: Attributes object, containing computed parameter values and parameter computation details metadata. """ batch_ids: Optional[List[str]] = self.get_batch_ids( domain=domain, variables=variables, parameters=parameters, ) num_batch_ids: int = len(batch_ids) if num_batch_ids != 1: raise ge_exceptions.ProfilerExecutionError( message=f"""Utilizing a {self.__class__.__name__} requires exactly one Batch of data to be available ({num_batch_ids} Batch identifiers found). """ ) # Compute metric value for one Batch object (expressed as list of Batch objects). super().build_parameters( domain=domain, variables=variables, parameters=parameters, parameter_computation_impl=super()._build_parameters, json_serialize=False, recompute_existing_parameter_values=recompute_existing_parameter_values, ) # Retrieve metric values for one Batch object (expressed as list of Batch objects). parameter_node: ParameterNode = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=self.fully_qualified_parameter_name, expected_return_type=None, variables=variables, parameters=parameters, ) return Attributes( { FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY: None if parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY] is None else parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY][0], FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: parameter_node[ FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY ], } )
def _get_domains( self, rule_name: str, variables: Optional[ParameterContainer] = None, ) -> List[Domain]: """Return domains matching the specified tolerance limits. Args: rule_name: name of Rule object, for which "Domain" objects are obtained. variables: Optional variables to substitute when evaluating. Returns: List of domains that match the desired tolerance limits. """ batch_ids: List[str] = self.get_batch_ids(variables=variables) validator: "Validator" = self.get_validator(variables=variables) # noqa: F821 effective_column_names: List[str] = self.get_effective_column_names( batch_ids=batch_ids, validator=validator, variables=variables, ) if not (self.include_column_names and effective_column_names): raise ge_exceptions.ProfilerExecutionError( message=f'Error: "column_list" in {self.__class__.__name__} must not be empty.' ) column_name: str semantic_types_by_column_name: Dict[str, SemanticDomainTypes] = { column_name: self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map[ column_name ] for column_name in effective_column_names } domains: List[Domain] = [ Domain( domain_type=self.domain_type, domain_kwargs={ "column_list": effective_column_names, }, details={ INFERRED_SEMANTIC_TYPE_KEY: semantic_types_by_column_name, }, rule_name=rule_name, ), ] return domains
def get_batch_id( self, variables: Optional[ParameterContainer] = None, ) -> Optional[str]: batch_ids: Optional[List[str]] = self._get_batch_ids( variables=variables, ) num_batch_ids: int = len(batch_ids) if num_batch_ids != 1: raise ge_exceptions.ProfilerExecutionError( message= f"""{self.__class__.__name__}.get_batch_id() expected to return exactly one batch_id \ ({num_batch_ids} were retrieved). """) return batch_ids[0]
def _get_truncate_values_using_heuristics( self, metric_values: np.ndarray, domain: Domain, *, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, ) -> Dict[str, Union[Optional[int], Optional[float]]]: # Obtain truncate_values directive from "rule state" (i.e., variables and parameters); from instance variable otherwise. truncate_values: Dict[ str, Optional[Number] ] = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=self.truncate_values, expected_return_type=dict, variables=variables, parameters=parameters, ) distribution_boundary: Optional[Union[int, float]] if not all( [ ( distribution_boundary is None or is_numeric(value=distribution_boundary) ) for distribution_boundary in truncate_values.values() ] ): raise ge_exceptions.ProfilerExecutionError( message=f"""The directive "truncate_values" for {self.__class__.__name__} must specify the [lower_bound, upper_bound] closed interval, where either boundary is a numeric value (or None). """ ) lower_bound: Optional[Number] = truncate_values.get("lower_bound") upper_bound: Optional[Number] = truncate_values.get("upper_bound") if lower_bound is None and np.all(np.greater(metric_values, NP_EPSILON)): lower_bound = 0.0 if upper_bound is None and np.all(np.less(metric_values, (-NP_EPSILON))): upper_bound = 0.0 return { "lower_bound": lower_bound, "upper_bound": upper_bound, }
def __init__( self, expectation_type: str, meta: Optional[Dict[str, Any]] = None, success_on_last_run: Optional[bool] = None, **kwargs, ): self._expectation_type = expectation_type self._expectation_kwargs = kwargs if meta is None: meta = {} if not isinstance(meta, dict): raise ge_exceptions.ProfilerExecutionError( message=f"""Argument "{meta}" in "{self.__class__.__name__}" must be of type "dictionary" \ (value of type "{str(type())}" was encountered). """ ) self._meta = meta self._success_on_last_run = success_on_last_run
def __init__( self, data_context: DataContext, batch_request: Optional[Union[dict, str]] = None, ): """ Args: data_context: DataContext batch_request: specified in DomainBuilder configuration to get Batch objects for domain computation. """ if data_context is None: raise ge_exceptions.ProfilerExecutionError( message= f"{self.__class__.__name__} requires a data_context, but none was provided." ) self._data_context = data_context self._batch_request = batch_request
def _get_domains( self, variables: Optional[ParameterContainer] = None, ) -> List[Domain]: """ Obtains and returns domains for all columns of a table (or for configured columns, if they exist in the table). """ batch_id: str = self.get_batch_id(variables=variables) table_columns: List[str] = self.get_validator(variables=variables).get_metric( metric=MetricConfiguration( metric_name="table.columns", metric_domain_kwargs={ "batch_id": batch_id, }, metric_value_kwargs=None, metric_dependencies=None, ) ) if self.column_names is None: self.column_names = table_columns else: column_name: str for column_name in self.column_names: if column_name not in table_columns: raise ge_exceptions.ProfilerExecutionError( message=f'Error: The column "{column_name}" in BatchData does not exist.' ) column_name: str domains: List[Domain] = [ Domain( domain_type=self.domain_type, domain_kwargs={ "column": column_name, }, ) for column_name in self.column_names ] return domains
def __init__( self, name: str, metric_name: Optional[str] = None, metric_multi_batch_parameter_builder_name: Optional[str] = None, metric_domain_kwargs: Optional[Union[str, dict]] = None, metric_value_kwargs: Optional[Union[str, dict]] = None, enforce_numeric_metric: Union[str, bool] = True, replace_nan_with_zero: Union[str, bool] = True, reduce_scalar_metric: Union[str, bool] = True, false_positive_rate: Union[str, float] = 5.0e-2, quantile_statistic_interpolation_method: str = "auto", estimator: str = "bootstrap", n_resamples: Optional[Union[str, int]] = None, bw_method: Optional[Union[str, float, Callable]] = None, random_seed: Optional[Union[str, int]] = None, include_estimator_samples_histogram_in_details: Union[ str, bool] = False, truncate_values: Optional[Union[str, Dict[str, Union[ Optional[int], Optional[float]]]]] = None, round_decimals: Optional[Union[str, int]] = None, evaluation_parameter_builder_configs: Optional[ List[ParameterBuilderConfig]] = None, data_context: Optional["BaseDataContext"] = None, # noqa: F821 ) -> None: """ Args: name: the name of this parameter -- this is user-specified parameter name (from configuration); it is not the fully-qualified parameter name; a fully-qualified parameter name must start with "$parameter." and may contain one or more subsequent parts (e.g., "$parameter.<my_param_from_config>.<metric_name>"). metric_name: the name of a metric used in MetricConfiguration (must be a supported and registered metric) metric_multi_batch_parameter_builder_name: name of parameter that computes "metric_name" (for every Batch). metric_domain_kwargs: used in MetricConfiguration metric_value_kwargs: used in MetricConfiguration enforce_numeric_metric: used in MetricConfiguration to insure that metric computations return numeric values replace_nan_with_zero: if False, then if the computed metric gives NaN, then exception is raised; otherwise, if True (default), then if the computed metric gives NaN, then it is converted to the 0.0 (float) value. reduce_scalar_metric: if True (default), then reduces computation of 1-dimensional metric to scalar value. false_positive_rate: user-configured fraction between 0 and 1 expressing desired false positive rate for identifying unexpected values as judged by the upper- and lower- quantiles of the observed metric data. quantile_statistic_interpolation_method: Applicable only for the "bootstrap" sampling method -- supplies value of (interpolation) "method" to "np.quantile()" statistic, used for confidence intervals. estimator: choice of the estimation algorithm: "oneshot" (one observation), "bootstrap" (default), or "kde" (kernel density estimation). n_resamples: Applicable only for the "bootstrap" and "kde" sampling methods -- if omitted (default), then 9999 is used (default in "https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bootstrap.html"). bw_method: Applicable only for the "kde" sampling method -- if omitted (default), then "scott" is used. Possible values for the estimator bandwidth method are described at: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gaussian_kde.html random_seed: Applicable only for the "bootstrap" and "kde" sampling methods -- if omitted (default), then uses "np.random.choice"; otherwise, utilizes "np.random.Generator(np.random.PCG64(random_seed))". include_estimator_samples_histogram_in_details: Applicable only for the "bootstrap" sampling method -- if True, then add 10-bin histogram of bootstraps to "details"; otherwise, omit this information (default). truncate_values: user-configured directive for whether or not to allow the computed parameter values (i.e., lower_bound, upper_bound) to take on values outside the specified bounds when packaged on output. round_decimals: user-configured non-negative integer indicating the number of decimals of the rounding precision of the computed parameter values (i.e., min_value, max_value) prior to packaging them on output. If omitted, then no rounding is performed, unless the computed value is already an integer. evaluation_parameter_builder_configs: ParameterBuilder configurations, executing and making whose respective ParameterBuilder objects' outputs available (as fully-qualified parameter names) is pre-requisite. These "ParameterBuilder" configurations help build parameters needed for this "ParameterBuilder". data_context: BaseDataContext associated with this ParameterBuilder """ super().__init__( name=name, metric_name=metric_name, metric_domain_kwargs=metric_domain_kwargs, metric_value_kwargs=metric_value_kwargs, enforce_numeric_metric=enforce_numeric_metric, replace_nan_with_zero=replace_nan_with_zero, reduce_scalar_metric=reduce_scalar_metric, evaluation_parameter_builder_configs= evaluation_parameter_builder_configs, data_context=data_context, ) self._metric_multi_batch_parameter_builder_name = ( metric_multi_batch_parameter_builder_name) self._false_positive_rate = false_positive_rate self._quantile_statistic_interpolation_method = ( quantile_statistic_interpolation_method) self._estimator = estimator self._n_resamples = n_resamples self._bw_method = bw_method self._random_seed = random_seed self._include_estimator_samples_histogram_in_details = ( include_estimator_samples_histogram_in_details) self._round_decimals = round_decimals if not truncate_values: truncate_values = { "lower_bound": None, "upper_bound": None, } else: if not isinstance(truncate_values, str): truncate_values_keys: set = set(truncate_values.keys()) if (not truncate_values_keys <= NumericMetricRangeMultiBatchParameterBuilder. RECOGNIZED_TRUNCATE_DISTRIBUTION_KEYS): raise ge_exceptions.ProfilerExecutionError( message= f"""Unrecognized truncate_values key(s) in {self.__class__.__name__}: "{str(truncate_values_keys - NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_TRUNCATE_DISTRIBUTION_KEYS)}" \ detected. """) self._truncate_values = truncate_values
def _build_parameters( self, parameter_container: ParameterContainer, domain: Domain, *, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, ): """ Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and optional details. :return: ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and optional details The algorithm operates according to the following steps: 1. Obtain batch IDs of interest using DataContext and BatchRequest (unless passed explicitly as argument). Note that this specific BatchRequest was specified as part of configuration for the present ParameterBuilder class. 2. Set up metric_domain_kwargs and metric_value_kwargs (using configuration and/or variables and parameters). 3. Instantiate the Validator object corresponding to BatchRequest (with a temporary expectation_suite_name) in order to have access to all Batch objects, on each of which the specified metric_name will be computed. 4. Perform metric computations and obtain the result in the array-like form (one metric value per each Batch). 5. Using the configured directives and heuristics, determine whether or not the ranges should be clipped. 6. Using the configured directives and heuristics, determine if return values should be rounded to an integer. 7. Convert the list of floating point metric computation results to a numpy array (for further computations). Steps 8 -- 10 are for the "oneshot" sampling method only (the "bootstrap" method achieves same automatically): 8. Compute the mean and the standard deviation of the metric (aggregated over all the gathered Batch objects). 9. Compute number of standard deviations (as floating point) needed (around the mean) to achieve the specified false_positive_rate (note that false_positive_rate of 0.0 would result in infinite number of standard deviations, hence it is "nudged" by small quantity "epsilon" above 0.0 if false_positive_rate of 0.0 appears as argument). (Please refer to "https://en.wikipedia.org/wiki/Normal_distribution" and references therein for background.) 10. Compute the "band" around the mean as the min_value and max_value (to be used in ExpectationConfiguration). 11. Return [low, high] for the desired metric as estimated by the specified sampling method. 12. Set up the arguments and call build_parameter_container() to store the parameter as part of "rule state". """ validator: Validator = self.get_validator( domain=domain, variables=variables, parameters=parameters, ) batch_ids: Optional[List[str]] = self.get_batch_ids( domain=domain, variables=variables, parameters=parameters, ) if not batch_ids: raise ge_exceptions.ProfilerExecutionError( message= f"Utilizing a {self.__class__.__name__} requires a non-empty list of batch identifiers." ) metric_computation_result: Dict[ str, Union[Union[np.ndarray, List[Union[Any, Number]]], Dict[str, Any]]] = self.get_metrics( batch_ids=batch_ids, validator=validator, metric_name=self._metric_name, metric_domain_kwargs=self._metric_domain_kwargs, metric_value_kwargs=self._metric_value_kwargs, enforce_numeric_metric=self._enforce_numeric_metric, replace_nan_with_zero=self._replace_nan_with_zero, domain=domain, variables=variables, parameters=parameters, ) metric_values: Union[np.ndarray, List[Union[ Any, Number]]] = metric_computation_result["metric_values"] details: Dict[str, Any] = metric_computation_result["details"] # Obtain sampling_method directive from rule state (i.e., variables and parameters); from instance variable otherwise. sampling_method: str = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=self._sampling_method, expected_return_type=str, variables=variables, parameters=parameters, ) if not (sampling_method in NumericMetricRangeMultiBatchParameterBuilder .RECOGNIZED_SAMPLING_METHOD_NAMES): raise ge_exceptions.ProfilerExecutionError( message= f"""The directive "sampling_method" for {self.__class__.__name__} can be only one of {NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_SAMPLING_METHOD_NAMES} ("{sampling_method}" was detected). """) # Obtain false_positive_rate from rule state (i.e., variables and parameters); from instance variable otherwise. false_positive_rate: Union[ Any, str] = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=self._false_positive_rate, expected_return_type=float, variables=variables, parameters=parameters, ) if not (0.0 <= false_positive_rate <= 1.0): raise ge_exceptions.ProfilerExecutionError( message= f"The confidence level for {self.__class__.__name__} is outside of [0.0, 1.0] closed interval." ) truncate_values: Dict[ str, Number] = self._get_truncate_values_using_heuristics( metric_values=metric_values, domain=domain, variables=variables, parameters=parameters, ) lower_bound: Optional[float] = truncate_values.get("lower_bound") upper_bound: Optional[float] = truncate_values.get("upper_bound") round_decimals: int = self._get_round_decimals_using_heuristics( metric_values=metric_values, domain=domain, variables=variables, parameters=parameters, ) metric_values = np.array(metric_values, dtype=np.float64) lower_quantile: Union[Number, float] upper_quantile: Union[Number, float] if np.all(np.isclose(metric_values, metric_values[0])): # Computation is unnecessary if distribution is degenerate. lower_quantile = upper_quantile = metric_values[0] elif sampling_method == "bootstrap": lower_quantile, upper_quantile = self._get_bootstrap_estimate( metric_values=metric_values, false_positive_rate=false_positive_rate, domain=domain, variables=variables, parameters=parameters, ) else: lower_quantile, upper_quantile = compute_quantiles( metric_values=metric_values, false_positive_rate=false_positive_rate, ) min_value: Union[Number, float] max_value: Union[Number, float] if round_decimals == 0: min_value = round(float(lower_quantile)) max_value = round(float(upper_quantile)) else: min_value = round(float(lower_quantile), round_decimals) max_value = round(float(upper_quantile), round_decimals) if lower_bound is not None: min_value = max(min_value, lower_bound) if upper_bound is not None: max_value = min(max_value, upper_bound) parameter_values: Dict[str, Any] = { f"$parameter.{self.parameter_name}": { "value": { "min_value": min_value, "max_value": max_value, }, "details": details, }, } build_parameter_container(parameter_container=parameter_container, parameter_values=parameter_values)
def infer_semantic_domain_type_from_table_column_type( self, column_types_dict_list: List[Dict[str, Any]], column_name: str, ) -> InferredSemanticDomainType: # Note: As of Python 3.8, specifying argument type in Lambda functions is not supported by Lambda syntax. column_types_dict_list = list( filter( lambda column_type_dict: column_name == column_type_dict["name" ], column_types_dict_list, )) if len(column_types_dict_list) != 1: raise ge_exceptions.ProfilerExecutionError( message= f"""Error: {len(column_types_dict_list)} columns were found while obtaining semantic type \ information. Please ensure that the specified column name refers to exactly one column. """) column_type: str = str(column_types_dict_list[0]["type"]).upper() semantic_column_type: SemanticDomainTypes if column_type in ( { type_name.upper() for type_name in ProfilerTypeMapping.INT_TYPE_NAMES } | { type_name.upper() for type_name in ProfilerTypeMapping.FLOAT_TYPE_NAMES }): semantic_column_type = SemanticDomainTypes.NUMERIC elif column_type in { type_name.upper() for type_name in ProfilerTypeMapping.STRING_TYPE_NAMES }: semantic_column_type = SemanticDomainTypes.TEXT elif column_type in { type_name.upper() for type_name in ProfilerTypeMapping.BOOLEAN_TYPE_NAMES }: semantic_column_type = SemanticDomainTypes.LOGIC elif column_type in { type_name.upper() for type_name in ProfilerTypeMapping.DATETIME_TYPE_NAMES }: semantic_column_type = SemanticDomainTypes.DATETIME elif column_type in { type_name.upper() for type_name in ProfilerTypeMapping.BINARY_TYPE_NAMES }: semantic_column_type = SemanticDomainTypes.BINARY elif column_type in { type_name.upper() for type_name in ProfilerTypeMapping.CURRENCY_TYPE_NAMES }: semantic_column_type = SemanticDomainTypes.CURRENCY elif column_type in { type_name.upper() for type_name in ProfilerTypeMapping.IDENTIFIER_TYPE_NAMES }: semantic_column_type = SemanticDomainTypes.IDENTIFIER elif column_type in ( { type_name.upper() for type_name in ProfilerTypeMapping.MISCELLANEOUS_TYPE_NAMES } | { type_name.upper() for type_name in ProfilerTypeMapping.RECORD_TYPE_NAMES }): semantic_column_type = SemanticDomainTypes.MISCELLANEOUS else: semantic_column_type = SemanticDomainTypes.UNKNOWN inferred_semantic_column_type: InferredSemanticDomainType = ( InferredSemanticDomainType( semantic_domain_type=semantic_column_type, details={ "algorithm_type": "deterministic", "mechanism": "lookup_table", "source": "great_expectations.profile.base.ProfilerTypeMapping", }, )) return inferred_semantic_column_type
def _build_parameters( self, domain: Domain, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, recompute_existing_parameter_values: bool = False, ) -> Attributes: """ Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details. Check the percentage of values matching each string, and return the best fit, or None if no string exceeds the configured threshold. Returns: Attributes object, containing computed parameter values and parameter computation details metadata. """ metric_computation_result: MetricComputationResult metric_computation_result = self.get_metrics( metric_name="column_values.nonnull.count", metric_domain_kwargs=self.metric_domain_kwargs, metric_value_kwargs=self.metric_value_kwargs, domain=domain, variables=variables, parameters=parameters, ) # This should never happen. if len(metric_computation_result.attributed_resolved_metrics) != 1: raise ge_exceptions.ProfilerExecutionError( message=f'Result of metric computations for {self.__class__.__name__} must be a list with exactly 1 element of type "AttributedResolvedMetrics" ({metric_computation_result.attributed_resolved_metrics} found).' ) attributed_resolved_metrics: AttributedResolvedMetrics attributed_resolved_metrics = ( metric_computation_result.attributed_resolved_metrics[0] ) metric_values: MetricValues metric_values = attributed_resolved_metrics.metric_values if metric_values is None: raise ge_exceptions.ProfilerExecutionError( message=f"Result of metric computations for {self.__class__.__name__} is empty." ) # Now obtain 1-dimensional vector of values of computed metric (each element corresponds to a Batch ID). metric_values = metric_values[:, 0] nonnull_count: int = sum(metric_values) # Obtain candidate_strings from "rule state" (i.e., variables and parameters); from instance variable otherwise. candidate_strings: Union[ List[str], Set[str], ] = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=self.candidate_strings, expected_return_type=None, variables=variables, parameters=parameters, ) # Gather "metric_value_kwargs" for all candidate "strftime_format" strings. format_string: str match_strftime_metric_value_kwargs_list: List[dict] = [] match_strftime_metric_value_kwargs: dict for format_string in candidate_strings: if self.metric_value_kwargs: match_strftime_metric_value_kwargs = { **self.metric_value_kwargs, **{"strftime_format": format_string}, } else: match_strftime_metric_value_kwargs = { "strftime_format": format_string, } match_strftime_metric_value_kwargs_list.append( match_strftime_metric_value_kwargs ) # Obtain resolved metrics and metadata for all metric configurations and available Batch objects simultaneously. metric_computation_result = self.get_metrics( metric_name="column_values.match_strftime_format.unexpected_count", metric_domain_kwargs=self.metric_domain_kwargs, metric_value_kwargs=match_strftime_metric_value_kwargs_list, domain=domain, variables=variables, parameters=parameters, ) format_string_success_ratios: dict = {} for ( attributed_resolved_metrics ) in metric_computation_result.attributed_resolved_metrics: # Now obtain 1-dimensional vector of values of computed metric (each element corresponds to a Batch ID). metric_values = attributed_resolved_metrics.metric_values[:, 0] match_strftime_unexpected_count: int = sum(metric_values) success_ratio: float = ( nonnull_count - match_strftime_unexpected_count ) / nonnull_count format_string_success_ratios[ attributed_resolved_metrics.metric_attributes["strftime_format"] ] = success_ratio # Obtain threshold from "rule state" (i.e., variables and parameters); from instance variable otherwise. threshold: float = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=self.threshold, expected_return_type=float, variables=variables, parameters=parameters, ) # get best-matching datetime string that matches greater than threshold best_format_string: str best_ratio: float ( best_format_string, best_ratio, ) = ParameterBuilder._get_best_candidate_above_threshold( format_string_success_ratios, threshold ) # dict of sorted datetime and ratios for all evaluated candidates sorted_format_strings_and_ratios: dict = ( ParameterBuilder._get_sorted_candidates_and_ratios( format_string_success_ratios ) ) return Attributes( { FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY: best_format_string, FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: { "success_ratio": best_ratio, "candidate_strings": sorted_format_strings_and_ratios, }, } )
def _sanitize_metric_computation( self, metric_name: str, attributed_resolved_metrics: AttributedResolvedMetrics, enforce_numeric_metric: Union[str, bool] = False, replace_nan_with_zero: Union[str, bool] = False, domain: Optional[Domain] = None, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, ) -> AttributedResolvedMetrics: """ This method conditions (or "sanitizes") data samples in the format "N x R^m", where "N" (most significant dimension) is the number of measurements (e.g., one per Batch of data), while "R^m" is the multi-dimensional metric, whose values are being estimated. The "conditioning" operations are: 1. If "enforce_numeric_metric" flag is set, raise an error if a non-numeric value is found in sample vectors. 2. Further, if a NaN is encountered in a sample vectors and "replace_nan_with_zero" is True, then replace those NaN values with the 0.0 floating point number; if "replace_nan_with_zero" is False, then raise an error. """ # Obtain enforce_numeric_metric from "rule state" (i.e., variables and parameters); from instance variable otherwise. enforce_numeric_metric = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=enforce_numeric_metric, expected_return_type=bool, variables=variables, parameters=parameters, ) # Obtain replace_nan_with_zero from "rule state" (i.e., variables and parameters); from instance variable otherwise. replace_nan_with_zero = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=replace_nan_with_zero, expected_return_type=bool, variables=variables, parameters=parameters, ) if not (enforce_numeric_metric or replace_nan_with_zero): return attributed_resolved_metrics metric_values_by_batch_id: Dict[str, MetricValue] = {} batch_id: str metric_values: MetricValues for ( batch_id, metric_values, ) in attributed_resolved_metrics.conditioned_attributed_metric_values.items(): batch_metric_values: MetricValues = [] metric_value_shape: tuple = metric_values.shape # Generate all permutations of indexes for accessing every element of the multi-dimensional metric. metric_value_shape_idx: int axes: List[np.ndarray] = [ np.indices(dimensions=(metric_value_shape_idx,))[0] for metric_value_shape_idx in metric_value_shape ] metric_value_indices: List[tuple] = list(itertools.product(*tuple(axes))) metric_value_idx: tuple for metric_value_idx in metric_value_indices: metric_value: MetricValue = metric_values[metric_value_idx] if enforce_numeric_metric: if not np.issubdtype(metric_value.dtype, np.number): raise ge_exceptions.ProfilerExecutionError( message=f"""Applicability of {self.__class__.__name__} is restricted to numeric-valued metrics \ (value of type "{str(metric_value.dtype)}" was computed). """ ) if np.isnan(metric_value): if not replace_nan_with_zero: raise ValueError( f"""Computation of metric "{metric_name}" resulted in NaN ("not a number") value. """ ) batch_metric_values.append(0.0) else: batch_metric_values.append(metric_value) metric_values_by_batch_id[batch_id] = batch_metric_values attributed_resolved_metrics.metric_values_by_batch_id = ( metric_values_by_batch_id ) return attributed_resolved_metrics
def get_metrics( self, metric_name: str, metric_domain_kwargs: Optional[Union[Union[str, dict], List[Union[str, dict]]]] = None, metric_value_kwargs: Optional[Union[Union[str, dict], List[Union[str, dict]]]] = None, enforce_numeric_metric: Union[str, bool] = False, replace_nan_with_zero: Union[str, bool] = False, domain: Optional[Domain] = None, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, ) -> MetricComputationResult: """ General multi-batch metric computation facility. Computes specified metric (can be multi-dimensional, numeric, non-numeric, or mixed) and conditions (or "sanitizes") result according to two criteria: enforcing metric output to be numeric and handling NaN values. :param metric_name: Name of metric of interest, being computed. :param metric_domain_kwargs: Metric Domain Kwargs is an essential parameter of the MetricConfiguration object. :param metric_value_kwargs: Metric Value Kwargs is an essential parameter of the MetricConfiguration object. :param enforce_numeric_metric: Flag controlling whether or not metric output must be numerically-valued. :param replace_nan_with_zero: Directive controlling how NaN metric values, if encountered, should be handled. :param domain: Domain object scoping "$variable"/"$parameter"-style references in configuration and runtime. :param variables: Part of the "rule state" available for "$variable"-style references. :param parameters: Part of the "rule state" available for "$parameter"-style references. :return: MetricComputationResult object, containing both: data samples in the format "N x R^m", where "N" (most significant dimension) is the number of measurements (e.g., one per Batch of data), while "R^m" is the multi-dimensional metric, whose values are being estimated, and details (to be used for metadata purposes). """ if not metric_name: raise ge_exceptions.ProfilerExecutionError( message= f"""Utilizing "{self.__class__.__name__}.get_metrics()" requires valid "metric_name" to be \ specified (empty "metric_name" value detected).""") batch_ids: Optional[List[str]] = self.get_batch_ids( domain=domain, variables=variables, parameters=parameters, ) if not batch_ids: raise ge_exceptions.ProfilerExecutionError( message= f"Utilizing a {self.__class__.__name__} requires a non-empty list of Batch identifiers." ) """ Compute metrics, corresponding to multiple "MetricConfiguration" directives, together, rather than individually. As a strategy, since "metric_domain_kwargs" changes depending on "batch_id", "metric_value_kwargs" serves as identifying entity (through "AttributedResolvedMetrics") for accessing resolved metrics (computation results). All "MetricConfiguration" directives are generated by combining each metric_value_kwargs" with "metric_domain_kwargs" for all "batch_ids" (where every "metric_domain_kwargs" represents separate "batch_id"). Then, all "MetricConfiguration" objects, collected into list as container, are resolved simultaneously. """ # Step-1: Gather "metric_domain_kwargs" (corresponding to "batch_ids"). domain_kwargs: dict = build_metric_domain_kwargs( batch_id=None, metric_domain_kwargs=metric_domain_kwargs, domain=domain, variables=variables, parameters=parameters, ) batch_id: str metric_domain_kwargs = [ copy.deepcopy( build_metric_domain_kwargs( batch_id=batch_id, metric_domain_kwargs=copy.deepcopy(domain_kwargs), domain=domain, variables=variables, parameters=parameters, )) for batch_id in batch_ids ] # Step-2: Gather "metric_value_kwargs" (caller may require same metric computed for multiple arguments). if not isinstance(metric_value_kwargs, list): metric_value_kwargs = [metric_value_kwargs] value_kwargs_cursor: dict metric_value_kwargs = [ # Obtain value kwargs from "rule state" (i.e., variables and parameters); from instance variable otherwise. get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=value_kwargs_cursor, expected_return_type=None, variables=variables, parameters=parameters, ) for value_kwargs_cursor in metric_value_kwargs ] # Step-3: Generate "MetricConfiguration" directives for all "metric_domain_kwargs"/"metric_value_kwargs" pairs. domain_kwargs_cursor: dict kwargs_combinations: List[List[dict]] = [ [domain_kwargs_cursor, value_kwargs_cursor] for value_kwargs_cursor in metric_value_kwargs for domain_kwargs_cursor in metric_domain_kwargs ] metrics_to_resolve: List[MetricConfiguration] kwargs_pair_cursor: List[dict, dict] metrics_to_resolve = [ MetricConfiguration( metric_name=metric_name, metric_domain_kwargs=kwargs_pair_cursor[0], metric_value_kwargs=kwargs_pair_cursor[1], metric_dependencies=None, ) for kwargs_pair_cursor in kwargs_combinations ] # Step-4: Sort "MetricConfiguration" directives by "metric_value_kwargs_id" and "batch_id" (in that order). # This precise sort order enables pairing every metric value with its respective "batch_id" (e.g., for display). metrics_to_resolve = sorted( metrics_to_resolve, key=lambda metric_configuration_element: ( metric_configuration_element.metric_value_kwargs_id, metric_configuration_element.metric_domain_kwargs["batch_id"], ), ) # Step-5: Resolve all metrics in one operation simultaneously. # The Validator object used for metric calculation purposes. validator: "Validator" = self.get_validator( # noqa: F821 domain=domain, variables=variables, parameters=parameters, ) resolved_metrics: Dict[Tuple[str, str, str], Any] = validator.compute_metrics( metric_configurations=metrics_to_resolve) # Step-6: Sort resolved metrics according to same sort order as was applied to "MetricConfiguration" directives. resolved_metrics_sorted: Dict[Tuple[str, str, str], Any] = {} metric_configuration: MetricConfiguration resolved_metric_value: Any for metric_configuration in metrics_to_resolve: if metric_configuration.id not in resolved_metrics: logger.warning( f"{metric_configuration.id[0]} was not found in the resolved Metrics for ParameterBuilder." ) continue resolved_metrics_sorted[ metric_configuration.id] = resolved_metrics[ metric_configuration.id] # Step-7: Map resolved metrics to their attributes for identification and recovery by receiver. attributed_resolved_metrics_map: Dict[str, AttributedResolvedMetrics] = {} attributed_resolved_metrics: AttributedResolvedMetrics for metric_configuration in metrics_to_resolve: attributed_resolved_metrics = attributed_resolved_metrics_map.get( metric_configuration.metric_value_kwargs_id) if attributed_resolved_metrics is None: attributed_resolved_metrics = AttributedResolvedMetrics( metric_attributes=metric_configuration.metric_value_kwargs, metric_values_by_batch_id=None, ) attributed_resolved_metrics_map[ metric_configuration. metric_value_kwargs_id] = attributed_resolved_metrics if metric_configuration.id in resolved_metrics_sorted: resolved_metric_value = resolved_metrics_sorted[ metric_configuration.id] attributed_resolved_metrics.add_resolved_metric( batch_id=metric_configuration. metric_domain_kwargs["batch_id"], value=resolved_metric_value, ) else: continue # Step-8: Convert scalar metric values to vectors to enable uniformity of processing in subsequent operations. metric_attributes_id: str for ( metric_attributes_id, attributed_resolved_metrics, ) in attributed_resolved_metrics_map.items(): if (isinstance(attributed_resolved_metrics.metric_values, np.ndarray) and attributed_resolved_metrics.metric_values.ndim == 1): attributed_resolved_metrics.metric_values_by_batch_id = { batch_id: [resolved_metric_value] for batch_id, resolved_metric_value in attributed_resolved_metrics.attributed_metric_values.items( ) } attributed_resolved_metrics_map[ metric_attributes_id] = attributed_resolved_metrics # Step-9: Apply numeric/hygiene flags (e.g., "enforce_numeric_metric", "replace_nan_with_zero") to results. for ( metric_attributes_id, attributed_resolved_metrics, ) in attributed_resolved_metrics_map.items(): self._sanitize_metric_computation( metric_name=metric_name, attributed_resolved_metrics=attributed_resolved_metrics, enforce_numeric_metric=enforce_numeric_metric, replace_nan_with_zero=replace_nan_with_zero, domain=domain, variables=variables, parameters=parameters, ) # Step-10: Build and return result to receiver (apply simplifications to cases of single "metric_value_kwargs"). return MetricComputationResult( attributed_resolved_metrics=list( attributed_resolved_metrics_map.values()), details={ "metric_configuration": { "metric_name": metric_name, "domain_kwargs": domain_kwargs, "metric_value_kwargs": metric_value_kwargs[0] if len(metric_value_kwargs) == 1 else metric_value_kwargs, "metric_dependencies": None, }, "num_batches": len(batch_ids), }, )
def _build_parameters( self, parameter_container: ParameterContainer, domain: Domain, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, ): """ Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and optional details. :return: ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and ptional details The algorithm operates according to the following steps: 1. Obtain batch IDs of interest using DataContext and BatchRequest (unless passed explicitly as argument). Note that this specific BatchRequest was specified as part of configuration for the present ParameterBuilder class. 2. Set up metric_domain_kwargs and metric_value_kwargs (using configuration and/or variables and parameters). 3. Instantiate the Validator object corresponding to BatchRequest (with a temporary expectation_suite_name) in order to have access to all Batch objects, on each of which the specified metric_name will be computed. 4. Perform metric computations and obtain the result in the array-like form (one metric value per each Batch). 5. Using the configured directives and heuristics, determine whether or not the ranges should be clipped. 6. Using the configured directives and heuristics, determine if return values should be rounded to an integer. 7. Convert the multi-dimensional metric computation results to a numpy array (for further computations). Steps 8 -- 10 are for the "oneshot" sampling method only (the "bootstrap" method achieves same automatically): 8. Compute the mean and the standard deviation of the metric (aggregated over all the gathered Batch objects). 9. Compute number of standard deviations (as floating point) needed (around the mean) to achieve the specified false_positive_rate (note that false_positive_rate of 0.0 would result in infinite number of standard deviations, hence it is "nudged" by small quantity "epsilon" above 0.0 if false_positive_rate of 0.0 appears as argument). (Please refer to "https://en.wikipedia.org/wiki/Normal_distribution" and references therein for background.) 10. Compute the "band" around the mean as the min_value and max_value (to be used in ExpectationConfiguration). 11. Return [low, high] for the desired metric as estimated by the specified sampling method. 12. Set up the arguments and call build_parameter_container() to store the parameter as part of "rule state". """ metric_computation_result: MetricComputationResult = self.get_metrics( metric_name=self.metric_name, metric_domain_kwargs=self.metric_domain_kwargs, metric_value_kwargs=self.metric_value_kwargs, enforce_numeric_metric=self.enforce_numeric_metric, replace_nan_with_zero=self.replace_nan_with_zero, domain=domain, variables=variables, parameters=parameters, ) metric_values: np.ndarray = metric_computation_result.metric_values details: MetricComputationDetails = metric_computation_result.details # Obtain sampling_method directive from "rule state" (i.e., variables and parameters); from instance variable otherwise. sampling_method: str = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=self.sampling_method, expected_return_type=str, variables=variables, parameters=parameters, ) if ( sampling_method not in NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_SAMPLING_METHOD_NAMES ): raise ge_exceptions.ProfilerExecutionError( message=f"""The directive "sampling_method" for {self.__class__.__name__} can be only one of {NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_SAMPLING_METHOD_NAMES} ("{sampling_method}" was detected). """ ) estimator: Callable etimator_kwargs: dict if sampling_method == "bootstrap": estimator = self._get_bootstrap_estimate estimator_kwargs = { "false_positive_rate": self.false_positive_rate, "num_bootstrap_samples": self.num_bootstrap_samples, } else: estimator = self._get_deterministic_estimate estimator_kwargs = { "false_positive_rate": self.false_positive_rate, } metric_value_range: np.ndarray = self._estimate_metric_value_range( metric_values=metric_values, estimator=estimator, domain=domain, variables=variables, parameters=parameters, **estimator_kwargs, ) parameter_values: Dict[str, Any] = { f"$parameter.{self.name}": { "value": { "value_range": metric_value_range, }, "details": details, }, } build_parameter_container( parameter_container=parameter_container, parameter_values=parameter_values )
def __init__( self, name: str, metric_name: str, metric_domain_kwargs: Optional[Union[str, dict]] = None, metric_value_kwargs: Optional[Union[str, dict]] = None, sampling_method: str = "bootstrap", enforce_numeric_metric: Union[str, bool] = True, replace_nan_with_zero: Union[str, bool] = True, reduce_scalar_metric: Union[str, bool] = True, false_positive_rate: Union[str, float] = 5.0e-2, num_bootstrap_samples: Optional[Union[str, int]] = None, round_decimals: Optional[Union[str, int]] = None, truncate_values: Optional[ Union[str, Dict[str, Union[Optional[int], Optional[float]]]] ] = None, data_context: Optional["DataContext"] = None, # noqa: F821 batch_request: Optional[Union[BatchRequest, RuntimeBatchRequest, dict]] = None, ): """ Args: name: the name of this parameter -- this is user-specified parameter name (from configuration); it is not the fully-qualified parameter name; a fully-qualified parameter name must start with "$parameter." and may contain one or more subsequent parts (e.g., "$parameter.<my_param_from_config>.<metric_name>"). metric_name: the name of a metric used in MetricConfiguration (must be a supported and registered metric) metric_domain_kwargs: used in MetricConfiguration metric_value_kwargs: used in MetricConfiguration sampling_method: choice of the sampling algorithm: "oneshot" (one observation) or "bootstrap" (default) enforce_numeric_metric: used in MetricConfiguration to insure that metric computations return numeric values replace_nan_with_zero: if False, then if the computed metric gives NaN, then exception is raised; otherwise, if True (default), then if the computed metric gives NaN, then it is converted to the 0.0 (float) value. reduce_scalar_metric: if True (default), then reduces computation of 1-dimensional metric to scalar value. false_positive_rate: user-configured fraction between 0 and 1 expressing desired false positive rate for identifying unexpected values as judged by the upper- and lower- quantiles of the observed metric data. num_bootstrap_samples: Applicable only for the "bootstrap" sampling method -- if omitted (default), then 9999 is used (default in "https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bootstrap.html"). round_decimals: user-configured non-negative integer indicating the number of decimals of the rounding precision of the computed parameter values (i.e., min_value, max_value) prior to packaging them on output. If omitted, then no rounding is performed, unless the computed value is already an integer. truncate_values: user-configured directive for whether or not to allow the computed parameter values (i.e., lower_bound, upper_bound) to take on values outside the specified bounds when packaged on output. data_context: DataContext batch_request: specified in ParameterBuilder configuration to get Batch objects for parameter computation. """ super().__init__( name=name, data_context=data_context, batch_request=batch_request, ) self._metric_name = metric_name self._metric_domain_kwargs = metric_domain_kwargs self._metric_value_kwargs = metric_value_kwargs self._sampling_method = sampling_method self._enforce_numeric_metric = enforce_numeric_metric self._replace_nan_with_zero = replace_nan_with_zero self._reduce_scalar_metric = reduce_scalar_metric self._false_positive_rate = false_positive_rate self._num_bootstrap_samples = num_bootstrap_samples self._round_decimals = round_decimals if not truncate_values: truncate_values = { "lower_bound": None, "upper_bound": None, } else: if not isinstance(truncate_values, str): truncate_values_keys: set = set(truncate_values.keys()) if ( not truncate_values_keys <= NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_TRUNCATE_DISTRIBUTION_KEYS ): raise ge_exceptions.ProfilerExecutionError( message=f"""Unrecognized truncate_values key(s) in {self.__class__.__name__}: "{str(truncate_values_keys - NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_TRUNCATE_DISTRIBUTION_KEYS)}" \ detected. """ ) self._truncate_values = truncate_values
def get_effective_column_names( self, batch_ids: Optional[List[str]] = None, validator: Optional["Validator"] = None, # noqa: F821 variables: Optional[ParameterContainer] = None, ) -> List[str]: # Obtain include_column_names from "rule state" (i.e., variables and parameters); from instance variable otherwise. include_column_names: Optional[ List[str] ] = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.include_column_names, expected_return_type=None, variables=variables, parameters=None, ) # Obtain exclude_column_names from "rule state" (i.e., variables and parameters); from instance variable otherwise. exclude_column_names: Optional[ List[str] ] = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.exclude_column_names, expected_return_type=None, variables=variables, parameters=None, ) if batch_ids is None: batch_ids: List[str] = self.get_batch_ids(variables=variables) if validator is None: validator = self.get_validator(variables=variables) table_columns: List[str] = validator.get_metric( metric=MetricConfiguration( metric_name="table.columns", metric_domain_kwargs={ "batch_id": batch_ids[-1], # active_batch_id }, metric_value_kwargs=None, metric_dependencies=None, ) ) effective_column_names: List[str] = include_column_names or table_columns if exclude_column_names is None: exclude_column_names = [] column_name: str effective_column_names = [ column_name for column_name in effective_column_names if column_name not in exclude_column_names ] for column_name in effective_column_names: if column_name not in table_columns: raise ge_exceptions.ProfilerExecutionError( message=f'Error: The column "{column_name}" in BatchData does not exist.' ) # include_column_name_suffixes column_name_suffixes from "rule state" (i.e., variables and parameters); from instance variable otherwise. include_column_name_suffixes: Optional[ Union[str, Iterable, List[str]] ] = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.include_column_name_suffixes, expected_return_type=None, variables=variables, parameters=None, ) # exclude_column_name_suffixes column_name_suffixes from "rule state" (i.e., variables and parameters); from instance variable otherwise. exclude_column_name_suffixes: Optional[ Union[str, Iterable, List[str]] ] = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.exclude_column_name_suffixes, expected_return_type=None, variables=variables, parameters=None, ) if include_column_name_suffixes: if isinstance(include_column_name_suffixes, str): include_column_name_suffixes = [include_column_name_suffixes] else: if not isinstance(include_column_name_suffixes, (Iterable, list)): raise ValueError( "Unrecognized include_column_name_suffixes directive -- must be a list or a string." ) effective_column_names: List[str] = list( filter( lambda candidate_column_name: candidate_column_name.endswith( tuple(include_column_name_suffixes) ), effective_column_names, ) ) if exclude_column_name_suffixes: if isinstance(exclude_column_name_suffixes, str): exclude_column_name_suffixes = [exclude_column_name_suffixes] else: if not isinstance(exclude_column_name_suffixes, (Iterable, list)): raise ValueError( "Unrecognized exclude_column_name_suffixes directive -- must be a list or a string." ) effective_column_names: List[str] = list( filter( lambda candidate_column_name: not candidate_column_name.endswith( tuple(exclude_column_name_suffixes) ), effective_column_names, ) ) # Obtain semantic_type_filter_module_name from "rule state" (i.e., variables and parameters); from instance variable otherwise. semantic_type_filter_module_name: Optional[ str ] = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.semantic_type_filter_module_name, expected_return_type=None, variables=variables, parameters=None, ) if semantic_type_filter_module_name is None: semantic_type_filter_module_name = "great_expectations.rule_based_profiler.helpers.simple_semantic_type_filter" # Obtain semantic_type_filter_class_name from "rule state" (i.e., variables and parameters); from instance variable otherwise. semantic_type_filter_class_name: Optional[ str ] = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.semantic_type_filter_class_name, expected_return_type=None, variables=variables, parameters=None, ) if semantic_type_filter_class_name is None: semantic_type_filter_class_name = "SimpleSemanticTypeFilter" semantic_type_filter: SemanticTypeFilter = instantiate_class_from_config( config={ "module_name": semantic_type_filter_module_name, "class_name": semantic_type_filter_class_name, }, runtime_environment={ "batch_ids": batch_ids, "validator": validator, "column_names": effective_column_names, }, config_defaults={}, ) self._semantic_type_filter = semantic_type_filter # Obtain include_semantic_types from "rule state" (i.e., variables and parameters); from instance variable otherwise. include_semantic_types: Optional[ Union[str, SemanticDomainTypes, List[Union[str, SemanticDomainTypes]]] ] = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.include_semantic_types, expected_return_type=None, variables=variables, parameters=None, ) include_semantic_types = ( self.semantic_type_filter.parse_semantic_domain_type_argument( semantic_types=include_semantic_types ) ) # Obtain exclude_semantic_types from "rule state" (i.e., variables and parameters); from instance variable otherwise. exclude_semantic_types: Optional[ Union[str, SemanticDomainTypes, List[Union[str, SemanticDomainTypes]]] ] = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.exclude_semantic_types, expected_return_type=None, variables=variables, parameters=None, ) exclude_semantic_types = ( self.semantic_type_filter.parse_semantic_domain_type_argument( semantic_types=exclude_semantic_types ) ) if include_semantic_types: effective_column_names = list( filter( lambda candidate_column_name: self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map[ candidate_column_name ] in include_semantic_types, effective_column_names, ) ) if exclude_semantic_types: effective_column_names = list( filter( lambda candidate_column_name: self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map[ candidate_column_name ] not in exclude_semantic_types, effective_column_names, ) ) return effective_column_names
def get_metrics( self, metric_name: str, metric_domain_kwargs: Optional[Union[str, dict]] = None, metric_value_kwargs: Optional[Union[str, dict]] = None, enforce_numeric_metric: Union[str, bool] = False, replace_nan_with_zero: Union[str, bool] = False, domain: Optional[Domain] = None, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, ) -> MetricComputationResult: """ General multi-batch metric computation facility. Computes specified metric (can be multi-dimensional, numeric, non-numeric, or mixed) and conditions (or "sanitizes") result according to two criteria: enforcing metric output to be numeric and handling NaN values. :param metric_name: Name of metric of interest, being computed. :param metric_domain_kwargs: Metric Domain Kwargs is an essential parameter of the MetricConfiguration object. :param metric_value_kwargs: Metric Value Kwargs is an essential parameter of the MetricConfiguration object. :param enforce_numeric_metric: Flag controlling whether or not metric output must be numerically-valued. :param replace_nan_with_zero: Directive controlling how NaN metric values, if encountered, should be handled. :param domain: Domain object scoping "$variable"/"$parameter"-style references in configuration and runtime. :param variables: Part of the "rule state" available for "$variable"-style references. :param parameters: Part of the "rule state" available for "$parameter"-style references. :return: MetricComputationResult object, containing both: data samples in the format "N x R^m", where "N" (most significant dimension) is the number of measurements (e.g., one per Batch of data), while "R^m" is the multi-dimensional metric, whose values are being estimated, and details (to be used for metadata purposes). """ # IDs of Batch objects used to compute the metric -- commonly obtained via the "get_batch_ids()" # method in this module, although it can readily accept the list of Batch IDs generated through any other means. batch_ids: Optional[List[str]] = self.get_batch_ids( domain=domain, variables=variables, parameters=parameters, ) if not batch_ids: raise ge_exceptions.ProfilerExecutionError( message= f"Utilizing a {self.__class__.__name__} requires a non-empty list of batch identifiers." ) domain_kwargs = build_metric_domain_kwargs( batch_id=None, metric_domain_kwargs=metric_domain_kwargs, domain=domain, variables=variables, parameters=parameters, ) metric_domain_kwargs: dict = copy.deepcopy(domain_kwargs) # Obtain value kwargs from "rule state" (i.e., variables and parameters); from instance variable otherwise. metric_value_kwargs = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=metric_value_kwargs, expected_return_type=None, variables=variables, parameters=parameters, ) metric_values: MetricValues = [] # The Validator object used for metric calculation purposes. validator: "Validator" = self.get_validator( domain=domain, variables=variables, parameters=parameters, ) metric_value: MetricValue batch_id: str for batch_id in batch_ids: metric_domain_kwargs["batch_id"] = batch_id metric_value = validator.get_metric(metric=MetricConfiguration( metric_name=metric_name, metric_domain_kwargs=metric_domain_kwargs, metric_value_kwargs=metric_value_kwargs, metric_dependencies=None, )) if np.isscalar(metric_value): metric_value = [metric_value] metric_values.append(metric_value) metric_values = np.array(metric_values) self._sanitize_metric_computation( metric_name=metric_name, metric_values=metric_values, enforce_numeric_metric=enforce_numeric_metric, replace_nan_with_zero=replace_nan_with_zero, domain=domain, variables=variables, parameters=parameters, ) return MetricComputationResult( metric_values=metric_values, details={ "metric_configuration": { "metric_name": metric_name, "domain_kwargs": domain_kwargs, "metric_value_kwargs": metric_value_kwargs, "metric_dependencies": None, }, "num_batches": len(metric_values), }, )
def _build_parameters( self, domain: Domain, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, recompute_existing_parameter_values: bool = False, ) -> Attributes: """ Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details. Returns: Attributes object, containing computed parameter values and parameter computation details metadata. The algorithm operates according to the following steps: 1. Obtain batch IDs of interest using BaseDataContext and BatchRequest (unless passed explicitly as argument). 2. Set up metric_domain_kwargs and metric_value_kwargs (using configuration and/or variables and parameters). 3. Instantiate the Validator object corresponding to BatchRequest (with a temporary expectation_suite_name) in order to have access to all Batch objects, on each of which the specified metric_name will be computed. 4. Perform metric computations and obtain the result in the array-like form (one metric value per each Batch). 5. Using the configured directives and heuristics, determine whether or not the ranges should be clipped. 6. Using the configured directives and heuristics, determine if return values should be rounded to an integer. 7. Convert the multi-dimensional metric computation results to a numpy array (for further computations). 8. Compute [low, high] for the desired metric using the chosen estimator method. 9. Return [low, high] for the desired metric as estimated by the specified sampling method. 10. Set up the arguments and call build_parameter_container() to store the parameter as part of "rule state". """ # Obtain false_positive_rate from "rule state" (i.e., variables and parameters); from instance variable otherwise. false_positive_rate: np.float64 = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=self.false_positive_rate, expected_return_type=(float, np.float64), variables=variables, parameters=parameters, ) if not (0.0 <= false_positive_rate <= 1.0): raise ge_exceptions.ProfilerExecutionError( f"""false_positive_rate must be a positive decimal number between 0 and 1 inclusive [0, 1], but {false_positive_rate} was provided.""") elif false_positive_rate <= NP_EPSILON: warnings.warn( f"""You have chosen a false_positive_rate of {false_positive_rate}, which is too close to 0. A false_positive_rate of {NP_EPSILON} has been selected instead.""") false_positive_rate = NP_EPSILON elif false_positive_rate >= (1.0 - NP_EPSILON): warnings.warn( f"""You have chosen a false_positive_rate of {false_positive_rate}, which is too close to 1. A false_positive_rate of {1.0-NP_EPSILON} has been selected instead.""") false_positive_rate = np.float64(1.0 - NP_EPSILON) parameter_reference: str if self.metric_multi_batch_parameter_builder_name: # Obtain metric_multi_batch_parameter_builder_name from "rule state" (i.e., variables and parameters); from instance variable otherwise. metric_multi_batch_parameter_builder_name: str = ( get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=self. metric_multi_batch_parameter_builder_name, expected_return_type=str, variables=variables, parameters=parameters, )) parameter_reference = ( f"{RAW_PARAMETER_KEY}{metric_multi_batch_parameter_builder_name}" ) else: # Compute metric value for each Batch object. super().build_parameters( domain=domain, variables=variables, parameters=parameters, parameter_computation_impl=super()._build_parameters, recompute_existing_parameter_values= recompute_existing_parameter_values, ) parameter_reference = self.raw_fully_qualified_parameter_name # Retrieve metric values for all Batch objects. parameter_node: ParameterNode = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=parameter_reference, expected_return_type=None, variables=variables, parameters=parameters, ) metric_values: MetricValues = parameter_node[ FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY] # Obtain estimator directive from "rule state" (i.e., variables and parameters); from instance variable otherwise. estimator: str = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=self.estimator, expected_return_type=str, variables=variables, parameters=parameters, ) if (estimator not in NumericMetricRangeMultiBatchParameterBuilder. RECOGNIZED_SAMPLING_METHOD_NAMES): raise ge_exceptions.ProfilerExecutionError( message= f"""The directive "estimator" for {self.__class__.__name__} can be only one of {NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_SAMPLING_METHOD_NAMES} ("{estimator}" was detected). """) round_decimals: int # Obtain quantile_statistic_interpolation_method directive from "rule state" (i.e., variables and parameters); from instance variable otherwise. quantile_statistic_interpolation_method: str = ( get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=self. quantile_statistic_interpolation_method, expected_return_type=str, variables=variables, parameters=parameters, )) if (quantile_statistic_interpolation_method not in NumericMetricRangeMultiBatchParameterBuilder. RECOGNIZED_QUANTILE_STATISTIC_INTERPOLATION_METHODS): raise ge_exceptions.ProfilerExecutionError( message= f"""The directive "quantile_statistic_interpolation_method" for {self.__class__.__name__} can \ be only one of {NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_QUANTILE_STATISTIC_INTERPOLATION_METHODS} \ ("{quantile_statistic_interpolation_method}" was detected). """) if integer_semantic_domain_type(domain=domain): round_decimals = 0 else: round_decimals = self._get_round_decimals_using_heuristics( metric_values=metric_values, domain=domain, variables=variables, parameters=parameters, ) if quantile_statistic_interpolation_method == "auto": if round_decimals == 0: quantile_statistic_interpolation_method = "nearest" else: quantile_statistic_interpolation_method = "linear" estimator_func: Callable estimator_kwargs: dict if estimator == "bootstrap": estimator_func = self._get_bootstrap_estimate estimator_kwargs = { "false_positive_rate": false_positive_rate, "quantile_statistic_interpolation_method": quantile_statistic_interpolation_method, "n_resamples": self.n_resamples, "random_seed": self.random_seed, } elif estimator == "kde": estimator_func = self._get_kde_estimate estimator_kwargs = { "false_positive_rate": false_positive_rate, "quantile_statistic_interpolation_method": quantile_statistic_interpolation_method, "n_resamples": self.n_resamples, "bw_method": self.bw_method, "random_seed": self.random_seed, } else: estimator_func = self._get_deterministic_estimate estimator_kwargs = { "false_positive_rate": false_positive_rate, "quantile_statistic_interpolation_method": quantile_statistic_interpolation_method, } numeric_range_estimation_result: NumericRangeEstimationResult = ( self._estimate_metric_value_range( metric_values=metric_values, estimator_func=estimator_func, round_decimals=round_decimals, domain=domain, variables=variables, parameters=parameters, **estimator_kwargs, )) value_range: np.ndarray = numeric_range_estimation_result.value_range details: Dict[str, Any] = copy.deepcopy( parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY]) # Obtain include_estimator_samples_histogram_in_details from "rule state" (i.e., variables and parameters); from instance variable otherwise. include_estimator_samples_histogram_in_details: bool = ( get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=self. include_estimator_samples_histogram_in_details, expected_return_type=bool, variables=variables, parameters=parameters, )) if include_estimator_samples_histogram_in_details: details[ "estimation_histogram"] = numeric_range_estimation_result.estimation_histogram return Attributes({ FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY: value_range, FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: details, })
def _get_domains( self, rule_name: str, variables: Optional[ParameterContainer] = None, ) -> List[Domain]: """Return domains matching the specified tolerance limits. Args: rule_name: name of Rule object, for which "Domain" objects are obtained. variables: Optional variables to substitute when evaluating. Returns: List of domains that match the desired tolerance limits. """ batch_ids: List[str] = self.get_batch_ids(variables=variables) validator: "Validator" = self.get_validator( variables=variables) # noqa: F821 effective_column_names: List[str] = self.get_effective_column_names( batch_ids=batch_ids, validator=validator, variables=variables, ) if not (effective_column_names and (len(effective_column_names) == 2)): raise ge_exceptions.ProfilerExecutionError( message= f"""Error: Columns specified for {self.__class__.__name__} in sorted order must correspond to \ "column_A" and "column_B" (in this exact order). """) effective_column_names = sorted(effective_column_names) domain_kwargs: Dict[str, str] = dict( zip( [ "column_A", "column_B", ], effective_column_names, )) column_name: str semantic_types_by_column_name: Dict[str, SemanticDomainTypes] = { column_name: self.semantic_type_filter. table_column_name_to_inferred_semantic_domain_type_map[column_name] for column_name in effective_column_names } domains: List[Domain] = [ Domain( domain_type=self.domain_type, domain_kwargs=domain_kwargs, details={ INFERRED_SEMANTIC_TYPE_KEY: semantic_types_by_column_name, }, rule_name=rule_name, ), ] return domains
def get_effective_column_names( self, batch_ids: Optional[List[str]] = None, validator: Optional["Validator"] = None, # noqa: F821 variables: Optional[ParameterContainer] = None, ) -> List[str]: """ This method applies multiple directives to obtain columns to be included as part of returned "Domain" objects. """ include_column_names: List[str] = cast( List[str], self._resolve_list_type_property( property_name="include_column_names", property_value_type=list, variables=variables, ), ) if batch_ids is None: batch_ids: List[str] = self.get_batch_ids(variables=variables) if validator is None: validator = self.get_validator(variables=variables) table_columns: List[str] = validator.get_metric( metric=MetricConfiguration( metric_name="table.columns", metric_domain_kwargs={ "batch_id": batch_ids[-1], # active_batch_id }, metric_value_kwargs=None, metric_dependencies=None, )) effective_column_names: List[ str] = include_column_names or table_columns exclude_column_names: List[str] = cast( List[str], self._resolve_list_type_property( property_name="exclude_column_names", property_value_type=list, variables=variables, ), ) column_name: str effective_column_names = [ column_name for column_name in effective_column_names if column_name not in exclude_column_names ] for column_name in effective_column_names: if column_name not in table_columns: raise ge_exceptions.ProfilerExecutionError( message= f'Error: The column "{column_name}" in BatchData does not exist.' ) include_column_name_suffixes: List[str] = cast( List[str], self._resolve_list_type_property( property_name="include_column_name_suffixes", property_value_type=(str, Iterable, list), variables=variables, ), ) if include_column_name_suffixes: effective_column_names = list( filter( lambda candidate_column_name: candidate_column_name. endswith(tuple(include_column_name_suffixes)), effective_column_names, )) exclude_column_name_suffixes: List[str] = cast( List[str], self._resolve_list_type_property( property_name="exclude_column_name_suffixes", property_value_type=(str, Iterable, list), variables=variables, ), ) if exclude_column_name_suffixes: effective_column_names = list( filter( lambda candidate_column_name: not candidate_column_name. endswith(tuple(exclude_column_name_suffixes)), effective_column_names, )) # Obtain semantic_type_filter_module_name from "rule state" (i.e., variables and parameters); from instance variable otherwise. semantic_type_filter_module_name: Optional[ str] = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.semantic_type_filter_module_name, expected_return_type=None, variables=variables, parameters=None, ) if semantic_type_filter_module_name is None: semantic_type_filter_module_name = "great_expectations.rule_based_profiler.helpers.simple_semantic_type_filter" # Obtain semantic_type_filter_class_name from "rule state" (i.e., variables and parameters); from instance variable otherwise. semantic_type_filter_class_name: Optional[ str] = get_parameter_value_and_validate_return_type( domain=None, parameter_reference=self.semantic_type_filter_class_name, expected_return_type=None, variables=variables, parameters=None, ) if semantic_type_filter_class_name is None: semantic_type_filter_class_name = "SimpleSemanticTypeFilter" semantic_type_filter: SemanticTypeFilter = instantiate_class_from_config( config={ "module_name": semantic_type_filter_module_name, "class_name": semantic_type_filter_class_name, }, runtime_environment={ "batch_ids": batch_ids, "validator": validator, "column_names": effective_column_names, }, config_defaults={}, ) self._semantic_type_filter = semantic_type_filter include_semantic_types: Union[List[Union[ str, SemanticDomainTypes]]] = cast( List[Union[str, SemanticDomainTypes]], self._resolve_list_type_property( property_name="include_semantic_types", property_value_type=(str, SemanticDomainTypes, list), variables=variables, ), ) include_semantic_types = ( self.semantic_type_filter.parse_semantic_domain_type_argument( semantic_types=include_semantic_types)) if include_semantic_types: effective_column_names = list( filter( lambda candidate_column_name: self.semantic_type_filter. table_column_name_to_inferred_semantic_domain_type_map[ candidate_column_name] in include_semantic_types, effective_column_names, )) exclude_semantic_types: Union[List[Union[ str, SemanticDomainTypes]]] = cast( List[Union[str, SemanticDomainTypes]], self._resolve_list_type_property( property_name="exclude_semantic_types", property_value_type=(str, SemanticDomainTypes, list), variables=variables, ), ) exclude_semantic_types = ( self.semantic_type_filter.parse_semantic_domain_type_argument( semantic_types=exclude_semantic_types)) if exclude_semantic_types: effective_column_names = list( filter( lambda candidate_column_name: self.semantic_type_filter. table_column_name_to_inferred_semantic_domain_type_map[ candidate_column_name] not in exclude_semantic_types, effective_column_names, )) return effective_column_names
def _sanitize_metric_computation( self, metric_name: str, metric_values: np.ndarray, enforce_numeric_metric: Union[str, bool] = False, replace_nan_with_zero: Union[str, bool] = False, domain: Optional[Domain] = None, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, ) -> np.ndarray: """ This method conditions (or "sanitizes") data samples in the format "N x R^m", where "N" (most significant dimension) is the number of measurements (e.g., one per Batch of data), while "R^m" is the multi-dimensional metric, whose values are being estimated. The "conditioning" operations are: 1. If "enforce_numeric_metric" flag is set, raise an error if a non-numeric value is found in sample vectors. 2. Further, if a NaN is encountered in a sample vectors and "replace_nan_with_zero" is True, then replace those NaN values with the 0.0 floating point number; if "replace_nan_with_zero" is False, then raise an error. """ # Obtain enforce_numeric_metric from "rule state" (i.e., variables and parameters); from instance variable otherwise. enforce_numeric_metric = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=enforce_numeric_metric, expected_return_type=bool, variables=variables, parameters=parameters, ) # Obtain replace_nan_with_zero from "rule state" (i.e., variables and parameters); from instance variable otherwise. replace_nan_with_zero = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=replace_nan_with_zero, expected_return_type=bool, variables=variables, parameters=parameters, ) # Outer-most dimension is data samples (e.g., one per Batch); the rest are dimensions of the actual metric. metric_value_shape: tuple = metric_values.shape[1:] # Generate all permutations of indexes for accessing every element of the multi-dimensional metric. metric_value_shape_idx: int axes: List[np.ndarray] = [ np.indices(dimensions=(metric_value_shape_idx, ))[0] for metric_value_shape_idx in metric_value_shape ] metric_value_indices: List[tuple] = list( itertools.product(*tuple(axes))) # Generate all permutations of indexes for accessing estimates of every element of the multi-dimensional metric. # Prefixing multi-dimensional index with "(slice(None, None, None),)" is equivalent to "[:,]" access. metric_value_idx: tuple metric_value_vector_indices: List[tuple] = [ (slice(None, None, None), ) + metric_value_idx for metric_value_idx in metric_value_indices ] # Traverse indices of sample vectors corresponding to every element of multi-dimensional metric. metric_value_vector: np.ndarray for metric_value_idx in metric_value_vector_indices: # Obtain "N"-element-long vector of samples for each element of multi-dimensional metric. metric_value_vector = metric_values[metric_value_idx] if enforce_numeric_metric: if not np.issubdtype(metric_value_vector.dtype, np.number): raise ge_exceptions.ProfilerExecutionError( message= f"""Applicability of {self.__class__.__name__} is restricted to numeric-valued metrics \ (value of type "{str(metric_value_vector.dtype)}" was computed). """) if np.any(np.isnan(metric_value_vector)): if not replace_nan_with_zero: raise ValueError( f"""Computation of metric "{metric_name}" resulted in NaN ("not a number") value. """) np.nan_to_num(metric_value_vector, copy=True, nan=0.0) return metric_values
def get_metrics( self, batch_ids: List[str], validator: Validator, metric_name: str, metric_domain_kwargs: Optional[Union[str, dict]] = None, metric_value_kwargs: Optional[Union[str, dict]] = None, enforce_numeric_metric: Optional[Union[str, bool]] = False, replace_nan_with_zero: Optional[Union[str, bool]] = False, domain: Optional[Domain] = None, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, ) -> Dict[str, Union[Union[np.ndarray, List[Union[Any, Number]]], Dict[ str, Any]]]: domain_kwargs = build_metric_domain_kwargs( batch_id=None, metric_domain_kwargs=metric_domain_kwargs, domain=domain, variables=variables, parameters=parameters, ) metric_domain_kwargs: dict = copy.deepcopy(domain_kwargs) # Obtain value kwargs from rule state (i.e., variables and parameters); from instance variable otherwise. metric_value_kwargs = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=metric_value_kwargs, expected_return_type=None, variables=variables, parameters=parameters, ) # Obtain enforce_numeric_metric from rule state (i.e., variables and parameters); from instance variable otherwise. enforce_numeric_metric = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=enforce_numeric_metric, expected_return_type=bool, variables=variables, parameters=parameters, ) # Obtain replace_nan_with_zero from rule state (i.e., variables and parameters); from instance variable otherwise. replace_nan_with_zero = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=replace_nan_with_zero, expected_return_type=bool, variables=variables, parameters=parameters, ) metric_values: List[Union[Any, Number]] = [] metric_value: Union[Any, Number] batch_id: str for batch_id in batch_ids: metric_domain_kwargs["batch_id"] = batch_id metric_configuration_arguments: Dict[str, Any] = { "metric_name": metric_name, "metric_domain_kwargs": metric_domain_kwargs, "metric_value_kwargs": metric_value_kwargs, "metric_dependencies": None, } metric_value = validator.get_metric(metric=MetricConfiguration( **metric_configuration_arguments)) if enforce_numeric_metric: if not is_numeric(value=metric_value): raise ge_exceptions.ProfilerExecutionError( message= f"""Applicability of {self.__class__.__name__} is restricted to numeric-valued metrics \ (value of type "{str(type(metric_value))}" was computed). """) if np.isnan(metric_value): if not replace_nan_with_zero: raise ValueError( f"""Computation of metric "{metric_name}" resulted in NaN ("not a number") value. """) metric_value = 0.0 metric_values.append(metric_value) return { "metric_values": metric_values, "details": { "metric_configuration": { "metric_name": metric_name, "domain_kwargs": domain_kwargs, "metric_value_kwargs": metric_value_kwargs, "metric_dependencies": None, }, "num_batches": len(metric_values), }, }