예제 #1
0
    def _get_kde_estimate(
        metric_values: np.ndarray,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        **kwargs,
    ) -> NumericRangeEstimationResult:
        # Obtain n_resamples override from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        n_resamples: Optional[
            int] = get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=kwargs.get("n_resamples"),
                expected_return_type=None,
                variables=variables,
                parameters=parameters,
            )

        if n_resamples is None:
            n_resamples = (NumericMetricRangeMultiBatchParameterBuilder.
                           DEFAULT_KDE_NUM_RESAMPLES)

        # Obtain bw_method override from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        bw_method: Optional[
            Union[str, float,
                  Callable]] = get_parameter_value_and_validate_return_type(
                      domain=domain,
                      parameter_reference=kwargs.get("bw_method"),
                      expected_return_type=None,
                      variables=variables,
                      parameters=parameters,
                  )

        if bw_method is None:
            bw_method = (NumericMetricRangeMultiBatchParameterBuilder.
                         DEFAULT_KDE_BW_METHOD)

        # Obtain random_seed override from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        random_seed: Optional[
            int] = get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=kwargs.get("random_seed"),
                expected_return_type=None,
                variables=variables,
                parameters=parameters,
            )

        false_positive_rate: np.float64 = kwargs.get("false_positive_rate",
                                                     5.0e-2)
        quantile_statistic_interpolation_method: str = kwargs.get(
            "quantile_statistic_interpolation_method")

        return compute_kde_quantiles_point_estimate(
            metric_values=metric_values,
            false_positive_rate=false_positive_rate,
            quantile_statistic_interpolation_method=
            quantile_statistic_interpolation_method,
            n_resamples=n_resamples,
            bw_method=bw_method,
            random_seed=random_seed,
        )
예제 #2
0
    def _build_expectation_configuration(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
    ) -> Optional[ExpectationConfiguration]:
        """Returns either and ExpectationConfiguration object or None depending on evaluation of condition"""
        parameter_name: str
        fully_qualified_parameter_name: str
        expectation_kwargs: Dict[str, Any] = {
            parameter_name: get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=fully_qualified_parameter_name,
                expected_return_type=None,
                variables=variables,
                parameters=parameters,
            )
            for parameter_name, fully_qualified_parameter_name in
            self.kwargs.items()
        }
        meta: Dict[str, Any] = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.meta,
            expected_return_type=dict,
            variables=variables,
            parameters=parameters,
        )

        if self._condition:
            parsed_condition: ParseResults = self._parse_condition()
            condition: bool = self._evaluate_condition(
                parsed_condition=parsed_condition,
                domain=domain,
                variables=variables,
                parameters=parameters,
            )

            if condition:
                return ExpectationConfiguration(
                    expectation_type=self._expectation_type,
                    kwargs=expectation_kwargs,
                    meta=meta,
                )
            else:
                return None
        else:
            return ExpectationConfiguration(
                expectation_type=self._expectation_type,
                kwargs=expectation_kwargs,
                meta=meta,
            )
예제 #3
0
    def _resolve_list_type_property(
        self,
        property_name: str,
        property_value_type: Union[type, Tuple[type, ...]],
        variables: Optional[ParameterContainer] = None,
    ) -> List[type]:
        property_value: Optional[property_value_type] = getattr(
            self, property_name, [])
        if property_value is None:
            property_value = []
        elif isinstance(property_value, str):
            property_value = [property_value]
        else:
            if not isinstance(property_value, property_value_type):
                raise ValueError(
                    f'Unrecognized "{property_name}" directive -- must be "{property_value_type}" (or string).'
                )

        property_cursor: type
        property_value = [
            # Obtain property from "rule state" (i.e., variables and parameters); from instance variable otherwise.
            get_parameter_value_and_validate_return_type(
                domain=None,
                parameter_reference=property_cursor,
                expected_return_type=None,
                variables=variables,
                parameters=None,
            ) for property_cursor in property_value
        ]

        return property_value
예제 #4
0
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        # Compute "table.columns" metric value for each Batch object.
        super().build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            parameter_computation_impl=super()._build_parameters,
            recompute_existing_parameter_values=
            recompute_existing_parameter_values,
        )

        # Retrieve "table.columns" metric values for all Batch objects.
        parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.raw_fully_qualified_parameter_name,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )
        table_columns_names_multi_batch_value: MetricValues = parameter_node[
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]

        one_batch_table_columns_names_value: MetricValue
        multi_batch_table_columns_names_sets_as_list: List[Set[str]] = [
            set(one_batch_table_columns_names_value)
            for one_batch_table_columns_names_value in
            table_columns_names_multi_batch_value
        ]

        multi_batch_table_columns_names_as_set: Set[str] = set().union(
            *multi_batch_table_columns_names_sets_as_list)

        one_batch_table_columns_names_set: Set[str]
        mean_table_columns_set_match: np.float64 = np.mean(
            np.asarray([
                1 if one_batch_table_columns_names_set
                == multi_batch_table_columns_names_as_set else 0
                for one_batch_table_columns_names_set in
                multi_batch_table_columns_names_sets_as_list
            ]))

        return Attributes({
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY:
            multi_batch_table_columns_names_as_set,
            FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: {
                "success_ratio": mean_table_columns_set_match,
            },
        })
예제 #5
0
    def _get_round_decimals_using_heuristics(
        self,
        metric_values: np.ndarray,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
    ) -> int:
        # Obtain round_decimals directive from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        round_decimals: Optional[
            int] = get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=self.round_decimals,
                expected_return_type=None,
                variables=variables,
                parameters=parameters,
            )
        if round_decimals is None:
            round_decimals = MAX_DECIMALS
        else:
            if not isinstance(round_decimals, int) or (round_decimals < 0):
                raise ge_exceptions.ProfilerExecutionError(
                    message=
                    f"""The directive "round_decimals" for {self.__class__.__name__} can be 0 or a
positive integer, or must be omitted (or set to None).
""")

        if np.issubdtype(metric_values.dtype, np.integer):
            round_decimals = 0

        return round_decimals
예제 #6
0
    def _get_bootstrap_estimate(
        metric_values: np.ndarray,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        **kwargs,
    ) -> NumericRangeEstimationResult:
        # Obtain num_bootstrap_samples override from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        num_bootstrap_samples: Optional[
            int] = get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=kwargs.get("num_bootstrap_samples"),
                expected_return_type=None,
                variables=variables,
                parameters=parameters,
            )

        n_resamples: int
        if num_bootstrap_samples is None:
            n_resamples = (NumericMetricRangeMultiBatchParameterBuilder.
                           DEFAULT_BOOTSTRAP_NUM_RESAMPLES)
        else:
            n_resamples = num_bootstrap_samples

        # Obtain random_seed override from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        random_seed: Optional[
            int] = get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=kwargs.get("bootstrap_random_seed"),
                expected_return_type=None,
                variables=variables,
                parameters=parameters,
            )

        false_positive_rate: np.float64 = kwargs.get("false_positive_rate",
                                                     5.0e-2)
        quantile_statistic_interpolation_method: str = kwargs.get(
            "quantile_statistic_interpolation_method")

        return compute_bootstrap_quantiles_point_estimate(
            metric_values=metric_values,
            false_positive_rate=false_positive_rate,
            n_resamples=n_resamples,
            random_seed=random_seed,
            quantile_statistic_interpolation_method=
            quantile_statistic_interpolation_method,
        )
예제 #7
0
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        batch_ids: Optional[List[str]] = self.get_batch_ids(
            domain=domain,
            variables=variables,
            parameters=parameters,
        )
        num_batch_ids: int = len(batch_ids)
        if num_batch_ids != 1:
            raise ge_exceptions.ProfilerExecutionError(
                message=f"""Utilizing a {self.__class__.__name__} requires exactly one Batch of data to be available
({num_batch_ids} Batch identifiers found).
"""
            )

        # Compute metric value for one Batch object (expressed as list of Batch objects).
        super().build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            parameter_computation_impl=super()._build_parameters,
            json_serialize=False,
            recompute_existing_parameter_values=recompute_existing_parameter_values,
        )

        # Retrieve metric values for one Batch object (expressed as list of Batch objects).
        parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.fully_qualified_parameter_name,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )

        return Attributes(
            {
                FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY: None
                if parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY] is None
                else parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY][0],
                FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: parameter_node[
                    FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY
                ],
            }
        )
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        # Build the list of unique values for each Batch object.
        super().build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            parameter_computation_impl=super()._build_parameters,
            recompute_existing_parameter_values=recompute_existing_parameter_values,
        )

        # Retrieve and replace list of unique values for each Batch with set of unique values for all batches in domain.
        parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.fully_qualified_parameter_name,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )
        metric_values: MetricValues = (
            AttributedResolvedMetrics.get_metric_values_from_attributed_metric_values(
                attributed_metric_values=parameter_node[
                    FULLY_QUALIFIED_PARAMETER_NAME_ATTRIBUTED_VALUE_KEY
                ]
            )
        )

        return Attributes(
            {
                FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY: _get_unique_values_from_nested_collection_of_sets(
                    collection=metric_values
                ),
                FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: parameter_node[
                    FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY
                ],
            }
        )
예제 #9
0
    def _get_truncate_values_using_heuristics(
        self,
        metric_values: np.ndarray,
        domain: Domain,
        *,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
    ) -> Dict[str, Union[Optional[int], Optional[float]]]:
        # Obtain truncate_values directive from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        truncate_values: Dict[
            str, Optional[Number]
        ] = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.truncate_values,
            expected_return_type=dict,
            variables=variables,
            parameters=parameters,
        )

        distribution_boundary: Optional[Union[int, float]]
        if not all(
            [
                (
                    distribution_boundary is None
                    or is_numeric(value=distribution_boundary)
                )
                for distribution_boundary in truncate_values.values()
            ]
        ):
            raise ge_exceptions.ProfilerExecutionError(
                message=f"""The directive "truncate_values" for {self.__class__.__name__} must specify the
[lower_bound, upper_bound] closed interval, where either boundary is a numeric value (or None).
"""
            )

        lower_bound: Optional[Number] = truncate_values.get("lower_bound")
        upper_bound: Optional[Number] = truncate_values.get("upper_bound")

        if lower_bound is None and np.all(np.greater(metric_values, NP_EPSILON)):
            lower_bound = 0.0

        if upper_bound is None and np.all(np.less(metric_values, (-NP_EPSILON))):
            upper_bound = 0.0

        return {
            "lower_bound": lower_bound,
            "upper_bound": upper_bound,
        }
예제 #10
0
 def _traverse_and_substitute(node: Any) -> None:
     if isinstance(node, dict):
         for key, val in node.copy().items():
             if isinstance(val,
                           str) and val.startswith(VARIABLES_PREFIX):
                 node[
                     key] = get_parameter_value_and_validate_return_type(
                         domain=None,
                         parameter_reference=val,
                         variables=variables_container,
                         parameters=None,
                     )
             _traverse_and_substitute(node=val)
     elif isinstance(node, list):
         for val in node:
             _traverse_and_substitute(node=val)
예제 #11
0
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        # Compute metric value for one Batch object (expressed as list of Batch objects).
        super().build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            parameter_computation_impl=super()._build_parameters,
            json_serialize=False,
            recompute_existing_parameter_values=
            recompute_existing_parameter_values,
        )

        # Retrieve metric values for one Batch object (expressed as list of Batch objects).
        parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.fully_qualified_parameter_name,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )

        return Attributes({
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY:
            None
            if parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY] is None
            else parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY][0],
            FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY:
            parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY],
        })
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        metric_computation_result: MetricComputationResult = self.get_metrics(
            metric_name=self.metric_name,
            metric_domain_kwargs=self.metric_domain_kwargs,
            metric_value_kwargs=self.metric_value_kwargs,
            enforce_numeric_metric=self.enforce_numeric_metric,
            replace_nan_with_zero=self.replace_nan_with_zero,
            domain=domain,
            variables=variables,
            parameters=parameters,
        )
        details: MetricComputationDetails = metric_computation_result.details

        # Obtain reduce_scalar_metric from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        reduce_scalar_metric: bool = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.reduce_scalar_metric,
            expected_return_type=bool,
            variables=variables,
            parameters=parameters,
        )

        if len(metric_computation_result.attributed_resolved_metrics) == 1:
            # As a simplification, apply reduction to scalar in case of one-dimensional metric (for convenience).
            if (reduce_scalar_metric and isinstance(
                    metric_computation_result.attributed_resolved_metrics[0].
                    conditioned_metric_values,
                    np.ndarray,
            ) and metric_computation_result.attributed_resolved_metrics[0].
                    conditioned_metric_values.ndim > 1 and
                    metric_computation_result.attributed_resolved_metrics[0].
                    conditioned_metric_values.shape[1] == 1):
                return Attributes({
                    FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY:
                    metric_computation_result.attributed_resolved_metrics[0].
                    conditioned_metric_values[:, 0],
                    FULLY_QUALIFIED_PARAMETER_NAME_ATTRIBUTED_VALUE_KEY:
                    metric_computation_result.attributed_resolved_metrics[0].
                    conditioned_attributed_metric_values,
                    FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY:
                    details,
                })

            return Attributes({
                FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY:
                metric_computation_result.attributed_resolved_metrics[0].
                conditioned_metric_values,
                FULLY_QUALIFIED_PARAMETER_NAME_ATTRIBUTED_VALUE_KEY:
                metric_computation_result.attributed_resolved_metrics[0].
                conditioned_attributed_metric_values,
                FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY:
                details,
            })

        return Attributes({
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY:
            metric_computation_result.attributed_resolved_metrics,
            FULLY_QUALIFIED_PARAMETER_NAME_ATTRIBUTED_VALUE_KEY:
            metric_computation_result.attributed_resolved_metrics,
            FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY:
            details,
        })
def test_partition_parameter_builder_alice_continuous(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    parameter_builder: ParameterBuilder = PartitionParameterBuilder(
        name="my_name",
        bucketize_data=True,
        evaluation_parameter_builder_configs=None,
        data_context=data_context,
    )

    metric_domain_kwargs: dict = {"column": "user_id"}
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    variables: Optional[ParameterContainer] = None
    parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    expected_parameter_value: dict = {
        "value": {
            "bins": [397433.0, 4942918.5, 9488404.0],
            "weights": [0.6666666666666666, 0.3333333333333333],
            "tail_weights": [0.0, 0.0],
        },
        "details": {
            "metric_configuration": {
                "metric_name": "column.histogram",
                "domain_kwargs": {
                    "column": "user_id"
                },
                "metric_value_kwargs": {
                    "bins": [397433.0, 4942918.5, 9488404.0]
                },
                "metric_dependencies": None,
            },
            "num_batches": 1,
        },
    }

    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=parameter_builder.
        json_serialized_fully_qualified_parameter_name,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    assert parameter_node == expected_parameter_value
def test_partition_parameter_builder_alice_continuous_changed_to_categorical(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    parameter_builder: ParameterBuilder = PartitionParameterBuilder(
        name="my_name",
        bucketize_data=True,
        evaluation_parameter_builder_configs=None,
        data_context=data_context,
    )

    metric_domain_kwargs: dict = {"column": "event_ts"}
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    variables: Optional[ParameterContainer] = None
    parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    expected_parameter_value: dict = {
        "value": {
            "values": [
                "2004-10-19 10:23:54",
                "2004-10-19 10:23:55",
                "2004-10-19 11:05:20",
            ],
            "weights":
            [0.3333333333333333, 0.3333333333333333, 0.3333333333333333],
        },
        "details": {
            "metric_configuration": {
                "metric_name": "column.value_counts",
                "domain_kwargs": {
                    "column": "event_ts"
                },
                "metric_value_kwargs": {
                    "sort": "value"
                },
                "metric_dependencies": None,
            },
            "num_batches": 1,
        },
    }

    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=parameter_builder.
        json_serialized_fully_qualified_parameter_name,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    assert parameter_node == expected_parameter_value
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        # Obtain total_count_parameter_builder_name from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        total_count_parameter_builder_name: str = (
            get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=self.total_count_parameter_builder_name,
                expected_return_type=str,
                variables=variables,
                parameters=parameters,
            ))

        fully_qualified_total_count_parameter_builder_name: str = (
            f"{PARAMETER_KEY}{total_count_parameter_builder_name}")
        # Obtain total_count from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        total_count_parameter_node: ParameterNode = (
            get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=
                fully_qualified_total_count_parameter_builder_name,
                expected_return_type=None,
                variables=variables,
                parameters=parameters,
            ))
        total_count_values: MetricValues = total_count_parameter_node[
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]

        # Obtain null_count_parameter_builder_name from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        null_count_parameter_builder_name: Optional[
            str] = get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=self.null_count_parameter_builder_name,
                expected_return_type=None,
                variables=variables,
                parameters=parameters,
            )

        batch_ids: Optional[List[str]] = self.get_batch_ids(
            domain=domain,
            variables=variables,
            parameters=parameters,
        )
        num_batch_ids: int = len(batch_ids)

        null_count_values: MetricValues
        if null_count_parameter_builder_name is None:
            null_count_values = np.zeros(shape=(num_batch_ids, ))
        else:
            fully_qualified_null_count_parameter_builder_name: str = (
                f"{PARAMETER_KEY}{null_count_parameter_builder_name}")
            # Obtain null_count from "rule state" (i.e., variables and parameters); from instance variable otherwise.
            null_count_parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=
                fully_qualified_null_count_parameter_builder_name,
                expected_return_type=None,
                variables=variables,
                parameters=parameters,
            )
            null_count_values = null_count_parameter_node[
                FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]

        nonnull_count_values: np.ndarray = total_count_values - null_count_values

        # Compute "unexpected_count" corresponding to "map_metric_name" (given as argument to this "ParameterBuilder").
        super().build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            parameter_computation_impl=super()._build_parameters,
            json_serialize=None,
            recompute_existing_parameter_values=
            recompute_existing_parameter_values,
        )

        # Retrieve "unexpected_count" corresponding to "map_metric_name" (given as argument to this "ParameterBuilder").
        parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.fully_qualified_parameter_name,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )
        unexpected_count_values: MetricValues = parameter_node[
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]

        unexpected_count_ratio_values: np.ndarray = (unexpected_count_values /
                                                     nonnull_count_values)
        mean_unexpected_count_ratio: np.float64 = np.mean(
            unexpected_count_ratio_values)

        return Attributes({
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY:
            mean_unexpected_count_ratio,
            FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY:
            parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY],
        })
예제 #16
0
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Check the percentage of values matching each string, and return the best fit, or None if no string exceeds the
        configured threshold.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        metric_computation_result: MetricComputationResult

        metric_computation_result = self.get_metrics(
            metric_name="column_values.nonnull.count",
            metric_domain_kwargs=self.metric_domain_kwargs,
            metric_value_kwargs=self.metric_value_kwargs,
            domain=domain,
            variables=variables,
            parameters=parameters,
        )

        # This should never happen.
        if len(metric_computation_result.attributed_resolved_metrics) != 1:
            raise ge_exceptions.ProfilerExecutionError(
                message=f'Result of metric computations for {self.__class__.__name__} must be a list with exactly 1 element of type "AttributedResolvedMetrics" ({metric_computation_result.attributed_resolved_metrics} found).'
            )

        attributed_resolved_metrics: AttributedResolvedMetrics

        attributed_resolved_metrics = (
            metric_computation_result.attributed_resolved_metrics[0]
        )

        metric_values: MetricValues

        metric_values = attributed_resolved_metrics.metric_values

        if metric_values is None:
            raise ge_exceptions.ProfilerExecutionError(
                message=f"Result of metric computations for {self.__class__.__name__} is empty."
            )

        # Now obtain 1-dimensional vector of values of computed metric (each element corresponds to a Batch ID).
        metric_values = metric_values[:, 0]

        nonnull_count: int = sum(metric_values)

        # Obtain candidate_strings from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        candidate_strings: Union[
            List[str],
            Set[str],
        ] = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.candidate_strings,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )

        # Gather "metric_value_kwargs" for all candidate "strftime_format" strings.
        format_string: str
        match_strftime_metric_value_kwargs_list: List[dict] = []
        match_strftime_metric_value_kwargs: dict
        for format_string in candidate_strings:
            if self.metric_value_kwargs:
                match_strftime_metric_value_kwargs = {
                    **self.metric_value_kwargs,
                    **{"strftime_format": format_string},
                }
            else:
                match_strftime_metric_value_kwargs = {
                    "strftime_format": format_string,
                }

            match_strftime_metric_value_kwargs_list.append(
                match_strftime_metric_value_kwargs
            )

        # Obtain resolved metrics and metadata for all metric configurations and available Batch objects simultaneously.
        metric_computation_result = self.get_metrics(
            metric_name="column_values.match_strftime_format.unexpected_count",
            metric_domain_kwargs=self.metric_domain_kwargs,
            metric_value_kwargs=match_strftime_metric_value_kwargs_list,
            domain=domain,
            variables=variables,
            parameters=parameters,
        )

        format_string_success_ratios: dict = {}

        for (
            attributed_resolved_metrics
        ) in metric_computation_result.attributed_resolved_metrics:
            # Now obtain 1-dimensional vector of values of computed metric (each element corresponds to a Batch ID).
            metric_values = attributed_resolved_metrics.metric_values[:, 0]

            match_strftime_unexpected_count: int = sum(metric_values)
            success_ratio: float = (
                nonnull_count - match_strftime_unexpected_count
            ) / nonnull_count
            format_string_success_ratios[
                attributed_resolved_metrics.metric_attributes["strftime_format"]
            ] = success_ratio

        # Obtain threshold from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        threshold: float = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.threshold,
            expected_return_type=float,
            variables=variables,
            parameters=parameters,
        )

        # get best-matching datetime string that matches greater than threshold
        best_format_string: str
        best_ratio: float
        (
            best_format_string,
            best_ratio,
        ) = ParameterBuilder._get_best_candidate_above_threshold(
            format_string_success_ratios, threshold
        )
        # dict of sorted datetime and ratios for all evaluated candidates
        sorted_format_strings_and_ratios: dict = (
            ParameterBuilder._get_sorted_candidates_and_ratios(
                format_string_success_ratios
            )
        )

        return Attributes(
            {
                FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY: best_format_string,
                FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: {
                    "success_ratio": best_ratio,
                    "candidate_strings": sorted_format_strings_and_ratios,
                },
            }
        )
예제 #17
0
def test_value_set_multi_batch_parameter_builder_alice_single_batch_numeric(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs: dict = {"column": "event_type"}
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    value_set_multi_batch_parameter_builder: ValueSetMultiBatchParameterBuilder = (
        ValueSetMultiBatchParameterBuilder(
            name="my_event_type_value_set",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
        ))

    assert parameter_container.parameter_nodes is None

    variables: Optional[ParameterContainer] = None
    value_set_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    assert (parameter_container.parameter_nodes is None
            or len(parameter_container.parameter_nodes) == 1)

    expected_value_set: List[int] = [19, 22, 73]
    expected_parameter_value: dict = {
        "value": expected_value_set,
        "details": {
            "metric_configuration": {
                "domain_kwargs": {
                    "column": "event_type"
                },
                "metric_name": "column.distinct_values",
                "metric_value_kwargs": None,
                "metric_dependencies": None,
            },
            "num_batches": 1,
        },
    }

    fully_qualified_parameter_name_for_value: str = "$parameter.my_event_type_value_set"
    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=fully_qualified_parameter_name_for_value,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    assert sorted(parameter_node.value) == expected_parameter_value["value"]
    assert parameter_node.details == expected_parameter_value["details"]
예제 #18
0
    def get_effective_column_names(
        self,
        batch_ids: Optional[List[str]] = None,
        validator: Optional["Validator"] = None,  # noqa: F821
        variables: Optional[ParameterContainer] = None,
    ) -> List[str]:
        """
        This method applies multiple directives to obtain columns to be included as part of returned "Domain" objects.
        """
        include_column_names: List[str] = cast(
            List[str],
            self._resolve_list_type_property(
                property_name="include_column_names",
                property_value_type=list,
                variables=variables,
            ),
        )

        if batch_ids is None:
            batch_ids: List[str] = self.get_batch_ids(variables=variables)

        if validator is None:
            validator = self.get_validator(variables=variables)

        table_columns: List[str] = validator.get_metric(
            metric=MetricConfiguration(
                metric_name="table.columns",
                metric_domain_kwargs={
                    "batch_id": batch_ids[-1],  # active_batch_id
                },
                metric_value_kwargs=None,
                metric_dependencies=None,
            ))

        effective_column_names: List[
            str] = include_column_names or table_columns

        exclude_column_names: List[str] = cast(
            List[str],
            self._resolve_list_type_property(
                property_name="exclude_column_names",
                property_value_type=list,
                variables=variables,
            ),
        )

        column_name: str

        effective_column_names = [
            column_name for column_name in effective_column_names
            if column_name not in exclude_column_names
        ]

        for column_name in effective_column_names:
            if column_name not in table_columns:
                raise ge_exceptions.ProfilerExecutionError(
                    message=
                    f'Error: The column "{column_name}" in BatchData does not exist.'
                )

        include_column_name_suffixes: List[str] = cast(
            List[str],
            self._resolve_list_type_property(
                property_name="include_column_name_suffixes",
                property_value_type=(str, Iterable, list),
                variables=variables,
            ),
        )
        if include_column_name_suffixes:
            effective_column_names = list(
                filter(
                    lambda candidate_column_name: candidate_column_name.
                    endswith(tuple(include_column_name_suffixes)),
                    effective_column_names,
                ))

        exclude_column_name_suffixes: List[str] = cast(
            List[str],
            self._resolve_list_type_property(
                property_name="exclude_column_name_suffixes",
                property_value_type=(str, Iterable, list),
                variables=variables,
            ),
        )
        if exclude_column_name_suffixes:
            effective_column_names = list(
                filter(
                    lambda candidate_column_name: not candidate_column_name.
                    endswith(tuple(exclude_column_name_suffixes)),
                    effective_column_names,
                ))

        # Obtain semantic_type_filter_module_name from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        semantic_type_filter_module_name: Optional[
            str] = get_parameter_value_and_validate_return_type(
                domain=None,
                parameter_reference=self.semantic_type_filter_module_name,
                expected_return_type=None,
                variables=variables,
                parameters=None,
            )
        if semantic_type_filter_module_name is None:
            semantic_type_filter_module_name = "great_expectations.rule_based_profiler.helpers.simple_semantic_type_filter"

        # Obtain semantic_type_filter_class_name from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        semantic_type_filter_class_name: Optional[
            str] = get_parameter_value_and_validate_return_type(
                domain=None,
                parameter_reference=self.semantic_type_filter_class_name,
                expected_return_type=None,
                variables=variables,
                parameters=None,
            )
        if semantic_type_filter_class_name is None:
            semantic_type_filter_class_name = "SimpleSemanticTypeFilter"

        semantic_type_filter: SemanticTypeFilter = instantiate_class_from_config(
            config={
                "module_name": semantic_type_filter_module_name,
                "class_name": semantic_type_filter_class_name,
            },
            runtime_environment={
                "batch_ids": batch_ids,
                "validator": validator,
                "column_names": effective_column_names,
            },
            config_defaults={},
        )
        self._semantic_type_filter = semantic_type_filter

        include_semantic_types: Union[List[Union[
            str, SemanticDomainTypes]]] = cast(
                List[Union[str, SemanticDomainTypes]],
                self._resolve_list_type_property(
                    property_name="include_semantic_types",
                    property_value_type=(str, SemanticDomainTypes, list),
                    variables=variables,
                ),
            )
        include_semantic_types = (
            self.semantic_type_filter.parse_semantic_domain_type_argument(
                semantic_types=include_semantic_types))

        if include_semantic_types:
            effective_column_names = list(
                filter(
                    lambda candidate_column_name: self.semantic_type_filter.
                    table_column_name_to_inferred_semantic_domain_type_map[
                        candidate_column_name] in include_semantic_types,
                    effective_column_names,
                ))

        exclude_semantic_types: Union[List[Union[
            str, SemanticDomainTypes]]] = cast(
                List[Union[str, SemanticDomainTypes]],
                self._resolve_list_type_property(
                    property_name="exclude_semantic_types",
                    property_value_type=(str, SemanticDomainTypes, list),
                    variables=variables,
                ),
            )
        exclude_semantic_types = (
            self.semantic_type_filter.parse_semantic_domain_type_argument(
                semantic_types=exclude_semantic_types))

        if exclude_semantic_types:
            effective_column_names = list(
                filter(
                    lambda candidate_column_name: self.semantic_type_filter.
                    table_column_name_to_inferred_semantic_domain_type_map[
                        candidate_column_name] not in exclude_semantic_types,
                    effective_column_names,
                ))

        return effective_column_names
예제 #19
0
def test_value_set_multi_batch_parameter_builder_alice_single_batch_string(
    alice_columnar_table_single_batch_context, ):
    """
    What does this test and why?
    This tests that non-numeric columns are handled appropriately,
    """
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs: dict = {"column": "user_agent"}
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    value_set_multi_batch_parameter_builder: ValueSetMultiBatchParameterBuilder = (
        ValueSetMultiBatchParameterBuilder(
            name="my_user_agent_value_set",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
        ))

    assert parameter_container.parameter_nodes is None

    variables: Optional[ParameterContainer] = None
    value_set_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    assert (parameter_container.parameter_nodes is None
            or len(parameter_container.parameter_nodes) == 1)

    expected_value_set: List[str] = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
    ]
    expected_parameter_value: dict = {
        "value": expected_value_set,
        "details": {
            "metric_configuration": {
                "domain_kwargs": {
                    "column": "user_agent"
                },
                "metric_name": "column.distinct_values",
                "metric_value_kwargs": None,
                "metric_dependencies": None,
            },
            "num_batches": 1,
        },
    }

    fully_qualified_parameter_name_for_value: str = "$parameter.my_user_agent_value_set"
    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=fully_qualified_parameter_name_for_value,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    assert sorted(parameter_node.value) == expected_parameter_value["value"]
    assert parameter_node.details == expected_parameter_value["details"]
예제 #20
0
    def _sanitize_metric_computation(
        self,
        metric_name: str,
        attributed_resolved_metrics: AttributedResolvedMetrics,
        enforce_numeric_metric: Union[str, bool] = False,
        replace_nan_with_zero: Union[str, bool] = False,
        domain: Optional[Domain] = None,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
    ) -> AttributedResolvedMetrics:
        """
        This method conditions (or "sanitizes") data samples in the format "N x R^m", where "N" (most significant
        dimension) is the number of measurements (e.g., one per Batch of data), while "R^m" is the multi-dimensional
        metric, whose values are being estimated.  The "conditioning" operations are:
        1. If "enforce_numeric_metric" flag is set, raise an error if a non-numeric value is found in sample vectors.
        2. Further, if a NaN is encountered in a sample vectors and "replace_nan_with_zero" is True, then replace those
        NaN values with the 0.0 floating point number; if "replace_nan_with_zero" is False, then raise an error.
        """
        # Obtain enforce_numeric_metric from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        enforce_numeric_metric = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=enforce_numeric_metric,
            expected_return_type=bool,
            variables=variables,
            parameters=parameters,
        )

        # Obtain replace_nan_with_zero from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        replace_nan_with_zero = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=replace_nan_with_zero,
            expected_return_type=bool,
            variables=variables,
            parameters=parameters,
        )

        if not (enforce_numeric_metric or replace_nan_with_zero):
            return attributed_resolved_metrics

        metric_values: MetricValues = attributed_resolved_metrics.metric_values

        # Outer-most dimension is data samples (e.g., one per Batch); the rest are dimensions of the actual metric.
        metric_value_shape: tuple = metric_values.shape[1:]

        # Generate all permutations of indexes for accessing every element of the multi-dimensional metric.
        metric_value_shape_idx: int
        axes: List[np.ndarray] = [
            np.indices(dimensions=(metric_value_shape_idx, ))[0]
            for metric_value_shape_idx in metric_value_shape
        ]
        metric_value_indices: List[tuple] = list(
            itertools.product(*tuple(axes)))

        # Generate all permutations of indexes for accessing estimates of every element of the multi-dimensional metric.
        # Prefixing multi-dimensional index with "(slice(None, None, None),)" is equivalent to "[:,]" access.
        metric_value_idx: tuple
        metric_value_vector_indices: List[tuple] = [
            (slice(None, None, None), ) + metric_value_idx
            for metric_value_idx in metric_value_indices
        ]

        # Traverse indices of sample vectors corresponding to every element of multi-dimensional metric.
        metric_value_vector: np.ndarray
        batch_id: str
        resolved_metric_value: Any
        for metric_value_idx in metric_value_vector_indices:
            # Obtain "N"-element-long vector of samples for each element of multi-dimensional metric.
            metric_value_vector = cast(np.ndarray,
                                       metric_values)[metric_value_idx]
            if enforce_numeric_metric:
                if not np.issubdtype(metric_value_vector.dtype, np.number):
                    raise ge_exceptions.ProfilerExecutionError(
                        message=
                        f"""Applicability of {self.__class__.__name__} is restricted to numeric-valued metrics \
(value of type "{str(metric_value_vector.dtype)}" was computed).
""")

                if np.any(np.isnan(metric_value_vector)):
                    if not replace_nan_with_zero:
                        raise ValueError(
                            f"""Computation of metric "{metric_name}" resulted in NaN ("not a number") value.
""")

                    attributed_resolved_metrics.metric_values_by_batch_id = {
                        batch_id: np.nan_to_num(metric_value_vector,
                                                copy=False,
                                                nan=0.0)
                        for batch_id, resolved_metric_value in
                        attributed_resolved_metrics.attributed_metric_values.
                        items()
                    }

        return attributed_resolved_metrics
    def _get_domains(
        self,
        rule_name: str,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        """Return domains matching the selected cardinality_limit_mode.

        Args:
            rule_name: name of Rule object, for which "Domain" objects are obtained.
            variables: Optional variables to substitute when evaluating.

        Returns:
            List of domains that match the desired cardinality.
        """
        batch_ids: List[str] = self.get_batch_ids(variables=variables)

        validator: "Validator" = self.get_validator(variables=variables)  # noqa: F821

        effective_column_names: List[str] = self.get_effective_column_names(
            batch_ids=batch_ids,
            validator=validator,
            variables=variables,
        )

        # Obtain cardinality_limit_mode from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        cardinality_limit_mode: Optional[
            Union[str, CardinalityLimitMode, dict]
        ] = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.cardinality_limit_mode,
            expected_return_type=None,
            variables=variables,
            parameters=None,
        )

        # Obtain max_unique_values from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        max_unique_values: Optional[int] = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.max_unique_values,
            expected_return_type=None,
            variables=variables,
            parameters=None,
        )

        # Obtain max_proportion_unique from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        max_proportion_unique: Optional[
            float
        ] = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.max_proportion_unique,
            expected_return_type=None,
            variables=variables,
            parameters=None,
        )

        validate_input_parameters(
            cardinality_limit_mode=cardinality_limit_mode,
            max_unique_values=max_unique_values,
            max_proportion_unique=max_proportion_unique,
        )

        self._cardinality_checker = CardinalityChecker(
            cardinality_limit_mode=cardinality_limit_mode,
            max_unique_values=max_unique_values,
            max_proportion_unique=max_proportion_unique,
        )

        # Obtain allowed_semantic_types_passthrough from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        allowed_semantic_types_passthrough: Union[
            str, SemanticDomainTypes, List[Union[str, SemanticDomainTypes]]
        ] = get_parameter_value_and_validate_return_type(
            domain=None,
            parameter_reference=self.allowed_semantic_types_passthrough,
            expected_return_type=None,
            variables=variables,
            parameters=None,
        )
        allowed_semantic_types_passthrough = (
            self.semantic_type_filter.parse_semantic_domain_type_argument(
                semantic_types=allowed_semantic_types_passthrough
            )
        )

        column_name: str

        allowed_column_names_passthrough: List[str] = [
            column_name
            for column_name in effective_column_names
            if self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map[
                column_name
            ]
            in allowed_semantic_types_passthrough
        ]

        effective_column_names = [
            column_name
            for column_name in effective_column_names
            if column_name not in allowed_column_names_passthrough
        ]

        metrics_for_cardinality_check: Dict[
            str, List[MetricConfiguration]
        ] = self._generate_metric_configurations_to_check_cardinality(
            batch_ids=batch_ids, column_names=effective_column_names
        )

        candidate_column_names: List[
            str
        ] = self._column_names_meeting_cardinality_limit(
            validator=validator,
            metrics_for_cardinality_check=metrics_for_cardinality_check,
        )
        candidate_column_names.extend(allowed_column_names_passthrough)

        column_name: str
        domains: List[Domain] = build_domains_from_column_names(
            rule_name=rule_name,
            column_names=candidate_column_names,
            domain_type=self.domain_type,
            table_column_name_to_inferred_semantic_domain_type_map=self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map,
        )

        return domains
def test_execution_mean_table_columns_set_match_multi_batch_parameter_builder(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    mean_table_columns_set_match_multi_batch_parameter_builder: ParameterBuilder = (
        MeanTableColumnsSetMatchMultiBatchParameterBuilder(
            name=
            "my_mean_table_columns_set_match_multi_batch_parameter_builder",
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            evaluation_parameter_builder_configs=None,
            data_context=data_context,
        ))

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.TABLE,
        domain_kwargs=None,
        rule_name="my_rule",
    )

    variables: Optional[ParameterContainer] = None

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    expected_parameter_value: dict = {
        "value": {
            "VendorID",
            "pickup_datetime",
            "total_amount",
            "congestion_surcharge",
            "dropoff_datetime",
            "mta_tax",
            "store_and_fwd_flag",
            "tip_amount",
            "trip_distance",
            "payment_type",
            "DOLocationID",
            "improvement_surcharge",
            "extra",
            "tolls_amount",
            "RatecodeID",
            "passenger_count",
            "PULocationID",
            "fare_amount",
        },
        "details": {
            "success_ratio": 1.0,
        },
    }

    mean_table_columns_set_match_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=
        mean_table_columns_set_match_multi_batch_parameter_builder.
        json_serialized_fully_qualified_parameter_name,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    assert len(
        parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]) == len(
            expected_parameter_value[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY])

    parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY] = set(
        parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY])
    assert parameter_node == expected_parameter_value
예제 #23
0
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.

         The algorithm operates according to the following steps:
         1. Obtain batch IDs of interest using BaseDataContext and BatchRequest (unless passed explicitly as argument).
         2. Set up metric_domain_kwargs and metric_value_kwargs (using configuration and/or variables and parameters).
         3. Instantiate the Validator object corresponding to BatchRequest (with a temporary expectation_suite_name) in
            order to have access to all Batch objects, on each of which the specified metric_name will be computed.
         4. Perform metric computations and obtain the result in the array-like form (one metric value per each Batch).
         5. Using the configured directives and heuristics, determine whether or not the ranges should be clipped.
         6. Using the configured directives and heuristics, determine if return values should be rounded to an integer.
         7. Convert the multi-dimensional metric computation results to a numpy array (for further computations).
         8. Compute [low, high] for the desired metric using the chosen estimator method.
         9. Return [low, high] for the desired metric as estimated by the specified sampling method.
        10. Set up the arguments and call build_parameter_container() to store the parameter as part of "rule state".
        """
        # Obtain false_positive_rate from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        false_positive_rate: np.float64 = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.false_positive_rate,
            expected_return_type=(float, np.float64),
            variables=variables,
            parameters=parameters,
        )

        if not (0.0 <= false_positive_rate <= 1.0):
            raise ge_exceptions.ProfilerExecutionError(
                f"""false_positive_rate must be a positive decimal number between 0 and 1 inclusive [0, 1],
but {false_positive_rate} was provided.""")
        elif false_positive_rate <= NP_EPSILON:
            warnings.warn(
                f"""You have chosen a false_positive_rate of {false_positive_rate}, which is too close to 0.
A false_positive_rate of {NP_EPSILON} has been selected instead.""")
            false_positive_rate = NP_EPSILON
        elif false_positive_rate >= (1.0 - NP_EPSILON):
            warnings.warn(
                f"""You have chosen a false_positive_rate of {false_positive_rate}, which is too close to 1.
A false_positive_rate of {1.0-NP_EPSILON} has been selected instead.""")
            false_positive_rate = np.float64(1.0 - NP_EPSILON)

        parameter_reference: str
        if self.metric_multi_batch_parameter_builder_name:
            # Obtain metric_multi_batch_parameter_builder_name from "rule state" (i.e., variables and parameters); from instance variable otherwise.
            metric_multi_batch_parameter_builder_name: str = (
                get_parameter_value_and_validate_return_type(
                    domain=domain,
                    parameter_reference=self.
                    metric_multi_batch_parameter_builder_name,
                    expected_return_type=str,
                    variables=variables,
                    parameters=parameters,
                ))
            parameter_reference = (
                f"{RAW_PARAMETER_KEY}{metric_multi_batch_parameter_builder_name}"
            )
        else:
            # Compute metric value for each Batch object.
            super().build_parameters(
                domain=domain,
                variables=variables,
                parameters=parameters,
                parameter_computation_impl=super()._build_parameters,
                recompute_existing_parameter_values=
                recompute_existing_parameter_values,
            )
            parameter_reference = self.raw_fully_qualified_parameter_name

        # Retrieve metric values for all Batch objects.
        parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=parameter_reference,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )
        metric_values: MetricValues = parameter_node[
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]

        # Obtain estimator directive from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        estimator: str = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.estimator,
            expected_return_type=str,
            variables=variables,
            parameters=parameters,
        )
        if (estimator not in NumericMetricRangeMultiBatchParameterBuilder.
                RECOGNIZED_SAMPLING_METHOD_NAMES):
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"""The directive "estimator" for {self.__class__.__name__} can be only one of
{NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_SAMPLING_METHOD_NAMES} ("{estimator}" was detected).
""")

        round_decimals: int

        # Obtain quantile_statistic_interpolation_method directive from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        quantile_statistic_interpolation_method: str = (
            get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=self.
                quantile_statistic_interpolation_method,
                expected_return_type=str,
                variables=variables,
                parameters=parameters,
            ))
        if (quantile_statistic_interpolation_method
                not in NumericMetricRangeMultiBatchParameterBuilder.
                RECOGNIZED_QUANTILE_STATISTIC_INTERPOLATION_METHODS):
            raise ge_exceptions.ProfilerExecutionError(
                message=
                f"""The directive "quantile_statistic_interpolation_method" for {self.__class__.__name__} can \
be only one of {NumericMetricRangeMultiBatchParameterBuilder.RECOGNIZED_QUANTILE_STATISTIC_INTERPOLATION_METHODS} \
("{quantile_statistic_interpolation_method}" was detected).
""")

        if integer_semantic_domain_type(domain=domain):
            round_decimals = 0
        else:
            round_decimals = self._get_round_decimals_using_heuristics(
                metric_values=metric_values,
                domain=domain,
                variables=variables,
                parameters=parameters,
            )

        if quantile_statistic_interpolation_method == "auto":
            if round_decimals == 0:
                quantile_statistic_interpolation_method = "nearest"
            else:
                quantile_statistic_interpolation_method = "linear"

        estimator_func: Callable
        estimator_kwargs: dict
        if estimator == "bootstrap":
            estimator_func = self._get_bootstrap_estimate
            estimator_kwargs = {
                "false_positive_rate": false_positive_rate,
                "quantile_statistic_interpolation_method":
                quantile_statistic_interpolation_method,
                "n_resamples": self.n_resamples,
                "random_seed": self.random_seed,
            }
        elif estimator == "kde":
            estimator_func = self._get_kde_estimate
            estimator_kwargs = {
                "false_positive_rate": false_positive_rate,
                "quantile_statistic_interpolation_method":
                quantile_statistic_interpolation_method,
                "n_resamples": self.n_resamples,
                "bw_method": self.bw_method,
                "random_seed": self.random_seed,
            }
        else:
            estimator_func = self._get_deterministic_estimate
            estimator_kwargs = {
                "false_positive_rate":
                false_positive_rate,
                "quantile_statistic_interpolation_method":
                quantile_statistic_interpolation_method,
            }

        numeric_range_estimation_result: NumericRangeEstimationResult = (
            self._estimate_metric_value_range(
                metric_values=metric_values,
                estimator_func=estimator_func,
                round_decimals=round_decimals,
                domain=domain,
                variables=variables,
                parameters=parameters,
                **estimator_kwargs,
            ))

        value_range: np.ndarray = numeric_range_estimation_result.value_range
        details: Dict[str, Any] = copy.deepcopy(
            parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY])

        # Obtain include_estimator_samples_histogram_in_details from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        include_estimator_samples_histogram_in_details: bool = (
            get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=self.
                include_estimator_samples_histogram_in_details,
                expected_return_type=bool,
                variables=variables,
                parameters=parameters,
            ))

        if include_estimator_samples_histogram_in_details:
            details[
                "estimation_histogram"] = numeric_range_estimation_result.estimation_histogram

        return Attributes({
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY: value_range,
            FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: details,
        })
예제 #24
0
    def _substitute_parameters_and_variables(
        self,
        term_list: Union[str, ParseResults],
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
    ) -> ParseResults:
        """Recursively substitute all parameters and variables in term list

        Given a list of terms created by parsing a provided condition, recursively substitute all parameters and
        variables in the term list, regardless of depth of groupings.

        Example:
            condition: "($variables.max_user_id>0 & $variables.answer==42) | $parameter.my_min_user_id.value[0]<0" will
            return the following term list from self._parse_condition:
                parsed_condition = [[
                                      [
                                        ['$variables.max_user_id', '>', 0],
                                        '&',
                                        ['$variables.answer', '==', 42]
                                      ],
                                      '|',
                                      ['$parameter.my_min_user_id.value[0]', '<', 0]
                                   ]]

            This method will then take that term list and recursively search for parameters and variables that need to
            be substituted and return this ParseResults object:
                return [[
                          [
                             [999999999999, '>', 0],
                             '&',
                             [42, '==', 42]
                          ],
                          '|',
                          [397433, '<', 0]
                       ]]

        Args:
            term_list (Union[str, ParseResults): the ParseResults object returned from self._parse_condition
            domain (Domain): The domain of the ExpectationConfiguration
            variables (Optional[ParameterContainer]): The variables set for this ExpectationConfiguration
            parameters (Optional[Dict[str, ParameterContainer]]): The parameters set for this ExpectationConfiguration

        Returns:
            ParseResults: a ParseResults object identical to the one returned by self._parse_condition except with
                          substituted parameters and variables.
        """
        idx: int
        token: Union[str, ParseResults]
        for idx, token in enumerate(term_list):
            if isinstance(token, str) and token.startswith("$"):
                term_list[idx]: Dict[
                    str, Any] = get_parameter_value_and_validate_return_type(
                        domain=domain,
                        parameter_reference=token,
                        expected_return_type=None,
                        variables=variables,
                        parameters=parameters,
                    )
            elif isinstance(token, ParseResults):
                self._substitute_parameters_and_variables(
                    term_list=token,
                    domain=domain,
                    variables=variables,
                    parameters=parameters,
                )
        return term_list
예제 #25
0
def test_simple_date_format_parameter_builder_bobby(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    metric_domain_kwargs: dict = {"column": "pickup_datetime"}
    candidate_strings: list[str] = [
        "%Y-%m-%d",
        "%Y-%m-%d %H:%M:%S",
    ]
    threshold: float = 0.9
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    date_format_string_parameter: SimpleDateFormatStringParameterBuilder = (
        SimpleDateFormatStringParameterBuilder(
            name="my_simple_date_format_string_parameter_builder",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_strings=candidate_strings,
            threshold=threshold,
            data_context=data_context,
        ))

    assert date_format_string_parameter._candidate_strings == set(
        candidate_strings)
    assert date_format_string_parameter._threshold == 0.9

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    date_format_string_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    assert (parameter_container.parameter_nodes is None
            or len(parameter_container.parameter_nodes) == 1)

    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_simple_date_format_string_parameter_builder.value")
    expected_value: str = "%Y-%m-%d %H:%M:%S"

    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        parameter_reference=fully_qualified_parameter_name_for_value,
        expected_return_type=str,
        domain=domain,
        parameters=parameters,
    )

    assert parameter_node == expected_value

    fully_qualified_parameter_name_for_meta: str = (
        "$parameter.my_simple_date_format_string_parameter_builder.details")
    expected_meta: dict = {
        "success_ratio": 1.0,
        "candidate_strings": {
            "%Y-%m-%d": 0.0,
            "%Y-%m-%d %H:%M:%S": 1.0
        },
    }
    meta: dict = get_parameter_value_and_validate_return_type(
        parameter_reference=fully_qualified_parameter_name_for_meta,
        expected_return_type=dict,
        domain=domain,
        parameters=parameters,
    )
    assert meta == expected_meta
예제 #26
0
def test_value_set_multi_batch_parameter_builder_bobby_string(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    metric_domain_kwargs_for_parameter_builder: str = "$domain.domain_kwargs"
    value_set_multi_batch_parameter_builder: ValueSetMultiBatchParameterBuilder = (
        ValueSetMultiBatchParameterBuilder(
            name="my_store_and_fwd_flag_value_set",
            metric_domain_kwargs=metric_domain_kwargs_for_parameter_builder,
            data_context=data_context,
        ))

    metric_domain_kwargs: dict = {"column": "store_and_fwd_flag"}
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    variables: Optional[ParameterContainer] = None
    value_set_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    assert (parameter_container.parameter_nodes is None
            or len(parameter_container.parameter_nodes) == 1)

    expected_value_set: List[str] = ["N", "Y"]
    expected_parameter_value: dict = {
        "value": expected_value_set,
        "details": {
            "metric_configuration": {
                "metric_name": "column.distinct_values",
                "domain_kwargs": {
                    "column": "store_and_fwd_flag"
                },
                "metric_value_kwargs": None,
                "metric_dependencies": None,
            },
            "num_batches": 3,
        },
    }

    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_store_and_fwd_flag_value_set")
    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=fully_qualified_parameter_name_for_value,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    assert sorted(parameter_node.value) == expected_parameter_value["value"]
    assert parameter_node.details == expected_parameter_value["details"]
예제 #27
0
def test_simple_date_format_parameter_builder_alice(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs = {"column": "event_ts"}

    date_format_string_parameter: SimpleDateFormatStringParameterBuilder = (
        SimpleDateFormatStringParameterBuilder(
            name="my_date_format",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
        ))

    assert date_format_string_parameter.candidate_strings == DEFAULT_CANDIDATE_STRINGS
    assert date_format_string_parameter._threshold == 1.0

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    date_format_string_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    # noinspection PyTypeChecker
    assert len(parameter_container.parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.my_date_format"
    expected_value: dict = {
        "value": "%Y-%m-%d %H:%M:%S",
        "details": {
            "success_ratio": 1.0,
            "candidate_strings": {
                "%Y-%m-%d %H:%M:%S": 1.0,
                "%y/%m/%d %H:%M:%S": 0.0,
                "%y/%m/%d": 0.0,
                "%y-%m-%d %H:%M:%S,%f %z": 0.0,
                "%y-%m-%d %H:%M:%S,%f": 0.0,
                "%y-%m-%d %H:%M:%S": 0.0,
                "%y-%m-%d": 0.0,
                "%y%m%d %H:%M:%S": 0.0,
                "%m/%d/%y*%H:%M:%S": 0.0,
                "%m/%d/%y %H:%M:%S %z": 0.0,
                "%m/%d/%Y*%H:%M:%S*%f": 0.0,
                "%m/%d/%Y*%H:%M:%S": 0.0,
                "%m/%d/%Y %H:%M:%S %z": 0.0,
                "%m/%d/%Y %H:%M:%S %p:%f": 0.0,
                "%m/%d/%Y %H:%M:%S %p": 0.0,
                "%m/%d/%Y": 0.0,
                "%m-%d-%Y": 0.0,
                "%m%d_%H:%M:%S.%f": 0.0,
                "%m%d_%H:%M:%S": 0.0,
                "%d/%m/%Y": 0.0,
                "%d/%b/%Y:%H:%M:%S %z": 0.0,
                "%d/%b/%Y:%H:%M:%S": 0.0,
                "%d/%b/%Y %H:%M:%S": 0.0,
                "%d/%b %H:%M:%S,%f": 0.0,
                "%d-%m-%Y": 0.0,
                "%d-%b-%Y %H:%M:%S.%f": 0.0,
                "%d-%b-%Y %H:%M:%S": 0.0,
                "%d %b %Y %H:%M:%S*%f": 0.0,
                "%d %b %Y %H:%M:%S": 0.0,
                "%b %d, %Y %H:%M:%S %p": 0.0,
                "%b %d %Y %H:%M:%S": 0.0,
                "%b %d %H:%M:%S %z %Y": 0.0,
                "%b %d %H:%M:%S %z": 0.0,
                "%b %d %H:%M:%S %Y": 0.0,
                "%b %d %H:%M:%S": 0.0,
                "%Y/%m/%d*%H:%M:%S": 0.0,
                "%Y/%m/%d": 0.0,
                "%Y-%m-%dT%z": 0.0,
                "%Y-%m-%d*%H:%M:%S:%f": 0.0,
                "%Y-%m-%d*%H:%M:%S": 0.0,
                "%Y-%m-%d'T'%H:%M:%S.%f'%z'": 0.0,
                "%Y-%m-%d'T'%H:%M:%S.%f": 0.0,
                "%Y-%m-%d'T'%H:%M:%S'%z'": 0.0,
                "%Y-%m-%d'T'%H:%M:%S%z": 0.0,
                "%Y-%m-%d'T'%H:%M:%S": 0.0,
                "%Y-%m-%d %H:%M:%S.%f%z": 0.0,
                "%Y-%m-%d %H:%M:%S.%f": 0.0,
                "%Y-%m-%d %H:%M:%S,%f%z": 0.0,
                "%Y-%m-%d %H:%M:%S,%f": 0.0,
                "%Y-%m-%d %H:%M:%S%z": 0.0,
                "%Y-%m-%d %H:%M:%S %z": 0.0,
                "%Y-%m-%d": 0.0,
                "%Y%m%d %H:%M:%S.%f": 0.0,
                "%Y %b %d %H:%M:%S.%f*%Z": 0.0,
                "%Y %b %d %H:%M:%S.%f %Z": 0.0,
                "%Y %b %d %H:%M:%S.%f": 0.0,
                "%H:%M:%S.%f": 0.0,
                "%H:%M:%S,%f": 0.0,
                "%H:%M:%S": 0.0,
            },
        },
    }

    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        parameter_reference=fully_qualified_parameter_name_for_value,
        expected_return_type=dict,
        domain=domain,
        parameters=parameters,
    )

    assert parameter_node == expected_value
    def _build_parameters(
        self,
        domain: Domain,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
        recompute_existing_parameter_values: bool = False,
    ) -> Attributes:
        """
        Builds ParameterContainer object that holds ParameterNode objects with attribute name-value pairs and details.

        Returns:
            Attributes object, containing computed parameter values and parameter computation details metadata.
        """
        # Obtain bucketize_data directive from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        bucketize_data = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=self.bucketize_data,
            expected_return_type=bool,
            variables=variables,
            parameters=parameters,
        )

        is_categorical: bool = not bucketize_data

        fully_qualified_column_partition_metric_single_batch_parameter_builder_name: str = f"{RAW_PARAMETER_KEY}{self._column_partition_metric_single_batch_parameter_builder_config.name}"
        # Obtain "column.partition" from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        column_partition_parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=fully_qualified_column_partition_metric_single_batch_parameter_builder_name,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )
        bins: MetricValue = column_partition_parameter_node[
            FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY
        ]

        if bins is None:
            is_categorical = True
        else:
            is_categorical = is_categorical or not np.all(np.diff(bins) > 0.0)

        fully_qualified_column_values_nonnull_count_metric_parameter_builder_name: str = f"{RAW_PARAMETER_KEY}{self._column_values_nonnull_count_metric_single_batch_parameter_builder_config.name}"
        # Obtain "column_values.nonnull.count" from "rule state" (i.e., variables and parameters); from instance variable otherwise.
        column_values_nonnull_count_parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=fully_qualified_column_values_nonnull_count_metric_parameter_builder_name,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )

        partition_object: dict
        details: dict

        weights: list

        if is_categorical:
            fully_qualified_column_value_counts_metric_single_batch_parameter_builder_name: str = f"{RAW_PARAMETER_KEY}{self._column_value_counts_metric_single_batch_parameter_builder_config.name}"
            # Obtain "column.value_counts" from "rule state" (i.e., variables and parameters); from instance variable otherwise.
            column_value_counts_parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
                domain=domain,
                parameter_reference=fully_qualified_column_value_counts_metric_single_batch_parameter_builder_name,
                expected_return_type=None,
                variables=variables,
                parameters=parameters,
            )

            values: list = list(
                column_value_counts_parameter_node[
                    FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY
                ].index
            )
            weights = list(
                np.asarray(
                    column_value_counts_parameter_node[
                        FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY
                    ]
                )
                / column_values_nonnull_count_parameter_node[
                    FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY
                ]
            )

            partition_object = {
                "values": values,
                "weights": weights,
            }
            details = column_value_counts_parameter_node[
                FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY
            ]
        else:
            self.metric_name = "column.histogram"
            self.metric_value_kwargs = {
                "bins": tuple(bins),
            }

            # Compute metric value for one Batch object.
            super().build_parameters(
                domain=domain,
                variables=variables,
                parameters=parameters,
                parameter_computation_impl=super()._build_parameters,
                recompute_existing_parameter_values=recompute_existing_parameter_values,
            )

            # Retrieve metric values for one Batch object.
            parameter_node: ParameterNode = (
                get_parameter_value_and_validate_return_type(
                    domain=domain,
                    parameter_reference=self.raw_fully_qualified_parameter_name,
                    expected_return_type=None,
                    variables=variables,
                    parameters=parameters,
                )
            )

            # in this case, we have requested a partition, histogram using said partition, and nonnull count
            bins = list(bins)
            weights = list(
                np.asarray(parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY])
                / column_values_nonnull_count_parameter_node[
                    FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY
                ]
            )
            tail_weights: float = (1.0 - sum(weights)) / 2.0

            partition_object = {
                "bins": bins,
                "weights": weights,
                "tail_weights": [tail_weights, tail_weights],
            }
            details = parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY]

        return Attributes(
            {
                FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY: partition_object,
                FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY: details,
            }
        )
예제 #29
0
def test_regex_two_candidates(mock_data_context: mock.MagicMock, batch_fixture: Batch):
    batch: Batch = batch_fixture

    mock_data_context.get_batch_list.return_value = [batch]
    mock_data_context.get_validator_using_batch_list.return_value = Validator(
        execution_engine=PandasExecutionEngine(), batches=[batch]
    )
    data_context: DataContext = mock_data_context

    metric_domain_kwargs: dict = {"column": "b"}
    candidate_regexes: List[str] = [r"^\d{1}$", r"^\d{3}$"]

    regex_pattern_string_parameter_builder: ParameterBuilder = (
        RegexPatternStringParameterBuilder(
            name="my_regex_pattern_string_parameter_builder",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_regexes=candidate_regexes,
            data_context=data_context,
        )
    )

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    regex_pattern_string_parameter_builder.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_list=[batch],
    )
    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_regex_pattern_string_parameter_builder.value"
    )

    expected_value: str = "^\\d{1}$"

    assert (
        get_parameter_value_and_validate_return_type(
            parameter_reference=fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        )
        == expected_value
    )
    fully_qualified_parameter_name_for_meta: str = (
        "$parameter.my_regex_pattern_string_parameter_builder.details"
    )
    expected_meta: dict = {
        "evaluated_regexes": {"^\\d{1}$": 1.0, "^\\d{3}$": 0.0},
        "success_ratio": 1.0,
    }
    meta: dict = get_parameter_value_and_validate_return_type(
        parameter_reference=fully_qualified_parameter_name_for_meta,
        expected_return_type=dict,
        domain=domain,
        parameters=parameters,
    )

    assert meta == expected_meta
def test_mean_unexpected_map_metric_multi_batch_parameter_builder_bobby_numeric_dependencies_evaluated_separately(
    bobby_columnar_table_multi_batch_deterministic_data_context,
):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context
    )

    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    my_total_count_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder(
        name="my_total_count",
        metric_name="table.row_count",
        metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
        metric_value_kwargs=None,
        enforce_numeric_metric=False,
        replace_nan_with_zero=False,
        reduce_scalar_metric=True,
        evaluation_parameter_builder_configs=None,
        data_context=data_context,
    )
    my_null_count_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder(
        name="my_null_count",
        metric_name="column_values.nonnull.unexpected_count",
        metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
        metric_value_kwargs=None,
        enforce_numeric_metric=False,
        replace_nan_with_zero=False,
        reduce_scalar_metric=True,
        evaluation_parameter_builder_configs=None,
        data_context=data_context,
    )

    mean_unexpected_map_metric_multi_batch_parameter_builder: ParameterBuilder = (
        MeanUnexpectedMapMetricMultiBatchParameterBuilder(
            name="my_passenger_count_values_not_null_mean_unexpected_map_metric",
            map_metric_name="column_values.nonnull",
            total_count_parameter_builder_name="my_total_count",
            null_count_parameter_builder_name="my_null_count",
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            evaluation_parameter_builder_configs=None,
            data_context=data_context,
        )
    )

    metric_domain_kwargs: dict = {"column": "passenger_count"}
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )

    variables: Optional[ParameterContainer] = None

    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    my_total_count_metric_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )
    my_null_count_metric_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    mean_unexpected_map_metric_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    expected_parameter_value: float = 0.0

    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=mean_unexpected_map_metric_multi_batch_parameter_builder.json_serialized_fully_qualified_parameter_name,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    rtol: float = RTOL
    atol: float = 5.0e-1 * ATOL
    np.testing.assert_allclose(
        actual=parameter_node.value,
        desired=expected_parameter_value,
        rtol=rtol,
        atol=atol,
        err_msg=f"Actual value of {parameter_node.value} differs from expected value of {expected_parameter_value} by more than {atol + rtol * abs(parameter_node.value)} tolerance.",
    )