Exemplo n.º 1
0
def test_simple_date_format_parameter_builder_zero_batch_id_error(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    date_format_string_parameter: ParameterBuilder = (
        SimpleDateFormatStringParameterBuilder(
            name="my_simple_date_format_string_parameter_builder",
            data_context=data_context,
        ))

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    with pytest.raises(ge_exceptions.ProfilerExecutionError) as e:
        date_format_string_parameter.build_parameters(
            domain=domain,
            parameters=parameters,
        )

    assert (
        str(e.value) ==
        "Utilizing a SimpleDateFormatStringParameterBuilder requires a non-empty list of Batch identifiers."
    )
def test_default_expectation_configuration_builder_alice_null_condition_parameter_builder_validation_dependency_included(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs: dict = {"column": "user_id"}

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id.value[0]"

    condition: Optional[str] = None
    max_user_id: int = 999999999999

    min_user_id_parameter_builder_config: ParameterBuilderConfig = (
        ParameterBuilderConfig(
            module_name=
            "great_expectations.rule_based_profiler.parameter_builder",
            class_name="MetricMultiBatchParameterBuilder",
            name="my_min_user_id",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
        ))
    validation_parameter_builder_configs: Optional[
        List[ParameterBuilderConfig]] = [
            min_user_id_parameter_builder_config,
        ]
    default_expectation_configuration_builder = DefaultExpectationConfigurationBuilder(
        expectation_type="expect_column_values_to_be_between",
        condition=condition,
        min_value=fully_qualified_parameter_name_for_value,
        max_value=max_user_id,
        validation_parameter_builder_configs=
        validation_parameter_builder_configs,
        data_context=data_context,
    )

    expectation_configuration: Optional[
        ExpectationConfiguration] = default_expectation_configuration_builder.build_expectation_configuration(
            domain=domain,
            parameters=parameters,
            batch_request=batch_request,
        )

    assert expectation_configuration.kwargs["min_value"] == 397433
Exemplo n.º 3
0
def single_part_name_parameter_container():
    skip_if_python_below_minimum_version()

    return ParameterContainer(parameter_nodes={
        "mean": ParameterNode({
            "mean": 5.0,
        }),
    })
def test_bootstrap_numeric_metric_range_multi_batch_parameter_builder_bobby_false_positive_rate_very_small(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # BatchRequest yielding three batches
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    # a commonly used defect rate in quality control that equates to 3.4 defects per million opportunities
    six_sigma_false_positive_rate: float = 3.4 / 1000000.0
    assert six_sigma_false_positive_rate > NP_EPSILON

    # what if user tries a false positive rate smaller than NP_EPSILON (by an order of magnitude in this case)?
    smaller_than_np_epsilon_false_positive_rate: float = NP_EPSILON / 10

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="row_count_range",
            metric_name="table.row_count",
            metric_multi_batch_parameter_builder_name=None,
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            estimator="bootstrap",
            false_positive_rate=smaller_than_np_epsilon_false_positive_rate,
            round_decimals=0,
            evaluation_parameter_builder_configs=None,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.TABLE,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    warning_message: str = re.escape(
        f"""You have chosen a false_positive_rate of {smaller_than_np_epsilon_false_positive_rate}, which is too close to 0.
A false_positive_rate of {NP_EPSILON} has been selected instead.""")

    with pytest.warns(UserWarning, match=warning_message):
        numeric_metric_range_parameter_builder.build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            batch_request=batch_request,
        )
def test_simple_date_format_parameter_builder_bobby(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    metric_domain_kwargs: dict = {"column": "pickup_datetime"}
    candidate_strings: set[str] = {
        "%Y-%m-%d",
        "%Y-%m-%d %H:%M:%S",
    }
    threshold: float = 0.9
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    date_format_string_parameter: SimpleDateFormatStringParameterBuilder = (
        SimpleDateFormatStringParameterBuilder(
            name="my_simple_date_format_string_parameter_builder",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_strings=candidate_strings,
            threshold=threshold,
            data_context=data_context,
            batch_request=batch_request,
        ))

    assert date_format_string_parameter.CANDIDATE_STRINGS != candidate_strings
    assert date_format_string_parameter._candidate_strings == candidate_strings
    assert date_format_string_parameter._threshold == 0.9

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    domain: Domain = Domain(domain_type=MetricDomainTypes.COLUMN,
                            domain_kwargs=metric_domain_kwargs)

    assert parameter_container.parameter_nodes is None

    date_format_string_parameter._build_parameters(
        parameter_container=parameter_container, domain=domain)

    assert len(parameter_container.parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_simple_date_format_string_parameter_builder")
    expected_value: dict = {
        "value": "%Y-%m-%d %H:%M:%S",
        "details": {
            "success_ratio": 1.0
        },
    }

    assert (get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters={domain.id: parameter_container},
    ) == expected_value)
def test_build_parameter_container(
    parameters_with_different_depth_level_values,
    multi_part_name_parameter_container,
):
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    build_parameter_container(
        parameter_container=parameter_container,
        parameter_values=parameters_with_different_depth_level_values,
    )
    assert parameter_container == multi_part_name_parameter_container
Exemplo n.º 7
0
def single_part_name_parameter_container():
    return ParameterContainer(
        parameter_nodes={
            "mean": ParameterNode(
                {
                    "mean": 5.0,
                }
            ),
        }
    )
def test_get_fully_qualified_parameter_names(
    parameters_with_different_depth_level_values,
):
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    build_parameter_container(
        parameter_container=parameter_container,
        parameter_values=parameters_with_different_depth_level_values,
    )

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=None,
        details=None,
        rule_name="my_rule",
    )
    # Convert variables argument to ParameterContainer
    variables: ParameterContainer = build_parameter_container_for_variables(
        variables_configs={
            "my_int": 9,
            "my_float": 3.38,
            "my_string": "hello",
        }
    )
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    expected_fully_qualified_parameter_names: List[str] = [
        "$variables",
        "$parameter.date_strings.yyyy_mm_dd_hh_mm_ss_tz_date_format",
        "$parameter.date_strings.yyyy_mm_dd_date_format",
        "$parameter.date_strings.mm_yyyy_dd_hh_mm_ss_tz_date_format",
        "$parameter.date_strings.mm_yyyy_dd_date_format",
        "$parameter.date_strings.tolerances.max_abs_error_time_milliseconds",
        "$parameter.date_strings.tolerances.max_num_conversion_attempts",
        "$parameter.tolerances.mostly",
        "$parameter.tolerances.financial.usd",
        "$parameter.monthly_taxi_fairs.mean_values",
        "$parameter.daily_taxi_fairs.mean_values",
        "$parameter.weekly_taxi_fairs.mean_values",
        "$mean",
    ]

    fully_qualified_parameter_names: List[str] = get_fully_qualified_parameter_names(
        domain=domain,
        variables=variables,
        parameters=parameters,
    )
    assert len(fully_qualified_parameter_names) == len(
        expected_fully_qualified_parameter_names
    )
    assert sorted(fully_qualified_parameter_names) == sorted(
        expected_fully_qualified_parameter_names
    )
def test_bootstrap_numeric_metric_range_multi_batch_parameter_builder_bobby_false_positive_rate_negative(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # BatchRequest yielding three batches
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="row_count_range",
            metric_name="table.row_count",
            metric_multi_batch_parameter_builder_name=None,
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            estimator="bootstrap",
            false_positive_rate=-0.05,
            round_decimals=0,
            evaluation_parameter_builder_configs=None,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.TABLE,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    error_message: str = re.escape(
        """false_positive_rate must be a positive decimal number between 0 and 1 inclusive [0, 1],
but -0.05 was provided.""")

    with pytest.raises(ge_exceptions.ProfilerExecutionError,
                       match=error_message):
        numeric_metric_range_parameter_builder.build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            batch_request=batch_request,
        )
def test_simple_date_format_parameter_builder_alice(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs = {"column": "event_ts"}

    date_format_string_parameter: SimpleDateFormatStringParameterBuilder = (
        SimpleDateFormatStringParameterBuilder(
            name="my_date_format",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
            batch_request=batch_request,
        ))

    assert date_format_string_parameter.CANDIDATE_STRINGS == DEFAULT_CANDIDATE_STRINGS
    assert date_format_string_parameter.candidate_strings is None
    assert date_format_string_parameter._threshold == 1.0

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    domain: Domain = Domain(domain_type=MetricDomainTypes.COLUMN,
                            domain_kwargs=metric_domain_kwargs)

    assert parameter_container.parameter_nodes is None

    date_format_string_parameter._build_parameters(
        parameter_container=parameter_container, domain=domain)

    # noinspection PyTypeChecker
    assert len(parameter_container.parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.my_date_format"
    expected_value: dict = {
        "value": "%Y-%m-%d %H:%M:%S",
        "details": {
            "success_ratio": 1.0
        },
    }

    assert (get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters={domain.id: parameter_container},
    ) == expected_value)
def test_bootstrap_numeric_metric_range_multi_batch_parameter_builder_bobby_false_positive_rate_zero(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # BatchRequest yielding three batches
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="row_count_range",
            metric_name="table.row_count",
            metric_multi_batch_parameter_builder_name=None,
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            estimator="bootstrap",
            false_positive_rate=0.0,
            round_decimals=0,
            evaluation_parameter_builder_configs=None,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.TABLE,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    warning_message: str = re.escape(
        f"""You have chosen a false_positive_rate of 0.0, which is too close to 0.
A false_positive_rate of {NP_EPSILON} has been selected instead.""")

    with pytest.warns(UserWarning, match=warning_message):
        numeric_metric_range_parameter_builder.build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            batch_request=batch_request,
        )
Exemplo n.º 12
0
def variables_multi_part_name_parameter_container():
    variables_multi_part_name_parameter_node: ParameterNode = ParameterNode({
        "false_positive_threshold":
        1.0e-2,
    })
    root_variables_node: ParameterNode = ParameterNode({
        "variables":
        variables_multi_part_name_parameter_node,  # $variables.false_positive_threshold
    })
    variables: ParameterContainer = ParameterContainer(
        parameter_nodes={
            "variables": root_variables_node,
        })
    return variables
Exemplo n.º 13
0
    def initialize_parameter_container_for_domain(
        self,
        domain: Domain,
        overwrite: bool = True,
    ) -> None:
        if not overwrite and domain.id in self.parameters:
            raise ge_exceptions.ProfilerConfigurationError(
                f"""Error: ParameterContainer for Domain\n{domain}\nalready exists.  In order to overwrite it, either \
pass "overwrite=True" or call "RuleState.remove_parameter_container_from_domain()" with Domain having ID equal to \
"{domain.id}" as argument first.
""")

        parameter_container: ParameterContainer = ParameterContainer(
            parameter_nodes=None)
        self._parameters[domain.id] = parameter_container
Exemplo n.º 14
0
def variables_multi_part_name_parameter_container():
    skip_if_python_below_minimum_version()

    variables_multi_part_name_parameter_node: ParameterNode = ParameterNode({
        "false_positive_threshold":
        1.0e-2,
    })
    root_variables_node: ParameterNode = ParameterNode({
        "variables":
        variables_multi_part_name_parameter_node,  # $variables.false_positive_threshold
    })
    variables: ParameterContainer = ParameterContainer(
        parameter_nodes={
            "variables": root_variables_node,
        })
    return variables
def test_simple_date_format_parameter_builder_zero_batch_id_error():
    date_format_string_parameter: SimpleDateFormatStringParameterBuilder = (
        SimpleDateFormatStringParameterBuilder(
            name="my_simple_date_format_string_parameter_builder", ))
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    domain: Domain = Domain(domain_type=MetricDomainTypes.COLUMN)

    with pytest.raises(ge_exceptions.ProfilerExecutionError) as e:
        date_format_string_parameter._build_parameters(
            parameter_container=parameter_container, domain=domain)

    assert (
        str(e.value) ==
        "Utilizing a SimpleDateFormatStringParameterBuilder requires a non-empty list of batch identifiers."
    )
Exemplo n.º 16
0
def test_regex_wrong_domain(mock_data_context: mock.MagicMock, batch_fixture: Batch):
    batch: Batch = batch_fixture
    mock_data_context.get_batch_list.return_value = [batch]
    mock_data_context.get_validator_using_batch_list.return_value = Validator(
        execution_engine=PandasExecutionEngine(), batches=[batch]
    )

    data_context: DataContext = mock_data_context

    # column : c does not exist
    metric_domain_kwargs: dict = {"column": "c"}
    candidate_regexes: List[str] = [r"^\d{1}$"]

    regex_pattern_string_parameter_builder: ParameterBuilder = (
        RegexPatternStringParameterBuilder(
            name="my_regex_pattern_string_parameter_builder",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_regexes=candidate_regexes,
            data_context=data_context,
        )
    )

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    with pytest.raises(ge_exceptions.ProfilerExecutionError) as e:
        regex_pattern_string_parameter_builder.build_parameters(
            domain=domain,
            parameters=parameters,
            batch_list=[batch],
        )

    assert (
        e.value.message
        == "Result of metric computations for RegexPatternStringParameterBuilder is empty."
    )
Exemplo n.º 17
0
    def generate(
        self,
        variables: Optional[ParameterContainer] = None,
    ) -> List[ExpectationConfiguration]:
        """
        Builds a list of Expectation Configurations, returning a single Expectation Configuration entry for every
        ConfigurationBuilder available based on the instantiation.

        :return: List of Corresponding Expectation Configurations representing every configured rule
        """
        expectation_configurations: List[ExpectationConfiguration] = []

        domains: List[Domain] = self._domain_builder.get_domains(
            variables=variables)

        domain: Domain
        for domain in domains:
            parameter_container: ParameterContainer = ParameterContainer(
                parameter_nodes=None)
            self._parameters[domain.id] = parameter_container
            parameter_builder: ParameterBuilder
            for parameter_builder in self._parameter_builders:
                parameter_builder.build_parameters(
                    parameter_container=parameter_container,
                    domain=domain,
                    variables=variables,
                    parameters=self.parameters,
                )

            expectation_configuration_builder: ExpectationConfigurationBuilder
            for (expectation_configuration_builder
                 ) in self._expectation_configuration_builders:
                expectation_configurations.append(
                    expectation_configuration_builder.
                    build_expectation_configuration(
                        domain=domain,
                        variables=variables,
                        parameters=self.parameters,
                    ))

        return expectation_configurations
def test_partition_parameter_builder_alice_continuous(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    parameter_builder: ParameterBuilder = PartitionParameterBuilder(
        name="my_name",
        bucketize_data=True,
        evaluation_parameter_builder_configs=None,
        data_context=data_context,
    )

    metric_domain_kwargs: dict = {"column": "user_id"}
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    variables: Optional[ParameterContainer] = None
    parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    expected_parameter_value: dict = {
        "value": {
            "bins": [397433.0, 4942918.5, 9488404.0],
            "weights": [0.6666666666666666, 0.3333333333333333],
            "tail_weights": [0.0, 0.0],
        },
        "details": {
            "metric_configuration": {
                "metric_name": "column.histogram",
                "domain_kwargs": {
                    "column": "user_id"
                },
                "metric_value_kwargs": {
                    "bins": [397433.0, 4942918.5, 9488404.0]
                },
                "metric_dependencies": None,
            },
            "num_batches": 1,
        },
    }

    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=parameter_builder.
        json_serialized_fully_qualified_parameter_name,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    assert parameter_node == expected_parameter_value
Exemplo n.º 19
0
def test_regex_pattern_string_parameter_builder_bobby_no_match(
    bobby_columnar_table_multi_batch_deterministic_data_context,
):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context
    )

    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
        "data_connector_query": {"index": -1},
    }

    metric_domain_kwargs: dict = {"column": "VendorID"}

    candidate_regexes: Set[str] = {
        r"^\d{3}$",  # won't match
    }
    threshold: float = 0.9

    regex_parameter: ParameterBuilder = RegexPatternStringParameterBuilder(
        name="my_regex_pattern_string_parameter_builder",
        metric_domain_kwargs=metric_domain_kwargs,
        candidate_regexes=candidate_regexes,
        threshold=threshold,
        data_context=data_context,
    )
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    regex_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_regex_pattern_string_parameter_builder"
    )
    expected_value: dict = {
        "value": "(?:[A-Fa-f0-9]){0,4}(?: ?:? ?(?:[A-Fa-f0-9]){0,4}){0,7}",
        "details": {
            "evaluated_regexes": {
                r"\d+": 1.0,
                r"-?\d+": 1.0,
                r"-?\d+(\.\d*)?": 1.0,
                r"[A-Za-z0-9\.,;:!?()\"'%\-]+": 1.0,
                r"^\s+": 0.0,
                r"\s+$": 0.0,
                r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#()?&//=]*)": 0.0,
                r"<\/?(?:p|a|b|img)(?: \/)?>": 0.0,
                r"(?:25[0-5]|2[0-4]\d|[01]\d{2}|\d{1,2})(?:.(?:25[0-5]|2[0-4]\d|[01]\d{2}|\d{1,2})){3}": 0.0,
                r"(?:[A-Fa-f0-9]){0,4}(?: ?:? ?(?:[A-Fa-f0-9]){0,4}){0,7}": 1.0,
                r"\b[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}-[0-5][0-9a-fA-F]{3}-[089ab][0-9a-fA-F]{3}-\b[0-9a-fA-F]{12}\b ": 0.0,
            },
            "success_ratio": 1.0,
        },
    }

    assert (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        )
        == expected_value
    )
Exemplo n.º 20
0
def test_regex_pattern_string_parameter_builder_bobby_multiple_matches(
    bobby_columnar_table_multi_batch_deterministic_data_context,
):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context
    )

    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
        "data_connector_query": {"index": -1},
    }

    metric_domain_kwargs: dict = {"column": "VendorID"}

    candidate_regexes: List[str] = [
        r"^\d{1}$",  # will match
        r"^[12]{1}$",  # will match 0.9941111111 of the time
        r"^\d{4}$",  # won't match
    ]
    threshold: float = 0.9

    regex_parameter: RegexPatternStringParameterBuilder = (
        RegexPatternStringParameterBuilder(
            name="my_regex_pattern_string_parameter_builder",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_regexes=candidate_regexes,
            threshold=threshold,
            data_context=data_context,
        )
    )

    assert regex_parameter.CANDIDATE_REGEX != candidate_regexes
    assert regex_parameter.candidate_regexes == candidate_regexes
    assert regex_parameter.threshold == 0.9

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    regex_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_regex_pattern_string_parameter_builder"
    )
    expected_value: dict = {
        "value": r"^\d{1}$",
        "details": {
            "evaluated_regexes": {
                r"^\d{1}$": 1.0,
                r"^[12]{1}$": 0.9941111111111111,
                r"^\d{4}$": 0.0,
            },
            "success_ratio": 1.0,
        },
    }

    results = get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters=parameters,
    )
    assert results is not None
    assert sorted(results["value"]) == sorted(expected_value["value"])
    assert results["details"] == expected_value["details"]
Exemplo n.º 21
0
def test_regex_pattern_string_parameter_builder_alice(
    alice_columnar_table_single_batch_context,
):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name": "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs = {"column": "id"}
    candidate_regexes: List[str] = [
        r"^\d{1}$",
        r"^\d{2}$",
        r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$",
    ]

    regex_pattern_string_parameter: ParameterBuilder = (
        RegexPatternStringParameterBuilder(
            name="my_regex_pattern_string_parameter_builder",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_regexes=candidate_regexes,
            data_context=data_context,
        )
    )

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    regex_pattern_string_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )
    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_regex_pattern_string_parameter_builder"
    )
    expected_value: dict = {
        "value": r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$",
        "details": {
            "evaluated_regexes": {
                r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$": 1.0,
                r"^\d{1}$": 0.0,
                r"^\d{2}$": 0.0,
            },
            "success_ratio": 1.0,
        },
    }

    assert (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        )
        == expected_value
    )
def test_execution_mean_table_columns_set_match_multi_batch_parameter_builder(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    mean_table_columns_set_match_multi_batch_parameter_builder: ParameterBuilder = (
        MeanTableColumnsSetMatchMultiBatchParameterBuilder(
            name=
            "my_mean_table_columns_set_match_multi_batch_parameter_builder",
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            evaluation_parameter_builder_configs=None,
            data_context=data_context,
        ))

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.TABLE,
        domain_kwargs=None,
        rule_name="my_rule",
    )

    variables: Optional[ParameterContainer] = None

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    expected_parameter_value: dict = {
        "value": {
            "VendorID",
            "pickup_datetime",
            "total_amount",
            "congestion_surcharge",
            "dropoff_datetime",
            "mta_tax",
            "store_and_fwd_flag",
            "tip_amount",
            "trip_distance",
            "payment_type",
            "DOLocationID",
            "improvement_surcharge",
            "extra",
            "tolls_amount",
            "RatecodeID",
            "passenger_count",
            "PULocationID",
            "fare_amount",
        },
        "details": {
            "success_ratio": 1.0,
        },
    }

    mean_table_columns_set_match_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=
        mean_table_columns_set_match_multi_batch_parameter_builder.
        json_serialized_fully_qualified_parameter_name,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    assert len(
        parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]) == len(
            expected_parameter_value[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY])

    parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY] = set(
        parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY])
    assert parameter_node == expected_parameter_value
Exemplo n.º 23
0
def test_simple_date_format_parameter_builder_alice(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs = {"column": "event_ts"}

    date_format_string_parameter: SimpleDateFormatStringParameterBuilder = (
        SimpleDateFormatStringParameterBuilder(
            name="my_date_format",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
        ))

    assert date_format_string_parameter.candidate_strings == DEFAULT_CANDIDATE_STRINGS
    assert date_format_string_parameter._threshold == 1.0

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    date_format_string_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    # noinspection PyTypeChecker
    assert len(parameter_container.parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.my_date_format"
    expected_value: dict = {
        "value": "%Y-%m-%d %H:%M:%S",
        "details": {
            "success_ratio": 1.0,
            "candidate_strings": {
                "%Y-%m-%d %H:%M:%S": 1.0,
                "%y/%m/%d %H:%M:%S": 0.0,
                "%y/%m/%d": 0.0,
                "%y-%m-%d %H:%M:%S,%f %z": 0.0,
                "%y-%m-%d %H:%M:%S,%f": 0.0,
                "%y-%m-%d %H:%M:%S": 0.0,
                "%y-%m-%d": 0.0,
                "%y%m%d %H:%M:%S": 0.0,
                "%m/%d/%y*%H:%M:%S": 0.0,
                "%m/%d/%y %H:%M:%S %z": 0.0,
                "%m/%d/%Y*%H:%M:%S*%f": 0.0,
                "%m/%d/%Y*%H:%M:%S": 0.0,
                "%m/%d/%Y %H:%M:%S %z": 0.0,
                "%m/%d/%Y %H:%M:%S %p:%f": 0.0,
                "%m/%d/%Y %H:%M:%S %p": 0.0,
                "%m/%d/%Y": 0.0,
                "%m-%d-%Y": 0.0,
                "%m%d_%H:%M:%S.%f": 0.0,
                "%m%d_%H:%M:%S": 0.0,
                "%d/%m/%Y": 0.0,
                "%d/%b/%Y:%H:%M:%S %z": 0.0,
                "%d/%b/%Y:%H:%M:%S": 0.0,
                "%d/%b/%Y %H:%M:%S": 0.0,
                "%d/%b %H:%M:%S,%f": 0.0,
                "%d-%m-%Y": 0.0,
                "%d-%b-%Y %H:%M:%S.%f": 0.0,
                "%d-%b-%Y %H:%M:%S": 0.0,
                "%d %b %Y %H:%M:%S*%f": 0.0,
                "%d %b %Y %H:%M:%S": 0.0,
                "%b %d, %Y %H:%M:%S %p": 0.0,
                "%b %d %Y %H:%M:%S": 0.0,
                "%b %d %H:%M:%S %z %Y": 0.0,
                "%b %d %H:%M:%S %z": 0.0,
                "%b %d %H:%M:%S %Y": 0.0,
                "%b %d %H:%M:%S": 0.0,
                "%Y/%m/%d*%H:%M:%S": 0.0,
                "%Y/%m/%d": 0.0,
                "%Y-%m-%dT%z": 0.0,
                "%Y-%m-%d*%H:%M:%S:%f": 0.0,
                "%Y-%m-%d*%H:%M:%S": 0.0,
                "%Y-%m-%d'T'%H:%M:%S.%f'%z'": 0.0,
                "%Y-%m-%d'T'%H:%M:%S.%f": 0.0,
                "%Y-%m-%d'T'%H:%M:%S'%z'": 0.0,
                "%Y-%m-%d'T'%H:%M:%S%z": 0.0,
                "%Y-%m-%d'T'%H:%M:%S": 0.0,
                "%Y-%m-%d %H:%M:%S.%f%z": 0.0,
                "%Y-%m-%d %H:%M:%S.%f": 0.0,
                "%Y-%m-%d %H:%M:%S,%f%z": 0.0,
                "%Y-%m-%d %H:%M:%S,%f": 0.0,
                "%Y-%m-%d %H:%M:%S%z": 0.0,
                "%Y-%m-%d %H:%M:%S %z": 0.0,
                "%Y-%m-%d": 0.0,
                "%Y%m%d %H:%M:%S.%f": 0.0,
                "%Y %b %d %H:%M:%S.%f*%Z": 0.0,
                "%Y %b %d %H:%M:%S.%f %Z": 0.0,
                "%Y %b %d %H:%M:%S.%f": 0.0,
                "%H:%M:%S.%f": 0.0,
                "%H:%M:%S,%f": 0.0,
                "%H:%M:%S": 0.0,
            },
        },
    }

    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        parameter_reference=fully_qualified_parameter_name_for_value,
        expected_return_type=dict,
        domain=domain,
        parameters=parameters,
    )

    assert parameter_node == expected_value
Exemplo n.º 24
0
def test_kde_numeric_metric_range_multi_batch_parameter_builder_bobby(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # BatchRequest yielding three batches
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="row_count_range",
            metric_name="table.row_count",
            estimator="kde",
            include_estimator_samples_histogram_in_details=True,
            false_positive_rate=1.0e-2,
            round_decimals=0,
            json_serialize=False,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        rule_name="my_rule",
        domain_type=MetricDomainTypes.TABLE,
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.row_count_range"
    expected_value_dict: dict = {
        "value": None,
        "details": {
            "metric_configuration": {
                "domain_kwargs": {},
                "metric_name": "table.row_count",
                "metric_value_kwargs": None,
                "metric_dependencies": None,
            },
            "num_batches": 3,
        },
    }

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    actual_value: np.ndarray = parameter_node.pop("value")
    parameter_node["value"] = None

    actual_estimation_histogram: np.ndarray = parameter_node.details.pop(
        "estimation_histogram")

    assert parameter_node == expected_value_dict

    expected_value: np.ndarray = np.array([6180, 10277])

    # Measure of "closeness" between "actual" and "desired" is computed as: atol + rtol * abs(desired)
    # (see "https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_allclose.html" for details).
    rtol: float = 1.0e-2
    atol: float = 0

    # kde results should be stable +/- 1%
    np.testing.assert_allclose(
        actual=actual_value,
        desired=expected_value,
        rtol=rtol,
        atol=atol,
        err_msg=
        f"Actual value of {actual_value} differs from expected value of {expected_value} by more than {atol + rtol * abs(expected_value)} tolerance.",
    )

    expected_estimation_histogram: np.ndarray = np.array([
        13.0,
        155.0,
        719.0,
        1546.0,
        2221.0,
        2570.0,
        1946.0,
        683.0,
        137.0,
        9.0,
    ])

    # Assert no significant difference between expected (null hypothesis) and actual estimation histograms.
    ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram,
                                      data2=expected_estimation_histogram)
    p_value: float = ks_result[1]
    assert p_value > 9.5e-1
Exemplo n.º 25
0
def test_regex_two_candidates(mock_data_context: mock.MagicMock, batch_fixture: Batch):
    batch: Batch = batch_fixture

    mock_data_context.get_batch_list.return_value = [batch]
    mock_data_context.get_validator_using_batch_list.return_value = Validator(
        execution_engine=PandasExecutionEngine(), batches=[batch]
    )
    data_context: DataContext = mock_data_context

    metric_domain_kwargs: dict = {"column": "b"}
    candidate_regexes: List[str] = [r"^\d{1}$", r"^\d{3}$"]

    regex_pattern_string_parameter_builder: ParameterBuilder = (
        RegexPatternStringParameterBuilder(
            name="my_regex_pattern_string_parameter_builder",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_regexes=candidate_regexes,
            data_context=data_context,
        )
    )

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    regex_pattern_string_parameter_builder.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_list=[batch],
    )
    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_regex_pattern_string_parameter_builder.value"
    )

    expected_value: str = "^\\d{1}$"

    assert (
        get_parameter_value_and_validate_return_type(
            parameter_reference=fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        )
        == expected_value
    )
    fully_qualified_parameter_name_for_meta: str = (
        "$parameter.my_regex_pattern_string_parameter_builder.details"
    )
    expected_meta: dict = {
        "evaluated_regexes": {"^\\d{1}$": 1.0, "^\\d{3}$": 0.0},
        "success_ratio": 1.0,
    }
    meta: dict = get_parameter_value_and_validate_return_type(
        parameter_reference=fully_qualified_parameter_name_for_meta,
        expected_return_type=dict,
        domain=domain,
        parameters=parameters,
    )

    assert meta == expected_meta
def test_partition_parameter_builder_alice_continuous_changed_to_categorical(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    parameter_builder: ParameterBuilder = PartitionParameterBuilder(
        name="my_name",
        bucketize_data=True,
        evaluation_parameter_builder_configs=None,
        data_context=data_context,
    )

    metric_domain_kwargs: dict = {"column": "event_ts"}
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    variables: Optional[ParameterContainer] = None
    parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    expected_parameter_value: dict = {
        "value": {
            "values": [
                "2004-10-19 10:23:54",
                "2004-10-19 10:23:55",
                "2004-10-19 11:05:20",
            ],
            "weights":
            [0.3333333333333333, 0.3333333333333333, 0.3333333333333333],
        },
        "details": {
            "metric_configuration": {
                "metric_name": "column.value_counts",
                "domain_kwargs": {
                    "column": "event_ts"
                },
                "metric_value_kwargs": {
                    "sort": "value"
                },
                "metric_dependencies": None,
            },
            "num_batches": 1,
        },
    }

    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=parameter_builder.
        json_serialized_fully_qualified_parameter_name,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    assert parameter_node == expected_parameter_value
Exemplo n.º 27
0
def test_oneshot_numeric_metric_range_multi_batch_parameter_builder_bobby(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    metric_domain_kwargs: dict = {"column": "fare_amount"}

    fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range"

    expected_value_dict: dict
    actual_value_dict: dict

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="column_min_range",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            estimator="oneshot",
            include_estimator_samples_histogram_in_details=True,
            false_positive_rate=1.0e-2,
            round_decimals=1,
            json_serialize=False,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    expected_value_dict = {
        "value": None,
        "details": {
            "metric_configuration": {
                "domain_kwargs": {
                    "column": "fare_amount"
                },
                "metric_name": "column.min",
                "metric_value_kwargs": None,
                "metric_dependencies": None,
            },
            "num_batches": 3,
        },
    }

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    actual_values_01: np.ndarray = parameter_node.pop("value")
    parameter_node["value"] = None

    actual_estimation_histogram: np.ndarray = parameter_node.details.pop(
        "estimation_histogram")

    assert parameter_node == expected_value_dict

    actual_value_01_lower: float = actual_values_01[0]
    actual_value_01_upper: float = actual_values_01[1]
    expected_value_01_lower: float = -51.7
    expected_value_01_upper: float = -21.0

    assert actual_value_01_lower == expected_value_01_lower
    assert actual_value_01_upper == expected_value_01_upper

    expected_estimation_histogram: np.ndarray = np.array([
        1.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        2.0,
    ])

    # Assert no significant difference between expected (null hypothesis) and actual estimation histograms.
    ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram,
                                      data2=expected_estimation_histogram)
    p_value: float = ks_result[1]
    assert p_value > 9.5e-1

    numeric_metric_range_parameter_builder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="column_min_range",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            estimator="oneshot",
            include_estimator_samples_histogram_in_details=True,
            false_positive_rate=5.0e-2,
            round_decimals=1,
            json_serialize=False,
            data_context=data_context,
        ))

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        recompute_existing_parameter_values=True,
        batch_request=batch_request,
    )

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    actual_values_05 = parameter_node.pop("value")
    parameter_node["value"] = None

    actual_estimation_histogram: np.ndarray = parameter_node.details.pop(
        "estimation_histogram")

    assert parameter_node == expected_value_dict

    actual_value_05_lower: float = actual_values_05[0]
    actual_value_05_upper: float = actual_values_05[1]
    expected_value_05_lower: float = -50.5
    expected_value_05_upper: float = -21.1

    assert actual_value_05_lower == expected_value_05_lower
    assert actual_value_05_upper == expected_value_05_upper

    # if false positive rate is higher, our range should be more narrow
    assert actual_value_01_lower < actual_value_05_lower
    assert actual_value_01_upper > actual_value_05_upper

    expected_estimation_histogram: np.ndarray = np.array([
        1.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        2.0,
    ])

    # Assert no significant difference between expected (null hypothesis) and actual estimation histograms.
    ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram,
                                      data2=expected_estimation_histogram)
    p_value: float = ks_result[1]
    assert p_value > 9.5e-1
Exemplo n.º 28
0
def test_simple_date_format_parameter_builder_bobby(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    metric_domain_kwargs: dict = {"column": "pickup_datetime"}
    candidate_strings: list[str] = [
        "%Y-%m-%d",
        "%Y-%m-%d %H:%M:%S",
    ]
    threshold: float = 0.9
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    date_format_string_parameter: SimpleDateFormatStringParameterBuilder = (
        SimpleDateFormatStringParameterBuilder(
            name="my_simple_date_format_string_parameter_builder",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_strings=candidate_strings,
            threshold=threshold,
            data_context=data_context,
        ))

    assert date_format_string_parameter._candidate_strings == set(
        candidate_strings)
    assert date_format_string_parameter._threshold == 0.9

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    date_format_string_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    assert (parameter_container.parameter_nodes is None
            or len(parameter_container.parameter_nodes) == 1)

    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_simple_date_format_string_parameter_builder.value")
    expected_value: str = "%Y-%m-%d %H:%M:%S"

    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        parameter_reference=fully_qualified_parameter_name_for_value,
        expected_return_type=str,
        domain=domain,
        parameters=parameters,
    )

    assert parameter_node == expected_value

    fully_qualified_parameter_name_for_meta: str = (
        "$parameter.my_simple_date_format_string_parameter_builder.details")
    expected_meta: dict = {
        "success_ratio": 1.0,
        "candidate_strings": {
            "%Y-%m-%d": 0.0,
            "%Y-%m-%d %H:%M:%S": 1.0
        },
    }
    meta: dict = get_parameter_value_and_validate_return_type(
        parameter_reference=fully_qualified_parameter_name_for_meta,
        expected_return_type=dict,
        domain=domain,
        parameters=parameters,
    )
    assert meta == expected_meta
Exemplo n.º 29
0
def test_numeric_metric_range_multi_batch_parameter_builder_bobby_kde_bw_method(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    """
    This tests whether a change to bw_method results in a change to the range
    """

    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # BatchRequest yielding three batches
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    metric_domain_kwargs: dict = {"column": "fare_amount"}

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="column_min_range",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            estimator="kde",
            false_positive_rate=5.0e-2,
            round_decimals=0,
            json_serialize=False,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        rule_name="my_rule",
        domain_type=MetricDomainTypes.TABLE,
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range"

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    default_bw_method_value: np.ndarray = parameter_node.pop("value")

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="column_min_range",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            estimator="kde",
            bw_method=0.5,
            false_positive_rate=5.0e-2,
            round_decimals=0,
            json_serialize=False,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        rule_name="my_rule",
        domain_type=MetricDomainTypes.TABLE,
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range"

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    other_bw_method_value: np.ndarray = parameter_node.pop("value")

    assert default_bw_method_value[0] != other_bw_method_value[0]
def test_get_parameter_values_for_fully_qualified_parameter_names(
    parameters_with_different_depth_level_values,
):
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    build_parameter_container(
        parameter_container=parameter_container,
        parameter_values=parameters_with_different_depth_level_values,
    )

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=None,
        details=None,
        rule_name="my_rule",
    )
    # Convert variables argument to ParameterContainer
    variables: ParameterContainer = build_parameter_container_for_variables(
        variables_configs={
            "my_int": 9,
            "my_float": 3.38,
            "my_string": "hello",
        }
    )
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    # fmt: off
    expected_parameter_values_for_fully_qualified_parameter_names: Dict[str, ParameterNode] = {
        "$variables": {
            "my_int": 9,
            "my_float": 3.38,
            "my_string": "hello",
        },
        "$parameter.weekly_taxi_fairs.mean_values": {
            "value": [
                {
                    "sunday": 71.43,
                    "monday": 74.35,
                    "tuesday": 42.3,
                    "wednesday": 42.3,
                    "thursday": 82.2,
                    "friday": 78.78,
                    "saturday": 91.39,
                },
                {
                    "sunday": 81.43,
                    "monday": 84.35,
                    "tuesday": 52.3,
                    "wednesday": 43.3,
                    "thursday": 22.2,
                    "friday": 98.78,
                    "saturday": 81.39,
                },
                {
                    "sunday": 61.43,
                    "monday": 34.35,
                    "tuesday": 82.3,
                    "wednesday": 72.3,
                    "thursday": 22.2,
                    "friday": 38.78,
                    "saturday": 51.39,
                },
                {
                    "sunday": 51.43,
                    "monday": 64.35,
                    "tuesday": 72.3,
                    "wednesday": 82.3,
                    "thursday": 22.2,
                    "friday": 98.78,
                    "saturday": 31.39,
                },
                {
                    "sunday": 72.43,
                    "monday": 77.35,
                    "tuesday": 46.3,
                    "wednesday": 47.3,
                    "thursday": 88.2,
                    "friday": 79.78,
                    "saturday": 93.39,
                },
                {
                    "sunday": 72.43,
                    "monday": 73.35,
                    "tuesday": 41.3,
                    "wednesday": 49.3,
                    "thursday": 80.2,
                    "friday": 78.78,
                    "saturday": 93.39,
                },
                {
                    "sunday": 74.43,
                    "monday": 78.35,
                    "tuesday": 49.3,
                    "wednesday": 43.3,
                    "thursday": 88.2,
                    "friday": 72.78,
                    "saturday": 97.39,
                },
                {
                    "sunday": 73.43,
                    "monday": 72.35,
                    "tuesday": 40.3,
                    "wednesday": 40.3,
                    "thursday": 89.2,
                    "friday": 77.78,
                    "saturday": 90.39,
                },
                {
                    "sunday": 72.43,
                    "monday": 73.35,
                    "tuesday": 45.3,
                    "wednesday": 44.3,
                    "thursday": 89.2,
                    "friday": 77.78,
                    "saturday": 96.39,
                },
                {
                    "sunday": 75.43,
                    "monday": 74.25,
                    "tuesday": 42.33,
                    "wednesday": 42.23,
                    "thursday": 82.21,
                    "friday": 78.76,
                    "saturday": 91.37,
                },
                {
                    "sunday": 71.43,
                    "monday": 74.37,
                    "tuesday": 42.3,
                    "wednesday": 42.32,
                    "thursday": 82.23,
                    "friday": 78.77,
                    "saturday": 91.49,
                },
                {
                    "sunday": 71.63,
                    "monday": 74.37,
                    "tuesday": 42.2,
                    "wednesday": 42.1,
                    "thursday": 82.29,
                    "friday": 78.79,
                    "saturday": 91.39,
                },
                {
                    "sunday": 71.42,
                    "monday": 74.33,
                    "tuesday": 42.33,
                    "wednesday": 42.34,
                    "thursday": 82.25,
                    "friday": 78.77,
                    "saturday": 91.69,
                },
                {
                    "sunday": 71.44,
                    "monday": 72.35,
                    "tuesday": 42.33,
                    "wednesday": 42.31,
                    "thursday": 82.29,
                    "friday": 78.68,
                    "saturday": 91.49,
                },
                {
                    "sunday": 71.44,
                    "monday": 74.32,
                    "tuesday": 42.32,
                    "wednesday": 42.32,
                    "thursday": 82.29,
                    "friday": 78.77,
                    "saturday": 91.49,
                },
                {
                    "sunday": 71.44,
                    "monday": 74.33,
                    "tuesday": 42.21,
                    "wednesday": 42.31,
                    "thursday": 82.27,
                    "friday": 78.74,
                    "saturday": 91.49,
                },
                {
                    "sunday": 71.33,
                    "monday": 74.25,
                    "tuesday": 42.31,
                    "wednesday": 42.03,
                    "thursday": 82.02,
                    "friday": 78.08,
                    "saturday": 91.38,
                },
                {
                    "sunday": 71.41,
                    "monday": 74.31,
                    "tuesday": 42.39,
                    "wednesday": 42.93,
                    "thursday": 82.92,
                    "friday": 78.75,
                    "saturday": 91.49,
                },
                {
                    "sunday": 72.43,
                    "monday": 73.35,
                    "tuesday": 42.3,
                    "wednesday": 32.3,
                    "thursday": 52.2,
                    "friday": 88.78,
                    "saturday": 81.39,
                },
                {
                    "sunday": 71.43,
                    "monday": 74.35,
                    "tuesday": 32.3,
                    "wednesday": 92.3,
                    "thursday": 72.2,
                    "friday": 74.78,
                    "saturday": 51.39,
                },
                {
                    "sunday": 72.43,
                    "monday": 64.35,
                    "tuesday": 52.3,
                    "wednesday": 42.39,
                    "thursday": 82.28,
                    "friday": 78.77,
                    "saturday": 91.36,
                },
                {
                    "sunday": 81.43,
                    "monday": 94.35,
                    "tuesday": 62.3,
                    "wednesday": 52.3,
                    "thursday": 92.2,
                    "friday": 88.78,
                    "saturday": 51.39,
                },
                {
                    "sunday": 21.43,
                    "monday": 34.35,
                    "tuesday": 42.34,
                    "wednesday": 62.3,
                    "thursday": 52.2,
                    "friday": 98.78,
                    "saturday": 81.39,
                },
                {
                    "sunday": 71.33,
                    "monday": 74.25,
                    "tuesday": 42.13,
                    "wednesday": 42.93,
                    "thursday": 82.82,
                    "friday": 78.78,
                    "saturday": 91.39,
                },
                {
                    "sunday": 72.43,
                    "monday": 73.35,
                    "tuesday": 44.3,
                    "wednesday": 45.3,
                    "thursday": 86.2,
                    "friday": 77.78,
                    "saturday": 98.39,
                },
                {
                    "sunday": 79.43,
                    "monday": 78.35,
                    "tuesday": 47.3,
                    "wednesday": 46.3,
                    "thursday": 85.2,
                    "friday": 74.78,
                    "saturday": 93.39,
                },
                {
                    "sunday": 71.42,
                    "monday": 74.31,
                    "tuesday": 42.0,
                    "wednesday": 42.1,
                    "thursday": 82.23,
                    "friday": 65.78,
                    "saturday": 91.26,
                },
                {
                    "sunday": 91.43,
                    "monday": 84.35,
                    "tuesday": 42.37,
                    "wednesday": 42.36,
                    "thursday": 82.25,
                    "friday": 78.74,
                    "saturday": 91.32,
                },
                {
                    "sunday": 71.33,
                    "monday": 74.45,
                    "tuesday": 42.35,
                    "wednesday": 42.36,
                    "thursday": 82.27,
                    "friday": 26.78,
                    "saturday": 71.39,
                },
                {
                    "sunday": 71.53,
                    "monday": 73.35,
                    "tuesday": 43.32,
                    "wednesday": 42.23,
                    "thursday": 82.32,
                    "friday": 78.18,
                    "saturday": 91.49,
                },
                {
                    "sunday": 71.53,
                    "monday": 74.25,
                    "tuesday": 52.3,
                    "wednesday": 52.3,
                    "thursday": 81.23,
                    "friday": 78.78,
                    "saturday": 78.39,
                },
            ],
            "details": {
                "confidence": "high",
            },
        },
        "$parameter.tolerances.mostly": 0.91,
        "$parameter.tolerances.financial.usd": 1.0,
        "$parameter.monthly_taxi_fairs.mean_values": {
            "value": [
                2.3,
                9.8,
                42.3,
                8.1,
                38.5,
                53.7,
                71.43,
                16.34,
                49.43,
                74.35,
                51.98,
                46.42,
                20.01,
                69.44,
                65.32,
                8.83,
                55.79,
                82.2,
                36.93,
                83.78,
                31.13,
                76.93,
                67.67,
                25.12,
                58.04,
                79.78,
                90.91,
                15.26,
                61.65,
                78.78,
                12.99,
            ],
            "details": {
                "confidence": "low",
            },
        },
        "$parameter.date_strings.yyyy_mm_dd_hh_mm_ss_tz_date_format": {
            "value": "%Y-%m-%d %H:%M:%S %Z",
            "details": {
                "confidence": 0.78,
            },
        },
        "$parameter.date_strings.yyyy_mm_dd_date_format": {
            "value": "%Y-%m-%d",
            "details": {
                "confidence": 0.78,
            },
        },
        "$parameter.date_strings.tolerances.max_num_conversion_attempts": 5,
        "$parameter.date_strings.tolerances.max_abs_error_time_milliseconds": 100,
        "$parameter.date_strings.mm_yyyy_dd_hh_mm_ss_tz_date_format": {
            "value": "%m-%Y-%d %H:%M:%S %Z",
            "details": {
                "confidence": 0.78,
            },
        },
        "$parameter.date_strings.mm_yyyy_dd_date_format": {
            "value": "%m-%Y-%d",
            "details": {
                "confidence": 0.78,
            },
        },
        "$parameter.daily_taxi_fairs.mean_values": {
            "value": {
                "sunday": 71.43,
                "monday": 74.35,
                "tuesday": 42.3,
                "wednesday": 42.3,
                "thursday": 82.2,
                "friday": 78.78,
                "saturday": 91.39,
            },
            "details": {
                "confidence": "medium",
            },
        },
        "$mean": 0.65,
    }
    # fmt: on

    parameter_values_for_fully_qualified_parameter_names: Dict[
        str, ParameterNode
    ] = get_parameter_values_for_fully_qualified_parameter_names(
        domain=domain,
        variables=variables,
        parameters=parameters,
    )
    assert (
        parameter_values_for_fully_qualified_parameter_names
        == expected_parameter_values_for_fully_qualified_parameter_names
    )