def test_simple_date_format_parameter_builder_zero_batch_id_error( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context date_format_string_parameter: ParameterBuilder = ( SimpleDateFormatStringParameterBuilder( name="my_simple_date_format_string_parameter_builder", data_context=data_context, )) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } with pytest.raises(ge_exceptions.ProfilerExecutionError) as e: date_format_string_parameter.build_parameters( domain=domain, parameters=parameters, ) assert ( str(e.value) == "Utilizing a SimpleDateFormatStringParameterBuilder requires a non-empty list of Batch identifiers." )
def test_default_expectation_configuration_builder_alice_null_condition_parameter_builder_validation_dependency_included( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs: dict = {"column": "user_id"} parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id.value[0]" condition: Optional[str] = None max_user_id: int = 999999999999 min_user_id_parameter_builder_config: ParameterBuilderConfig = ( ParameterBuilderConfig( module_name= "great_expectations.rule_based_profiler.parameter_builder", class_name="MetricMultiBatchParameterBuilder", name="my_min_user_id", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, )) validation_parameter_builder_configs: Optional[ List[ParameterBuilderConfig]] = [ min_user_id_parameter_builder_config, ] default_expectation_configuration_builder = DefaultExpectationConfigurationBuilder( expectation_type="expect_column_values_to_be_between", condition=condition, min_value=fully_qualified_parameter_name_for_value, max_value=max_user_id, validation_parameter_builder_configs= validation_parameter_builder_configs, data_context=data_context, ) expectation_configuration: Optional[ ExpectationConfiguration] = default_expectation_configuration_builder.build_expectation_configuration( domain=domain, parameters=parameters, batch_request=batch_request, ) assert expectation_configuration.kwargs["min_value"] == 397433
def single_part_name_parameter_container(): skip_if_python_below_minimum_version() return ParameterContainer(parameter_nodes={ "mean": ParameterNode({ "mean": 5.0, }), })
def test_bootstrap_numeric_metric_range_multi_batch_parameter_builder_bobby_false_positive_rate_very_small( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) # BatchRequest yielding three batches batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } # a commonly used defect rate in quality control that equates to 3.4 defects per million opportunities six_sigma_false_positive_rate: float = 3.4 / 1000000.0 assert six_sigma_false_positive_rate > NP_EPSILON # what if user tries a false positive rate smaller than NP_EPSILON (by an order of magnitude in this case)? smaller_than_np_epsilon_false_positive_rate: float = NP_EPSILON / 10 numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="row_count_range", metric_name="table.row_count", metric_multi_batch_parameter_builder_name=None, metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, estimator="bootstrap", false_positive_rate=smaller_than_np_epsilon_false_positive_rate, round_decimals=0, evaluation_parameter_builder_configs=None, data_context=data_context, )) variables: Optional[ParameterContainer] = None domain: Domain = Domain( domain_type=MetricDomainTypes.TABLE, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None warning_message: str = re.escape( f"""You have chosen a false_positive_rate of {smaller_than_np_epsilon_false_positive_rate}, which is too close to 0. A false_positive_rate of {NP_EPSILON} has been selected instead.""") with pytest.warns(UserWarning, match=warning_message): numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, )
def test_simple_date_format_parameter_builder_bobby( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) metric_domain_kwargs: dict = {"column": "pickup_datetime"} candidate_strings: set[str] = { "%Y-%m-%d", "%Y-%m-%d %H:%M:%S", } threshold: float = 0.9 batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } date_format_string_parameter: SimpleDateFormatStringParameterBuilder = ( SimpleDateFormatStringParameterBuilder( name="my_simple_date_format_string_parameter_builder", metric_domain_kwargs=metric_domain_kwargs, candidate_strings=candidate_strings, threshold=threshold, data_context=data_context, batch_request=batch_request, )) assert date_format_string_parameter.CANDIDATE_STRINGS != candidate_strings assert date_format_string_parameter._candidate_strings == candidate_strings assert date_format_string_parameter._threshold == 0.9 parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) domain: Domain = Domain(domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs) assert parameter_container.parameter_nodes is None date_format_string_parameter._build_parameters( parameter_container=parameter_container, domain=domain) assert len(parameter_container.parameter_nodes) == 1 fully_qualified_parameter_name_for_value: str = ( "$parameter.my_simple_date_format_string_parameter_builder") expected_value: dict = { "value": "%Y-%m-%d %H:%M:%S", "details": { "success_ratio": 1.0 }, } assert (get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters={domain.id: parameter_container}, ) == expected_value)
def test_build_parameter_container( parameters_with_different_depth_level_values, multi_part_name_parameter_container, ): parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) build_parameter_container( parameter_container=parameter_container, parameter_values=parameters_with_different_depth_level_values, ) assert parameter_container == multi_part_name_parameter_container
def single_part_name_parameter_container(): return ParameterContainer( parameter_nodes={ "mean": ParameterNode( { "mean": 5.0, } ), } )
def test_get_fully_qualified_parameter_names( parameters_with_different_depth_level_values, ): parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) build_parameter_container( parameter_container=parameter_container, parameter_values=parameters_with_different_depth_level_values, ) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=None, details=None, rule_name="my_rule", ) # Convert variables argument to ParameterContainer variables: ParameterContainer = build_parameter_container_for_variables( variables_configs={ "my_int": 9, "my_float": 3.38, "my_string": "hello", } ) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } expected_fully_qualified_parameter_names: List[str] = [ "$variables", "$parameter.date_strings.yyyy_mm_dd_hh_mm_ss_tz_date_format", "$parameter.date_strings.yyyy_mm_dd_date_format", "$parameter.date_strings.mm_yyyy_dd_hh_mm_ss_tz_date_format", "$parameter.date_strings.mm_yyyy_dd_date_format", "$parameter.date_strings.tolerances.max_abs_error_time_milliseconds", "$parameter.date_strings.tolerances.max_num_conversion_attempts", "$parameter.tolerances.mostly", "$parameter.tolerances.financial.usd", "$parameter.monthly_taxi_fairs.mean_values", "$parameter.daily_taxi_fairs.mean_values", "$parameter.weekly_taxi_fairs.mean_values", "$mean", ] fully_qualified_parameter_names: List[str] = get_fully_qualified_parameter_names( domain=domain, variables=variables, parameters=parameters, ) assert len(fully_qualified_parameter_names) == len( expected_fully_qualified_parameter_names ) assert sorted(fully_qualified_parameter_names) == sorted( expected_fully_qualified_parameter_names )
def test_bootstrap_numeric_metric_range_multi_batch_parameter_builder_bobby_false_positive_rate_negative( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) # BatchRequest yielding three batches batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="row_count_range", metric_name="table.row_count", metric_multi_batch_parameter_builder_name=None, metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, estimator="bootstrap", false_positive_rate=-0.05, round_decimals=0, evaluation_parameter_builder_configs=None, data_context=data_context, )) variables: Optional[ParameterContainer] = None domain: Domain = Domain( domain_type=MetricDomainTypes.TABLE, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None error_message: str = re.escape( """false_positive_rate must be a positive decimal number between 0 and 1 inclusive [0, 1], but -0.05 was provided.""") with pytest.raises(ge_exceptions.ProfilerExecutionError, match=error_message): numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, )
def test_simple_date_format_parameter_builder_alice( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs = {"column": "event_ts"} date_format_string_parameter: SimpleDateFormatStringParameterBuilder = ( SimpleDateFormatStringParameterBuilder( name="my_date_format", metric_domain_kwargs=metric_domain_kwargs, data_context=data_context, batch_request=batch_request, )) assert date_format_string_parameter.CANDIDATE_STRINGS == DEFAULT_CANDIDATE_STRINGS assert date_format_string_parameter.candidate_strings is None assert date_format_string_parameter._threshold == 1.0 parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) domain: Domain = Domain(domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs) assert parameter_container.parameter_nodes is None date_format_string_parameter._build_parameters( parameter_container=parameter_container, domain=domain) # noinspection PyTypeChecker assert len(parameter_container.parameter_nodes) == 1 fully_qualified_parameter_name_for_value: str = "$parameter.my_date_format" expected_value: dict = { "value": "%Y-%m-%d %H:%M:%S", "details": { "success_ratio": 1.0 }, } assert (get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters={domain.id: parameter_container}, ) == expected_value)
def test_bootstrap_numeric_metric_range_multi_batch_parameter_builder_bobby_false_positive_rate_zero( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) # BatchRequest yielding three batches batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="row_count_range", metric_name="table.row_count", metric_multi_batch_parameter_builder_name=None, metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, estimator="bootstrap", false_positive_rate=0.0, round_decimals=0, evaluation_parameter_builder_configs=None, data_context=data_context, )) variables: Optional[ParameterContainer] = None domain: Domain = Domain( domain_type=MetricDomainTypes.TABLE, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None warning_message: str = re.escape( f"""You have chosen a false_positive_rate of 0.0, which is too close to 0. A false_positive_rate of {NP_EPSILON} has been selected instead.""") with pytest.warns(UserWarning, match=warning_message): numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, )
def variables_multi_part_name_parameter_container(): variables_multi_part_name_parameter_node: ParameterNode = ParameterNode({ "false_positive_threshold": 1.0e-2, }) root_variables_node: ParameterNode = ParameterNode({ "variables": variables_multi_part_name_parameter_node, # $variables.false_positive_threshold }) variables: ParameterContainer = ParameterContainer( parameter_nodes={ "variables": root_variables_node, }) return variables
def initialize_parameter_container_for_domain( self, domain: Domain, overwrite: bool = True, ) -> None: if not overwrite and domain.id in self.parameters: raise ge_exceptions.ProfilerConfigurationError( f"""Error: ParameterContainer for Domain\n{domain}\nalready exists. In order to overwrite it, either \ pass "overwrite=True" or call "RuleState.remove_parameter_container_from_domain()" with Domain having ID equal to \ "{domain.id}" as argument first. """) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) self._parameters[domain.id] = parameter_container
def variables_multi_part_name_parameter_container(): skip_if_python_below_minimum_version() variables_multi_part_name_parameter_node: ParameterNode = ParameterNode({ "false_positive_threshold": 1.0e-2, }) root_variables_node: ParameterNode = ParameterNode({ "variables": variables_multi_part_name_parameter_node, # $variables.false_positive_threshold }) variables: ParameterContainer = ParameterContainer( parameter_nodes={ "variables": root_variables_node, }) return variables
def test_simple_date_format_parameter_builder_zero_batch_id_error(): date_format_string_parameter: SimpleDateFormatStringParameterBuilder = ( SimpleDateFormatStringParameterBuilder( name="my_simple_date_format_string_parameter_builder", )) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) domain: Domain = Domain(domain_type=MetricDomainTypes.COLUMN) with pytest.raises(ge_exceptions.ProfilerExecutionError) as e: date_format_string_parameter._build_parameters( parameter_container=parameter_container, domain=domain) assert ( str(e.value) == "Utilizing a SimpleDateFormatStringParameterBuilder requires a non-empty list of batch identifiers." )
def test_regex_wrong_domain(mock_data_context: mock.MagicMock, batch_fixture: Batch): batch: Batch = batch_fixture mock_data_context.get_batch_list.return_value = [batch] mock_data_context.get_validator_using_batch_list.return_value = Validator( execution_engine=PandasExecutionEngine(), batches=[batch] ) data_context: DataContext = mock_data_context # column : c does not exist metric_domain_kwargs: dict = {"column": "c"} candidate_regexes: List[str] = [r"^\d{1}$"] regex_pattern_string_parameter_builder: ParameterBuilder = ( RegexPatternStringParameterBuilder( name="my_regex_pattern_string_parameter_builder", metric_domain_kwargs=metric_domain_kwargs, candidate_regexes=candidate_regexes, data_context=data_context, ) ) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } with pytest.raises(ge_exceptions.ProfilerExecutionError) as e: regex_pattern_string_parameter_builder.build_parameters( domain=domain, parameters=parameters, batch_list=[batch], ) assert ( e.value.message == "Result of metric computations for RegexPatternStringParameterBuilder is empty." )
def generate( self, variables: Optional[ParameterContainer] = None, ) -> List[ExpectationConfiguration]: """ Builds a list of Expectation Configurations, returning a single Expectation Configuration entry for every ConfigurationBuilder available based on the instantiation. :return: List of Corresponding Expectation Configurations representing every configured rule """ expectation_configurations: List[ExpectationConfiguration] = [] domains: List[Domain] = self._domain_builder.get_domains( variables=variables) domain: Domain for domain in domains: parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) self._parameters[domain.id] = parameter_container parameter_builder: ParameterBuilder for parameter_builder in self._parameter_builders: parameter_builder.build_parameters( parameter_container=parameter_container, domain=domain, variables=variables, parameters=self.parameters, ) expectation_configuration_builder: ExpectationConfigurationBuilder for (expectation_configuration_builder ) in self._expectation_configuration_builders: expectation_configurations.append( expectation_configuration_builder. build_expectation_configuration( domain=domain, variables=variables, parameters=self.parameters, )) return expectation_configurations
def test_partition_parameter_builder_alice_continuous( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } parameter_builder: ParameterBuilder = PartitionParameterBuilder( name="my_name", bucketize_data=True, evaluation_parameter_builder_configs=None, data_context=data_context, ) metric_domain_kwargs: dict = {"column": "user_id"} domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None variables: Optional[ParameterContainer] = None parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) expected_parameter_value: dict = { "value": { "bins": [397433.0, 4942918.5, 9488404.0], "weights": [0.6666666666666666, 0.3333333333333333], "tail_weights": [0.0, 0.0], }, "details": { "metric_configuration": { "metric_name": "column.histogram", "domain_kwargs": { "column": "user_id" }, "metric_value_kwargs": { "bins": [397433.0, 4942918.5, 9488404.0] }, "metric_dependencies": None, }, "num_batches": 1, }, } parameter_node: ParameterNode = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=parameter_builder. json_serialized_fully_qualified_parameter_name, expected_return_type=None, variables=variables, parameters=parameters, ) assert parameter_node == expected_parameter_value
def test_regex_pattern_string_parameter_builder_bobby_no_match( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context ) batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", "data_connector_query": {"index": -1}, } metric_domain_kwargs: dict = {"column": "VendorID"} candidate_regexes: Set[str] = { r"^\d{3}$", # won't match } threshold: float = 0.9 regex_parameter: ParameterBuilder = RegexPatternStringParameterBuilder( name="my_regex_pattern_string_parameter_builder", metric_domain_kwargs=metric_domain_kwargs, candidate_regexes=candidate_regexes, threshold=threshold, data_context=data_context, ) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None regex_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) fully_qualified_parameter_name_for_value: str = ( "$parameter.my_regex_pattern_string_parameter_builder" ) expected_value: dict = { "value": "(?:[A-Fa-f0-9]){0,4}(?: ?:? ?(?:[A-Fa-f0-9]){0,4}){0,7}", "details": { "evaluated_regexes": { r"\d+": 1.0, r"-?\d+": 1.0, r"-?\d+(\.\d*)?": 1.0, r"[A-Za-z0-9\.,;:!?()\"'%\-]+": 1.0, r"^\s+": 0.0, r"\s+$": 0.0, r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#()?&//=]*)": 0.0, r"<\/?(?:p|a|b|img)(?: \/)?>": 0.0, r"(?:25[0-5]|2[0-4]\d|[01]\d{2}|\d{1,2})(?:.(?:25[0-5]|2[0-4]\d|[01]\d{2}|\d{1,2})){3}": 0.0, r"(?:[A-Fa-f0-9]){0,4}(?: ?:? ?(?:[A-Fa-f0-9]){0,4}){0,7}": 1.0, r"\b[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}-[0-5][0-9a-fA-F]{3}-[089ab][0-9a-fA-F]{3}-\b[0-9a-fA-F]{12}\b ": 0.0, }, "success_ratio": 1.0, }, } assert ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) == expected_value )
def test_regex_pattern_string_parameter_builder_bobby_multiple_matches( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context ) batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", "data_connector_query": {"index": -1}, } metric_domain_kwargs: dict = {"column": "VendorID"} candidate_regexes: List[str] = [ r"^\d{1}$", # will match r"^[12]{1}$", # will match 0.9941111111 of the time r"^\d{4}$", # won't match ] threshold: float = 0.9 regex_parameter: RegexPatternStringParameterBuilder = ( RegexPatternStringParameterBuilder( name="my_regex_pattern_string_parameter_builder", metric_domain_kwargs=metric_domain_kwargs, candidate_regexes=candidate_regexes, threshold=threshold, data_context=data_context, ) ) assert regex_parameter.CANDIDATE_REGEX != candidate_regexes assert regex_parameter.candidate_regexes == candidate_regexes assert regex_parameter.threshold == 0.9 domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None regex_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) fully_qualified_parameter_name_for_value: str = ( "$parameter.my_regex_pattern_string_parameter_builder" ) expected_value: dict = { "value": r"^\d{1}$", "details": { "evaluated_regexes": { r"^\d{1}$": 1.0, r"^[12]{1}$": 0.9941111111111111, r"^\d{4}$": 0.0, }, "success_ratio": 1.0, }, } results = get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) assert results is not None assert sorted(results["value"]) == sorted(expected_value["value"]) assert results["details"] == expected_value["details"]
def test_regex_pattern_string_parameter_builder_alice( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs = {"column": "id"} candidate_regexes: List[str] = [ r"^\d{1}$", r"^\d{2}$", r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$", ] regex_pattern_string_parameter: ParameterBuilder = ( RegexPatternStringParameterBuilder( name="my_regex_pattern_string_parameter_builder", metric_domain_kwargs=metric_domain_kwargs, candidate_regexes=candidate_regexes, data_context=data_context, ) ) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None regex_pattern_string_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) fully_qualified_parameter_name_for_value: str = ( "$parameter.my_regex_pattern_string_parameter_builder" ) expected_value: dict = { "value": r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$", "details": { "evaluated_regexes": { r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$": 1.0, r"^\d{1}$": 0.0, r"^\d{2}$": 0.0, }, "success_ratio": 1.0, }, } assert ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) == expected_value )
def test_execution_mean_table_columns_set_match_multi_batch_parameter_builder( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } mean_table_columns_set_match_multi_batch_parameter_builder: ParameterBuilder = ( MeanTableColumnsSetMatchMultiBatchParameterBuilder( name= "my_mean_table_columns_set_match_multi_batch_parameter_builder", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, evaluation_parameter_builder_configs=None, data_context=data_context, )) domain: Domain = Domain( domain_type=MetricDomainTypes.TABLE, domain_kwargs=None, rule_name="my_rule", ) variables: Optional[ParameterContainer] = None parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } expected_parameter_value: dict = { "value": { "VendorID", "pickup_datetime", "total_amount", "congestion_surcharge", "dropoff_datetime", "mta_tax", "store_and_fwd_flag", "tip_amount", "trip_distance", "payment_type", "DOLocationID", "improvement_surcharge", "extra", "tolls_amount", "RatecodeID", "passenger_count", "PULocationID", "fare_amount", }, "details": { "success_ratio": 1.0, }, } mean_table_columns_set_match_multi_batch_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_node: ParameterNode = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference= mean_table_columns_set_match_multi_batch_parameter_builder. json_serialized_fully_qualified_parameter_name, expected_return_type=None, variables=variables, parameters=parameters, ) assert len( parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]) == len( expected_parameter_value[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]) parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY] = set( parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]) assert parameter_node == expected_parameter_value
def test_simple_date_format_parameter_builder_alice( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs = {"column": "event_ts"} date_format_string_parameter: SimpleDateFormatStringParameterBuilder = ( SimpleDateFormatStringParameterBuilder( name="my_date_format", metric_domain_kwargs=metric_domain_kwargs, data_context=data_context, )) assert date_format_string_parameter.candidate_strings == DEFAULT_CANDIDATE_STRINGS assert date_format_string_parameter._threshold == 1.0 domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None date_format_string_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) # noinspection PyTypeChecker assert len(parameter_container.parameter_nodes) == 1 fully_qualified_parameter_name_for_value: str = "$parameter.my_date_format" expected_value: dict = { "value": "%Y-%m-%d %H:%M:%S", "details": { "success_ratio": 1.0, "candidate_strings": { "%Y-%m-%d %H:%M:%S": 1.0, "%y/%m/%d %H:%M:%S": 0.0, "%y/%m/%d": 0.0, "%y-%m-%d %H:%M:%S,%f %z": 0.0, "%y-%m-%d %H:%M:%S,%f": 0.0, "%y-%m-%d %H:%M:%S": 0.0, "%y-%m-%d": 0.0, "%y%m%d %H:%M:%S": 0.0, "%m/%d/%y*%H:%M:%S": 0.0, "%m/%d/%y %H:%M:%S %z": 0.0, "%m/%d/%Y*%H:%M:%S*%f": 0.0, "%m/%d/%Y*%H:%M:%S": 0.0, "%m/%d/%Y %H:%M:%S %z": 0.0, "%m/%d/%Y %H:%M:%S %p:%f": 0.0, "%m/%d/%Y %H:%M:%S %p": 0.0, "%m/%d/%Y": 0.0, "%m-%d-%Y": 0.0, "%m%d_%H:%M:%S.%f": 0.0, "%m%d_%H:%M:%S": 0.0, "%d/%m/%Y": 0.0, "%d/%b/%Y:%H:%M:%S %z": 0.0, "%d/%b/%Y:%H:%M:%S": 0.0, "%d/%b/%Y %H:%M:%S": 0.0, "%d/%b %H:%M:%S,%f": 0.0, "%d-%m-%Y": 0.0, "%d-%b-%Y %H:%M:%S.%f": 0.0, "%d-%b-%Y %H:%M:%S": 0.0, "%d %b %Y %H:%M:%S*%f": 0.0, "%d %b %Y %H:%M:%S": 0.0, "%b %d, %Y %H:%M:%S %p": 0.0, "%b %d %Y %H:%M:%S": 0.0, "%b %d %H:%M:%S %z %Y": 0.0, "%b %d %H:%M:%S %z": 0.0, "%b %d %H:%M:%S %Y": 0.0, "%b %d %H:%M:%S": 0.0, "%Y/%m/%d*%H:%M:%S": 0.0, "%Y/%m/%d": 0.0, "%Y-%m-%dT%z": 0.0, "%Y-%m-%d*%H:%M:%S:%f": 0.0, "%Y-%m-%d*%H:%M:%S": 0.0, "%Y-%m-%d'T'%H:%M:%S.%f'%z'": 0.0, "%Y-%m-%d'T'%H:%M:%S.%f": 0.0, "%Y-%m-%d'T'%H:%M:%S'%z'": 0.0, "%Y-%m-%d'T'%H:%M:%S%z": 0.0, "%Y-%m-%d'T'%H:%M:%S": 0.0, "%Y-%m-%d %H:%M:%S.%f%z": 0.0, "%Y-%m-%d %H:%M:%S.%f": 0.0, "%Y-%m-%d %H:%M:%S,%f%z": 0.0, "%Y-%m-%d %H:%M:%S,%f": 0.0, "%Y-%m-%d %H:%M:%S%z": 0.0, "%Y-%m-%d %H:%M:%S %z": 0.0, "%Y-%m-%d": 0.0, "%Y%m%d %H:%M:%S.%f": 0.0, "%Y %b %d %H:%M:%S.%f*%Z": 0.0, "%Y %b %d %H:%M:%S.%f %Z": 0.0, "%Y %b %d %H:%M:%S.%f": 0.0, "%H:%M:%S.%f": 0.0, "%H:%M:%S,%f": 0.0, "%H:%M:%S": 0.0, }, }, } parameter_node: ParameterNode = get_parameter_value_and_validate_return_type( parameter_reference=fully_qualified_parameter_name_for_value, expected_return_type=dict, domain=domain, parameters=parameters, ) assert parameter_node == expected_value
def test_kde_numeric_metric_range_multi_batch_parameter_builder_bobby( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) # BatchRequest yielding three batches batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="row_count_range", metric_name="table.row_count", estimator="kde", include_estimator_samples_histogram_in_details=True, false_positive_rate=1.0e-2, round_decimals=0, json_serialize=False, data_context=data_context, )) variables: Optional[ParameterContainer] = None domain: Domain = Domain( rule_name="my_rule", domain_type=MetricDomainTypes.TABLE, ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_nodes: Optional[Dict[str, ParameterNode]] = ( parameter_container.parameter_nodes or {}) assert len(parameter_nodes) == 1 fully_qualified_parameter_name_for_value: str = "$parameter.row_count_range" expected_value_dict: dict = { "value": None, "details": { "metric_configuration": { "domain_kwargs": {}, "metric_name": "table.row_count", "metric_value_kwargs": None, "metric_dependencies": None, }, "num_batches": 3, }, } parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) actual_value: np.ndarray = parameter_node.pop("value") parameter_node["value"] = None actual_estimation_histogram: np.ndarray = parameter_node.details.pop( "estimation_histogram") assert parameter_node == expected_value_dict expected_value: np.ndarray = np.array([6180, 10277]) # Measure of "closeness" between "actual" and "desired" is computed as: atol + rtol * abs(desired) # (see "https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_allclose.html" for details). rtol: float = 1.0e-2 atol: float = 0 # kde results should be stable +/- 1% np.testing.assert_allclose( actual=actual_value, desired=expected_value, rtol=rtol, atol=atol, err_msg= f"Actual value of {actual_value} differs from expected value of {expected_value} by more than {atol + rtol * abs(expected_value)} tolerance.", ) expected_estimation_histogram: np.ndarray = np.array([ 13.0, 155.0, 719.0, 1546.0, 2221.0, 2570.0, 1946.0, 683.0, 137.0, 9.0, ]) # Assert no significant difference between expected (null hypothesis) and actual estimation histograms. ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram, data2=expected_estimation_histogram) p_value: float = ks_result[1] assert p_value > 9.5e-1
def test_regex_two_candidates(mock_data_context: mock.MagicMock, batch_fixture: Batch): batch: Batch = batch_fixture mock_data_context.get_batch_list.return_value = [batch] mock_data_context.get_validator_using_batch_list.return_value = Validator( execution_engine=PandasExecutionEngine(), batches=[batch] ) data_context: DataContext = mock_data_context metric_domain_kwargs: dict = {"column": "b"} candidate_regexes: List[str] = [r"^\d{1}$", r"^\d{3}$"] regex_pattern_string_parameter_builder: ParameterBuilder = ( RegexPatternStringParameterBuilder( name="my_regex_pattern_string_parameter_builder", metric_domain_kwargs=metric_domain_kwargs, candidate_regexes=candidate_regexes, data_context=data_context, ) ) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None regex_pattern_string_parameter_builder.build_parameters( domain=domain, parameters=parameters, batch_list=[batch], ) fully_qualified_parameter_name_for_value: str = ( "$parameter.my_regex_pattern_string_parameter_builder.value" ) expected_value: str = "^\\d{1}$" assert ( get_parameter_value_and_validate_return_type( parameter_reference=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) == expected_value ) fully_qualified_parameter_name_for_meta: str = ( "$parameter.my_regex_pattern_string_parameter_builder.details" ) expected_meta: dict = { "evaluated_regexes": {"^\\d{1}$": 1.0, "^\\d{3}$": 0.0}, "success_ratio": 1.0, } meta: dict = get_parameter_value_and_validate_return_type( parameter_reference=fully_qualified_parameter_name_for_meta, expected_return_type=dict, domain=domain, parameters=parameters, ) assert meta == expected_meta
def test_partition_parameter_builder_alice_continuous_changed_to_categorical( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } parameter_builder: ParameterBuilder = PartitionParameterBuilder( name="my_name", bucketize_data=True, evaluation_parameter_builder_configs=None, data_context=data_context, ) metric_domain_kwargs: dict = {"column": "event_ts"} domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None variables: Optional[ParameterContainer] = None parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) expected_parameter_value: dict = { "value": { "values": [ "2004-10-19 10:23:54", "2004-10-19 10:23:55", "2004-10-19 11:05:20", ], "weights": [0.3333333333333333, 0.3333333333333333, 0.3333333333333333], }, "details": { "metric_configuration": { "metric_name": "column.value_counts", "domain_kwargs": { "column": "event_ts" }, "metric_value_kwargs": { "sort": "value" }, "metric_dependencies": None, }, "num_batches": 1, }, } parameter_node: ParameterNode = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=parameter_builder. json_serialized_fully_qualified_parameter_name, expected_return_type=None, variables=variables, parameters=parameters, ) assert parameter_node == expected_parameter_value
def test_oneshot_numeric_metric_range_multi_batch_parameter_builder_bobby( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } metric_domain_kwargs: dict = {"column": "fare_amount"} fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range" expected_value_dict: dict actual_value_dict: dict numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="column_min_range", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, estimator="oneshot", include_estimator_samples_histogram_in_details=True, false_positive_rate=1.0e-2, round_decimals=1, json_serialize=False, data_context=data_context, )) variables: Optional[ParameterContainer] = None domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_nodes: Optional[Dict[str, ParameterNode]] = ( parameter_container.parameter_nodes or {}) assert len(parameter_nodes) == 1 expected_value_dict = { "value": None, "details": { "metric_configuration": { "domain_kwargs": { "column": "fare_amount" }, "metric_name": "column.min", "metric_value_kwargs": None, "metric_dependencies": None, }, "num_batches": 3, }, } parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) actual_values_01: np.ndarray = parameter_node.pop("value") parameter_node["value"] = None actual_estimation_histogram: np.ndarray = parameter_node.details.pop( "estimation_histogram") assert parameter_node == expected_value_dict actual_value_01_lower: float = actual_values_01[0] actual_value_01_upper: float = actual_values_01[1] expected_value_01_lower: float = -51.7 expected_value_01_upper: float = -21.0 assert actual_value_01_lower == expected_value_01_lower assert actual_value_01_upper == expected_value_01_upper expected_estimation_histogram: np.ndarray = np.array([ 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, ]) # Assert no significant difference between expected (null hypothesis) and actual estimation histograms. ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram, data2=expected_estimation_histogram) p_value: float = ks_result[1] assert p_value > 9.5e-1 numeric_metric_range_parameter_builder = ( NumericMetricRangeMultiBatchParameterBuilder( name="column_min_range", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, estimator="oneshot", include_estimator_samples_histogram_in_details=True, false_positive_rate=5.0e-2, round_decimals=1, json_serialize=False, data_context=data_context, )) numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, recompute_existing_parameter_values=True, batch_request=batch_request, ) parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) actual_values_05 = parameter_node.pop("value") parameter_node["value"] = None actual_estimation_histogram: np.ndarray = parameter_node.details.pop( "estimation_histogram") assert parameter_node == expected_value_dict actual_value_05_lower: float = actual_values_05[0] actual_value_05_upper: float = actual_values_05[1] expected_value_05_lower: float = -50.5 expected_value_05_upper: float = -21.1 assert actual_value_05_lower == expected_value_05_lower assert actual_value_05_upper == expected_value_05_upper # if false positive rate is higher, our range should be more narrow assert actual_value_01_lower < actual_value_05_lower assert actual_value_01_upper > actual_value_05_upper expected_estimation_histogram: np.ndarray = np.array([ 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, ]) # Assert no significant difference between expected (null hypothesis) and actual estimation histograms. ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram, data2=expected_estimation_histogram) p_value: float = ks_result[1] assert p_value > 9.5e-1
def test_simple_date_format_parameter_builder_bobby( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) metric_domain_kwargs: dict = {"column": "pickup_datetime"} candidate_strings: list[str] = [ "%Y-%m-%d", "%Y-%m-%d %H:%M:%S", ] threshold: float = 0.9 batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } date_format_string_parameter: SimpleDateFormatStringParameterBuilder = ( SimpleDateFormatStringParameterBuilder( name="my_simple_date_format_string_parameter_builder", metric_domain_kwargs=metric_domain_kwargs, candidate_strings=candidate_strings, threshold=threshold, data_context=data_context, )) assert date_format_string_parameter._candidate_strings == set( candidate_strings) assert date_format_string_parameter._threshold == 0.9 domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None date_format_string_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) assert (parameter_container.parameter_nodes is None or len(parameter_container.parameter_nodes) == 1) fully_qualified_parameter_name_for_value: str = ( "$parameter.my_simple_date_format_string_parameter_builder.value") expected_value: str = "%Y-%m-%d %H:%M:%S" parameter_node: ParameterNode = get_parameter_value_and_validate_return_type( parameter_reference=fully_qualified_parameter_name_for_value, expected_return_type=str, domain=domain, parameters=parameters, ) assert parameter_node == expected_value fully_qualified_parameter_name_for_meta: str = ( "$parameter.my_simple_date_format_string_parameter_builder.details") expected_meta: dict = { "success_ratio": 1.0, "candidate_strings": { "%Y-%m-%d": 0.0, "%Y-%m-%d %H:%M:%S": 1.0 }, } meta: dict = get_parameter_value_and_validate_return_type( parameter_reference=fully_qualified_parameter_name_for_meta, expected_return_type=dict, domain=domain, parameters=parameters, ) assert meta == expected_meta
def test_numeric_metric_range_multi_batch_parameter_builder_bobby_kde_bw_method( bobby_columnar_table_multi_batch_deterministic_data_context, ): """ This tests whether a change to bw_method results in a change to the range """ data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) # BatchRequest yielding three batches batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } metric_domain_kwargs: dict = {"column": "fare_amount"} numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="column_min_range", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, estimator="kde", false_positive_rate=5.0e-2, round_decimals=0, json_serialize=False, data_context=data_context, )) variables: Optional[ParameterContainer] = None domain: Domain = Domain( rule_name="my_rule", domain_type=MetricDomainTypes.TABLE, ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_nodes: Optional[Dict[str, ParameterNode]] = ( parameter_container.parameter_nodes or {}) assert len(parameter_nodes) == 1 fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range" parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) default_bw_method_value: np.ndarray = parameter_node.pop("value") numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="column_min_range", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, estimator="kde", bw_method=0.5, false_positive_rate=5.0e-2, round_decimals=0, json_serialize=False, data_context=data_context, )) variables: Optional[ParameterContainer] = None domain: Domain = Domain( rule_name="my_rule", domain_type=MetricDomainTypes.TABLE, ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_nodes: Optional[Dict[str, ParameterNode]] = ( parameter_container.parameter_nodes or {}) assert len(parameter_nodes) == 1 fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range" parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) other_bw_method_value: np.ndarray = parameter_node.pop("value") assert default_bw_method_value[0] != other_bw_method_value[0]
def test_get_parameter_values_for_fully_qualified_parameter_names( parameters_with_different_depth_level_values, ): parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) build_parameter_container( parameter_container=parameter_container, parameter_values=parameters_with_different_depth_level_values, ) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=None, details=None, rule_name="my_rule", ) # Convert variables argument to ParameterContainer variables: ParameterContainer = build_parameter_container_for_variables( variables_configs={ "my_int": 9, "my_float": 3.38, "my_string": "hello", } ) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } # fmt: off expected_parameter_values_for_fully_qualified_parameter_names: Dict[str, ParameterNode] = { "$variables": { "my_int": 9, "my_float": 3.38, "my_string": "hello", }, "$parameter.weekly_taxi_fairs.mean_values": { "value": [ { "sunday": 71.43, "monday": 74.35, "tuesday": 42.3, "wednesday": 42.3, "thursday": 82.2, "friday": 78.78, "saturday": 91.39, }, { "sunday": 81.43, "monday": 84.35, "tuesday": 52.3, "wednesday": 43.3, "thursday": 22.2, "friday": 98.78, "saturday": 81.39, }, { "sunday": 61.43, "monday": 34.35, "tuesday": 82.3, "wednesday": 72.3, "thursday": 22.2, "friday": 38.78, "saturday": 51.39, }, { "sunday": 51.43, "monday": 64.35, "tuesday": 72.3, "wednesday": 82.3, "thursday": 22.2, "friday": 98.78, "saturday": 31.39, }, { "sunday": 72.43, "monday": 77.35, "tuesday": 46.3, "wednesday": 47.3, "thursday": 88.2, "friday": 79.78, "saturday": 93.39, }, { "sunday": 72.43, "monday": 73.35, "tuesday": 41.3, "wednesday": 49.3, "thursday": 80.2, "friday": 78.78, "saturday": 93.39, }, { "sunday": 74.43, "monday": 78.35, "tuesday": 49.3, "wednesday": 43.3, "thursday": 88.2, "friday": 72.78, "saturday": 97.39, }, { "sunday": 73.43, "monday": 72.35, "tuesday": 40.3, "wednesday": 40.3, "thursday": 89.2, "friday": 77.78, "saturday": 90.39, }, { "sunday": 72.43, "monday": 73.35, "tuesday": 45.3, "wednesday": 44.3, "thursday": 89.2, "friday": 77.78, "saturday": 96.39, }, { "sunday": 75.43, "monday": 74.25, "tuesday": 42.33, "wednesday": 42.23, "thursday": 82.21, "friday": 78.76, "saturday": 91.37, }, { "sunday": 71.43, "monday": 74.37, "tuesday": 42.3, "wednesday": 42.32, "thursday": 82.23, "friday": 78.77, "saturday": 91.49, }, { "sunday": 71.63, "monday": 74.37, "tuesday": 42.2, "wednesday": 42.1, "thursday": 82.29, "friday": 78.79, "saturday": 91.39, }, { "sunday": 71.42, "monday": 74.33, "tuesday": 42.33, "wednesday": 42.34, "thursday": 82.25, "friday": 78.77, "saturday": 91.69, }, { "sunday": 71.44, "monday": 72.35, "tuesday": 42.33, "wednesday": 42.31, "thursday": 82.29, "friday": 78.68, "saturday": 91.49, }, { "sunday": 71.44, "monday": 74.32, "tuesday": 42.32, "wednesday": 42.32, "thursday": 82.29, "friday": 78.77, "saturday": 91.49, }, { "sunday": 71.44, "monday": 74.33, "tuesday": 42.21, "wednesday": 42.31, "thursday": 82.27, "friday": 78.74, "saturday": 91.49, }, { "sunday": 71.33, "monday": 74.25, "tuesday": 42.31, "wednesday": 42.03, "thursday": 82.02, "friday": 78.08, "saturday": 91.38, }, { "sunday": 71.41, "monday": 74.31, "tuesday": 42.39, "wednesday": 42.93, "thursday": 82.92, "friday": 78.75, "saturday": 91.49, }, { "sunday": 72.43, "monday": 73.35, "tuesday": 42.3, "wednesday": 32.3, "thursday": 52.2, "friday": 88.78, "saturday": 81.39, }, { "sunday": 71.43, "monday": 74.35, "tuesday": 32.3, "wednesday": 92.3, "thursday": 72.2, "friday": 74.78, "saturday": 51.39, }, { "sunday": 72.43, "monday": 64.35, "tuesday": 52.3, "wednesday": 42.39, "thursday": 82.28, "friday": 78.77, "saturday": 91.36, }, { "sunday": 81.43, "monday": 94.35, "tuesday": 62.3, "wednesday": 52.3, "thursday": 92.2, "friday": 88.78, "saturday": 51.39, }, { "sunday": 21.43, "monday": 34.35, "tuesday": 42.34, "wednesday": 62.3, "thursday": 52.2, "friday": 98.78, "saturday": 81.39, }, { "sunday": 71.33, "monday": 74.25, "tuesday": 42.13, "wednesday": 42.93, "thursday": 82.82, "friday": 78.78, "saturday": 91.39, }, { "sunday": 72.43, "monday": 73.35, "tuesday": 44.3, "wednesday": 45.3, "thursday": 86.2, "friday": 77.78, "saturday": 98.39, }, { "sunday": 79.43, "monday": 78.35, "tuesday": 47.3, "wednesday": 46.3, "thursday": 85.2, "friday": 74.78, "saturday": 93.39, }, { "sunday": 71.42, "monday": 74.31, "tuesday": 42.0, "wednesday": 42.1, "thursday": 82.23, "friday": 65.78, "saturday": 91.26, }, { "sunday": 91.43, "monday": 84.35, "tuesday": 42.37, "wednesday": 42.36, "thursday": 82.25, "friday": 78.74, "saturday": 91.32, }, { "sunday": 71.33, "monday": 74.45, "tuesday": 42.35, "wednesday": 42.36, "thursday": 82.27, "friday": 26.78, "saturday": 71.39, }, { "sunday": 71.53, "monday": 73.35, "tuesday": 43.32, "wednesday": 42.23, "thursday": 82.32, "friday": 78.18, "saturday": 91.49, }, { "sunday": 71.53, "monday": 74.25, "tuesday": 52.3, "wednesday": 52.3, "thursday": 81.23, "friday": 78.78, "saturday": 78.39, }, ], "details": { "confidence": "high", }, }, "$parameter.tolerances.mostly": 0.91, "$parameter.tolerances.financial.usd": 1.0, "$parameter.monthly_taxi_fairs.mean_values": { "value": [ 2.3, 9.8, 42.3, 8.1, 38.5, 53.7, 71.43, 16.34, 49.43, 74.35, 51.98, 46.42, 20.01, 69.44, 65.32, 8.83, 55.79, 82.2, 36.93, 83.78, 31.13, 76.93, 67.67, 25.12, 58.04, 79.78, 90.91, 15.26, 61.65, 78.78, 12.99, ], "details": { "confidence": "low", }, }, "$parameter.date_strings.yyyy_mm_dd_hh_mm_ss_tz_date_format": { "value": "%Y-%m-%d %H:%M:%S %Z", "details": { "confidence": 0.78, }, }, "$parameter.date_strings.yyyy_mm_dd_date_format": { "value": "%Y-%m-%d", "details": { "confidence": 0.78, }, }, "$parameter.date_strings.tolerances.max_num_conversion_attempts": 5, "$parameter.date_strings.tolerances.max_abs_error_time_milliseconds": 100, "$parameter.date_strings.mm_yyyy_dd_hh_mm_ss_tz_date_format": { "value": "%m-%Y-%d %H:%M:%S %Z", "details": { "confidence": 0.78, }, }, "$parameter.date_strings.mm_yyyy_dd_date_format": { "value": "%m-%Y-%d", "details": { "confidence": 0.78, }, }, "$parameter.daily_taxi_fairs.mean_values": { "value": { "sunday": 71.43, "monday": 74.35, "tuesday": 42.3, "wednesday": 42.3, "thursday": 82.2, "friday": 78.78, "saturday": 91.39, }, "details": { "confidence": "medium", }, }, "$mean": 0.65, } # fmt: on parameter_values_for_fully_qualified_parameter_names: Dict[ str, ParameterNode ] = get_parameter_values_for_fully_qualified_parameter_names( domain=domain, variables=variables, parameters=parameters, ) assert ( parameter_values_for_fully_qualified_parameter_names == expected_parameter_values_for_fully_qualified_parameter_names )