def test_profiler_parameter_builder_added(data_context_with_taxi_data): """ What does this test and why? This test now adds a simple ParameterBuilder to our Rule. More specifically, we use a MetricMultiBatchParameterBuilder to pass in the min_value parameter to expect_column_values_to_be_greater_than. """ context: DataContext = data_context_with_taxi_data batch_request: BatchRequest = BatchRequest( datasource_name="taxi_multibatch_datasource_other_possibility", data_connector_name="default_inferred_data_connector_name", data_asset_name="yellow_tripdata_sample_2018", data_connector_query={"index": -1}, ) domain_builder: DomainBuilder = ColumnDomainBuilder( include_column_name_suffixes=["_amount"], data_context=context, ) # parameter_builder numeric_range_parameter_builder: MetricMultiBatchParameterBuilder = ( MetricMultiBatchParameterBuilder( data_context=context, metric_name="column.min", metric_domain_kwargs="$domain.domain_kwargs", name="my_column_min", )) config_builder: DefaultExpectationConfigurationBuilder = ( DefaultExpectationConfigurationBuilder( expectation_type="expect_column_values_to_be_greater_than", value="$parameter.my_column_min.value[-1]", column="$domain.domain_kwargs.column", )) simple_rule: Rule = Rule( name="rule_with_variables_and_parameters", variables=None, domain_builder=domain_builder, parameter_builders=[numeric_range_parameter_builder], expectation_configuration_builders=[config_builder], ) my_rbp = RuleBasedProfiler( name="my_rbp", config_version=1.0, data_context=context, ) my_rbp.add_rule(rule=simple_rule) result: RuleBasedProfilerResult = my_rbp.run(batch_request=batch_request) expectation_configurations: List[ ExpectationConfiguration] = result.expectation_configurations assert len(expectation_configurations) == 4
def get_table_columns_metric_multi_batch_parameter_builder( ) -> ParameterBuilder: """ This method instantiates one commonly used "MetricMultiBatchParameterBuilder" with specified directives. """ metric_name: str = "table.columns" name: str = sanitize_parameter_name(name=metric_name) return MetricMultiBatchParameterBuilder( name=name, metric_name=metric_name, metric_domain_kwargs= DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=False, replace_nan_with_zero=False, reduce_scalar_metric=True, evaluation_parameter_builder_configs=None, data_context=None, )
def build_numeric_metric_multi_batch_parameter_builder( metric_name: str, metric_domain_kwargs: Optional[Union[ str, dict]] = DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs: Optional[Union[str, dict]] = None, ) -> MetricMultiBatchParameterBuilder: """ This method instantiates "MetricMultiBatchParameterBuilder" class with specific arguments for given purpose. """ name: str = sanitize_parameter_name(name=metric_name) return MetricMultiBatchParameterBuilder( name=name, metric_name=metric_name, metric_domain_kwargs=metric_domain_kwargs, metric_value_kwargs=metric_value_kwargs, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, evaluation_parameter_builder_configs=None, data_context=None, )
def test_profiler_save_and_load(data_context_with_taxi_data): """ What does this test and why? This tests whether context.save_profiler() can be invoked to update a profiler that lives in Store. The test ensures that any changes that we make to the Profiler, like adding a rule, will be persisted. The test tests that context.save_profiler() and context.get_profiler() return the expected RBP. """ context: DataContext = data_context_with_taxi_data domain_builder: DomainBuilder = ColumnDomainBuilder( include_column_name_suffixes=["_amount"], data_context=context, ) # parameter_builder numeric_range_parameter_builder: MetricMultiBatchParameterBuilder = ( MetricMultiBatchParameterBuilder( data_context=context, metric_name="column.min", metric_domain_kwargs="$domain.domain_kwargs", name="my_column_min", )) config_builder: DefaultExpectationConfigurationBuilder = ( DefaultExpectationConfigurationBuilder( expectation_type="expect_column_values_to_be_greater_than", value="$parameter.my_column_min.value[-1]", column="$domain.domain_kwargs.column", )) simple_variables_rule: Rule = Rule( name="rule_with_no_variables_no_parameters", variables=None, domain_builder=domain_builder, parameter_builders=[numeric_range_parameter_builder], expectation_configuration_builders=[config_builder], ) my_rbp = RuleBasedProfiler( name="my_rbp", config_version=1.0, data_context=context, ) res: dict = my_rbp.config.to_json_dict() assert res == { "class_name": "RuleBasedProfiler", "module_name": "great_expectations.rule_based_profiler", "name": "my_rbp", "config_version": 1.0, "rules": None, "variables": {}, } my_rbp.add_rule(rule=simple_variables_rule) context.save_profiler(name="my_rbp", profiler=my_rbp) # load profiler from store my_loaded_profiler: RuleBasedProfiler = context.get_profiler(name="my_rbp") res = my_loaded_profiler.config.to_json_dict() assert res == { "module_name": "great_expectations.rule_based_profiler", "class_name": "RuleBasedProfiler", "name": "my_rbp", "config_version": 1.0, "variables": {}, "rules": { "rule_with_no_variables_no_parameters": { "domain_builder": { "module_name": "great_expectations.rule_based_profiler.domain_builder.column_domain_builder", "class_name": "ColumnDomainBuilder", "include_column_name_suffixes": [ "_amount", ], }, "variables": {}, "parameter_builders": [ { "module_name": "great_expectations.rule_based_profiler.parameter_builder.metric_multi_batch_parameter_builder", "class_name": "MetricMultiBatchParameterBuilder", "name": "my_column_min", "metric_name": "column.min", "metric_domain_kwargs": "$domain.domain_kwargs", "enforce_numeric_metric": False, "replace_nan_with_zero": False, "reduce_scalar_metric": True, "evaluation_parameter_builder_configs": None, }, ], "expectation_configuration_builders": [ { "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder.default_expectation_configuration_builder", "class_name": "DefaultExpectationConfigurationBuilder", "expectation_type": "expect_column_values_to_be_greater_than", "meta": {}, "column": "$domain.domain_kwargs.column", "validation_parameter_builder_configs": None, "value": "$parameter.my_column_min.value[-1]", }, ], }, }, }
def test_mean_unexpected_map_metric_multi_batch_parameter_builder_bobby_numeric_dependencies_evaluated_separately( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context ) batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } my_total_count_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder( name="my_total_count", metric_name="table.row_count", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=False, replace_nan_with_zero=False, reduce_scalar_metric=True, evaluation_parameter_builder_configs=None, data_context=data_context, ) my_null_count_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder( name="my_null_count", metric_name="column_values.nonnull.unexpected_count", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=False, replace_nan_with_zero=False, reduce_scalar_metric=True, evaluation_parameter_builder_configs=None, data_context=data_context, ) mean_unexpected_map_metric_multi_batch_parameter_builder: ParameterBuilder = ( MeanUnexpectedMapMetricMultiBatchParameterBuilder( name="my_passenger_count_values_not_null_mean_unexpected_map_metric", map_metric_name="column_values.nonnull", total_count_parameter_builder_name="my_total_count", null_count_parameter_builder_name="my_null_count", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, evaluation_parameter_builder_configs=None, data_context=data_context, ) ) metric_domain_kwargs: dict = {"column": "passenger_count"} domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) variables: Optional[ParameterContainer] = None parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } my_total_count_metric_multi_batch_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) my_null_count_metric_multi_batch_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) mean_unexpected_map_metric_multi_batch_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) expected_parameter_value: float = 0.0 parameter_node: ParameterNode = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=mean_unexpected_map_metric_multi_batch_parameter_builder.json_serialized_fully_qualified_parameter_name, expected_return_type=None, variables=variables, parameters=parameters, ) rtol: float = RTOL atol: float = 5.0e-1 * ATOL np.testing.assert_allclose( actual=parameter_node.value, desired=expected_parameter_value, rtol=rtol, atol=atol, err_msg=f"Actual value of {parameter_node.value} differs from expected value of {expected_parameter_value} by more than {atol + rtol * abs(parameter_node.value)} tolerance.", )
def test_default_expectation_configuration_builder_alice_parentheses_parameter_variable_condition_true( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs: dict = {"column": "user_id"} min_user_id_parameter: MetricMultiBatchParameterBuilder = ( MetricMultiBatchParameterBuilder( name="my_min_user_id", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, data_context=data_context, ) ) variables: ParameterContainer = build_parameter_container_for_variables( {"max_user_id": 999999999999, "answer": 42} ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } min_user_id_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id.value[0]" parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) condition: str = "($variables.max_user_id>0 & $variables.answer==42) | $parameter.my_min_user_id.value[0]<0" max_value: str = "$variables.max_user_id" default_expectation_configuration_builder: DefaultExpectationConfigurationBuilder = DefaultExpectationConfigurationBuilder( expectation_type="expect_column_values_to_be_between", condition=condition, min_value=parameter_value, max_value=max_value, ) expectation_configuration: Optional[ ExpectationConfiguration ] = default_expectation_configuration_builder.build_expectation_configuration( domain=domain, variables=variables, parameters=parameters, ) assert expectation_configuration.kwargs["min_value"] == 397433
def test_condition_not_string_exception( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs: dict = {"column": "user_id"} min_user_id_parameter: MetricMultiBatchParameterBuilder = ( MetricMultiBatchParameterBuilder( name="my_min_user_id", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, data_context=data_context, ) ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } min_user_id_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id" parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) condition: dict = {"condition": "$variables.tolerance<0.8"} max_user_id: int = 999999999999 with pytest.raises(ge_exceptions.ProfilerExecutionError) as e: # noinspection PyTypeChecker DefaultExpectationConfigurationBuilder( expectation_type="expect_column_values_to_be_between", condition=condition, min_value=parameter_value.value[0], max_value=max_user_id, ) assert ( str(e.value) == 'Argument "{\'condition\': \'$variables.tolerance<0.8\'}" in "DefaultExpectationConfigurationBuilder" must be of type "string" (value of type "<class \'dict\'>" was encountered).\n' )
def test_default_expectation_configuration_builder_alice_null_condition_parameter_builder_validation_dependency_separate( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs: dict = {"column": "user_id"} min_user_id_parameter: MetricMultiBatchParameterBuilder = ( MetricMultiBatchParameterBuilder( name="my_min_user_id", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, data_context=data_context, ) ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } min_user_id_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id" parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) condition: Optional[str] = None max_user_id: int = 999999999999 default_expectation_configuration_builder = DefaultExpectationConfigurationBuilder( expectation_type="expect_column_values_to_be_between", condition=condition, min_value=parameter_value.value[0], max_value=max_user_id, ) expectation_configuration: Optional[ ExpectationConfiguration ] = default_expectation_configuration_builder.build_expectation_configuration( domain=domain, parameters=parameters, ) assert expectation_configuration.kwargs["min_value"] == 397433
def metrics_parameter_builders_by_domain( self, ) -> Dict[Domain, List[ParameterBuilder]]: table_row_count_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder( name="table_row_count", metric_name="table.row_count", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, evaluation_parameter_builder_configs=None, json_serialize=True, data_context=None, ) column_distinct_values_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder( name="column_distinct_values.count", metric_name="column.distinct_values.count", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, evaluation_parameter_builder_configs=None, json_serialize=True, data_context=None, ) return { Domain(domain_type=MetricDomainTypes.TABLE,): [ table_row_count_metric_multi_batch_parameter_builder, ], Domain(domain_type=MetricDomainTypes.COLUMN,): [ column_distinct_values_metric_multi_batch_parameter_builder, ], }