def test_profiler_parameter_builder_added(data_context_with_taxi_data): """ What does this test and why? This test now adds a simple ParameterBuilder to our Rule. More specifically, we use a MetricMultiBatchParameterBuilder to pass in the min_value parameter to expect_column_values_to_be_greater_than. """ context: DataContext = data_context_with_taxi_data batch_request: BatchRequest = BatchRequest( datasource_name="taxi_multibatch_datasource_other_possibility", data_connector_name="default_inferred_data_connector_name", data_asset_name="yellow_tripdata_sample_2018", data_connector_query={"index": -1}, ) domain_builder: DomainBuilder = ColumnDomainBuilder( include_column_name_suffixes=["_amount"], data_context=context, ) # parameter_builder numeric_range_parameter_builder: MetricMultiBatchParameterBuilder = ( MetricMultiBatchParameterBuilder( data_context=context, metric_name="column.min", metric_domain_kwargs="$domain.domain_kwargs", name="my_column_min", )) config_builder: DefaultExpectationConfigurationBuilder = ( DefaultExpectationConfigurationBuilder( expectation_type="expect_column_values_to_be_greater_than", value="$parameter.my_column_min.value[-1]", column="$domain.domain_kwargs.column", )) simple_rule: Rule = Rule( name="rule_with_variables_and_parameters", variables=None, domain_builder=domain_builder, parameter_builders=[numeric_range_parameter_builder], expectation_configuration_builders=[config_builder], ) my_rbp = RuleBasedProfiler( name="my_rbp", config_version=1.0, data_context=context, ) my_rbp.add_rule(rule=simple_rule) result: RuleBasedProfilerResult = my_rbp.run(batch_request=batch_request) expectation_configurations: List[ ExpectationConfiguration] = result.expectation_configurations assert len(expectation_configurations) == 4
def test_add_rule_and_run_profiler(data_context_with_taxi_data): """ What does this test and why? This is the first test where we build a Rule in memory and use the add_rule() method to add to our RuleBasedProfiler and run the profiler. We use the DomainBuilder from the previous test (against "_amount" columns) and an ExpectationConfigurationBuilder that uses expect_column_values_to_not_be_null because it only needs a domain value. The test eventually asserts that the profiler return 4 Expectations, one per column in our domain. """ context: DataContext = data_context_with_taxi_data batch_request: BatchRequest = BatchRequest( datasource_name="taxi_multibatch_datasource_other_possibility", data_connector_name="default_inferred_data_connector_name", data_asset_name="yellow_tripdata_sample_2018", data_connector_query={"index": -1}, ) domain_builder: DomainBuilder = ColumnDomainBuilder( include_column_name_suffixes=["_amount"], data_context=context, ) default_expectation_configuration_builder = DefaultExpectationConfigurationBuilder( expectation_type="expect_column_values_to_not_be_null", column="$domain.domain_kwargs.column", ) simple_rule: Rule = Rule( name="rule_with_no_variables_no_parameters", variables=None, domain_builder=domain_builder, expectation_configuration_builders=[ default_expectation_configuration_builder ], ) my_rbp: RuleBasedProfiler = RuleBasedProfiler( name="my_simple_rbp", config_version=1.0, data_context=context, ) my_rbp.add_rule(rule=simple_rule) result: RuleBasedProfilerResult = my_rbp.run(batch_request=batch_request) expectation_configurations: List[ ExpectationConfiguration] = result.expectation_configurations assert len(expectation_configurations) == 4
def _init_rule( self, rule_name: str, rule_config: Dict[str, Any], ) -> Rule: # Config is validated through schema but do a sanity check attr: str for attr in ( "domain_builder", "expectation_configuration_builders", ): if attr not in rule_config: raise ge_exceptions.ProfilerConfigurationError( message= f'Invalid rule "{rule_name}": missing mandatory {attr}.') # Instantiate builder attributes domain_builder: DomainBuilder = RuleBasedProfiler._init_rule_domain_builder( domain_builder_config=rule_config["domain_builder"], data_context=self._data_context, ) parameter_builders: Optional[List[ ParameterBuilder]] = RuleBasedProfiler._init_rule_parameter_builders( parameter_builder_configs=rule_config.get( "parameter_builders"), data_context=self._data_context, ) expectation_configuration_builders: List[ ExpectationConfigurationBuilder] = RuleBasedProfiler._init_rule_expectation_configuration_builders( expectation_configuration_builder_configs=rule_config[ "expectation_configuration_builders"]) # Compile previous steps and package into a Rule object return Rule( name=rule_name, domain_builder=domain_builder, parameter_builders=parameter_builders, expectation_configuration_builders= expectation_configuration_builders, )
def __init__( self, *, profiler_config: Optional[Dict[str, Dict[str, Dict[str, Any]]]] = None, data_context: Optional[DataContext] = None, ): """ Create a new Profiler using configured rules. For a rule or an item in a rule configuration, instantiates the following if available: a domain builder, a parameter builder, and a configuration builder. These will be used to define profiler computation patterns. Args: profiler_config: Variables and Rules configuration as a dictionary data_context: DataContext object that defines a full runtime environment (data access, etc.) """ self._profiler_config = profiler_config self._data_context = data_context self._rules = [] rules_configs: Dict[str, Dict[str, Any]] = self._profiler_config.get( "rules", {}) rule_name: str rule_config: Dict[str, Any] for rule_name, rule_config in rules_configs.items(): domain_builder_config: dict = rule_config.get("domain_builder") if domain_builder_config is None: raise ge_exceptions.ProfilerConfigurationError( message= f'Invalid rule "{rule_name}": no domain_builder found.') domain_builder: DomainBuilder = instantiate_class_from_config( config=domain_builder_config, runtime_environment={"data_context": data_context}, config_defaults={ "module_name": "great_expectations.rule_based_profiler.domain_builder" }, ) parameter_builders: List[ParameterBuilder] = [] parameter_builder_configs: dict = rule_config.get( "parameter_builders") if parameter_builder_configs: parameter_builder_config: dict for parameter_builder_config in parameter_builder_configs: parameter_builders.append( instantiate_class_from_config( config=parameter_builder_config, runtime_environment={"data_context": data_context}, config_defaults={ "module_name": "great_expectations.rule_based_profiler.parameter_builder" }, )) expectation_configuration_builders: List[ ExpectationConfigurationBuilder] = [] expectation_configuration_builder_configs: dict = rule_config.get( "expectation_configuration_builders") if expectation_configuration_builder_configs: expectation_configuration_builder_config: dict for (expectation_configuration_builder_config ) in expectation_configuration_builder_configs: expectation_configuration_builders.append( instantiate_class_from_config( config=expectation_configuration_builder_config, runtime_environment={}, config_defaults={ "class_name": "DefaultExpectationConfigurationBuilder", "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder", }, )) variables_configs: Dict[str, Dict] = self._profiler_config.get( "variables", {}) variables: Optional[ParameterContainer] = None if variables_configs: variables = build_parameter_container_for_variables( variables_configs=variables_configs) self._rules.append( Rule( name=rule_name, domain_builder=domain_builder, parameter_builders=parameter_builders, expectation_configuration_builders= expectation_configuration_builders, variables=variables, ))
def test_profiler_save_and_load(data_context_with_taxi_data): """ What does this test and why? This tests whether context.save_profiler() can be invoked to update a profiler that lives in Store. The test ensures that any changes that we make to the Profiler, like adding a rule, will be persisted. The test tests that context.save_profiler() and context.get_profiler() return the expected RBP. """ context: DataContext = data_context_with_taxi_data domain_builder: DomainBuilder = ColumnDomainBuilder( include_column_name_suffixes=["_amount"], data_context=context, ) # parameter_builder numeric_range_parameter_builder: MetricMultiBatchParameterBuilder = ( MetricMultiBatchParameterBuilder( data_context=context, metric_name="column.min", metric_domain_kwargs="$domain.domain_kwargs", name="my_column_min", )) config_builder: DefaultExpectationConfigurationBuilder = ( DefaultExpectationConfigurationBuilder( expectation_type="expect_column_values_to_be_greater_than", value="$parameter.my_column_min.value[-1]", column="$domain.domain_kwargs.column", )) simple_variables_rule: Rule = Rule( name="rule_with_no_variables_no_parameters", variables=None, domain_builder=domain_builder, parameter_builders=[numeric_range_parameter_builder], expectation_configuration_builders=[config_builder], ) my_rbp = RuleBasedProfiler( name="my_rbp", config_version=1.0, data_context=context, ) res: dict = my_rbp.config.to_json_dict() assert res == { "class_name": "RuleBasedProfiler", "module_name": "great_expectations.rule_based_profiler", "name": "my_rbp", "config_version": 1.0, "rules": None, "variables": {}, } my_rbp.add_rule(rule=simple_variables_rule) context.save_profiler(name="my_rbp", profiler=my_rbp) # load profiler from store my_loaded_profiler: RuleBasedProfiler = context.get_profiler(name="my_rbp") res = my_loaded_profiler.config.to_json_dict() assert res == { "module_name": "great_expectations.rule_based_profiler", "class_name": "RuleBasedProfiler", "name": "my_rbp", "config_version": 1.0, "variables": {}, "rules": { "rule_with_no_variables_no_parameters": { "domain_builder": { "module_name": "great_expectations.rule_based_profiler.domain_builder.column_domain_builder", "class_name": "ColumnDomainBuilder", "include_column_name_suffixes": [ "_amount", ], }, "variables": {}, "parameter_builders": [ { "module_name": "great_expectations.rule_based_profiler.parameter_builder.metric_multi_batch_parameter_builder", "class_name": "MetricMultiBatchParameterBuilder", "name": "my_column_min", "metric_name": "column.min", "metric_domain_kwargs": "$domain.domain_kwargs", "enforce_numeric_metric": False, "replace_nan_with_zero": False, "reduce_scalar_metric": True, "evaluation_parameter_builder_configs": None, }, ], "expectation_configuration_builders": [ { "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder.default_expectation_configuration_builder", "class_name": "DefaultExpectationConfigurationBuilder", "expectation_type": "expect_column_values_to_be_greater_than", "meta": {}, "column": "$domain.domain_kwargs.column", "validation_parameter_builder_configs": None, "value": "$parameter.my_column_min.value[-1]", }, ], }, }, }
def _reconcile_rule_expectation_configuration_builder_configs( rule: Rule, expectation_configuration_builder_configs: List[dict] ) -> List[dict]: """ Rule "expectation configuration builders" reconciliation involves combining the expectation configuration builders, instantiated from Rule configuration (e.g., stored in a YAML file managed by the Profiler store), with the expectation configuration builders overrides, possibly supplied as part of the candiate override rule configuration. The reconciliation logic for "expectation configuration builders" is of the "upsert" nature: A candidate override expectation configuration builder configuration contributes to the expectation configuration builders list of the rule if the corresponding expectation configuration builder name does not exist in the list of instantiated expectation configuration builders of the rule; otherwise, once instnatiated, it replaces the configuration associated with the original expectation configuration builder having the same name. :param rule: Profiler "rule", subject to expectations configuration builder overrides :param expectation_configuration_builder_configs: expectation configuration builder configuration overrides, supplied in dictionary (configuration) form :return: reconciled expectation configuration builder configuration, returned in dictionary (configuration) form """ expectation_configuration_builder_config: dict for (expectation_configuration_builder_config ) in expectation_configuration_builder_configs: _validate_builder_override_config( builder_config=expectation_configuration_builder_config) effective_expectation_configuration_builder_configs: Dict[str, dict] = {} current_expectation_configuration_builders: Dict[ str, ExpectationConfigurationBuilder] = rule._get_expectation_configuration_builders_as_dict( ) expectation_configuration_builder_name: str expectation_configuration_builder: ExpectationConfigurationBuilder expectation_configuration_builder_as_dict: dict for ( expectation_configuration_builder_name, expectation_configuration_builder, ) in current_expectation_configuration_builders.items(): expectation_configuration_builder_as_dict = ( expectation_configuration_builder.to_dict()) expectation_configuration_builder_as_dict[ "class_name"] = expectation_configuration_builder.__class__.__name__ expectation_configuration_builder_as_dict[ "module_name"] = expectation_configuration_builder.__class__.__module__ # Roundtrip through schema validation to add/or restore any missing fields. deserialized_config: ExpectationConfigurationBuilderConfig = ( expectationConfigurationBuilderConfigSchema.load( expectation_configuration_builder_as_dict)) serialized_config: dict = deserialized_config.to_dict() effective_expectation_configuration_builder_configs[ expectation_configuration_builder_name] = serialized_config effective_expectation_configuration_builder_configs = nested_update( effective_expectation_configuration_builder_configs, { expectation_configuration_builder_config["expectation_type"]: expectation_configuration_builder_config for expectation_configuration_builder_config in expectation_configuration_builder_configs }, dedup=True, ) if not effective_expectation_configuration_builder_configs: return [] return list( effective_expectation_configuration_builder_configs.values())