示例#1
0
def test_profiler_parameter_builder_added(data_context_with_taxi_data):
    """
    What does this test and why?

    This test now adds a simple ParameterBuilder to our Rule. More specifically,
    we use a MetricMultiBatchParameterBuilder to pass in the min_value parameter to
    expect_column_values_to_be_greater_than.
    """
    context: DataContext = data_context_with_taxi_data
    batch_request: BatchRequest = BatchRequest(
        datasource_name="taxi_multibatch_datasource_other_possibility",
        data_connector_name="default_inferred_data_connector_name",
        data_asset_name="yellow_tripdata_sample_2018",
        data_connector_query={"index": -1},
    )
    domain_builder: DomainBuilder = ColumnDomainBuilder(
        include_column_name_suffixes=["_amount"],
        data_context=context,
    )
    # parameter_builder
    numeric_range_parameter_builder: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            data_context=context,
            metric_name="column.min",
            metric_domain_kwargs="$domain.domain_kwargs",
            name="my_column_min",
        ))
    config_builder: DefaultExpectationConfigurationBuilder = (
        DefaultExpectationConfigurationBuilder(
            expectation_type="expect_column_values_to_be_greater_than",
            value="$parameter.my_column_min.value[-1]",
            column="$domain.domain_kwargs.column",
        ))
    simple_rule: Rule = Rule(
        name="rule_with_variables_and_parameters",
        variables=None,
        domain_builder=domain_builder,
        parameter_builders=[numeric_range_parameter_builder],
        expectation_configuration_builders=[config_builder],
    )
    my_rbp = RuleBasedProfiler(
        name="my_rbp",
        config_version=1.0,
        data_context=context,
    )
    my_rbp.add_rule(rule=simple_rule)
    result: RuleBasedProfilerResult = my_rbp.run(batch_request=batch_request)
    expectation_configurations: List[
        ExpectationConfiguration] = result.expectation_configurations
    assert len(expectation_configurations) == 4
示例#2
0
def test_add_rule_and_run_profiler(data_context_with_taxi_data):
    """
    What does this test and why?

    This is the first test where we build a Rule in memory and use the add_rule() method
    to add to our RuleBasedProfiler and run the profiler. We use the DomainBuilder from
    the previous test (against "_amount" columns) and an ExpectationConfigurationBuilder
    that uses expect_column_values_to_not_be_null because it only needs a domain value.

    The test eventually asserts that the profiler return 4 Expectations, one per column in
    our domain.
    """
    context: DataContext = data_context_with_taxi_data
    batch_request: BatchRequest = BatchRequest(
        datasource_name="taxi_multibatch_datasource_other_possibility",
        data_connector_name="default_inferred_data_connector_name",
        data_asset_name="yellow_tripdata_sample_2018",
        data_connector_query={"index": -1},
    )
    domain_builder: DomainBuilder = ColumnDomainBuilder(
        include_column_name_suffixes=["_amount"],
        data_context=context,
    )
    default_expectation_configuration_builder = DefaultExpectationConfigurationBuilder(
        expectation_type="expect_column_values_to_not_be_null",
        column="$domain.domain_kwargs.column",
    )
    simple_rule: Rule = Rule(
        name="rule_with_no_variables_no_parameters",
        variables=None,
        domain_builder=domain_builder,
        expectation_configuration_builders=[
            default_expectation_configuration_builder
        ],
    )
    my_rbp: RuleBasedProfiler = RuleBasedProfiler(
        name="my_simple_rbp",
        config_version=1.0,
        data_context=context,
    )
    my_rbp.add_rule(rule=simple_rule)
    result: RuleBasedProfilerResult = my_rbp.run(batch_request=batch_request)
    expectation_configurations: List[
        ExpectationConfiguration] = result.expectation_configurations
    assert len(expectation_configurations) == 4
    def _init_rule(
        self,
        rule_name: str,
        rule_config: Dict[str, Any],
    ) -> Rule:
        # Config is validated through schema but do a sanity check
        attr: str
        for attr in (
                "domain_builder",
                "expectation_configuration_builders",
        ):
            if attr not in rule_config:
                raise ge_exceptions.ProfilerConfigurationError(
                    message=
                    f'Invalid rule "{rule_name}": missing mandatory {attr}.')

        # Instantiate builder attributes
        domain_builder: DomainBuilder = RuleBasedProfiler._init_rule_domain_builder(
            domain_builder_config=rule_config["domain_builder"],
            data_context=self._data_context,
        )
        parameter_builders: Optional[List[
            ParameterBuilder]] = RuleBasedProfiler._init_rule_parameter_builders(
                parameter_builder_configs=rule_config.get(
                    "parameter_builders"),
                data_context=self._data_context,
            )
        expectation_configuration_builders: List[
            ExpectationConfigurationBuilder] = RuleBasedProfiler._init_rule_expectation_configuration_builders(
                expectation_configuration_builder_configs=rule_config[
                    "expectation_configuration_builders"])

        # Compile previous steps and package into a Rule object
        return Rule(
            name=rule_name,
            domain_builder=domain_builder,
            parameter_builders=parameter_builders,
            expectation_configuration_builders=
            expectation_configuration_builders,
        )
示例#4
0
    def __init__(
        self,
        *,
        profiler_config: Optional[Dict[str, Dict[str, Dict[str, Any]]]] = None,
        data_context: Optional[DataContext] = None,
    ):
        """
        Create a new Profiler using configured rules.
        For a rule or an item in a rule configuration, instantiates the following if
        available: a domain builder, a parameter builder, and a configuration builder.
        These will be used to define profiler computation patterns.

        Args:
            profiler_config: Variables and Rules configuration as a dictionary
            data_context: DataContext object that defines a full runtime environment (data access, etc.)
        """
        self._profiler_config = profiler_config
        self._data_context = data_context
        self._rules = []

        rules_configs: Dict[str, Dict[str, Any]] = self._profiler_config.get(
            "rules", {})
        rule_name: str
        rule_config: Dict[str, Any]

        for rule_name, rule_config in rules_configs.items():
            domain_builder_config: dict = rule_config.get("domain_builder")

            if domain_builder_config is None:
                raise ge_exceptions.ProfilerConfigurationError(
                    message=
                    f'Invalid rule "{rule_name}": no domain_builder found.')

            domain_builder: DomainBuilder = instantiate_class_from_config(
                config=domain_builder_config,
                runtime_environment={"data_context": data_context},
                config_defaults={
                    "module_name":
                    "great_expectations.rule_based_profiler.domain_builder"
                },
            )

            parameter_builders: List[ParameterBuilder] = []

            parameter_builder_configs: dict = rule_config.get(
                "parameter_builders")

            if parameter_builder_configs:
                parameter_builder_config: dict
                for parameter_builder_config in parameter_builder_configs:
                    parameter_builders.append(
                        instantiate_class_from_config(
                            config=parameter_builder_config,
                            runtime_environment={"data_context": data_context},
                            config_defaults={
                                "module_name":
                                "great_expectations.rule_based_profiler.parameter_builder"
                            },
                        ))

            expectation_configuration_builders: List[
                ExpectationConfigurationBuilder] = []

            expectation_configuration_builder_configs: dict = rule_config.get(
                "expectation_configuration_builders")

            if expectation_configuration_builder_configs:
                expectation_configuration_builder_config: dict
                for (expectation_configuration_builder_config
                     ) in expectation_configuration_builder_configs:
                    expectation_configuration_builders.append(
                        instantiate_class_from_config(
                            config=expectation_configuration_builder_config,
                            runtime_environment={},
                            config_defaults={
                                "class_name":
                                "DefaultExpectationConfigurationBuilder",
                                "module_name":
                                "great_expectations.rule_based_profiler.expectation_configuration_builder",
                            },
                        ))

            variables_configs: Dict[str, Dict] = self._profiler_config.get(
                "variables", {})
            variables: Optional[ParameterContainer] = None

            if variables_configs:
                variables = build_parameter_container_for_variables(
                    variables_configs=variables_configs)

            self._rules.append(
                Rule(
                    name=rule_name,
                    domain_builder=domain_builder,
                    parameter_builders=parameter_builders,
                    expectation_configuration_builders=
                    expectation_configuration_builders,
                    variables=variables,
                ))
示例#5
0
def test_profiler_save_and_load(data_context_with_taxi_data):
    """
    What does this test and why?

    This tests whether context.save_profiler() can be invoked to update a profiler that lives in Store.
    The test ensures that any changes that we make to the Profiler, like adding a rule, will be persisted.

    The test tests that context.save_profiler() and context.get_profiler() return the expected RBP.
    """
    context: DataContext = data_context_with_taxi_data
    domain_builder: DomainBuilder = ColumnDomainBuilder(
        include_column_name_suffixes=["_amount"],
        data_context=context,
    )
    # parameter_builder
    numeric_range_parameter_builder: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            data_context=context,
            metric_name="column.min",
            metric_domain_kwargs="$domain.domain_kwargs",
            name="my_column_min",
        ))
    config_builder: DefaultExpectationConfigurationBuilder = (
        DefaultExpectationConfigurationBuilder(
            expectation_type="expect_column_values_to_be_greater_than",
            value="$parameter.my_column_min.value[-1]",
            column="$domain.domain_kwargs.column",
        ))
    simple_variables_rule: Rule = Rule(
        name="rule_with_no_variables_no_parameters",
        variables=None,
        domain_builder=domain_builder,
        parameter_builders=[numeric_range_parameter_builder],
        expectation_configuration_builders=[config_builder],
    )
    my_rbp = RuleBasedProfiler(
        name="my_rbp",
        config_version=1.0,
        data_context=context,
    )
    res: dict = my_rbp.config.to_json_dict()
    assert res == {
        "class_name": "RuleBasedProfiler",
        "module_name": "great_expectations.rule_based_profiler",
        "name": "my_rbp",
        "config_version": 1.0,
        "rules": None,
        "variables": {},
    }
    my_rbp.add_rule(rule=simple_variables_rule)
    context.save_profiler(name="my_rbp", profiler=my_rbp)

    # load profiler from store
    my_loaded_profiler: RuleBasedProfiler = context.get_profiler(name="my_rbp")

    res = my_loaded_profiler.config.to_json_dict()
    assert res == {
        "module_name": "great_expectations.rule_based_profiler",
        "class_name": "RuleBasedProfiler",
        "name": "my_rbp",
        "config_version": 1.0,
        "variables": {},
        "rules": {
            "rule_with_no_variables_no_parameters": {
                "domain_builder": {
                    "module_name":
                    "great_expectations.rule_based_profiler.domain_builder.column_domain_builder",
                    "class_name": "ColumnDomainBuilder",
                    "include_column_name_suffixes": [
                        "_amount",
                    ],
                },
                "variables": {},
                "parameter_builders": [
                    {
                        "module_name":
                        "great_expectations.rule_based_profiler.parameter_builder.metric_multi_batch_parameter_builder",
                        "class_name": "MetricMultiBatchParameterBuilder",
                        "name": "my_column_min",
                        "metric_name": "column.min",
                        "metric_domain_kwargs": "$domain.domain_kwargs",
                        "enforce_numeric_metric": False,
                        "replace_nan_with_zero": False,
                        "reduce_scalar_metric": True,
                        "evaluation_parameter_builder_configs": None,
                    },
                ],
                "expectation_configuration_builders": [
                    {
                        "module_name":
                        "great_expectations.rule_based_profiler.expectation_configuration_builder.default_expectation_configuration_builder",
                        "class_name": "DefaultExpectationConfigurationBuilder",
                        "expectation_type":
                        "expect_column_values_to_be_greater_than",
                        "meta": {},
                        "column": "$domain.domain_kwargs.column",
                        "validation_parameter_builder_configs": None,
                        "value": "$parameter.my_column_min.value[-1]",
                    },
                ],
            },
        },
    }
    def _reconcile_rule_expectation_configuration_builder_configs(
            rule: Rule, expectation_configuration_builder_configs: List[dict]
    ) -> List[dict]:
        """
        Rule "expectation configuration builders" reconciliation involves combining the expectation configuration builders, instantiated from Rule
        configuration (e.g., stored in a YAML file managed by the Profiler store), with the expectation configuration builders
        overrides, possibly supplied as part of the candiate override rule configuration.

        The reconciliation logic for "expectation configuration builders" is of the "upsert" nature: A candidate override expectation configuration
        builder configuration contributes to the expectation configuration builders list of the rule if the corresponding expectation configuration
        builder name does not exist in the list of instantiated expectation configuration builders of the rule; otherwise, once
        instnatiated, it replaces the configuration associated with the original expectation configuration builder having the same name.

        :param rule: Profiler "rule", subject to expectations configuration builder overrides
        :param expectation_configuration_builder_configs: expectation configuration builder configuration overrides, supplied in dictionary (configuration) form
        :return: reconciled expectation configuration builder configuration, returned in dictionary (configuration) form
        """
        expectation_configuration_builder_config: dict
        for (expectation_configuration_builder_config
             ) in expectation_configuration_builder_configs:
            _validate_builder_override_config(
                builder_config=expectation_configuration_builder_config)

        effective_expectation_configuration_builder_configs: Dict[str,
                                                                  dict] = {}

        current_expectation_configuration_builders: Dict[
            str,
            ExpectationConfigurationBuilder] = rule._get_expectation_configuration_builders_as_dict(
            )

        expectation_configuration_builder_name: str
        expectation_configuration_builder: ExpectationConfigurationBuilder
        expectation_configuration_builder_as_dict: dict
        for (
                expectation_configuration_builder_name,
                expectation_configuration_builder,
        ) in current_expectation_configuration_builders.items():
            expectation_configuration_builder_as_dict = (
                expectation_configuration_builder.to_dict())
            expectation_configuration_builder_as_dict[
                "class_name"] = expectation_configuration_builder.__class__.__name__
            expectation_configuration_builder_as_dict[
                "module_name"] = expectation_configuration_builder.__class__.__module__

            # Roundtrip through schema validation to add/or restore any missing fields.
            deserialized_config: ExpectationConfigurationBuilderConfig = (
                expectationConfigurationBuilderConfigSchema.load(
                    expectation_configuration_builder_as_dict))
            serialized_config: dict = deserialized_config.to_dict()

            effective_expectation_configuration_builder_configs[
                expectation_configuration_builder_name] = serialized_config

        effective_expectation_configuration_builder_configs = nested_update(
            effective_expectation_configuration_builder_configs,
            {
                expectation_configuration_builder_config["expectation_type"]:
                expectation_configuration_builder_config
                for expectation_configuration_builder_config in
                expectation_configuration_builder_configs
            },
            dedup=True,
        )

        if not effective_expectation_configuration_builder_configs:
            return []

        return list(
            effective_expectation_configuration_builder_configs.values())