示例#1
0
def test_profiler_parameter_builder_added(data_context_with_taxi_data):
    """
    What does this test and why?

    This test now adds a simple ParameterBuilder to our Rule. More specifically,
    we use a MetricMultiBatchParameterBuilder to pass in the min_value parameter to
    expect_column_values_to_be_greater_than.
    """
    context: DataContext = data_context_with_taxi_data
    batch_request: BatchRequest = BatchRequest(
        datasource_name="taxi_multibatch_datasource_other_possibility",
        data_connector_name="default_inferred_data_connector_name",
        data_asset_name="yellow_tripdata_sample_2018",
        data_connector_query={"index": -1},
    )
    domain_builder: DomainBuilder = ColumnDomainBuilder(
        include_column_name_suffixes=["_amount"],
        data_context=context,
    )
    # parameter_builder
    numeric_range_parameter_builder: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            data_context=context,
            metric_name="column.min",
            metric_domain_kwargs="$domain.domain_kwargs",
            name="my_column_min",
        ))
    config_builder: DefaultExpectationConfigurationBuilder = (
        DefaultExpectationConfigurationBuilder(
            expectation_type="expect_column_values_to_be_greater_than",
            value="$parameter.my_column_min.value[-1]",
            column="$domain.domain_kwargs.column",
        ))
    simple_rule: Rule = Rule(
        name="rule_with_variables_and_parameters",
        variables=None,
        domain_builder=domain_builder,
        parameter_builders=[numeric_range_parameter_builder],
        expectation_configuration_builders=[config_builder],
    )
    my_rbp = RuleBasedProfiler(
        name="my_rbp",
        config_version=1.0,
        data_context=context,
    )
    my_rbp.add_rule(rule=simple_rule)
    result: RuleBasedProfilerResult = my_rbp.run(batch_request=batch_request)
    expectation_configurations: List[
        ExpectationConfiguration] = result.expectation_configurations
    assert len(expectation_configurations) == 4
def test_profile_excludes_citations(
    alice_columnar_table_single_batch_context,
    alice_columnar_table_single_batch,
):
    # Load data context
    data_context: DataContext = alice_columnar_table_single_batch_context

    # Load profiler configs & loop (run tests for each one)
    yaml_config: str = alice_columnar_table_single_batch["profiler_config"]

    # Instantiate Profiler
    profiler_config: dict = yaml.load(yaml_config)

    # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields.
    deserialized_config: dict = ruleBasedProfilerConfigSchema.load(profiler_config)
    serialized_config: dict = ruleBasedProfilerConfigSchema.dump(deserialized_config)

    # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config`
    # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern
    serialized_config.pop("class_name")
    serialized_config.pop("module_name")

    profiler: RuleBasedProfiler = RuleBasedProfiler(
        **serialized_config,
        data_context=data_context,
    )

    expectation_suite: ExpectationSuite = profiler.run(
        expectation_suite_name=alice_columnar_table_single_batch[
            "expected_expectation_suite_name"
        ],
        include_citation=False,
    )

    assert expectation_suite.meta.get("citations") is None
示例#3
0
    def __init__(
        self,
        name: str,
        validator: Validator,
    ) -> None:
        """
        DataAssistant subclasses guide "RuleBasedProfiler" to contain Rule configurations to embody profiling behaviors,
        corresponding to indended exploration and validation goals.  Then executing "RuleBasedProfiler.run()" yields
        "RuleBasedProfilerResult" object, containing "fully_qualified_parameter_names_by_domain",
        "parameter_values_for_fully_qualified_parameter_names_by_domain", "expectation_configurations", and "citation",
        immediately available for composing "ExpectationSuite" and validating underlying data "Batch" objects.

        Args:
            name: the name of this DataAssistant object
            validator: Validator object, containing loaded Batch objects as well as Expectation and Metric operations
        """
        self._name = name
        self._validator = validator

        self._profiler = RuleBasedProfiler(
            name=self.name,
            config_version=1.0,
            variables=None,
            data_context=self._validator.data_context,
        )
        self._build_profiler()
def test_profile_includes_citations(
    alice_columnar_table_single_batch_context,
    alice_columnar_table_single_batch,
):
    # Load data context
    data_context: DataContext = alice_columnar_table_single_batch_context

    # Load profiler configs & loop (run tests for each one)
    yaml_config: str = alice_columnar_table_single_batch["profiler_config"]

    # Instantiate Profiler
    profiler_config = yaml.load(yaml_config)
    # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config`
    # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern
    profiler_config.pop("class_name")

    profiler: RuleBasedProfiler = RuleBasedProfiler(
        **profiler_config,
        data_context=data_context,
    )

    expectation_suite: ExpectationSuite = profiler.run(
        expectation_suite_name=alice_columnar_table_single_batch[
            "expected_expectation_suite_name"
        ],
        include_citation=True,
    )

    assert len(expectation_suite.meta["citations"]) > 0
def test_save_profiler(
    mock_data_context: mock.MagicMock,
    populated_profiler_store: ProfilerStore,
    profiler_config_with_placeholder_args: RuleBasedProfilerConfig,
):
    with mock.patch(
            "great_expectations.data_context.store.profiler_store.ProfilerStore.set",
            return_value=profiler_config_with_placeholder_args,
    ):
        mock_data_context.save_profiler(
            profiler=profiler_config_with_placeholder_args,
            profiler_store=populated_profiler_store,
            name="my_profiler",
            ge_cloud_id=None,
        )

    with mock.patch(
            "great_expectations.data_context.store.profiler_store.ProfilerStore.get",
            return_value=profiler_config_with_placeholder_args,
    ):
        profiler = RuleBasedProfiler.get_profiler(
            data_context=mock_data_context,
            profiler_store=populated_profiler_store,
            name="my_profiler",
            ge_cloud_id=None,
        )
    assert isinstance(profiler, RuleBasedProfiler)
示例#6
0
def test_bobster_profiler_user_workflow_multi_batch_row_count_range_rule_bootstrap_sampling_method(
    bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000_data_context,
    bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000,
):
    # Load data context
    data_context: DataContext = (
        bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000_data_context
    )

    # Load profiler configs & loop (run tests for each one)
    yaml_config: str = bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[
        "profiler_config"]

    # Instantiate Profiler
    profiler_config: CommentedMap = yaml.load(yaml_config)

    # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields.
    deserialized_config: dict = ruleBasedProfilerConfigSchema.load(
        profiler_config)
    serialized_config: dict = ruleBasedProfilerConfigSchema.dump(
        deserialized_config)

    # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config`
    # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern
    serialized_config.pop("class_name")
    serialized_config.pop("module_name")

    profiler: RuleBasedProfiler = RuleBasedProfiler(
        **serialized_config,
        data_context=data_context,
    )

    expectation_suite: ExpectationSuite = profiler.run(
        expectation_suite_name=
        bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[
            "test_configuration_bootstrap_sampling_method"]
        ["expectation_suite_name"], )
    expect_table_row_count_to_be_between_expectation_configuration_kwargs: dict = (
        expectation_suite.to_json_dict()["expectations"][0]["kwargs"])
    min_value: int = (
        expect_table_row_count_to_be_between_expectation_configuration_kwargs[
            "min_value"])
    max_value: int = (
        expect_table_row_count_to_be_between_expectation_configuration_kwargs[
            "max_value"])

    assert (bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[
        "test_configuration_bootstrap_sampling_method"]
            ["expect_table_row_count_to_be_between_min_value_mean_value"] <
            min_value <
            bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[
                "test_configuration_bootstrap_sampling_method"]
            ["expect_table_row_count_to_be_between_mean_value"])
    assert (bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[
        "test_configuration_bootstrap_sampling_method"]
            ["expect_table_row_count_to_be_between_mean_value"] < max_value <
            bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[
                "test_configuration_bootstrap_sampling_method"]
            ["expect_table_row_count_to_be_between_max_value_mean_value"])
示例#7
0
def test_get_profiler_run_usage_statistics_without_handler():
    # Without a DataContext, the usage stats handler is not propogated down to the RBP
    profiler: RuleBasedProfiler = RuleBasedProfiler(
        name="my_profiler",
        config_version=1.0,
    )
    payload: dict = get_profiler_run_usage_statistics(profiler=profiler)
    assert payload == {}
示例#8
0
    def __init__(
        self,
        name: str,
        validator: Optional[Validator],
    ) -> None:
        """
        DataAssistant subclasses guide "RuleBasedProfiler" to contain Rule configurations to embody profiling behaviors,
        corresponding to indended exploration and validation goals.  Then executing "RuleBasedProfiler.run()" yields
        "RuleBasedProfilerResult" object, containing "fully_qualified_parameter_names_by_domain",
        "parameter_values_for_fully_qualified_parameter_names_by_domain", "expectation_configurations", and "citation",
        immediately available for composing "ExpectationSuite" and validating underlying data "Batch" objects.

        Args:
            name: the name of this DataAssistant object
            validator: Validator object, containing loaded Batch objects as well as Expectation and Metric operations
        """
        self._name = name

        self._validator = validator

        if validator is None:
            self._data_context = None
            self._batches = None
        else:
            self._data_context = self._validator.data_context
            self._batches = self._validator.batches

        variables: Optional[Dict[str, Any]] = self.get_variables() or {}
        self._profiler = RuleBasedProfiler(
            name=self.name,
            config_version=1.0,
            variables=variables,
            data_context=self._data_context,
        )

        self._metrics_parameter_builders_by_domain = {}

        rules: Optional[List[Rule]] = self.get_rules() or []

        rule: Rule
        for rule in rules:
            self.profiler.add_rule(rule=rule)
            self._metrics_parameter_builders_by_domain[Domain(
                domain_type=rule.domain_builder.domain_type,
                rule_name=rule.name,
            )] = rule.parameter_builders
示例#9
0
def test_profile_get_expectation_suite(
    mock_emit,
    alice_columnar_table_single_batch_context,
    alice_columnar_table_single_batch,
):
    # Load data context
    data_context: DataContext = alice_columnar_table_single_batch_context

    # Load profiler configs & loop (run tests for each one)
    yaml_config: str = alice_columnar_table_single_batch["profiler_config"]

    # Instantiate Profiler
    profiler_config = yaml.load(yaml_config)
    # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config`
    # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern
    profiler_config.pop("class_name")

    profiler: RuleBasedProfiler = RuleBasedProfiler(
        **profiler_config,
        data_context=data_context,
    )

    # BatchRequest yielding exactly one batch
    alice_single_batch_data_batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    result: RuleBasedProfilerResult = profiler.run(
        batch_request=alice_single_batch_data_batch_request)

    expectation_suite_name: str = "my_suite"

    suite: ExpectationSuite = result.get_expectation_suite(
        expectation_suite_name=expectation_suite_name)

    assert suite is not None and len(suite.expectations) > 0

    assert mock_emit.call_count == 44

    # noinspection PyUnresolvedReferences
    actual_events: List[unittest.mock._Call] = mock_emit.call_args_list
    assert (actual_events[-1][0][0]["event"] == UsageStatsEvents.
            RULE_BASED_PROFILER_RESULT_GET_EXPECTATION_SUITE.value)
示例#10
0
def test_add_rule_and_run_profiler(data_context_with_taxi_data):
    """
    What does this test and why?

    This is the first test where we build a Rule in memory and use the add_rule() method
    to add to our RuleBasedProfiler and run the profiler. We use the DomainBuilder from
    the previous test (against "_amount" columns) and an ExpectationConfigurationBuilder
    that uses expect_column_values_to_not_be_null because it only needs a domain value.

    The test eventually asserts that the profiler return 4 Expectations, one per column in
    our domain.
    """
    context: DataContext = data_context_with_taxi_data
    batch_request: BatchRequest = BatchRequest(
        datasource_name="taxi_multibatch_datasource_other_possibility",
        data_connector_name="default_inferred_data_connector_name",
        data_asset_name="yellow_tripdata_sample_2018",
        data_connector_query={"index": -1},
    )
    domain_builder: DomainBuilder = ColumnDomainBuilder(
        include_column_name_suffixes=["_amount"],
        data_context=context,
    )
    default_expectation_configuration_builder = DefaultExpectationConfigurationBuilder(
        expectation_type="expect_column_values_to_not_be_null",
        column="$domain.domain_kwargs.column",
    )
    simple_rule: Rule = Rule(
        name="rule_with_no_variables_no_parameters",
        variables=None,
        domain_builder=domain_builder,
        expectation_configuration_builders=[
            default_expectation_configuration_builder
        ],
    )
    my_rbp: RuleBasedProfiler = RuleBasedProfiler(
        name="my_simple_rbp",
        config_version=1.0,
        data_context=context,
    )
    my_rbp.add_rule(rule=simple_rule)
    result: RuleBasedProfilerResult = my_rbp.run(batch_request=batch_request)
    expectation_configurations: List[
        ExpectationConfiguration] = result.expectation_configurations
    assert len(expectation_configurations) == 4
示例#11
0
def test_profile_includes_citations(
    mock_emit,
    alice_columnar_table_single_batch_context,
    alice_columnar_table_single_batch,
):
    # Load data context
    data_context: DataContext = alice_columnar_table_single_batch_context

    # Load profiler configs & loop (run tests for each one)
    yaml_config: str = alice_columnar_table_single_batch["profiler_config"]

    # Instantiate Profiler
    profiler_config = yaml.load(yaml_config)
    # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config`
    # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern
    profiler_config.pop("class_name")

    profiler: RuleBasedProfiler = RuleBasedProfiler(
        **profiler_config,
        data_context=data_context,
    )

    # BatchRequest yielding exactly one batch
    alice_single_batch_data_batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    result: RuleBasedProfilerResult = profiler.run(
        batch_request=alice_single_batch_data_batch_request)

    assert result.citation is not None and len(result.citation.keys()) > 0

    assert mock_emit.call_count == 43
    assert all(payload[0][0]["event"] == "data_context.get_batch_list"
               for payload in mock_emit.call_args_list[:-1])

    # noinspection PyUnresolvedReferences
    actual_events: List[unittest.mock._Call] = mock_emit.call_args_list
    assert (actual_events[-1][0][0]["event"] ==
            UsageStatsEvents.RULE_BASED_PROFILER_RUN.value)
示例#12
0
def test_get_profiler_run_usage_statistics_with_handler_invalid_payload(
    mock_data_context: mock.MagicMock, ):
    # Ensure that real handler gets passed down by the context
    handler: UsageStatisticsHandler = UsageStatisticsHandler(
        mock_data_context, "my_id", "my_url")
    mock_data_context.usage_statistics_handler = handler

    profiler: RuleBasedProfiler = RuleBasedProfiler(
        name="my_profiler", config_version=1.0, data_context=mock_data_context)

    payload: dict = get_profiler_run_usage_statistics(profiler=profiler)

    # Payload won't pass schema validation due to a lack of rules but we can confirm that it is anonymized
    assert payload == {
        "anonymized_name": "a0061ec021855cd2b3a994dd8d90fe5d",
        "config_version": 1.0,
        "rule_count": 0,
        "variable_count": 0,
    }
示例#13
0
def test_bobby_profiler_user_workflow_multi_batch_row_count_range_rule_and_column_ranges_rule_oneshot_sampling_method(
    bobby_columnar_table_multi_batch_deterministic_data_context,
    bobby_columnar_table_multi_batch,
):
    # Load data context
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # Load profiler configs & loop (run tests for each one)
    yaml_config: str = bobby_columnar_table_multi_batch["profiler_config"]

    # Instantiate Profiler
    profiler_config: dict = yaml.load(yaml_config)

    # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields.
    deserialized_config: dict = ruleBasedProfilerConfigSchema.load(
        profiler_config)
    serialized_config: dict = ruleBasedProfilerConfigSchema.dump(
        deserialized_config)

    # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config`
    # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern
    serialized_config.pop("class_name")
    serialized_config.pop("module_name")

    profiler: RuleBasedProfiler = RuleBasedProfiler(
        **serialized_config,
        data_context=data_context,
    )

    expectation_suite: ExpectationSuite = profiler.run(
        expectation_suite_name=bobby_columnar_table_multi_batch[
            "test_configuration_oneshot_sampling_method"]
        ["expectation_suite_name"],
        include_citation=True,
    )

    assert (expectation_suite == bobby_columnar_table_multi_batch[
        "test_configuration_oneshot_sampling_method"]
            ["expected_expectation_suite"])
示例#14
0
def test_get_profiler_run_usage_statistics_with_handler_valid_payload(
    mock_data_context: mock.MagicMock, ):
    # Ensure that real handler gets passed down by the context
    handler: UsageStatisticsHandler = UsageStatisticsHandler(
        mock_data_context, "my_id", "my_url")
    mock_data_context.usage_statistics_handler = handler

    profiler: RuleBasedProfiler = RuleBasedProfiler(
        name="my_profiler", config_version=1.0, data_context=mock_data_context)

    override_rules: Dict[str, dict] = {
        "my_override_rule": {
            "domain_builder": {
                "class_name":
                "ColumnDomainBuilder",
                "module_name":
                "great_expectations.rule_based_profiler.domain_builder",
            },
            "parameter_builders": [
                {
                    "class_name": "MetricMultiBatchParameterBuilder",
                    "module_name":
                    "great_expectations.rule_based_profiler.parameter_builder",
                    "name": "my_parameter",
                    "metric_name": "my_metric",
                },
                {
                    "class_name":
                    "NumericMetricRangeMultiBatchParameterBuilder",
                    "module_name":
                    "great_expectations.rule_based_profiler.parameter_builder",
                    "name": "my_other_parameter",
                    "metric_name": "my_other_metric",
                },
            ],
            "expectation_configuration_builders": [
                {
                    "class_name": "DefaultExpectationConfigurationBuilder",
                    "module_name":
                    "great_expectations.rule_based_profiler.expectation_configuration_builder",
                    "expectation_type":
                    "expect_column_pair_values_A_to_be_greater_than_B",
                    "column_A": "$domain.domain_kwargs.column_A",
                    "column_B": "$domain.domain_kwargs.column_B",
                    "my_one_arg": "$parameter.my_parameter.value[0]",
                    "meta": {
                        "details": {
                            "my_parameter_estimator":
                            "$parameter.my_parameter.details",
                            "note":
                            "Important remarks about estimation algorithm.",
                        },
                    },
                },
                {
                    "class_name": "DefaultExpectationConfigurationBuilder",
                    "module_name":
                    "great_expectations.rule_based_profiler.expectation_configuration_builder",
                    "expectation_type": "expect_column_min_to_be_between",
                    "column": "$domain.domain_kwargs.column",
                    "my_another_arg": "$parameter.my_other_parameter.value[0]",
                    "meta": {
                        "details": {
                            "my_other_parameter_estimator":
                            "$parameter.my_other_parameter.details",
                            "note":
                            "Important remarks about estimation algorithm.",
                        },
                    },
                },
            ],
        },
    }

    payload: dict = get_profiler_run_usage_statistics(profiler=profiler,
                                                      rules=override_rules)

    assert payload == {
        "anonymized_name":
        "a0061ec021855cd2b3a994dd8d90fe5d",
        "anonymized_rules": [{
            "anonymized_domain_builder": {
                "parent_class": "ColumnDomainBuilder"
            },
            "anonymized_expectation_configuration_builders": [
                {
                    "expectation_type":
                    "expect_column_pair_values_A_to_be_greater_than_B",
                    "parent_class": "DefaultExpectationConfigurationBuilder",
                },
                {
                    "expectation_type": "expect_column_min_to_be_between",
                    "parent_class": "DefaultExpectationConfigurationBuilder",
                },
            ],
            "anonymized_name":
            "bd8a8b4465a94b363caf2b307c080547",
            "anonymized_parameter_builders": [
                {
                    "anonymized_name": "25dac9e56a1969727bc0f90db6eaa833",
                    "parent_class": "MetricMultiBatchParameterBuilder",
                },
                {
                    "anonymized_name": "be5baa3f1064e6e19356f2168968cbeb",
                    "parent_class":
                    "NumericMetricRangeMultiBatchParameterBuilder",
                },
            ],
        }],
        "config_version":
        1.0,
        "rule_count":
        1,
        "variable_count":
        0,
    }
示例#15
0
        column: $domain.domain_kwargs.column
        min_value: $parameter.max_range.value[0]
        max_value: $parameter.max_range.value[1]
        mostly: $variables.mostly
        meta:
          profiler_details: $parameter.max_range.details
"""

data_context = DataContext()

# Instantiate RuleBasedProfiler
full_profiler_config_dict: dict = yaml.load(profiler_config)
rule_based_profiler: RuleBasedProfiler = RuleBasedProfiler(
    name=full_profiler_config_dict["name"],
    config_version=full_profiler_config_dict["config_version"],
    rules=full_profiler_config_dict["rules"],
    variables=full_profiler_config_dict["variables"],
    data_context=data_context,
)

batch_request: dict = {
    "datasource_name": "taxi_pandas",
    "data_connector_name": "monthly",
    "data_asset_name": "my_reports",
    "data_connector_query": {
        "index": "-6:-1",
    },
}

result: RuleBasedProfilerResult = rule_based_profiler.run(
    batch_request=batch_request)
示例#16
0
def test_profiler_save_and_load(data_context_with_taxi_data):
    """
    What does this test and why?

    This tests whether context.save_profiler() can be invoked to update a profiler that lives in Store.
    The test ensures that any changes that we make to the Profiler, like adding a rule, will be persisted.

    The test tests that context.save_profiler() and context.get_profiler() return the expected RBP.
    """
    context: DataContext = data_context_with_taxi_data
    domain_builder: DomainBuilder = ColumnDomainBuilder(
        include_column_name_suffixes=["_amount"],
        data_context=context,
    )
    # parameter_builder
    numeric_range_parameter_builder: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            data_context=context,
            metric_name="column.min",
            metric_domain_kwargs="$domain.domain_kwargs",
            name="my_column_min",
        ))
    config_builder: DefaultExpectationConfigurationBuilder = (
        DefaultExpectationConfigurationBuilder(
            expectation_type="expect_column_values_to_be_greater_than",
            value="$parameter.my_column_min.value[-1]",
            column="$domain.domain_kwargs.column",
        ))
    simple_variables_rule: Rule = Rule(
        name="rule_with_no_variables_no_parameters",
        variables=None,
        domain_builder=domain_builder,
        parameter_builders=[numeric_range_parameter_builder],
        expectation_configuration_builders=[config_builder],
    )
    my_rbp = RuleBasedProfiler(
        name="my_rbp",
        config_version=1.0,
        data_context=context,
    )
    res: dict = my_rbp.config.to_json_dict()
    assert res == {
        "class_name": "RuleBasedProfiler",
        "module_name": "great_expectations.rule_based_profiler",
        "name": "my_rbp",
        "config_version": 1.0,
        "rules": None,
        "variables": {},
    }
    my_rbp.add_rule(rule=simple_variables_rule)
    context.save_profiler(name="my_rbp", profiler=my_rbp)

    # load profiler from store
    my_loaded_profiler: RuleBasedProfiler = context.get_profiler(name="my_rbp")

    res = my_loaded_profiler.config.to_json_dict()
    assert res == {
        "module_name": "great_expectations.rule_based_profiler",
        "class_name": "RuleBasedProfiler",
        "name": "my_rbp",
        "config_version": 1.0,
        "variables": {},
        "rules": {
            "rule_with_no_variables_no_parameters": {
                "domain_builder": {
                    "module_name":
                    "great_expectations.rule_based_profiler.domain_builder.column_domain_builder",
                    "class_name": "ColumnDomainBuilder",
                    "include_column_name_suffixes": [
                        "_amount",
                    ],
                },
                "variables": {},
                "parameter_builders": [
                    {
                        "module_name":
                        "great_expectations.rule_based_profiler.parameter_builder.metric_multi_batch_parameter_builder",
                        "class_name": "MetricMultiBatchParameterBuilder",
                        "name": "my_column_min",
                        "metric_name": "column.min",
                        "metric_domain_kwargs": "$domain.domain_kwargs",
                        "enforce_numeric_metric": False,
                        "replace_nan_with_zero": False,
                        "reduce_scalar_metric": True,
                        "evaluation_parameter_builder_configs": None,
                    },
                ],
                "expectation_configuration_builders": [
                    {
                        "module_name":
                        "great_expectations.rule_based_profiler.expectation_configuration_builder.default_expectation_configuration_builder",
                        "class_name": "DefaultExpectationConfigurationBuilder",
                        "expectation_type":
                        "expect_column_values_to_be_greater_than",
                        "meta": {},
                        "column": "$domain.domain_kwargs.column",
                        "validation_parameter_builder_configs": None,
                        "value": "$parameter.my_column_min.value[-1]",
                    },
                ],
            },
        },
    }
示例#17
0
def test_quentin_profiler_user_workflow_multi_batch_quantiles_value_ranges_rule(
    quentin_columnar_table_multi_batch_data_context,
    quentin_columnar_table_multi_batch,
):
    # Load data context
    data_context: DataContext = quentin_columnar_table_multi_batch_data_context

    # Load profiler configs & loop (run tests for each one)
    yaml_config: str = quentin_columnar_table_multi_batch["profiler_config"]

    # Instantiate Profiler
    profiler_config: CommentedMap = yaml.load(yaml_config)

    # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields.
    deserialized_config: dict = ruleBasedProfilerConfigSchema.load(
        profiler_config)
    serialized_config: dict = ruleBasedProfilerConfigSchema.dump(
        deserialized_config)

    # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config`
    # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern
    serialized_config.pop("class_name")
    serialized_config.pop("module_name")

    profiler: RuleBasedProfiler = RuleBasedProfiler(
        **serialized_config,
        data_context=data_context,
    )

    expectation_suite: ExpectationSuite = profiler.run(
        expectation_suite_name=quentin_columnar_table_multi_batch[
            "test_configuration"]["expectation_suite_name"], )

    expectation_configuration_dict: dict
    column_name: str
    expectation_kwargs: dict
    expect_column_quantile_values_to_be_between_expectation_configurations_kwargs_dict: Dict[
        str, dict] = {
            expectation_configuration_dict["kwargs"]["column"]:
            expectation_configuration_dict["kwargs"]
            for expectation_configuration_dict in
            expectation_suite.to_json_dict()["expectations"]
        }
    expect_column_quantile_values_to_be_between_expectation_configurations_value_ranges_by_column: Dict[
        str, List[List[Number]]] = {
            column_name: expectation_kwargs["value_ranges"]
            for column_name, expectation_kwargs in
            expect_column_quantile_values_to_be_between_expectation_configurations_kwargs_dict
            .items()
        }

    assert (
        expect_column_quantile_values_to_be_between_expectation_configurations_value_ranges_by_column[
            "tolls_amount"] ==
        quentin_columnar_table_multi_batch["test_configuration"]
        ["expect_column_quantile_values_to_be_between_quantile_ranges_by_column"]
        ["tolls_amount"])

    # Measure of "closeness" between "actual" and "desired" is computed as: atol + rtol * abs(desired)
    # (see "https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_allclose.html" for details).
    rtol: float = 1.0e-7
    atol: float = 5.0e-2

    value_range: List[Number]
    paired_quantiles: zip
    column_quantiles: List[List[Number]]
    idx: int
    for (
            column_name,
            column_quantiles,
    ) in (expect_column_quantile_values_to_be_between_expectation_configurations_value_ranges_by_column
          .items()):
        paired_quantiles = zip(
            column_quantiles,
            quentin_columnar_table_multi_batch["test_configuration"]
            ["expect_column_quantile_values_to_be_between_quantile_ranges_by_column"]
            [column_name],
        )
        for value_range in list(paired_quantiles):
            for idx in range(2):
                np.testing.assert_allclose(
                    actual=value_range[0][idx],
                    desired=value_range[1][idx],
                    rtol=rtol,
                    atol=atol,
                    err_msg=
                    f"Actual value of {value_range[0][idx]} differs from expected value of {value_range[1][idx]} by more than {atol + rtol * abs(value_range[1][idx])} tolerance.",
                )