def __init__(self, expectation_suite_identifier, run_id, batch_identifier):
        """Constructs a ValidationResultIdentifier

        Args:
            expectation_suite_identifier (ExpectationSuiteIdentifier, list, tuple, or dict):
                identifying information for the fully qualified expectation suite used to validate
            run_id (RunIdentifier): The run_id for which validation occurred
        """
        super().__init__()
        self._expectation_suite_identifier = expectation_suite_identifier
        if isinstance(run_id, str):
            warnings.warn(
                "String run_ids will be deprecated in the future. Please provide a run_id of type "
                "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name "
                "and run_time (both optional).",
                DeprecationWarning,
            )
            try:
                run_time = parse(run_id)
            except (ValueError, TypeError):
                run_time = None
            run_id = RunIdentifier(run_name=run_id, run_time=run_time)
        elif isinstance(run_id, dict):
            run_id = RunIdentifier(**run_id)
        elif run_id is None:
            run_id = RunIdentifier()
        elif not isinstance(run_id, RunIdentifier):
            run_id = RunIdentifier(run_name=str(run_id))

        self._run_id = run_id
        self._batch_identifier = batch_identifier
def test_StoreAction():
    fake_in_memory_store = ValidationsStore(
        store_backend={
            "class_name": "InMemoryStoreBackend",
        }
    )
    stores = {"fake_in_memory_store": fake_in_memory_store}

    class Object:
        ge_cloud_mode = False

    data_context = Object()
    data_context.stores = stores

    action = StoreValidationResultAction(
        data_context=data_context,
        target_store_name="fake_in_memory_store",
    )
    assert fake_in_memory_store.list_keys() == []

    action.run(
        validation_result_suite_identifier=ValidationResultIdentifier(
            expectation_suite_identifier=ExpectationSuiteIdentifier(
                expectation_suite_name="default_expectations"
            ),
            run_id=RunIdentifier(run_name="prod_20190801"),
            batch_identifier="1234",
        ),
        validation_result_suite=ExpectationSuiteValidationResult(
            success=False, results=[]
        ),
        data_asset=None,
    )

    expected_run_id = RunIdentifier(
        run_name="prod_20190801", run_time="20190926T134241.000000Z"
    )

    assert len(fake_in_memory_store.list_keys()) == 1
    stored_identifier = fake_in_memory_store.list_keys()[0]
    assert stored_identifier.batch_identifier == "1234"
    assert (
        stored_identifier.expectation_suite_identifier.expectation_suite_name
        == "default_expectations"
    )
    assert stored_identifier.run_id == expected_run_id

    assert fake_in_memory_store.get(
        ValidationResultIdentifier(
            expectation_suite_identifier=ExpectationSuiteIdentifier(
                expectation_suite_name="default_expectations"
            ),
            run_id=expected_run_id,
            batch_identifier="1234",
        )
    ) == ExpectationSuiteValidationResult(success=False, results=[])
예제 #3
0
    def profile(
        cls,
        data_asset,
        run_id=None,
        profiler_configuration=None,
        run_name=None,
        run_time=None,
    ):
        assert not (run_id and run_name) and not (
            run_id and run_time
        ), "Please provide either a run_id or run_name and/or run_time."
        if isinstance(run_id, str) and not run_name:
            # deprecated-v0.11.0
            warnings.warn(
                "String run_ids are deprecated as of v0.11.0 and support will be removed in v0.16. Please provide a run_id of type "
                "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name "
                "and run_time (both optional). Instead of providing a run_id, you may also provide"
                "run_name and run_time separately.",
                DeprecationWarning,
            )
            try:
                run_time = parse(run_id)
            except (ValueError, TypeError):
                pass
            run_id = RunIdentifier(run_name=run_id, run_time=run_time)
        elif isinstance(run_id, dict):
            run_id = RunIdentifier(**run_id)
        elif not isinstance(run_id, RunIdentifier):
            run_name = run_name or "profiling"
            run_id = RunIdentifier(run_name=run_name, run_time=run_time)

        if not cls.validate(data_asset):
            raise GreatExpectationsError(
                "Invalid data_asset for profiler; aborting")

        expectation_suite = cls._profile(data_asset,
                                         configuration=profiler_configuration)

        batch_kwargs = data_asset.batch_kwargs
        expectation_suite = cls.add_meta(expectation_suite, batch_kwargs)
        validation_results = data_asset.validate(expectation_suite,
                                                 run_id=run_id,
                                                 result_format="SUMMARY")
        expectation_suite.add_citation(
            comment=
            f"{str(cls.__name__)} added a citation based on the current batch.",
            batch_kwargs=data_asset.batch_kwargs,
            batch_markers=data_asset.batch_markers,
            batch_parameters=data_asset.batch_parameters,
        )
        return expectation_suite, validation_results
예제 #4
0
def test_warning_and_failure_validation_operator(
        validation_operators_data_context):
    data_context = validation_operators_data_context
    validator_batch_kwargs = data_context.build_batch_kwargs(
        "my_datasource", "subdir_reader", "f1")

    batch = data_context.get_batch(expectation_suite_name="f1.warning",
                                   batch_kwargs=validator_batch_kwargs)

    # NOTE: 20200130 - JPC - currently the warning and failure validation operator ignores the batch-provided suite and
    # fetches its own

    assert data_context.validations_store.list_keys() == []

    # We want to demonstrate running the validation operator with both a pre-built batch (DataAsset) and with
    # a tuple of parameters for get_batch
    results = data_context.run_validation_operator(
        assets_to_validate=[batch],
        run_id=RunIdentifier(run_name="test-100"),
        validation_operator_name="errors_and_warnings_validation_operator",
        base_expectation_suite_name="f1",
    )

    validations_keys = data_context.validations_store.list_keys()
    assert (
        len(validations_keys) == 2
    )  # we should have run two suites even though there was only one batch
    suite_names = [
        key.expectation_suite_identifier.expectation_suite_name
        for key in validations_keys
    ]
    assert "f1.warning" in suite_names
    assert "f1.failure" in suite_names
def test_TupleGCSStoreBackend_base_public_path():
    """
    What does this test and why?

    the base_public_path parameter allows users to point to a custom DNS when hosting Data docs.

    This test will exercise the get_url_for_key method twice to see that we are getting the expected url,
    with or without base_public_path
    """
    bucket = "leakybucket"
    prefix = "this_is_a_test_prefix"
    project = "dummy-project"
    base_public_path = "http://www.test.com/"

    with patch("google.cloud.storage.Client",
               autospec=True) as mock_gcs_client:
        mock_client = mock_gcs_client.return_value
        mock_bucket = mock_client.get_bucket.return_value
        mock_blob = mock_bucket.blob.return_value

        my_store_with_base_public_path = TupleGCSStoreBackend(
            filepath_template=None,
            bucket=bucket,
            prefix=prefix,
            project=project,
            base_public_path=base_public_path,
        )

        my_store_with_base_public_path.set(("BBB", ),
                                           b"bbb",
                                           content_encoding=None,
                                           content_type="image/png")

    run_id = RunIdentifier("my_run_id", datetime.datetime.utcnow())
    key = ValidationResultIdentifier(
        ExpectationSuiteIdentifier(expectation_suite_name="my_suite_name"),
        run_id,
        "my_batch_id",
    )
    run_time_string = run_id.to_tuple()[1]

    url = my_store_with_base_public_path.get_public_url_for_key(key.to_tuple())
    assert (
        url == "http://www.test.com/leakybucket" +
        f"/this_is_a_test_prefix/my_suite_name/my_run_id/{run_time_string}/my_batch_id"
    )
예제 #6
0
def test_database_evaluation_parameter_store_get_bind_params(param_store):
    # Bind params must be expressed as a string-keyed dictionary.
    # Verify that the param_store supports that
    run_id = RunIdentifier(run_name=datetime.datetime.now(
        datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ"))
    metric_identifier = ValidationMetricIdentifier(
        run_id=run_id,
        data_asset_name=None,
        expectation_suite_identifier="asset.warning",
        metric_name=
        "expect_column_values_to_match_regex.result.unexpected_percent",
        metric_kwargs_id="column=mycol",
    )

    metric_value = 12.3456789
    param_store.set(metric_identifier, metric_value)

    metric_identifier = ValidationMetricIdentifier(
        run_id=run_id,
        data_asset_name=None,
        expectation_suite_identifier="asset.warning",
        metric_name=
        "expect_table_row_count_to_be_between.result.observed_value",
        metric_kwargs_id=None,
    )

    metric_value = 512
    param_store.set(metric_identifier, metric_value)

    metric_identifier = ValidationMetricIdentifier(
        run_id=run_id,
        data_asset_name=None,
        expectation_suite_identifier="asset2.warning",
        metric_name=
        "expect_column_values_to_match_regex.result.unexpected_percent",
        metric_kwargs_id="column=mycol",
    )

    metric_value = 12.3456789
    param_store.set(metric_identifier, metric_value)

    params = param_store.get_bind_params(run_id)
    assert params == {
        "urn:great_expectations:validations:asset.warning:"
        "expect_column_values_to_match_regex.result.unexpected_percent:column=mycol":
        12.3456789,
        "urn:great_expectations:validations:asset.warning:"
        "expect_table_row_count_to_be_between.result.observed_value":
        512,
        "urn:great_expectations:validations:asset2.warning:"
        "expect_column_values_to_match_regex.result.unexpected_percent:column=mycol":
        12.3456789,
    }
def ge_validation_result_suite_id() -> ValidationResultIdentifier:
    validation_result_suite_id = ValidationResultIdentifier(
        expectation_suite_identifier=ExpectationSuiteIdentifier(
            "asset.default"),
        run_id=RunIdentifier(
            run_name="test_100",
            run_time=datetime.fromtimestamp(1640701702, tz=timezone.utc),
        ),
        batch_identifier="010ef8c1cd417910b971f4468f024ec5",
    )

    return validation_result_suite_id
예제 #8
0
def test_resource_key_passes_run_name_filter():
    resource_key = ValidationResultIdentifier(
        expectation_suite_identifier=ExpectationSuiteIdentifier("test_suite"),
        run_id=RunIdentifier(run_name="foofooprofilingfoo"),
        batch_identifier="f14c3d2f6e8028c2db0c25edabdb0d61",
    )

    assert (resource_key_passes_run_name_filter(
        resource_key, run_name_filter={"equals": "profiling"}) is False)
    assert (resource_key_passes_run_name_filter(
        resource_key, run_name_filter={"equals": "foofooprofilingfoo"}) is
            True)

    assert (resource_key_passes_run_name_filter(
        resource_key, run_name_filter={"not_equals": "profiling"}) is True)
    assert (resource_key_passes_run_name_filter(
        resource_key, run_name_filter={"not_equals": "foofooprofilingfoo"}) is
            False)

    assert (resource_key_passes_run_name_filter(
        resource_key, run_name_filter={"includes": "profiling"}) is True)
    assert (resource_key_passes_run_name_filter(
        resource_key, run_name_filter={"includes": "foobar"}) is False)

    assert (resource_key_passes_run_name_filter(
        resource_key, run_name_filter={"not_includes": "foobar"}) is True)
    assert (resource_key_passes_run_name_filter(
        resource_key, run_name_filter={"not_includes": "profiling"}) is False)

    assert (resource_key_passes_run_name_filter(
        resource_key,
        run_name_filter={"matches_regex": "(foo){2}profiling("
                         "foo)+"},
    ) is True)
    assert (resource_key_passes_run_name_filter(
        resource_key,
        run_name_filter={"matches_regex": "(foo){3}profiling("
                         "foo)+"},
    ) is False)
    with pytest.warns(DeprecationWarning):
        assert (resource_key_passes_run_name_filter(
            resource_key, run_name_filter={"eq": "profiling"}) is False)
        assert (resource_key_passes_run_name_filter(
            resource_key, run_name_filter={"eq": "foofooprofilingfoo"}) is
                True)
    with pytest.warns(DeprecationWarning):
        assert (resource_key_passes_run_name_filter(
            resource_key, run_name_filter={"ne": "profiling"}) is True)
        assert (resource_key_passes_run_name_filter(
            resource_key, run_name_filter={"ne": "foofooprofilingfoo"}) is
                False)
예제 #9
0
    def __init__(
        self,
        run_id,
        data_asset_name,
        expectation_suite_identifier,
        metric_name,
        metric_kwargs_id,
    ) -> None:
        super().__init__(metric_name, metric_kwargs_id)
        if not isinstance(expectation_suite_identifier, ExpectationSuiteIdentifier):
            expectation_suite_identifier = ExpectationSuiteIdentifier(
                expectation_suite_name=expectation_suite_identifier
            )

        if isinstance(run_id, str):
            # deprecated-v0.11.0
            warnings.warn(
                "String run_ids are deprecated as of v0.11.0 and support will be removed in v0.16. Please provide a run_id of type "
                "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name "
                "and run_time (both optional).",
                DeprecationWarning,
            )
            try:
                run_time = parse(run_id)
            except (ValueError, TypeError):
                run_time = None
            run_id = RunIdentifier(run_name=run_id, run_time=run_time)
        elif isinstance(run_id, dict):
            run_id = RunIdentifier(**run_id)
        elif run_id is None:
            run_id = RunIdentifier()
        elif not isinstance(run_id, RunIdentifier):
            run_id = RunIdentifier(run_name=str(run_id))

        self._run_id = run_id
        self._data_asset_name = data_asset_name
        self._expectation_suite_identifier = expectation_suite_identifier
예제 #10
0
def test_database_evaluation_parameter_store_basics(param_store):
    run_id = RunIdentifier(run_name=datetime.datetime.now(
        datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ"))
    metric_identifier = ValidationMetricIdentifier(
        run_id=run_id,
        data_asset_name=None,
        expectation_suite_identifier="asset.warning",
        metric_name=
        "expect_column_values_to_match_regex.result.unexpected_percent",
        metric_kwargs_id="column=mycol",
    )
    metric_value = 12.3456789

    param_store.set(metric_identifier, metric_value)
    value = param_store.get(metric_identifier)
    assert value == metric_value
예제 #11
0
 def from_tuple(cls, tuple_):
     if len(tuple_) < 6:
         raise GreatExpectationsError(
             "ValidationMetricIdentifier tuple must have at least six components."
         )
     if tuple_[2] == "__":
         tuple_data_asset_name = None
     else:
         tuple_data_asset_name = tuple_[2]
     metric_id = MetricIdentifier.from_tuple(tuple_[-2:])
     return cls(
         run_id=RunIdentifier.from_tuple((tuple_[0], tuple_[1])),
         data_asset_name=tuple_data_asset_name,
         expectation_suite_identifier=ExpectationSuiteIdentifier.from_tuple(
             tuple_[3:-2]
         ),
         metric_name=metric_id.metric_name,
         metric_kwargs_id=metric_id.metric_kwargs_id,
     )
예제 #12
0
 def from_fixed_length_tuple(cls, tuple_):
     if len(tuple_) != 6:
         raise GreatExpectationsError(
             "ValidationMetricIdentifier fixed length tuple must have exactly six "
             "components."
         )
     if tuple_[2] == "__":
         tuple_data_asset_name = None
     else:
         tuple_data_asset_name = tuple_[2]
     metric_id = MetricIdentifier.from_tuple(tuple_[-2:])
     return cls(
         run_id=RunIdentifier.from_fixed_length_tuple((tuple_[0], tuple_[1])),
         data_asset_name=tuple_data_asset_name,
         expectation_suite_identifier=ExpectationSuiteIdentifier.from_fixed_length_tuple(
             tuple((tuple_[3],))
         ),
         metric_name=metric_id.metric_name,
         metric_kwargs_id=metric_id.metric_kwargs_id,
     )
예제 #13
0
def test_evaluation_parameter_store_calls_proper_gcs_tuple_store_methods(
    mock_parent_list_keys,
    mock_gcs_list_keys,
):
    """
    What does this test and why?

    Demonstrate that EvaluationParameterStore works as expected with TupleGCSStoreBackend
    and that the store backend adheres to the Liskov substitution principle.
    """
    evaluation_parameter_store = EvaluationParameterStore()
    run_id = RunIdentifier()
    gcs_store = TupleGCSStoreBackend(bucket="my_bucket", project="my_project")
    evaluation_parameter_store._store_backend = gcs_store

    # Sanity check to ensure neither parent nor child method has been called
    assert not mock_gcs_list_keys.called
    assert not mock_parent_list_keys.called

    # `get_bind_params` calls the child method due to proper polymorphism
    evaluation_parameter_store.get_bind_params(run_id=run_id)
    assert mock_gcs_list_keys.called
    assert not mock_parent_list_keys.called
def test_errors_warnings_validation_operator_run_slack_query(
    warning_failure_validation_operator_data_context, assets_to_validate
):
    data_context = warning_failure_validation_operator_data_context

    vo = WarningAndFailureExpectationSuitesValidationOperator(
        data_context=data_context,
        action_list=[],
        name="test",
        slack_webhook="https://hooks.slack.com/services/test/slack/webhook",
    )

    return_obj = vo.run(
        assets_to_validate=assets_to_validate,
        run_id=RunIdentifier(run_name="test_100"),
        base_expectation_suite_name="f1",
    )
    slack_query = vo._build_slack_query(return_obj)
    expected_slack_query = {
        "blocks": [
            {"type": "divider"},
            {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": "*FailureVsWarning Validation Operator Completed.*",
                },
            },
            {"type": "divider"},
            {
                "type": "section",
                "text": {"type": "mrkdwn", "text": "*Status*: Failed :x:"},
            },
            {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": "*Batch Id List:* ['ge_batch_id=82a8de83-e063-11e9-8133-acde48001122', "
                    "'ge_batch_id=82a8de83-e063-11e9-8226-acde48001122', "
                    "'ge_batch_id=82a8de83-e063-11e9-a53d-acde48001122']",
                },
            },
            {
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": "*Failed Batches:* ['f1.failure-ge_batch_id=82a8de83-e063-11e9-8133-acde48001122']",
                },
            },
            {
                "type": "section",
                "text": {"type": "mrkdwn", "text": "*Run Name:* test_100"},
            },
            {
                "type": "section",
                "text": {"type": "mrkdwn", "text": "*Run Time:* LOCALEDATE"},
            },
            {"type": "divider"},
            {
                "type": "context",
                "elements": [
                    {
                        "type": "mrkdwn",
                        "text": "Learn about FailureVsWarning Validation Operators at https://docs.greatexpectations.i"
                        "o/en/latest/reference/validation_operators/warning_and_failure_expectation_suites_val"
                        "idation_operator.html",
                    }
                ],
            },
        ]
    }

    # We're okay with system variation in locales (OS X likes 24 hour, but not Travis)
    slack_query["blocks"][7]["text"]["text"] = slack_query["blocks"][7]["text"][
        "text"
    ].replace("09/26/2019 13:42:41", "LOCALEDATE")
    slack_query["blocks"][7]["text"]["text"] = slack_query["blocks"][7]["text"][
        "text"
    ].replace("09/26/2019 01:42:41 PM", "LOCALEDATE")
    expected_slack_query["blocks"][7]["text"]["text"] = expected_slack_query["blocks"][
        7
    ]["text"]["text"].replace("09/26/2019 13:42:41", "LOCALEDATE")
    expected_slack_query["blocks"][7]["text"]["text"] = expected_slack_query["blocks"][
        7
    ]["text"]["text"].replace("09/26/2019 01:42:41 PM", "LOCALEDATE")

    import json

    print(json.dumps(slack_query, indent=2))
    print(json.dumps(expected_slack_query, indent=2))
    assert slack_query == expected_slack_query
예제 #15
0
def test_evaluation_parameter_store_methods(
    data_context_parameterized_expectation_suite: DataContext, ):
    run_id = RunIdentifier(run_name="20191125T000000.000000Z")
    source_patient_data_results = ExpectationSuiteValidationResult(
        meta={
            "expectation_suite_name": "source_patient_data.default",
            "run_id": run_id,
        },
        results=[
            ExpectationValidationResult(
                expectation_config=ExpectationConfiguration(
                    expectation_type="expect_table_row_count_to_equal",
                    kwargs={
                        "value": 1024,
                    },
                ),
                success=True,
                exception_info={
                    "exception_message": None,
                    "exception_traceback": None,
                    "raised_exception": False,
                },
                result={
                    "observed_value": 1024,
                    "element_count": 1024,
                    "missing_percent": 0.0,
                    "missing_count": 0,
                },
            )
        ],
        success=True,
    )

    data_context_parameterized_expectation_suite.store_evaluation_parameters(
        source_patient_data_results)

    bound_parameters = data_context_parameterized_expectation_suite.evaluation_parameter_store.get_bind_params(
        run_id)
    assert bound_parameters == {
        "urn:great_expectations:validations:source_patient_data.default:expect_table_row_count_to_equal.result"
        ".observed_value":
        1024
    }
    source_diabetes_data_results = ExpectationSuiteValidationResult(
        meta={
            "expectation_suite_name": "source_diabetes_data.default",
            "run_id": run_id,
        },
        results=[
            ExpectationValidationResult(
                expectation_config=ExpectationConfiguration(
                    expectation_type=
                    "expect_column_unique_value_count_to_be_between",
                    kwargs={
                        "column": "patient_nbr",
                        "min": 2048,
                        "max": 2048
                    },
                ),
                success=True,
                exception_info={
                    "exception_message": None,
                    "exception_traceback": None,
                    "raised_exception": False,
                },
                result={
                    "observed_value": 2048,
                    "element_count": 5000,
                    "missing_percent": 0.0,
                    "missing_count": 0,
                },
            )
        ],
        success=True,
    )

    data_context_parameterized_expectation_suite.store_evaluation_parameters(
        source_diabetes_data_results)
    bound_parameters = data_context_parameterized_expectation_suite.evaluation_parameter_store.get_bind_params(
        run_id)
    assert bound_parameters == {
        "urn:great_expectations:validations:source_patient_data.default:expect_table_row_count_to_equal.result"
        ".observed_value":
        1024,
        "urn:great_expectations:validations:source_diabetes_data.default"
        ":expect_column_unique_value_count_to_be_between.result.observed_value:column=patient_nbr":
        2048,
    }
def test_StoreMetricsAction(basic_in_memory_data_context_for_validation_operator):
    action = StoreMetricsAction(
        data_context=basic_in_memory_data_context_for_validation_operator,
        requested_metrics={
            "*": [
                "statistics.evaluated_expectations",
                "statistics.successful_expectations",
            ]
        },
        target_store_name="metrics_store",
    )

    run_id = RunIdentifier(run_name="bar")

    validation_result = ExpectationSuiteValidationResult(
        success=False,
        meta={"expectation_suite_name": "foo", "run_id": run_id},
        statistics={"evaluated_expectations": 5, "successful_expectations": 3},
    )

    # Run the action and store our metrics
    action.run(
        validation_result,
        ValidationResultIdentifier.from_object(validation_result),
        data_asset=None,
    )

    validation_result = ExpectationSuiteValidationResult(
        success=False,
        meta={"expectation_suite_name": "foo.warning", "run_id": run_id},
        statistics={"evaluated_expectations": 8, "successful_expectations": 4},
    )

    action.run(
        validation_result,
        ValidationResultIdentifier.from_object(validation_result),
        data_asset=None,
    )

    assert (
        basic_in_memory_data_context_for_validation_operator.stores[
            "metrics_store"
        ].get(
            ValidationMetricIdentifier(
                run_id=run_id,
                data_asset_name=None,
                expectation_suite_identifier=ExpectationSuiteIdentifier("foo"),
                metric_name="statistics.evaluated_expectations",
                metric_kwargs_id=None,
            )
        )
        == 5
    )

    assert (
        basic_in_memory_data_context_for_validation_operator.stores[
            "metrics_store"
        ].get(
            ValidationMetricIdentifier(
                run_id=run_id,
                data_asset_name=None,
                expectation_suite_identifier=ExpectationSuiteIdentifier("foo"),
                metric_name="statistics.successful_expectations",
                metric_kwargs_id=None,
            )
        )
        == 3
    )

    assert (
        basic_in_memory_data_context_for_validation_operator.stores[
            "metrics_store"
        ].get(
            ValidationMetricIdentifier(
                run_id=run_id,
                data_asset_name=None,
                expectation_suite_identifier=ExpectationSuiteIdentifier("foo.warning"),
                metric_name="statistics.evaluated_expectations",
                metric_kwargs_id=None,
            )
        )
        == 8
    )

    assert (
        basic_in_memory_data_context_for_validation_operator.stores[
            "metrics_store"
        ].get(
            ValidationMetricIdentifier(
                run_id=run_id,
                data_asset_name=None,
                expectation_suite_identifier=ExpectationSuiteIdentifier("foo.warning"),
                metric_name="statistics.successful_expectations",
                metric_kwargs_id=None,
            )
        )
        == 4
    )
예제 #17
0
파일: schema.py 프로젝트: flyteorg/flytekit
    def to_python_value(
        self,
        ctx: FlyteContext,
        lv: Literal,
        expected_python_type: Type[GreatExpectationsType],
    ) -> GreatExpectationsType:
        if not (lv and lv.scalar and
                ((lv.scalar.primitive and lv.scalar.primitive.string_value)
                 or lv.scalar.schema or lv.scalar.blob
                 or lv.scalar.structured_dataset)):
            raise AssertionError(
                "Can only validate a literal string/FlyteFile/FlyteSchema value"
            )

        # fetch the configuration
        type_conf = GreatExpectationsTypeTransformer.get_config(
            expected_python_type)
        conf_dict = type_conf[1].to_dict()  # type: ignore

        ge_conf = GreatExpectationsFlyteConfig(**conf_dict)

        # fetch the data context
        context = ge.data_context.DataContext(
            ge_conf.context_root_dir)  # type: ignore

        # determine the type of data connector
        selected_datasource = list(
            filter(lambda x: x["name"] == ge_conf.datasource_name,
                   context.list_datasources()))

        if not selected_datasource:
            raise ValueError("Datasource doesn't exist!")

        data_connector_class_lookup = {
            data_connector_name: data_connector_class["class_name"]
            for data_connector_name, data_connector_class in
            selected_datasource[0]["data_connectors"].items()
        }

        specified_data_connector_class = data_connector_class_lookup[
            ge_conf.data_connector_name]

        is_runtime = False
        if specified_data_connector_class == "RuntimeDataConnector":
            is_runtime = True
            if not ge_conf.data_asset_name:
                raise ValueError(
                    "data_asset_name has to be given in a RuntimeBatchRequest")

        # file path for FlyteSchema and FlyteFile
        temp_dataset = ""

        # return value
        return_dataset = ""

        # FlyteSchema
        if lv.scalar.schema or lv.scalar.structured_dataset:
            return_dataset, temp_dataset = self._flyte_schema(
                is_runtime=is_runtime,
                ctx=ctx,
                ge_conf=ge_conf,
                lv=lv,
                expected_python_type=type_conf[0])

        # FlyteFile
        if lv.scalar.blob:
            return_dataset, temp_dataset = self._flyte_file(
                ctx=ctx,
                ge_conf=ge_conf,
                lv=lv,
                expected_python_type=type_conf[0])

        if lv.scalar.primitive:
            dataset = return_dataset = lv.scalar.primitive.string_value
        else:
            dataset = temp_dataset

        batch_request_conf = ge_conf.batch_request_config

        # minimalistic batch request
        final_batch_request = {
            "data_asset_name":
            ge_conf.data_asset_name if is_runtime else dataset,
            "datasource_name": ge_conf.datasource_name,
            "data_connector_name": ge_conf.data_connector_name,
        }

        # Great Expectations' RuntimeBatchRequest
        if batch_request_conf and (batch_request_conf["runtime_parameters"]
                                   or is_runtime):
            final_batch_request.update({
                "runtime_parameters":
                batch_request_conf["runtime_parameters"]
                if batch_request_conf["runtime_parameters"] else {},
                "batch_identifiers":
                batch_request_conf["batch_identifiers"],
                "batch_spec_passthrough":
                batch_request_conf["batch_spec_passthrough"],
            })

            if is_runtime and lv.scalar.primitive:
                final_batch_request["runtime_parameters"]["query"] = dataset
            elif is_runtime and (lv.scalar.schema
                                 or lv.scalar.structured_dataset):
                final_batch_request["runtime_parameters"][
                    "batch_data"] = return_dataset.open().all()
            else:
                raise AssertionError(
                    "Can only use runtime_parameters for query(str)/schema data"
                )

        # Great Expectations' BatchRequest
        elif batch_request_conf:
            final_batch_request.update({
                "data_connector_query":
                batch_request_conf["data_connector_query"],
                "batch_spec_passthrough":
                batch_request_conf["batch_spec_passthrough"],
            })

        if ge_conf.checkpoint_params:
            checkpoint = SimpleCheckpoint(
                f"_tmp_checkpoint_{ge_conf.expectation_suite_name}",
                context,
                **ge_conf.checkpoint_params,
            )
        else:
            checkpoint = SimpleCheckpoint(
                f"_tmp_checkpoint_{ge_conf.expectation_suite_name}", context)

        # identify every run uniquely
        run_id = RunIdentifier(
            **{
                "run_name": ge_conf.datasource_name + "_run",
                "run_time": datetime.datetime.utcnow(),
            })

        checkpoint_result = checkpoint.run(
            run_id=run_id,
            validations=[{
                "batch_request":
                final_batch_request,
                "expectation_suite_name":
                ge_conf.expectation_suite_name,
            }],
        )
        final_result = convert_to_json_serializable(
            checkpoint_result.list_validation_results())[0]

        result_string = ""
        if final_result["success"] is False:
            for every_result in final_result["results"]:
                if every_result["success"] is False:
                    result_string += (
                        every_result["expectation_config"]["kwargs"]["column"]
                        + " -> " +
                        every_result["expectation_config"]["expectation_type"]
                        + "\n")

            # raise a Great Expectations' exception
            raise ValidationError(
                "Validation failed!\nCOLUMN\t\tFAILED EXPECTATION\n" +
                result_string)

        logger.info("Validation succeeded!")

        return typing.cast(GreatExpectationsType, return_dataset)
예제 #18
0
def test_run_identifier_parses_datetime_run_name():
    time = datetime.datetime.now(
        datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
    run_id = RunIdentifier(run_name=time)
    assert run_id.run_name == run_id.run_time.strftime("%Y%m%dT%H%M%S.%fZ")
def test_TupleGCSStoreBackend():
    # pytest.importorskip("google-cloud-storage")
    """
    What does this test test and why?

    Since no package like moto exists for GCP services, we mock the GCS client
    and assert that the store backend makes the right calls for set, get, and list.

    TODO : One option may be to have a GCS Store in Docker, which can be use to "actually" run these tests.
    """

    bucket = "leakybucket"
    prefix = "this_is_a_test_prefix"
    project = "dummy-project"
    base_public_path = "http://www.test.com/"

    with patch("google.cloud.storage.Client",
               autospec=True) as mock_gcs_client:

        mock_client = mock_gcs_client.return_value
        mock_bucket = mock_client.get_bucket.return_value
        mock_blob = mock_bucket.blob.return_value

        my_store = TupleGCSStoreBackend(
            filepath_template="my_file_{0}",
            bucket=bucket,
            prefix=prefix,
            project=project,
        )

        my_store.set(("AAA", ), "aaa", content_type="text/html")

        mock_gcs_client.assert_called_with("dummy-project")
        mock_client.get_bucket.assert_called_with("leakybucket")
        mock_bucket.blob.assert_called_with(
            "this_is_a_test_prefix/my_file_AAA")
        # mock_bucket.blob.assert_any_call("this_is_a_test_prefix/.ge_store_backend_id")
        mock_blob.upload_from_string.assert_called_with(
            b"aaa", content_type="text/html")

    with patch("google.cloud.storage.Client",
               autospec=True) as mock_gcs_client:
        mock_client = mock_gcs_client.return_value
        mock_bucket = mock_client.get_bucket.return_value
        mock_blob = mock_bucket.blob.return_value

        my_store_with_no_filepath_template = TupleGCSStoreBackend(
            filepath_template=None,
            bucket=bucket,
            prefix=prefix,
            project=project)

        my_store_with_no_filepath_template.set(("AAA", ),
                                               b"aaa",
                                               content_encoding=None,
                                               content_type="image/png")

        mock_gcs_client.assert_called_with("dummy-project")
        mock_client.get_bucket.assert_called_with("leakybucket")
        mock_bucket.blob.assert_called_with("this_is_a_test_prefix/AAA")
        # mock_bucket.blob.assert_any_call("this_is_a_test_prefix/.ge_store_backend_id")
        mock_blob.upload_from_string.assert_called_with(
            b"aaa", content_type="image/png")

    with patch("google.cloud.storage.Client",
               autospec=True) as mock_gcs_client:

        mock_client = mock_gcs_client.return_value
        mock_bucket = mock_client.get_bucket.return_value
        mock_blob = mock_bucket.get_blob.return_value
        mock_str = mock_blob.download_as_string.return_value

        my_store.get(("BBB", ))

        mock_gcs_client.assert_called_once_with("dummy-project")
        mock_client.get_bucket.assert_called_once_with("leakybucket")
        mock_bucket.get_blob.assert_called_once_with(
            "this_is_a_test_prefix/my_file_BBB")
        mock_blob.download_as_string.assert_called_once()
        mock_str.decode.assert_called_once_with("utf-8")

    with patch("google.cloud.storage.Client",
               autospec=True) as mock_gcs_client:

        mock_client = mock_gcs_client.return_value

        my_store.list_keys()

        mock_client.list_blobs.assert_called_once_with(
            "leakybucket", prefix="this_is_a_test_prefix")

        my_store.remove_key("leakybucket")

        from google.cloud.exceptions import NotFound

        try:
            mock_client.get_bucket.assert_called_once_with("leakybucket")
        except NotFound:
            pass

    with patch("google.cloud.storage.Client",
               autospec=True) as mock_gcs_client:
        mock_gcs_client.side_effect = InvalidKeyError(
            "Hi I am an InvalidKeyError")
        with pytest.raises(InvalidKeyError):
            my_store.get(("non_existent_key", ))

    run_id = RunIdentifier("my_run_id", datetime.datetime.utcnow())
    key = ValidationResultIdentifier(
        ExpectationSuiteIdentifier(expectation_suite_name="my_suite_name"),
        run_id,
        "my_batch_id",
    )
    run_time_string = run_id.to_tuple()[1]

    url = my_store_with_no_filepath_template.get_url_for_key(key.to_tuple())
    assert (
        url == "https://storage.googleapis.com/leakybucket" +
        f"/this_is_a_test_prefix/my_suite_name/my_run_id/{run_time_string}/my_batch_id"
    )
 def from_fixed_length_tuple(cls, tuple_):
     return cls(
         ExpectationSuiteIdentifier(tuple_[0]),
         RunIdentifier.from_tuple((tuple_[1], tuple_[2])),
         tuple_[3],
     )
 def from_tuple(cls, tuple_):
     return cls(
         ExpectationSuiteIdentifier.from_tuple(tuple_[0:-3]),
         RunIdentifier.from_tuple((tuple_[-3], tuple_[-2])),
         tuple_[-1],
     )
    def run(
        self,
        assets_to_validate,
        run_id=None,
        base_expectation_suite_name=None,
        evaluation_parameters=None,
        run_name=None,
        run_time=None,
        result_format=None,
    ):
        assert not (run_id and run_name) and not (
            run_id and run_time
        ), "Please provide either a run_id or run_name and/or run_time."
        if isinstance(run_id, str) and not run_name:
            warnings.warn(
                "String run_ids will be deprecated in the future. Please provide a run_id of type "
                "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name "
                "and run_time (both optional). Instead of providing a run_id, you may also provide"
                "run_name and run_time separately.",
                DeprecationWarning,
            )
            try:
                run_time = parse(run_id)
            except (ValueError, TypeError):
                pass
            run_id = RunIdentifier(run_name=run_id, run_time=run_time)
        elif isinstance(run_id, dict):
            run_id = RunIdentifier(**run_id)
        elif not isinstance(run_id, RunIdentifier):
            run_id = RunIdentifier(run_name=run_name, run_time=run_time)

        if base_expectation_suite_name is None:
            if self.base_expectation_suite_name is None:
                raise ValueError(
                    "base_expectation_suite_name must be configured in the validation operator or passed at runtime"
                )
            base_expectation_suite_name = self.base_expectation_suite_name

        run_results = {}

        for item in assets_to_validate:
            batch = self._build_batch_from_item(item)

            batch_id = batch.batch_id
            run_id = run_id

            assert not batch_id is None
            assert not run_id is None

            failure_expectation_suite_identifier = ExpectationSuiteIdentifier(
                expectation_suite_name=base_expectation_suite_name +
                self.expectation_suite_name_suffixes[0])

            failure_validation_result_id = ValidationResultIdentifier(
                expectation_suite_identifier=
                failure_expectation_suite_identifier,
                run_id=run_id,
                batch_identifier=batch_id,
            )

            failure_expectation_suite = None
            try:
                failure_expectation_suite = self.data_context.stores[
                    self.data_context.expectations_store_name].get(
                        failure_expectation_suite_identifier)

            # NOTE : Abe 2019/09/17 : I'm concerned that this may be too permissive, since
            # it will catch any error in the Store, not just KeyErrors. In the longer term, a better
            # solution will be to have the Stores catch other known errors and raise KeyErrors,
            # so that methods like this can catch and handle a single error type.
            except Exception:
                logger.debug("Failure expectation suite not found: {}".format(
                    failure_expectation_suite_identifier))

            if failure_expectation_suite:
                failure_run_result_obj = {
                    "expectation_suite_severity_level": "failure"
                }
                failure_validation_result = batch.validate(
                    failure_expectation_suite,
                    run_id,
                    result_format=result_format
                    if result_format else self.result_format,
                    evaluation_parameters=evaluation_parameters,
                )
                failure_run_result_obj[
                    "validation_result"] = failure_validation_result
                failure_actions_results = self._run_actions(
                    batch,
                    failure_expectation_suite_identifier,
                    failure_expectation_suite,
                    failure_validation_result,
                    run_id,
                )
                failure_run_result_obj[
                    "actions_results"] = failure_actions_results
                run_results[
                    failure_validation_result_id] = failure_run_result_obj

                if not failure_validation_result.success and self.stop_on_first_error:
                    break

            warning_expectation_suite_identifier = ExpectationSuiteIdentifier(
                expectation_suite_name=base_expectation_suite_name +
                self.expectation_suite_name_suffixes[1])

            warning_validation_result_id = ValidationResultIdentifier(
                expectation_suite_identifier=
                warning_expectation_suite_identifier,
                run_id=run_id,
                batch_identifier=batch.batch_id,
            )

            warning_expectation_suite = None
            try:
                warning_expectation_suite = self.data_context.stores[
                    self.data_context.expectations_store_name].get(
                        warning_expectation_suite_identifier)
            except Exception:
                logger.debug("Warning expectation suite not found: {}".format(
                    warning_expectation_suite_identifier))

            if warning_expectation_suite:
                warning_run_result_obj = {
                    "expectation_suite_severity_level": "warning"
                }
                warning_validation_result = batch.validate(
                    warning_expectation_suite,
                    run_id,
                    result_format=result_format
                    if result_format else self.result_format,
                    evaluation_parameters=evaluation_parameters,
                )
                warning_run_result_obj[
                    "validation_result"] = warning_validation_result
                warning_actions_results = self._run_actions(
                    batch,
                    warning_expectation_suite_identifier,
                    warning_expectation_suite,
                    warning_validation_result,
                    run_id,
                )
                warning_run_result_obj[
                    "actions_results"] = warning_actions_results
                run_results[
                    warning_validation_result_id] = warning_run_result_obj

        validation_operator_result = ValidationOperatorResult(
            run_id=run_id,
            run_results=run_results,
            validation_operator_config=self.validation_operator_config,
            evaluation_parameters=evaluation_parameters,
            success=all([
                run_result_obj["validation_result"].success
                for run_result_obj in run_results.values() if
                run_result_obj["expectation_suite_severity_level"] == "failure"
            ]),
        )

        if self.slack_webhook:
            if (self.notify_on == "all" or self.notify_on == "success"
                    and validation_operator_result.success
                    or self.notify_on == "failure"
                    and not validation_operator_result.success):
                slack_query = self._build_slack_query(
                    validation_operator_result=validation_operator_result)
                send_slack_notification(query=slack_query,
                                        slack_webhook=self.slack_webhook)

        return validation_operator_result
def test_configuration_driven_site_builder_without_how_to_buttons(
    site_builder_data_context_with_html_store_titanic_random, ):
    context = site_builder_data_context_with_html_store_titanic_random

    context.add_validation_operator(
        "validate_and_store",
        {
            "class_name":
            "ActionListValidationOperator",
            "action_list": [
                {
                    "name": "store_validation_result",
                    "action": {
                        "class_name": "StoreValidationResultAction",
                        "target_store_name": "validations_store",
                    },
                },
                {
                    "name": "extract_and_store_eval_parameters",
                    "action": {
                        "class_name": "StoreEvaluationParametersAction",
                        "target_store_name": "evaluation_parameter_store",
                    },
                },
            ],
        },
    )

    # profiling the Titanic datasource will generate one expectation suite and one validation
    # that is a profiling result
    datasource_name = "titanic"
    data_asset_name = "Titanic"
    profiler_name = "BasicDatasetProfiler"
    generator_name = "subdir_reader"
    context.profile_datasource(datasource_name)

    # creating another validation result using the profiler's suite (no need to use a new expectation suite
    # for this test). having two validation results - one with run id "profiling" - allows us to test
    # the logic of run_name_filter that helps filtering validation results to be included in
    # the profiling and the validation sections.
    batch_kwargs = context.build_batch_kwargs(
        datasource=datasource_name,
        batch_kwargs_generator=generator_name,
        name=data_asset_name,
    )

    expectation_suite_name = "{}.{}.{}.{}".format(datasource_name,
                                                  generator_name,
                                                  data_asset_name,
                                                  profiler_name)

    batch = context.get_batch(
        batch_kwargs=batch_kwargs,
        expectation_suite_name=expectation_suite_name,
    )
    run_id = "test_run_id_12345"
    context.run_validation_operator(
        assets_to_validate=[batch],
        run_id=RunIdentifier(run_name=run_id),
        validation_operator_name="validate_and_store",
    )

    data_docs_config = context._project_config.data_docs_sites
    local_site_config = data_docs_config["local_site"]

    # set this flag to false in config to hide how-to buttons and related elements
    local_site_config["show_how_to_buttons"] = False

    site_builder = SiteBuilder(
        data_context=context,
        runtime_environment={"root_directory": context.root_directory},
        **local_site_config)
    res = site_builder.build()

    index_page_locator_info = res[0]
    index_links_dict = res[1]

    assert_how_to_buttons(context,
                          index_page_locator_info,
                          index_links_dict,
                          show_how_to_buttons=False)
def test_errors_warnings_validation_operator_succeeded_vo_result_with_only_failed_warning_suite(
    warning_failure_validation_operator_data_context, assets_to_validate
):
    # this tests whether the WarningAndFailureExpectationSuitesValidationOperator properly returns
    # a failed ValidationOperatorResult if there is a failed validation with a suite severity level of "failure"

    data_context = warning_failure_validation_operator_data_context

    vo = WarningAndFailureExpectationSuitesValidationOperator(
        data_context=data_context,
        action_list=[],
        name="test",
    )

    # only pass asset that yields succeeded "failure-level" suite and failed "warning-level" suite
    return_obj = vo.run(
        assets_to_validate=[assets_to_validate[0]],
        run_id=RunIdentifier(run_name="test_100"),
        base_expectation_suite_name="f1",
    )
    run_results = list(return_obj.run_results.values())

    # make sure there are no failed validations with suite severity of failure
    assert not any(
        [
            run_result
            for run_result in run_results
            if run_result["expectation_suite_severity_level"] == "failure"
            and not run_result["validation_result"].success
        ]
    )
    # make sure there is at least one failed validation with suite severity of warning
    assert any(
        [
            run_result
            for run_result in run_results
            if run_result["expectation_suite_severity_level"] == "warning"
            and not run_result["validation_result"].success
        ]
    )
    assert return_obj.success

    # only pass asset that yields succeeded "failure-level" suite and succeeded "warning-level" suite
    return_obj_2 = vo.run(
        assets_to_validate=[assets_to_validate[2]],
        run_id=RunIdentifier(run_name="test_100"),
        base_expectation_suite_name="f1",
    )
    run_results_2 = list(return_obj_2.run_results.values())

    # make sure there are no failed validations with suite severity of failure
    assert not any(
        [
            run_result
            for run_result in run_results_2
            if run_result["expectation_suite_severity_level"] == "failure"
            and not run_result["validation_result"].success
        ]
    )
    # make sure there are no failed validation with suite severity of warning
    assert not any(
        [
            run_result
            for run_result in run_results_2
            if run_result["expectation_suite_severity_level"] == "warning"
            and not run_result["validation_result"].success
        ]
    )
    assert return_obj_2.success
    def run(
        self,
        assets_to_validate,
        run_id=None,
        evaluation_parameters=None,
        run_name=None,
        run_time=None,
        catch_exceptions=None,
        result_format=None,
        checkpoint_identifier=None,
    ):
        assert not (run_id and run_name) and not (
            run_id and run_time
        ), "Please provide either a run_id or run_name and/or run_time."
        if isinstance(run_id, str) and not run_name:
            warnings.warn(
                "String run_ids will be deprecated in the future. Please provide a run_id of type "
                "RunIdentifier(run_name=None, run_time=None), or a dictionary containing run_name "
                "and run_time (both optional). Instead of providing a run_id, you may also provide"
                "run_name and run_time separately.",
                DeprecationWarning,
            )
            try:
                run_time = parse(run_id)
            except (ValueError, TypeError):
                pass
            run_id = RunIdentifier(run_name=run_id, run_time=run_time)
        elif isinstance(run_id, dict):
            run_id = RunIdentifier(**run_id)
        elif not isinstance(run_id, RunIdentifier):
            run_id = RunIdentifier(run_name=run_name, run_time=run_time)

        ###
        # NOTE: 20211010 - jdimatteo: This method is called by both Checkpoint.run and LegacyCheckpoint.run and below
        # usage of AsyncExecutor may speed up I/O bound validations by running them in parallel with multithreading
        # (if concurrency is enabled in the data context configuration).
        #
        # When this method is called by LegacyCheckpoint.run, len(assets_to_validate) may be greater than 1. If
        # concurrency is enabled in the configuration AND len(assets_to_validate) > 1, then execution is run in multiple
        # threads with AsyncExecutor -- otherwise AsyncExecutor only uses the current single thread to execute the work.
        # Please see the below arguments used to initialize AsyncExecutor and the corresponding AsyncExecutor docstring
        # for more details on when multiple threads are used.
        #
        # When this method is called by Checkpoint.run, len(assets_to_validate) may be 1 even if there are multiple
        # validations, because Checkpoint.run calls this method in a loop for each validation. AsyncExecutor is also
        # used in the Checkpoint.run loop to optionally run each validation in parallel with multithreading, so this
        # method's AsyncExecutor is nested within the Checkpoint.run AsyncExecutor. The AsyncExecutor logic to only use
        # multithreading when max_workers > 1 ensures that no nested multithreading is ever used when
        # len(assets_to_validate) is equal to 1. So no unnecessary multithreading is ever used here even though it may
        # be nested inside another AsyncExecutor (and this is a good thing because it avoids extra overhead associated
        # with each thread and minimizes the total number of threads to simplify debugging).
        with AsyncExecutor(
                self.data_context.concurrency,
                max_workers=len(assets_to_validate)) as async_executor:
            batch_and_async_result_tuples = []
            for item in assets_to_validate:
                batch = self._build_batch_from_item(item)

                if hasattr(batch, "active_batch_id"):
                    batch_identifier = batch.active_batch_id
                else:
                    batch_identifier = batch.batch_id

                if result_format is None:
                    result_format = self.result_format

                batch_validate_arguments = {
                    "run_id": run_id,
                    "result_format": result_format,
                    "evaluation_parameters": evaluation_parameters,
                }

                if catch_exceptions is not None:
                    batch_validate_arguments[
                        "catch_exceptions"] = catch_exceptions

                batch_and_async_result_tuples.append((
                    batch,
                    async_executor.submit(
                        batch.validate,
                        **batch_validate_arguments,
                    ),
                ))

            run_results = {}
            for batch, async_batch_validation_result in batch_and_async_result_tuples:
                if self.data_context.ge_cloud_mode:
                    expectation_suite_identifier = GeCloudIdentifier(
                        resource_type="expectation_suite",
                        ge_cloud_id=batch._expectation_suite.ge_cloud_id,
                    )
                    validation_result_id = GeCloudIdentifier(
                        resource_type="suite_validation_result")
                else:
                    expectation_suite_identifier = ExpectationSuiteIdentifier(
                        expectation_suite_name=batch._expectation_suite.
                        expectation_suite_name)
                    validation_result_id = ValidationResultIdentifier(
                        batch_identifier=batch_identifier,
                        expectation_suite_identifier=
                        expectation_suite_identifier,
                        run_id=run_id,
                    )

                batch_actions_results = self._run_actions(
                    batch=batch,
                    expectation_suite_identifier=expectation_suite_identifier,
                    expectation_suite=batch._expectation_suite,
                    batch_validation_result=async_batch_validation_result.
                    result(),
                    run_id=run_id,
                    validation_result_id=validation_result_id,
                    checkpoint_identifier=checkpoint_identifier,
                )

                run_result_obj = {
                    "validation_result":
                    async_batch_validation_result.result(),
                    "actions_results": batch_actions_results,
                }
                run_results[validation_result_id] = run_result_obj

        return ValidationOperatorResult(
            run_id=run_id,
            run_results=run_results,
            validation_operator_config=self.validation_operator_config,
            evaluation_parameters=evaluation_parameters,
        )
def test_StoreMetricsAction_column_metric(
    basic_in_memory_data_context_for_validation_operator,
):
    action = StoreMetricsAction(
        data_context=basic_in_memory_data_context_for_validation_operator,
        requested_metrics={
            "*": [
                {
                    "column": {
                        "provider_id": [
                            "expect_column_values_to_be_unique.result.unexpected_count"
                        ]
                    }
                },
                "statistics.evaluated_expectations",
                "statistics.successful_expectations",
            ]
        },
        target_store_name="metrics_store",
    )

    run_id = RunIdentifier(run_name="bar")

    validation_result = ExpectationSuiteValidationResult(
        success=False,
        meta={"expectation_suite_name": "foo", "run_id": run_id},
        results=[
            ExpectationValidationResult(
                meta={},
                result={
                    "element_count": 10,
                    "missing_count": 0,
                    "missing_percent": 0.0,
                    "unexpected_count": 7,
                    "unexpected_percent": 0.0,
                    "unexpected_percent_nonmissing": 0.0,
                    "partial_unexpected_list": [],
                },
                success=True,
                expectation_config=ExpectationConfiguration(
                    expectation_type="expect_column_values_to_be_unique",
                    kwargs={"column": "provider_id", "result_format": "BASIC"},
                ),
                exception_info=None,
            )
        ],
        statistics={"evaluated_expectations": 5, "successful_expectations": 3},
    )

    action.run(
        validation_result,
        ValidationResultIdentifier.from_object(validation_result),
        data_asset=None,
    )

    assert (
        basic_in_memory_data_context_for_validation_operator.stores[
            "metrics_store"
        ].get(
            ValidationMetricIdentifier(
                run_id=run_id,
                data_asset_name=None,
                expectation_suite_identifier=ExpectationSuiteIdentifier("foo"),
                metric_name="expect_column_values_to_be_unique.result.unexpected_count",
                metric_kwargs_id="column=provider_id",
            )
        )
        == 7
    )
def test_configuration_driven_site_builder_skip_and_clean_missing(
    site_builder_data_context_with_html_store_titanic_random,
):
    # tests auto-cleaning functionality of DefaultSiteIndexBuilder
    # when index page is built, if an HTML page is present without corresponding suite or validation result,
    # the HTML page should be removed and not appear on index page
    context = site_builder_data_context_with_html_store_titanic_random

    context.add_validation_operator(
        "validate_and_store",
        {
            "class_name": "ActionListValidationOperator",
            "action_list": [
                {
                    "name": "store_validation_result",
                    "action": {
                        "class_name": "StoreValidationResultAction",
                        "target_store_name": "validations_store",
                    },
                },
                {
                    "name": "extract_and_store_eval_parameters",
                    "action": {
                        "class_name": "StoreEvaluationParametersAction",
                        "target_store_name": "evaluation_parameter_store",
                    },
                },
            ],
        },
    )

    # profiling the Titanic datasource will generate one expectation suite and one validation
    # that is a profiling result
    datasource_name = "titanic"
    data_asset_name = "Titanic"
    profiler_name = "BasicDatasetProfiler"
    generator_name = "subdir_reader"
    context.profile_datasource(datasource_name)

    # creating another validation result using the profiler's suite (no need to use a new expectation suite
    # for this test). having two validation results - one with run id "profiling" - allows us to test
    # the logic of run_name_filter that helps filtering validation results to be included in
    # the profiling and the validation sections.
    batch_kwargs = context.build_batch_kwargs(
        datasource=datasource_name,
        batch_kwargs_generator=generator_name,
        data_asset_name=data_asset_name,
    )

    expectation_suite_name = "{}.{}.{}.{}".format(
        datasource_name, generator_name, data_asset_name, profiler_name
    )

    batch = context.get_batch(
        batch_kwargs=batch_kwargs,
        expectation_suite_name=expectation_suite_name,
    )
    run_id = RunIdentifier(run_name="test_run_id_12345")
    context.run_validation_operator(
        assets_to_validate=[batch],
        run_id=run_id,
        validation_operator_name="validate_and_store",
    )

    data_docs_config = context._project_config.data_docs_sites
    local_site_config = data_docs_config["local_site"]

    validations_set = set(context.stores["validations_store"].list_keys())
    assert len(validations_set) == 6

    expectation_suite_set = set(context.stores["expectations_store"].list_keys())
    assert len(expectation_suite_set) == 5

    site_builder = SiteBuilder(
        data_context=context,
        runtime_environment={"root_directory": context.root_directory},
        **local_site_config
    )
    site_builder.build()

    # test expectation suite pages
    expectation_suite_html_pages = {
        ExpectationSuiteIdentifier.from_tuple(suite_tuple)
        for suite_tuple in site_builder.target_store.store_backends[
            ExpectationSuiteIdentifier
        ].list_keys()
    }
    # suites in expectations store should match html pages
    assert expectation_suite_set == expectation_suite_html_pages

    # remove suites from expectations store
    for i in range(2):
        context.stores["expectations_store"].remove_key(list(expectation_suite_set)[i])

    # re-build data docs, which should remove suite HTML pages that no longer have corresponding suite in
    # expectations store
    site_builder.build()

    expectation_suite_set = set(context.stores["expectations_store"].list_keys())
    expectation_suite_html_pages = {
        ExpectationSuiteIdentifier.from_tuple(suite_tuple)
        for suite_tuple in site_builder.target_store.store_backends[
            ExpectationSuiteIdentifier
        ].list_keys()
    }
    assert expectation_suite_set == expectation_suite_html_pages

    # test validation result pages
    validation_html_pages = {
        ValidationResultIdentifier.from_tuple(result_tuple)
        for result_tuple in site_builder.target_store.store_backends[
            ValidationResultIdentifier
        ].list_keys()
    }
    # validations in store should match html pages
    assert validations_set == validation_html_pages

    # remove validations from store
    for i in range(2):
        context.stores["validations_store"].store_backend.remove_key(
            list(validations_set)[i]
        )

    # re-build data docs, which should remove validation HTML pages that no longer have corresponding validation in
    # validations store
    site_builder.build()

    validations_set = set(context.stores["validations_store"].list_keys())
    validation_html_pages = {
        ValidationResultIdentifier.from_tuple(result_tuple)
        for result_tuple in site_builder.target_store.store_backends[
            ValidationResultIdentifier
        ].list_keys()
    }
    assert validations_set == validation_html_pages
예제 #28
0
 def get_bind_params(self, run_id: RunIdentifier) -> dict:
     params = {}
     for k in self._store_backend.list_keys(run_id.to_tuple()):
         key = self.tuple_to_key(k)
         params[key.to_evaluation_parameter_urn()] = self.get(key)
     return params
def validation_operator_run(name, run_name, validation_config_file, suite,
                            directory):
    # Note though the long lines here aren't pythonic, they look best if Click does the line wraps.
    """
    Run a validation operator against some data.

    There are two modes to run this command:

    1. Interactive (good for development):

        Specify the name of the validation operator using the --name argument
        and the name of the expectation suite using the --suite argument.

        The cli will help you specify the batch of data that you want to
        validate interactively.


    2. Non-interactive (good for production):

        Use the `--validation_config_file` argument to specify the path of the validation configuration JSON file. This file can be used to instruct a validation operator to validate multiple batches of data and use multiple expectation suites to validate each batch.

        Learn how to create a validation config file here:
        https://great-expectations.readthedocs.io/en/latest/command_line.html#great-expectations-validation-operator-run-validation-config-file-validation-config-file-path

        This command exits with 0 if the validation operator ran and the "success" attribute in its return object is True. Otherwise, the command exits with 1.

    To learn more about validation operators, go here:
    https://great-expectations.readthedocs.io/en/latest/features/validation.html#validation-operators
    """

    try:
        context = DataContext(directory)
    except ge_exceptions.ConfigNotFoundError as err:
        cli_message("Failed to process <red>{}</red>".format(err.message))
        sys.exit(1)

    try:
        if validation_config_file is not None:
            try:
                with open(validation_config_file) as f:
                    validation_config = json.load(f)
            except (OSError, json_parse_exception) as e:
                cli_message(
                    f"Failed to process the --validation_config_file argument: <red>{e}</red>"
                )
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(1)

            validation_config_error_message = _validate_valdiation_config(
                validation_config)
            if validation_config_error_message is not None:
                cli_message(
                    "<red>The validation config in {:s} is misconfigured: {:s}</red>"
                    .format(validation_config_file,
                            validation_config_error_message))
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(1)

        else:
            if suite is None:
                cli_message("""
Please use --suite argument to specify the name of the expectation suite.
Call `great_expectation suite list` command to list the expectation suites in your project.
""")
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(0)

            suite = toolkit.load_expectation_suite(
                context, suite, "cli.validation_operator.run")

            if name is None:
                cli_message("""
Please use --name argument to specify the name of the validation operator.
Call `great_expectation validation-operator list` command to list the operators in your project.
""")
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(1)
            else:
                if name not in context.list_validation_operator_names():
                    cli_message(f"""
Could not find a validation operator {name}.
Call `great_expectation validation-operator list` command to list the operators in your project.
""")
                    send_usage_message(
                        data_context=context,
                        event="cli.validation_operator.run",
                        success=False,
                    )
                    sys.exit(1)

            batch_kwargs = None

            cli_message("""
Let us help you specify the batch of data your want the validation operator to validate."""
                        )

            try:
                data_source = toolkit.select_datasource(context)
            except ValueError as ve:
                cli_message("<red>{}</red>".format(ve))
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(1)

            if not data_source:
                cli_message("<red>No datasources found in the context.</red>")
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(1)

            if batch_kwargs is None:
                (
                    datasource_name,
                    batch_kwargs_generator,
                    data_asset,
                    batch_kwargs,
                ) = get_batch_kwargs(
                    context,
                    datasource_name=data_source.name,
                    batch_kwargs_generator_name=None,
                    data_asset_name=None,
                    additional_batch_kwargs=None,
                )

            validation_config = {
                "validation_operator_name":
                name,
                "batches": [{
                    "batch_kwargs":
                    batch_kwargs,
                    "expectation_suite_names": [suite.expectation_suite_name],
                }],
            }

        try:
            validation_operator_name = validation_config[
                "validation_operator_name"]
            batches_to_validate = []
            for entry in validation_config["batches"]:
                for expectation_suite_name in entry["expectation_suite_names"]:
                    batch = context.get_batch(entry["batch_kwargs"],
                                              expectation_suite_name)
                    batches_to_validate.append(batch)

            if run_name is None:
                run_name = datetime.datetime.now(
                    datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")

            run_id = RunIdentifier(run_name=run_name)

            if suite is None:
                results = context.run_validation_operator(
                    validation_operator_name,
                    assets_to_validate=batches_to_validate,
                    run_id=run_id,
                )
            else:
                if suite.evaluation_parameters is None:
                    results = context.run_validation_operator(
                        validation_operator_name,
                        assets_to_validate=batches_to_validate,
                        run_id=run_id,
                    )
                else:
                    results = context.run_validation_operator(
                        validation_operator_name,
                        assets_to_validate=batches_to_validate,
                        run_id=run_id,
                        evaluation_parameters=suite.evaluation_parameters,
                    )
        except (ge_exceptions.DataContextError, OSError, SQLAlchemyError) as e:
            cli_message("<red>{}</red>".format(e))
            send_usage_message(data_context=context,
                               event="cli.validation_operator.run",
                               success=False)
            sys.exit(1)

        if not results["success"]:
            cli_message("Validation failed!")
            send_usage_message(data_context=context,
                               event="cli.validation_operator.run",
                               success=True)
            sys.exit(1)
        else:
            cli_message("Validation succeeded!")
            send_usage_message(data_context=context,
                               event="cli.validation_operator.run",
                               success=True)
            sys.exit(0)
    except Exception as e:
        send_usage_message(data_context=context,
                           event="cli.validation_operator.run",
                           success=False)
        raise e
def test_configuration_driven_site_builder(
    site_builder_data_context_v013_with_html_store_titanic_random,
):
    context = site_builder_data_context_v013_with_html_store_titanic_random

    context.add_validation_operator(
        "validate_and_store",
        {
            "class_name": "ActionListValidationOperator",
            "action_list": [
                {
                    "name": "store_validation_result",
                    "action": {
                        "class_name": "StoreValidationResultAction",
                        "target_store_name": "validations_store",
                    },
                },
                {
                    "name": "extract_and_store_eval_parameters",
                    "action": {
                        "class_name": "StoreEvaluationParametersAction",
                        "target_store_name": "evaluation_parameter_store",
                    },
                },
            ],
        },
    )

    # profiling the Titanic datasource will generate one expectation suite and one validation
    # that is a profiling result
    datasource_name = "titanic"
    data_asset_name = "Titanic"
    profiler_name = "BasicDatasetProfiler"
    generator_name = "subdir_reader"
    context.profile_datasource(datasource_name)

    # creating another validation result using the profiler's suite (no need to use a new expectation suite
    # for this test). having two validation results - one with run id "profiling" - allows us to test
    # the logic of run_name_filter that helps filtering validation results to be included in
    # the profiling and the validation sections.
    batch_kwargs = context.build_batch_kwargs(
        datasource=datasource_name,
        batch_kwargs_generator=generator_name,
        data_asset_name=data_asset_name,
    )

    expectation_suite_name = "{}.{}.{}.{}".format(
        datasource_name, generator_name, data_asset_name, profiler_name
    )

    batch = context.get_batch(
        batch_kwargs=batch_kwargs,
        expectation_suite_name=expectation_suite_name,
    )
    run_id = RunIdentifier(run_name="test_run_id_12345")
    context.run_validation_operator(
        assets_to_validate=[batch],
        run_id=run_id,
        validation_operator_name="validate_and_store",
    )

    data_docs_config = context._project_config.data_docs_sites
    local_site_config = data_docs_config["local_site"]

    validations_set = set(context.stores["validations_store"].list_keys())
    assert len(validations_set) == 6
    assert (
        ValidationResultIdentifier(
            expectation_suite_identifier=ExpectationSuiteIdentifier(
                expectation_suite_name=expectation_suite_name
            ),
            run_id="test_run_id_12345",
            batch_identifier=batch.batch_id,
        )
        in validations_set
    )
    assert (
        ValidationResultIdentifier(
            expectation_suite_identifier=ExpectationSuiteIdentifier(
                expectation_suite_name=expectation_suite_name
            ),
            run_id="profiling",
            batch_identifier=batch.batch_id,
        )
        in validations_set
    )
    assert (
        ValidationResultIdentifier(
            expectation_suite_identifier=ExpectationSuiteIdentifier(
                expectation_suite_name=expectation_suite_name
            ),
            run_id="profiling",
            batch_identifier=batch.batch_id,
        )
        in validations_set
    )
    assert (
        ValidationResultIdentifier(
            expectation_suite_identifier=ExpectationSuiteIdentifier(
                expectation_suite_name=expectation_suite_name
            ),
            run_id="profiling",
            batch_identifier=batch.batch_id,
        )
        in validations_set
    )

    site_builder = SiteBuilder(
        data_context=context,
        runtime_environment={"root_directory": context.root_directory},
        **local_site_config
    )
    res = site_builder.build()

    index_page_locator_info = res[0]
    index_links_dict = res[1]

    # assert that how-to buttons and related elements are rendered (default behavior)
    assert_how_to_buttons(context, index_page_locator_info, index_links_dict)
    # print(json.dumps(index_page_locator_info, indent=2))
    assert (
        index_page_locator_info
        == "file://"
        + context.root_directory
        + "/uncommitted/data_docs/local_site/index.html"
    )

    # print(json.dumps(index_links_dict, indent=2))

    assert "site_name" in index_links_dict

    assert "expectations_links" in index_links_dict
    assert len(index_links_dict["expectations_links"]) == 5

    assert "validations_links" in index_links_dict
    assert (
        len(index_links_dict["validations_links"]) == 1
    ), """
    The only rendered validation should be the one not generated by the profiler
    """

    assert "profiling_links" in index_links_dict
    assert len(index_links_dict["profiling_links"]) == 5

    # save documentation locally
    os.makedirs("./tests/render/output", exist_ok=True)
    os.makedirs("./tests/render/output/documentation", exist_ok=True)

    if os.path.isdir("./tests/render/output/documentation"):
        shutil.rmtree("./tests/render/output/documentation")
    shutil.copytree(
        os.path.join(
            site_builder_data_context_v013_with_html_store_titanic_random.root_directory,
            "uncommitted/data_docs/",
        ),
        "./tests/render/output/documentation",
    )

    # let's create another validation result and run the site builder to add it
    # to the data docs
    # the operator does not have an StoreValidationResultAction action configured, so the site
    # will not be updated without our call to site builder

    expectation_suite_path_component = expectation_suite_name.replace(".", "/")
    validation_result_page_path = os.path.join(
        site_builder.site_index_builder.target_store.store_backends[
            ValidationResultIdentifier
        ].full_base_directory,
        "validations",
        expectation_suite_path_component,
        run_id.run_name,
        run_id.run_time.strftime("%Y%m%dT%H%M%S.%fZ"),
        batch.batch_id + ".html",
    )

    ts_last_mod_0 = os.path.getmtime(validation_result_page_path)

    run_id = RunIdentifier(run_name="test_run_id_12346")
    operator_result = context.run_validation_operator(
        assets_to_validate=[batch],
        run_id=run_id,
        validation_operator_name="validate_and_store",
    )

    validation_result_id = operator_result.list_validation_result_identifiers()[0]
    res = site_builder.build(resource_identifiers=[validation_result_id])

    index_links_dict = res[1]

    # verify that an additional validation result HTML file was generated
    assert len(index_links_dict["validations_links"]) == 2

    site_builder.site_index_builder.target_store.store_backends[
        ValidationResultIdentifier
    ].full_base_directory

    # verify that the validation result HTML file rendered in the previous run was NOT updated
    ts_last_mod_1 = os.path.getmtime(validation_result_page_path)

    assert ts_last_mod_0 == ts_last_mod_1

    # verify that the new method of the site builder that returns the URL of the HTML file that renders
    # a resource

    new_validation_result_page_path = os.path.join(
        site_builder.site_index_builder.target_store.store_backends[
            ValidationResultIdentifier
        ].full_base_directory,
        "validations",
        expectation_suite_path_component,
        run_id.run_name,
        run_id.run_time.strftime("%Y%m%dT%H%M%S.%fZ"),
        batch.batch_id + ".html",
    )

    html_url = site_builder.get_resource_url(resource_identifier=validation_result_id)
    assert "file://" + new_validation_result_page_path == html_url

    html_url = site_builder.get_resource_url()
    assert (
        "file://"
        + os.path.join(
            site_builder.site_index_builder.target_store.store_backends[
                ValidationResultIdentifier
            ].full_base_directory,
            "index.html",
        )
        == html_url
    )

    team_site_config = data_docs_config["team_site"]
    team_site_builder = SiteBuilder(
        data_context=context,
        runtime_environment={"root_directory": context.root_directory},
        **team_site_config
    )
    team_site_builder.clean_site()
    obs = [
        url_dict
        for url_dict in context.get_docs_sites_urls(site_name="team_site")
        if url_dict.get("site_url")
    ]
    assert len(obs) == 0

    # exercise clean_site
    site_builder.clean_site()
    obs = [
        url_dict
        for url_dict in context.get_docs_sites_urls()
        if url_dict.get("site_url")
    ]
    assert len(obs) == 0

    # restore site
    context = site_builder_data_context_v013_with_html_store_titanic_random
    site_builder = SiteBuilder(
        data_context=context,
        runtime_environment={"root_directory": context.root_directory},
        **local_site_config
    )
    res = site_builder.build()