示例#1
0
def checkpoint(
    titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled,
):
    context: DataContext = titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled
    return Checkpoint(data_context=context,
                      **{
                          "name":
                          "my_checkpoint",
                          "config_version":
                          1.0,
                          "template_name":
                          None,
                          "run_name_template":
                          None,
                          "expectation_suite_name":
                          None,
                          "batch_request":
                          None,
                          "action_list": [
                              {
                                  "name": "store_validation_result",
                                  "action": {
                                      "class_name":
                                      "StoreValidationResultAction"
                                  },
                              },
                              {
                                  "name": "store_evaluation_params",
                                  "action": {
                                      "class_name":
                                      "StoreEvaluationParametersAction"
                                  },
                              },
                              {
                                  "name": "update_data_docs",
                                  "action": {
                                      "class_name": "UpdateDataDocsAction",
                                      "site_names": []
                                  },
                              },
                          ],
                          "evaluation_parameters": {},
                          "runtime_configuration": {},
                          "validations": [{
                              "batch_request": {
                                  "datasource_name": "example_datasource",
                                  "data_connector_name":
                                  "default_runtime_data_connector_name",
                                  "data_asset_name": "my_data_asset",
                              },
                              "expectation_suite_name":
                              "test_suite",
                          }],
                          "profilers": [],
                          "ge_cloud_id":
                          None,
                          "expectation_suite_ge_cloud_id":
                          None,
                      })
            "batch_request": {
                "datasource_name": "taxi_datasource",
                "data_connector_name": "default_inferred_data_connector_name",
                "data_asset_name": "yellow_tripdata_sample_2019-01",
                "data_connector_query": {"index": -1},
            },
            "expectation_suite_name": "my_expectation_suite",
        }
    ],
}
# </snippet>

# Initialize your checkpoint with the Data Context and configuration
# from before.
# <snippet>
my_checkpoint = Checkpoint(data_context=context, **python_config)
# </snippet>

# Run your Checkpoint.
# <snippet>
results = my_checkpoint.run()
# </snippet>

# The following asserts are for testing purposes and do not need to be included in typical scripts.
assert results.success is True
run_id_type = type(results.run_id)
assert run_id_type == RunIdentifier
validation_result_id_type_set = {type(k) for k in results.run_results.keys()}
assert len(validation_result_id_type_set) == 1
validation_result_id_type = next(iter(validation_result_id_type_set))
assert validation_result_id_type == ValidationResultIdentifier
示例#3
0
def test_checkpoint_config_deepcopy(
    titanic_pandas_data_context_with_v013_datasource_stats_enabled_with_checkpoints_v1_with_templates,
    monkeypatch,
):
    monkeypatch.setenv("GE_ENVIRONMENT", "my_ge_environment")
    monkeypatch.setenv("VAR", "test")
    monkeypatch.setenv("MY_PARAM", "1")
    monkeypatch.setenv("OLD_PARAM", "2")

    context: DataContext = titanic_pandas_data_context_with_v013_datasource_stats_enabled_with_checkpoints_v1_with_templates

    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    runtime_batch_request: RuntimeBatchRequest = RuntimeBatchRequest(
        **{
            "datasource_name": "my_datasource",
            "data_connector_name": "my_runtime_data_connector",
            "data_asset_name": "default_data_asset_name",
            "batch_identifiers": {
                "pipeline_stage_name": "core_processing",
                "airflow_run_id": 1234567890,
            },
            "runtime_parameters": {"batch_data": test_df},
        }
    )

    nested_checkpoint_config: CheckpointConfig = CheckpointConfig(
        name="my_nested_checkpoint",
        config_version=1,
        template_name="my_nested_checkpoint_template_2",
        expectation_suite_name="users.delivery",
        validations=[
            {
                "batch_request": {
                    "datasource_name": "my_datasource",
                    "data_connector_name": "my_special_data_connector",
                    "data_asset_name": "users",
                    "data_connector_query": {"partition_index": -1},
                }
            },
            {
                "batch_request": {
                    "datasource_name": "my_datasource",
                    "data_connector_name": "my_other_data_connector",
                    "data_asset_name": "users",
                    "data_connector_query": {"partition_index": -2},
                }
            },
        ],
    )
    nested_checkpoint: Checkpoint = Checkpoint(
        data_context=context,
        **filter_properties_dict(
            properties=nested_checkpoint_config.to_json_dict(),
            delete_fields={"class_name", "module_name"},
            clean_falsy=True,
        ),
    )
    substituted_config_template_and_runtime_kwargs: dict = nested_checkpoint.get_substituted_config(
        runtime_kwargs={
            "batch_request": runtime_batch_request,
            "expectation_suite_name": "runtime_suite_name",
            "template_name": "my_nested_checkpoint_template_3",
            "validations": [
                {
                    "batch_request": {
                        "datasource_name": "my_datasource",
                        "data_connector_name": "my_other_data_connector_2_runtime",
                        "data_asset_name": "users",
                        "data_connector_query": {"partition_index": -3},
                    }
                },
                {
                    "batch_request": {
                        "datasource_name": "my_datasource",
                        "data_connector_name": "my_other_data_connector_3_runtime",
                        "data_asset_name": "users",
                        "data_connector_query": {"partition_index": -4},
                    }
                },
            ],
            "run_name_template": "runtime_run_template",
            "action_list": [
                {
                    "name": "store_validation_result",
                    "action": {
                        "class_name": "StoreValidationResultAction",
                    },
                },
                {
                    "name": "store_evaluation_params",
                    "action": {
                        "class_name": "MyCustomRuntimeStoreEvaluationParametersAction",
                    },
                },
                {
                    "name": "update_data_docs",
                    "action": None,
                },
                {
                    "name": "update_data_docs_deluxe_runtime",
                    "action": {
                        "class_name": "UpdateDataDocsAction",
                    },
                },
            ],
            "evaluation_parameters": {
                "environment": "runtime-$GE_ENVIRONMENT",
                "tolerance": 1.0e-2,
                "aux_param_0": "runtime-$MY_PARAM",
                "aux_param_1": "1 + $MY_PARAM",
                "new_runtime_eval_param": "bloopy!",
            },
            "runtime_configuration": {
                "result_format": "BASIC",
                "partial_unexpected_count": 999,
                "new_runtime_config_key": "bleepy!",
            },
        }
    )

    checkpoint_config_copy: dict = copy.deepcopy(
        substituted_config_template_and_runtime_kwargs
    )
    assert deep_filter_properties_iterable(
        properties=checkpoint_config_copy,
        clean_falsy=True,
    ) == deep_filter_properties_iterable(
        properties=substituted_config_template_and_runtime_kwargs,
        clean_falsy=True,
    )
示例#4
0
    def run(
        self,
        checkpoint_name: str = None,
        ge_checkpoint: Checkpoint = None,
        checkpoint_kwargs: dict = None,
        context: ge.DataContext = None,
        assets_to_validate: list = None,
        batch_kwargs: dict = None,
        expectation_suite_name: str = None,
        context_root_dir: str = None,
        runtime_environment: Optional[dict] = None,
        run_name: str = None,
        run_info_at_end: bool = True,
        disable_markdown_artifact: bool = False,
        validation_operator: str = "action_list_operator",
        evaluation_parameters: Optional[dict] = None,
    ):
        """
        Task run method.

        Args:
            - checkpoint_name (str, optional): the name of a pre-configured checkpoint; should match the
                filename of the checkpoint without the extension. Either checkpoint_name or
                checkpoint_config is required when using the Great Expectations v3 API.
            - ge_checkpoint (Checkpoint, optional): an in-memory GE `Checkpoint` object used to perform
                validation. If not provided then `checkpoint_name` will be used to load the specified
                checkpoint.
            - checkpoint_kwargs (Dict, optional): A dictionary whose keys match the parameters of
                `CheckpointConfig` which can be used to update and populate the task's Checkpoint at
                runtime.
            - context (DataContext, optional): an in-memory GE `DataContext` object. e.g.
                `ge.data_context.DataContext()` If not provided then `context_root_dir` will be used to
                look for one.
            - assets_to_validate (list, optional): A list of assets to validate when running the
                validation operator. Only used in the Great Expectations v2 API
            - batch_kwargs (dict, optional): a dictionary of batch kwargs to be used when validating
                assets. Only used in the Great Expectations v2 API
            - expectation_suite_name (str, optional): the name of an expectation suite to be used when
                validating assets. Only used in the Great Expectations v2 API
            - context_root_dir (str, optional): the absolute or relative path to the directory holding
                your `great_expectations.yml`
            - runtime_environment (dict, optional): a dictionary of great expectation config key-value
                pairs to overwrite your config in `great_expectations.yml`
            - run_name (str, optional): the name of this  Great Expectation validation run; defaults to
                the task slug
            - run_info_at_end (bool, optional): add run info to the end of the artifact generated by this
                task. Defaults to `True`.
            - disable_markdown_artifact (bool, optional): toggle the posting of a markdown artifact from
                this tasks. Defaults to `False`.
            - evaluation_parameters (Optional[dict], optional): the evaluation parameters to use when
                running validation. For more information, see
                [example](https://docs.prefect.io/api/latest/tasks/great_expectations.html#rungreatexpectationsvalidation)
                and
                [docs](https://docs.greatexpectations.io/en/latest/reference/core_concepts/evaluation_parameters.html).
            - validation_operator (str, optional): configure the actions to be executed after running
                validation. Defaults to `action_list_operator`.

        Raises:
            - 'signals.FAIL' if the validation was not a success

        Returns:
            - result
                ('great_expectations.validation_operators.types.validation_operator_result.ValidationOperatorResult'):
                The Great Expectations metadata returned from the validation if the v2 (batch_kwargs) API
                is used.

                ('great_expectations.checkpoint.checkpoint.CheckpointResult'):
                The Great Expectations metadata returned from running the provided checkpoint if a
                checkpoint name is provided.

        """

        if version.parse(ge.__version__) < version.parse("0.13.8"):
            self.logger.warning(
                f"You are using great_expectations version {ge.__version__} which may cause"
                "errors in this task. Please upgrade great_expections to 0.13.8 or later."
            )

        runtime_environment = runtime_environment or dict()
        checkpoint_kwargs = checkpoint_kwargs or dict()

        # Load context if not provided directly
        if not context:
            context = ge.DataContext(
                context_root_dir=context_root_dir,
                runtime_environment=runtime_environment,
            )

        # Check that the parameters are mutually exclusive
        if (sum(
                bool(x) for x in [
                    (expectation_suite_name and batch_kwargs),
                    assets_to_validate,
                    checkpoint_name,
                    ge_checkpoint,
                ]) != 1):
            raise ValueError(
                "Exactly one of expectation_suite_name + batch_kwargs, assets_to_validate, "
                "checkpoint_name, or ge_checkpoint is required to run validation."
            )

        results = None
        # If there is a checkpoint or checkpoint name provided, run the checkpoint.
        # Checkpoints are the preferred deployment of validation configuration.
        if ge_checkpoint or checkpoint_name:
            ge_checkpoint = ge_checkpoint or context.get_checkpoint(
                checkpoint_name)
            results = ge_checkpoint.run(
                evaluation_parameters=evaluation_parameters,
                run_id={
                    "run_name": run_name or prefect.context.get("task_slug")
                },
                **checkpoint_kwargs,
            )
        else:
            # If assets are not provided directly through `assets_to_validate` then they need be loaded
            #   get batch from `batch_kwargs` and `expectation_suite_name`
            if not assets_to_validate:
                assets_to_validate = [
                    context.get_batch(batch_kwargs, expectation_suite_name)
                ]

            # Run validation operator
            results = context.run_validation_operator(
                validation_operator,
                assets_to_validate=assets_to_validate,
                run_id={
                    "run_name": run_name or prefect.context.get("task_slug")
                },
                evaluation_parameters=evaluation_parameters,
            )

        # Generate artifact markdown
        if not disable_markdown_artifact:
            validation_results_page_renderer = (
                ge.render.renderer.ValidationResultsPageRenderer(
                    run_info_at_end=run_info_at_end))
            rendered_content_list = validation_results_page_renderer.render_validation_operator_result(
                # This also works with a CheckpointResult because of duck typing.
                # The passed in object needs a list_validation_results method that
                # returns a list of ExpectationSuiteValidationResult.
                validation_operator_result=results)
            markdown_artifact = " ".join(
                ge.render.view.DefaultMarkdownPageView().render(
                    rendered_content_list))

            create_markdown_artifact(markdown_artifact)

        if results.success is False:
            raise signals.FAIL(result=results)

        return results
    action:
      class_name: UpdateDataDocsAction
      site_names: []
validations:
  - batch_request:
      datasource_name: taxi_datasource
      data_connector_name: default_inferred_data_connector_name
      data_asset_name: yellow_tripdata_sample_2019-01
    expectation_suite_name: my_expectation_suite
"""
# </snippet>

# Initialize your checkpoint with the Data Context and Checkpoint configuration
# from before.
# <snippet>
my_checkpoint = Checkpoint(data_context=context, **yaml.load(yaml_config))
# </snippet>

# Run your Checkpoint.
# <snippet>
results = my_checkpoint.run()
# </snippet>

# The following asserts are for testing purposes and do not need to be included in typical scripts.
assert results.success is True
run_id_type = type(results.run_id)
assert run_id_type == RunIdentifier
validation_result_id_type_set = {type(k) for k in results.run_results.keys()}
assert len(validation_result_id_type_set) == 1
validation_result_id_type = next(iter(validation_result_id_type_set))
assert validation_result_id_type == ValidationResultIdentifier