def checkpoint( titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled, ): context: DataContext = titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled return Checkpoint(data_context=context, **{ "name": "my_checkpoint", "config_version": 1.0, "template_name": None, "run_name_template": None, "expectation_suite_name": None, "batch_request": None, "action_list": [ { "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction" }, }, { "name": "store_evaluation_params", "action": { "class_name": "StoreEvaluationParametersAction" }, }, { "name": "update_data_docs", "action": { "class_name": "UpdateDataDocsAction", "site_names": [] }, }, ], "evaluation_parameters": {}, "runtime_configuration": {}, "validations": [{ "batch_request": { "datasource_name": "example_datasource", "data_connector_name": "default_runtime_data_connector_name", "data_asset_name": "my_data_asset", }, "expectation_suite_name": "test_suite", }], "profilers": [], "ge_cloud_id": None, "expectation_suite_ge_cloud_id": None, })
"batch_request": { "datasource_name": "taxi_datasource", "data_connector_name": "default_inferred_data_connector_name", "data_asset_name": "yellow_tripdata_sample_2019-01", "data_connector_query": {"index": -1}, }, "expectation_suite_name": "my_expectation_suite", } ], } # </snippet> # Initialize your checkpoint with the Data Context and configuration # from before. # <snippet> my_checkpoint = Checkpoint(data_context=context, **python_config) # </snippet> # Run your Checkpoint. # <snippet> results = my_checkpoint.run() # </snippet> # The following asserts are for testing purposes and do not need to be included in typical scripts. assert results.success is True run_id_type = type(results.run_id) assert run_id_type == RunIdentifier validation_result_id_type_set = {type(k) for k in results.run_results.keys()} assert len(validation_result_id_type_set) == 1 validation_result_id_type = next(iter(validation_result_id_type_set)) assert validation_result_id_type == ValidationResultIdentifier
def test_checkpoint_config_deepcopy( titanic_pandas_data_context_with_v013_datasource_stats_enabled_with_checkpoints_v1_with_templates, monkeypatch, ): monkeypatch.setenv("GE_ENVIRONMENT", "my_ge_environment") monkeypatch.setenv("VAR", "test") monkeypatch.setenv("MY_PARAM", "1") monkeypatch.setenv("OLD_PARAM", "2") context: DataContext = titanic_pandas_data_context_with_v013_datasource_stats_enabled_with_checkpoints_v1_with_templates test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) runtime_batch_request: RuntimeBatchRequest = RuntimeBatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "my_runtime_data_connector", "data_asset_name": "default_data_asset_name", "batch_identifiers": { "pipeline_stage_name": "core_processing", "airflow_run_id": 1234567890, }, "runtime_parameters": {"batch_data": test_df}, } ) nested_checkpoint_config: CheckpointConfig = CheckpointConfig( name="my_nested_checkpoint", config_version=1, template_name="my_nested_checkpoint_template_2", expectation_suite_name="users.delivery", validations=[ { "batch_request": { "datasource_name": "my_datasource", "data_connector_name": "my_special_data_connector", "data_asset_name": "users", "data_connector_query": {"partition_index": -1}, } }, { "batch_request": { "datasource_name": "my_datasource", "data_connector_name": "my_other_data_connector", "data_asset_name": "users", "data_connector_query": {"partition_index": -2}, } }, ], ) nested_checkpoint: Checkpoint = Checkpoint( data_context=context, **filter_properties_dict( properties=nested_checkpoint_config.to_json_dict(), delete_fields={"class_name", "module_name"}, clean_falsy=True, ), ) substituted_config_template_and_runtime_kwargs: dict = nested_checkpoint.get_substituted_config( runtime_kwargs={ "batch_request": runtime_batch_request, "expectation_suite_name": "runtime_suite_name", "template_name": "my_nested_checkpoint_template_3", "validations": [ { "batch_request": { "datasource_name": "my_datasource", "data_connector_name": "my_other_data_connector_2_runtime", "data_asset_name": "users", "data_connector_query": {"partition_index": -3}, } }, { "batch_request": { "datasource_name": "my_datasource", "data_connector_name": "my_other_data_connector_3_runtime", "data_asset_name": "users", "data_connector_query": {"partition_index": -4}, } }, ], "run_name_template": "runtime_run_template", "action_list": [ { "name": "store_validation_result", "action": { "class_name": "StoreValidationResultAction", }, }, { "name": "store_evaluation_params", "action": { "class_name": "MyCustomRuntimeStoreEvaluationParametersAction", }, }, { "name": "update_data_docs", "action": None, }, { "name": "update_data_docs_deluxe_runtime", "action": { "class_name": "UpdateDataDocsAction", }, }, ], "evaluation_parameters": { "environment": "runtime-$GE_ENVIRONMENT", "tolerance": 1.0e-2, "aux_param_0": "runtime-$MY_PARAM", "aux_param_1": "1 + $MY_PARAM", "new_runtime_eval_param": "bloopy!", }, "runtime_configuration": { "result_format": "BASIC", "partial_unexpected_count": 999, "new_runtime_config_key": "bleepy!", }, } ) checkpoint_config_copy: dict = copy.deepcopy( substituted_config_template_and_runtime_kwargs ) assert deep_filter_properties_iterable( properties=checkpoint_config_copy, clean_falsy=True, ) == deep_filter_properties_iterable( properties=substituted_config_template_and_runtime_kwargs, clean_falsy=True, )
def run( self, checkpoint_name: str = None, ge_checkpoint: Checkpoint = None, checkpoint_kwargs: dict = None, context: ge.DataContext = None, assets_to_validate: list = None, batch_kwargs: dict = None, expectation_suite_name: str = None, context_root_dir: str = None, runtime_environment: Optional[dict] = None, run_name: str = None, run_info_at_end: bool = True, disable_markdown_artifact: bool = False, validation_operator: str = "action_list_operator", evaluation_parameters: Optional[dict] = None, ): """ Task run method. Args: - checkpoint_name (str, optional): the name of a pre-configured checkpoint; should match the filename of the checkpoint without the extension. Either checkpoint_name or checkpoint_config is required when using the Great Expectations v3 API. - ge_checkpoint (Checkpoint, optional): an in-memory GE `Checkpoint` object used to perform validation. If not provided then `checkpoint_name` will be used to load the specified checkpoint. - checkpoint_kwargs (Dict, optional): A dictionary whose keys match the parameters of `CheckpointConfig` which can be used to update and populate the task's Checkpoint at runtime. - context (DataContext, optional): an in-memory GE `DataContext` object. e.g. `ge.data_context.DataContext()` If not provided then `context_root_dir` will be used to look for one. - assets_to_validate (list, optional): A list of assets to validate when running the validation operator. Only used in the Great Expectations v2 API - batch_kwargs (dict, optional): a dictionary of batch kwargs to be used when validating assets. Only used in the Great Expectations v2 API - expectation_suite_name (str, optional): the name of an expectation suite to be used when validating assets. Only used in the Great Expectations v2 API - context_root_dir (str, optional): the absolute or relative path to the directory holding your `great_expectations.yml` - runtime_environment (dict, optional): a dictionary of great expectation config key-value pairs to overwrite your config in `great_expectations.yml` - run_name (str, optional): the name of this Great Expectation validation run; defaults to the task slug - run_info_at_end (bool, optional): add run info to the end of the artifact generated by this task. Defaults to `True`. - disable_markdown_artifact (bool, optional): toggle the posting of a markdown artifact from this tasks. Defaults to `False`. - evaluation_parameters (Optional[dict], optional): the evaluation parameters to use when running validation. For more information, see [example](https://docs.prefect.io/api/latest/tasks/great_expectations.html#rungreatexpectationsvalidation) and [docs](https://docs.greatexpectations.io/en/latest/reference/core_concepts/evaluation_parameters.html). - validation_operator (str, optional): configure the actions to be executed after running validation. Defaults to `action_list_operator`. Raises: - 'signals.FAIL' if the validation was not a success Returns: - result ('great_expectations.validation_operators.types.validation_operator_result.ValidationOperatorResult'): The Great Expectations metadata returned from the validation if the v2 (batch_kwargs) API is used. ('great_expectations.checkpoint.checkpoint.CheckpointResult'): The Great Expectations metadata returned from running the provided checkpoint if a checkpoint name is provided. """ if version.parse(ge.__version__) < version.parse("0.13.8"): self.logger.warning( f"You are using great_expectations version {ge.__version__} which may cause" "errors in this task. Please upgrade great_expections to 0.13.8 or later." ) runtime_environment = runtime_environment or dict() checkpoint_kwargs = checkpoint_kwargs or dict() # Load context if not provided directly if not context: context = ge.DataContext( context_root_dir=context_root_dir, runtime_environment=runtime_environment, ) # Check that the parameters are mutually exclusive if (sum( bool(x) for x in [ (expectation_suite_name and batch_kwargs), assets_to_validate, checkpoint_name, ge_checkpoint, ]) != 1): raise ValueError( "Exactly one of expectation_suite_name + batch_kwargs, assets_to_validate, " "checkpoint_name, or ge_checkpoint is required to run validation." ) results = None # If there is a checkpoint or checkpoint name provided, run the checkpoint. # Checkpoints are the preferred deployment of validation configuration. if ge_checkpoint or checkpoint_name: ge_checkpoint = ge_checkpoint or context.get_checkpoint( checkpoint_name) results = ge_checkpoint.run( evaluation_parameters=evaluation_parameters, run_id={ "run_name": run_name or prefect.context.get("task_slug") }, **checkpoint_kwargs, ) else: # If assets are not provided directly through `assets_to_validate` then they need be loaded # get batch from `batch_kwargs` and `expectation_suite_name` if not assets_to_validate: assets_to_validate = [ context.get_batch(batch_kwargs, expectation_suite_name) ] # Run validation operator results = context.run_validation_operator( validation_operator, assets_to_validate=assets_to_validate, run_id={ "run_name": run_name or prefect.context.get("task_slug") }, evaluation_parameters=evaluation_parameters, ) # Generate artifact markdown if not disable_markdown_artifact: validation_results_page_renderer = ( ge.render.renderer.ValidationResultsPageRenderer( run_info_at_end=run_info_at_end)) rendered_content_list = validation_results_page_renderer.render_validation_operator_result( # This also works with a CheckpointResult because of duck typing. # The passed in object needs a list_validation_results method that # returns a list of ExpectationSuiteValidationResult. validation_operator_result=results) markdown_artifact = " ".join( ge.render.view.DefaultMarkdownPageView().render( rendered_content_list)) create_markdown_artifact(markdown_artifact) if results.success is False: raise signals.FAIL(result=results) return results
action: class_name: UpdateDataDocsAction site_names: [] validations: - batch_request: datasource_name: taxi_datasource data_connector_name: default_inferred_data_connector_name data_asset_name: yellow_tripdata_sample_2019-01 expectation_suite_name: my_expectation_suite """ # </snippet> # Initialize your checkpoint with the Data Context and Checkpoint configuration # from before. # <snippet> my_checkpoint = Checkpoint(data_context=context, **yaml.load(yaml_config)) # </snippet> # Run your Checkpoint. # <snippet> results = my_checkpoint.run() # </snippet> # The following asserts are for testing purposes and do not need to be included in typical scripts. assert results.success is True run_id_type = type(results.run_id) assert run_id_type == RunIdentifier validation_result_id_type_set = {type(k) for k in results.run_results.keys()} assert len(validation_result_id_type_set) == 1 validation_result_id_type = next(iter(validation_result_id_type_set)) assert validation_result_id_type == ValidationResultIdentifier