def validate_validation_dict(validation_dict: dict) -> None: if validation_dict.get("batch_request") is None: raise ge_exceptions.CheckpointError("validation batch_request cannot be None") if not validation_dict.get("expectation_suite_name"): raise ge_exceptions.CheckpointError( "validation expectation_suite_name must be specified" ) if not validation_dict.get("action_list"): raise ge_exceptions.CheckpointError("validation action_list cannot be empty")
def get_checkpoint(self, name: Optional[str], ge_cloud_id: Optional[str]) -> CheckpointConfig: key: Union[GeCloudIdentifier, ConfigurationIdentifier] = self.determine_key( name=name, ge_cloud_id=ge_cloud_id) try: checkpoint_config: CheckpointConfig = self.get(key=key) except ge_exceptions.InvalidKeyError as exc_ik: raise ge_exceptions.CheckpointNotFoundError( message= f'Non-existent Checkpoint configuration named "{key.configuration_key}".\n\nDetails: {exc_ik}' ) except ValidationError as exc_ve: raise ge_exceptions.InvalidCheckpointConfigError( message="Invalid Checkpoint configuration", validation_error=exc_ve) if checkpoint_config.config_version is None: config_dict: dict = checkpoint_config.to_json_dict() batches: Optional[dict] = config_dict.get("batches") if not (batches is not None and (len(batches) == 0 or { "batch_kwargs", "expectation_suite_names", }.issubset( set( itertools.chain.from_iterable(item.keys() for item in batches))))): raise ge_exceptions.CheckpointError( message= "Attempt to instantiate LegacyCheckpoint with insufficient and/or incorrect arguments." ) return checkpoint_config
def get_substituted_batch_request( substituted_runtime_config: dict, validation_batch_request: Optional[Union[BatchRequestBase, dict]] = None, ) -> Optional[Union[BatchRequest, RuntimeBatchRequest]]: substituted_runtime_batch_request = substituted_runtime_config.get("batch_request") if substituted_runtime_batch_request is None and validation_batch_request is None: return None if substituted_runtime_batch_request is None: substituted_runtime_batch_request = {} if validation_batch_request is None: validation_batch_request = {} validation_batch_request = get_batch_request_as_dict( batch_request=validation_batch_request ) substituted_runtime_batch_request = get_batch_request_as_dict( batch_request=substituted_runtime_batch_request ) for key, value in validation_batch_request.items(): substituted_value = substituted_runtime_batch_request.get(key) if value is not None and substituted_value is not None: raise ge_exceptions.CheckpointError( f'BatchRequest attribute "{key}" was specified in both validation and top-level CheckpointConfig.' ) effective_batch_request: dict = dict( **substituted_runtime_batch_request, **validation_batch_request ) return materialize_batch_request(batch_request=effective_batch_request)
def get_runtime_batch_request( substituted_runtime_config: CheckpointConfig, validation_batch_request: Optional[dict] = None, ) -> Union[BatchRequest, RuntimeBatchRequest]: runtime_config_batch_request = substituted_runtime_config.batch_request if (runtime_config_batch_request is not None and "runtime_parameters" in runtime_config_batch_request) or ( validation_batch_request is not None and "runtime_parameters" in validation_batch_request): batch_request_class = RuntimeBatchRequest else: batch_request_class = BatchRequest if runtime_config_batch_request is None: return (validation_batch_request if validation_batch_request is None else batch_request_class(**validation_batch_request)) if validation_batch_request is None: return batch_request_class(**runtime_config_batch_request) runtime_batch_request_dict: dict = copy.deepcopy(validation_batch_request) for key, val in runtime_batch_request_dict.items(): if val is not None and runtime_config_batch_request.get( key) is not None: raise ge_exceptions.CheckpointError( f'BatchRequest attribute "{key}" was specified in both validation and top-level CheckpointConfig.' ) runtime_batch_request_dict.update(runtime_config_batch_request) return batch_request_class(**runtime_batch_request_dict)
def _get_substituted_template( self, source_config: dict, ) -> dict: substituted_config: dict template_name = source_config.get("template_name") if template_name: checkpoint: Checkpoint = self.data_context.get_checkpoint( name=template_name) template_config: dict = checkpoint.config.to_json_dict() if template_config["config_version"] != source_config[ "config_version"]: raise ge_exceptions.CheckpointError( f"Invalid template '{template_name}' (ver. {template_config['config_version']}) for Checkpoint " f"'{source_config}' (ver. {source_config['config_version']}. Checkpoints can only use templates with the same config_version." ) substituted_template_config: dict = self._get_substituted_template( source_config=template_config) substituted_config = substitute_template_config( source_config=source_config, template_config=substituted_template_config) else: substituted_config = copy.deepcopy(source_config) if self.data_context.ge_cloud_mode: return substituted_config return self._substitute_config_variables(config=substituted_config)
def get_substituted_config( self, config: Optional[Union[CheckpointConfig, dict]] = None, runtime_kwargs: Optional[dict] = None, ) -> CheckpointConfig: runtime_kwargs = runtime_kwargs or {} if config is None: config = self.config if isinstance(config, dict): config = CheckpointConfig(**config) substituted_config: Union[CheckpointConfig, dict] if ( self._substituted_config is not None and not runtime_kwargs.get("template_name") and not config.template_name ): substituted_config = deepcopy(self._substituted_config) if any(runtime_kwargs.values()): substituted_config.update(runtime_kwargs=runtime_kwargs) else: template_name = runtime_kwargs.get("template_name") or config.template_name if not template_name: substituted_config = copy.deepcopy(config) if any(runtime_kwargs.values()): substituted_config.update(runtime_kwargs=runtime_kwargs) self._substituted_config = substituted_config else: checkpoint = self.data_context.get_checkpoint(name=template_name) template_config = checkpoint.config if template_config.config_version != config.config_version: raise ge_exceptions.CheckpointError( f"Invalid template '{template_name}' (ver. {template_config.config_version}) for Checkpoint " f"'{config}' (ver. {config.config_version}. Checkpoints can only use templates with the same config_version." ) if template_config.template_name is not None: substituted_config = self.get_substituted_config( config=template_config ) else: substituted_config = template_config # merge template with config substituted_config.update( other_config=config, runtime_kwargs=runtime_kwargs ) # don't replace _substituted_config if already exists if self._substituted_config is None: self._substituted_config = substituted_config return self._substitute_config_variables(config=substituted_config)
def get_runtime_batch_request( substituted_runtime_config: CheckpointConfig, validation_batch_request: Optional[dict] = None, ) -> BatchRequest: if substituted_runtime_config.batch_request is None: return (validation_batch_request if validation_batch_request is None else BatchRequest(**validation_batch_request)) if validation_batch_request is None: return BatchRequest(**substituted_runtime_config.batch_request) runtime_batch_request_dict: dict = copy.deepcopy(validation_batch_request) for key, val in runtime_batch_request_dict.items(): if (val is not None and substituted_runtime_config.batch_request.get(key) is not None): raise ge_exceptions.CheckpointError( f'BatchRequest attribute "{key}" was specified in both validation and top-level CheckpointConfig.' ) runtime_batch_request_dict.update(substituted_runtime_config.batch_request) return BatchRequest(**runtime_batch_request_dict)
def get_checkpoint( data_context: "DataContext", # noqa: F821 checkpoint_store: CheckpointStore, name: Optional[str] = None, ge_cloud_id: Optional[str] = None, ) -> Union[Checkpoint, LegacyCheckpoint]: if ge_cloud_id: key: GeCloudIdentifier = GeCloudIdentifier( resource_type="contract", ge_cloud_id=ge_cloud_id ) else: key: ConfigurationIdentifier = ConfigurationIdentifier( configuration_key=name, ) try: checkpoint_config: CheckpointConfig = checkpoint_store.get(key=key) except ge_exceptions.InvalidKeyError as exc_ik: raise ge_exceptions.CheckpointNotFoundError( message=f'Non-existent Checkpoint configuration named "{key.configuration_key}".\n\nDetails: {exc_ik}' ) except ValidationError as exc_ve: raise ge_exceptions.InvalidCheckpointConfigError( message="Invalid Checkpoint configuration", validation_error=exc_ve ) if checkpoint_config.config_version is None: if not ( "batches" in checkpoint_config.to_json_dict() and ( len(checkpoint_config.to_json_dict()["batches"]) == 0 or {"batch_kwargs", "expectation_suite_names",}.issubset( set( list( itertools.chain.from_iterable( [ item.keys() for item in checkpoint_config.to_json_dict()[ "batches" ] ] ) ) ) ) ) ): raise ge_exceptions.CheckpointError( message="Attempt to instantiate LegacyCheckpoint with insufficient and/or incorrect arguments." ) config: dict = checkpoint_config.to_json_dict() if name: config.update({"name": name}) config = filter_properties_dict(properties=config, clean_falsy=True) checkpoint: Union[Checkpoint, LegacyCheckpoint] = instantiate_class_from_config( config=config, runtime_environment={ "data_context": data_context, }, config_defaults={ "module_name": "great_expectations.checkpoint", }, ) return checkpoint
def run( self, template_name: Optional[str] = None, run_name_template: Optional[str] = None, expectation_suite_name: Optional[str] = None, batch_request: Optional[Union[BatchRequest, dict]] = None, action_list: Optional[List[dict]] = None, evaluation_parameters: Optional[dict] = None, runtime_configuration: Optional[dict] = None, validations: Optional[List[dict]] = None, profilers: Optional[List[dict]] = None, run_id: Optional[Union[str, RunIdentifier]] = None, run_name: Optional[str] = None, run_time: Optional[Union[str, datetime.datetime]] = None, result_format: Optional[str] = None, **kwargs, ) -> CheckpointResult: assert not (run_id and run_name) and not ( run_id and run_time ), "Please provide either a run_id or run_name and/or run_time." run_time = run_time or datetime.datetime.now() runtime_configuration: dict = runtime_configuration or {} result_format: Optional[ dict] = result_format or runtime_configuration.get("result_format") if result_format is None: result_format = {"result_format": "SUMMARY"} runtime_kwargs = { "template_name": template_name, "run_name_template": run_name_template, "expectation_suite_name": expectation_suite_name, "batch_request": batch_request, "action_list": action_list, "evaluation_parameters": evaluation_parameters, "runtime_configuration": runtime_configuration, "validations": validations, "profilers": profilers, } substituted_runtime_config: CheckpointConfig = self.get_substituted_config( runtime_kwargs=runtime_kwargs) run_name_template: Optional[ str] = substituted_runtime_config.run_name_template validations: list = substituted_runtime_config.validations if len(validations) == 0: raise ge_exceptions.CheckpointError( f'Checkpoint "{self.name}" does not contain any validations.') run_results = {} if run_name is None and run_name_template is not None: run_name: str = get_datetime_string_from_strftime_format( format_str=run_name_template, datetime_obj=run_time) run_id = run_id or RunIdentifier(run_name=run_name, run_time=run_time) for idx, validation_dict in enumerate(validations): try: substituted_validation_dict: dict = get_substituted_validation_dict( substituted_runtime_config=substituted_runtime_config, validation_dict=validation_dict, ) batch_request: Union[ BatchRequest, RuntimeBatchRequest] = substituted_validation_dict.get( "batch_request") expectation_suite_name: str = substituted_validation_dict.get( "expectation_suite_name") action_list: list = substituted_validation_dict.get( "action_list") validator: Validator = self.data_context.get_validator( batch_request=batch_request, expectation_suite_name=expectation_suite_name, ) action_list_validation_operator: ActionListValidationOperator = ( ActionListValidationOperator( data_context=self.data_context, action_list=action_list, result_format=result_format, name=f"{self.name}-checkpoint-validation[{idx}]", )) val_op_run_result: ValidationOperatorResult = ( action_list_validation_operator.run( assets_to_validate=[validator], run_id=run_id, evaluation_parameters=substituted_validation_dict.get( "evaluation_parameters"), result_format=result_format, )) run_results.update(val_op_run_result.run_results) except ( ge_exceptions.CheckpointError, ge_exceptions.ExecutionEngineError, ) as e: raise ge_exceptions.CheckpointError( f"Exception occurred while running validation[{idx}] of Checkpoint '{self.name}': {e.message}." ) return CheckpointResult(run_id=run_id, run_results=run_results, checkpoint_config=self.config)
def run( self, template_name: Optional[str] = None, run_name_template: Optional[str] = None, expectation_suite_name: Optional[str] = None, batch_request: Optional[Union[BatchRequestBase, dict]] = None, action_list: Optional[List[dict]] = None, evaluation_parameters: Optional[dict] = None, runtime_configuration: Optional[dict] = None, validations: Optional[List[dict]] = None, profilers: Optional[List[dict]] = None, run_id: Optional[Union[str, RunIdentifier]] = None, run_name: Optional[str] = None, run_time: Optional[Union[str, datetime.datetime]] = None, result_format: Optional[Union[str, dict]] = None, expectation_suite_ge_cloud_id: Optional[str] = None, ) -> CheckpointResult: assert not (run_id and run_name) and not ( run_id and run_time ), "Please provide either a run_id or run_name and/or run_time." run_time = run_time or datetime.datetime.now() runtime_configuration = runtime_configuration or {} result_format = result_format or runtime_configuration.get( "result_format") batch_request = get_batch_request_as_dict(batch_request=batch_request) validations = get_validations_with_batch_request_as_dict( validations=validations) runtime_kwargs: dict = { "template_name": template_name, "run_name_template": run_name_template, "expectation_suite_name": expectation_suite_name, "batch_request": batch_request or {}, "action_list": action_list or [], "evaluation_parameters": evaluation_parameters or {}, "runtime_configuration": runtime_configuration or {}, "validations": validations or [], "profilers": profilers or [], "expectation_suite_ge_cloud_id": expectation_suite_ge_cloud_id, } substituted_runtime_config: dict = self.get_substituted_config( runtime_kwargs=runtime_kwargs) run_name_template = substituted_runtime_config.get("run_name_template") batch_request = substituted_runtime_config.get("batch_request") validations = substituted_runtime_config.get("validations") or [] if len(validations) == 0 and not batch_request: raise ge_exceptions.CheckpointError( f'Checkpoint "{self.name}" must contain either a batch_request or validations.' ) if run_name is None and run_name_template is not None: run_name = get_datetime_string_from_strftime_format( format_str=run_name_template, datetime_obj=run_time) run_id = run_id or RunIdentifier(run_name=run_name, run_time=run_time) # Use AsyncExecutor to speed up I/O bound validations by running them in parallel with multithreading (if # concurrency is enabled in the data context configuration) -- please see the below arguments used to initialize # AsyncExecutor and the corresponding AsyncExecutor docstring for more details on when multiple threads are # used. with AsyncExecutor(self.data_context.concurrency, max_workers=len(validations)) as async_executor: # noinspection PyUnresolvedReferences async_validation_operator_results: List[ AsyncResult[ValidationOperatorResult]] = [] if len(validations) > 0: for idx, validation_dict in enumerate(validations): self._run_validation( substituted_runtime_config=substituted_runtime_config, async_validation_operator_results= async_validation_operator_results, async_executor=async_executor, result_format=result_format, run_id=run_id, idx=idx, validation_dict=validation_dict, ) else: self._run_validation( substituted_runtime_config=substituted_runtime_config, async_validation_operator_results= async_validation_operator_results, async_executor=async_executor, result_format=result_format, run_id=run_id, ) run_results: dict = {} for async_validation_operator_result in async_validation_operator_results: run_results.update( async_validation_operator_result.result().run_results) return CheckpointResult( run_id=run_id, run_results=run_results, checkpoint_config=self.config, )
def _run_validation( self, substituted_runtime_config: dict, async_validation_operator_results: List[AsyncResult], async_executor: AsyncExecutor, result_format: Optional[dict], run_id: Optional[Union[str, RunIdentifier]], idx: Optional[int] = 0, validation_dict: Optional[dict] = None, ) -> None: if validation_dict is None: validation_dict = {} try: substituted_validation_dict: dict = get_substituted_validation_dict( substituted_runtime_config=substituted_runtime_config, validation_dict=validation_dict, ) batch_request: Union[ BatchRequest, RuntimeBatchRequest] = substituted_validation_dict.get( "batch_request") expectation_suite_name: str = substituted_validation_dict.get( "expectation_suite_name") expectation_suite_ge_cloud_id: str = substituted_validation_dict.get( "expectation_suite_ge_cloud_id") include_rendered_content: bool = substituted_validation_dict.get( "include_rendered_content", False) validator: Validator = self.data_context.get_validator( batch_request=batch_request, expectation_suite_name=(expectation_suite_name if not self.data_context.ge_cloud_mode else None), expectation_suite_ge_cloud_id=(expectation_suite_ge_cloud_id if self.data_context.ge_cloud_mode else None), include_rendered_content=include_rendered_content, ) action_list: list = substituted_validation_dict.get("action_list") runtime_configuration_validation = substituted_validation_dict.get( "runtime_configuration", {}) catch_exceptions_validation = runtime_configuration_validation.get( "catch_exceptions") result_format_validation = runtime_configuration_validation.get( "result_format") result_format = result_format or result_format_validation if result_format is None: result_format = {"result_format": "SUMMARY"} action_list_validation_operator: ActionListValidationOperator = ( ActionListValidationOperator( data_context=self.data_context, action_list=action_list, result_format=result_format, name=f"{self.name}-checkpoint-validation[{idx}]", )) checkpoint_identifier = None if self.data_context.ge_cloud_mode: checkpoint_identifier = GeCloudIdentifier( resource_type=GeCloudRESTResource.CONTRACT, ge_cloud_id=str(self.ge_cloud_id), ) operator_run_kwargs = {} if catch_exceptions_validation is not None: operator_run_kwargs[ "catch_exceptions"] = catch_exceptions_validation async_validation_operator_results.append( async_executor.submit( action_list_validation_operator.run, assets_to_validate=[validator], run_id=run_id, evaluation_parameters=substituted_validation_dict.get( "evaluation_parameters"), result_format=result_format, checkpoint_identifier=checkpoint_identifier, checkpoint_name=self.name, **operator_run_kwargs, )) except ( ge_exceptions.CheckpointError, ge_exceptions.ExecutionEngineError, ge_exceptions.MetricError, ) as e: raise ge_exceptions.CheckpointError( f"Exception occurred while running validation[{idx}] of Checkpoint '{self.name}': {e.message}." )
def resolve_config_using_acceptable_arguments( checkpoint: "Checkpoint", # noqa: F821 template_name: Optional[str] = None, run_name_template: Optional[str] = None, expectation_suite_name: Optional[str] = None, batch_request: Optional[Union[BatchRequest, RuntimeBatchRequest, dict]] = None, action_list: Optional[List[dict]] = None, evaluation_parameters: Optional[dict] = None, runtime_configuration: Optional[dict] = None, validations: Optional[List[dict]] = None, profilers: Optional[List[dict]] = None, run_id: Optional[Union[str, RunIdentifier]] = None, run_name: Optional[str] = None, run_time: Optional[Union[str, datetime.datetime]] = None, result_format: Optional[Union[str, dict]] = None, expectation_suite_ge_cloud_id: Optional[str] = None, ) -> dict: """ This method reconciles the Checkpoint configuration (e.g., obtained from the Checkpoint store) with dynamically supplied arguments in order to obtain that Checkpoint specification that is ready for running validation on it. This procedure is necessecitated by the fact that the Checkpoint configuration is hierarchical in its form, which was established for the purposes of making the specification of different Checkpoint capabilities easy. In particular, entities, such as BatchRequest, expectation_suite_name, and action_list, can be specified at the top Checkpoint level with the suitable ovverrides provided at lower levels (e.g., in the validations section). Reconciling and normalizing the Checkpoint configuration is essential for usage statistics, because the exact values of the entities in their formally validated form (e.g., BatchRequest) is the required level of detail. """ assert not (run_id and run_name) and not ( run_id and run_time ), "Please provide either a run_id or run_name and/or run_time." run_time = run_time or datetime.datetime.now() runtime_configuration = runtime_configuration or {} batch_request = get_batch_request_as_dict(batch_request=batch_request) validations = get_validations_with_batch_request_as_dict( validations=validations) runtime_kwargs: dict = { "template_name": template_name, "run_name_template": run_name_template, "expectation_suite_name": expectation_suite_name, "batch_request": batch_request, "action_list": action_list, "evaluation_parameters": evaluation_parameters, "runtime_configuration": runtime_configuration, "validations": validations, "profilers": profilers, "expectation_suite_ge_cloud_id": expectation_suite_ge_cloud_id, } substituted_runtime_config: dict = checkpoint.get_substituted_config( runtime_kwargs=runtime_kwargs) run_name_template = substituted_runtime_config.get("run_name_template") validations = substituted_runtime_config.get("validations") or [] batch_request = substituted_runtime_config.get("batch_request") if len(validations) == 0 and not batch_request: raise ge_exceptions.CheckpointError( f'Checkpoint "{checkpoint.name}" must contain either a batch_request or validations.' ) if run_name is None and run_name_template is not None: run_name = get_datetime_string_from_strftime_format( format_str=run_name_template, datetime_obj=run_time) run_id = run_id or RunIdentifier(run_name=run_name, run_time=run_time) validation_dict: dict for validation_dict in validations: substituted_validation_dict: dict = get_substituted_validation_dict( substituted_runtime_config=substituted_runtime_config, validation_dict=validation_dict, ) validation_batch_request: Union[ BatchRequest, RuntimeBatchRequest] = substituted_validation_dict.get( "batch_request") validation_dict["batch_request"] = validation_batch_request validation_expectation_suite_name: str = substituted_validation_dict.get( "expectation_suite_name") validation_dict[ "expectation_suite_name"] = validation_expectation_suite_name validation_expectation_suite_ge_cloud_id: str = ( substituted_validation_dict.get( "expectation_suite_ge_cloud_id")) validation_dict[ "expectation_suite_ge_cloud_id"] = validation_expectation_suite_ge_cloud_id validation_action_list: list = substituted_validation_dict.get( "action_list") validation_dict["action_list"] = validation_action_list return substituted_runtime_config