yaml_config = my_checkpoint_config.replace( "getting_started_datasource", GETTING_STARTED_DATASOURCE_NAME ) my_checkpoint_config = my_checkpoint_config.replace( "getting_started_expectation_suite_taxi.demo", GETTING_STARTED_EXPECTATION_SUITE_NAME, ) my_checkpoint_config = yaml.load(my_checkpoint_config) # NOTE: The following code (up to and including the assert) is only for testing and can be ignored by users. # In the current test, site_names are set to None because we do not want to update and build data_docs # If you would like to build data_docs then either remove `site_names=None` or pass in a list of site_names you would like to build the docs on. checkpoint = SimpleCheckpoint( **my_checkpoint_config, data_context=context, site_names=None ) checkpoint_result = checkpoint.run(site_names=None) assert checkpoint_result.run_results # Create second checkpoint on yellow_tripdata_sample_2019-02.csv # <snippet> yaml_config = f""" name: getting_started_checkpoint config_version: 1.0 class_name: SimpleCheckpoint run_name_template: "%Y%m%d-%H%M%S-my-run-name-template" validations: - batch_request: datasource_name: getting_started_datasource
def execute(self, **kwargs) -> Any: context = ge.data_context.DataContext( self._context_root_dir) # type: ignore if len(self.python_interface.inputs.keys()) != 1: raise TypeError( "Expected one input argument to validate the dataset") dataset_key = list(self.python_interface.inputs.keys())[0] dataset = kwargs[dataset_key] datatype = self.python_interface.inputs[dataset_key] if not issubclass(datatype, (FlyteFile, FlyteSchema, str)): raise TypeError( "'dataset' has to have FlyteFile/FlyteSchema/str datatype") # determine the type of data connector selected_datasource = list( filter(lambda x: x["name"] == self._datasource_name, context.list_datasources())) if not selected_datasource: raise ValueError("Datasource doesn't exist!") data_connector_class_lookup = { data_connector_name: data_connector_class["class_name"] for data_connector_name, data_connector_class in selected_datasource[0]["data_connectors"].items() } specified_data_connector_class = data_connector_class_lookup[ self._data_connector_name] is_runtime = False if specified_data_connector_class == "RuntimeDataConnector": is_runtime = True if not self._data_asset_name: raise ValueError( "data_asset_name has to be given in a RuntimeBatchRequest") # FlyteFile if issubclass(datatype, FlyteFile): dataset = self._flyte_file(dataset) # FlyteSchema # convert schema to parquet file if issubclass(datatype, FlyteSchema) and not is_runtime: dataset = self._flyte_schema(dataset) # minimalistic batch request final_batch_request = { "data_asset_name": self._data_asset_name if is_runtime else dataset, "datasource_name": self._datasource_name, "data_connector_name": self._data_connector_name, } # Great Expectations' RuntimeBatchRequest if self._batch_request_config and ( self._batch_request_config.runtime_parameters or is_runtime): final_batch_request.update({ "runtime_parameters": self._batch_request_config.runtime_parameters if self._batch_request_config.runtime_parameters else {}, "batch_identifiers": self._batch_request_config.batch_identifiers, "batch_spec_passthrough": self._batch_request_config.batch_spec_passthrough, }) if is_runtime and issubclass(datatype, str): final_batch_request["runtime_parameters"]["query"] = dataset elif is_runtime and issubclass(datatype, FlyteSchema): final_batch_request["runtime_parameters"][ "batch_data"] = dataset.open().all() else: raise AssertionError( "Can only use runtime_parameters for query(str)/schema data" ) # Great Expectations' BatchRequest elif self._batch_request_config: final_batch_request.update({ "data_connector_query": self._batch_request_config.data_connector_query, "batch_spec_passthrough": self._batch_request_config.batch_spec_passthrough, }) if self._checkpoint_params: checkpoint = SimpleCheckpoint( f"_tmp_checkpoint_{self._expectation_suite_name}", context, **self._checkpoint_params, ) else: checkpoint = SimpleCheckpoint( f"_tmp_checkpoint_{self._expectation_suite_name}", context, ) # identify every run uniquely run_id = RunIdentifier( **{ "run_name": self._datasource_name + "_run", "run_time": datetime.datetime.utcnow(), }) checkpoint_result = checkpoint.run( run_id=run_id, validations=[{ "batch_request": final_batch_request, "expectation_suite_name": self._expectation_suite_name, }], ) final_result = convert_to_json_serializable( checkpoint_result.list_validation_results())[0] result_string = "" if final_result["success"] is False: for every_result in final_result["results"]: if every_result["success"] is False: result_string += ( every_result["expectation_config"]["kwargs"]["column"] + " -> " + every_result["expectation_config"]["expectation_type"] + "\n") # raise a Great Expectations' exception raise ValidationError( "Validation failed!\nCOLUMN\t\tFAILED EXPECTATION\n" + result_string) logger.info("Validation succeeded!") return final_result
def to_python_value( self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[GreatExpectationsType], ) -> GreatExpectationsType: if not (lv and lv.scalar and ((lv.scalar.primitive and lv.scalar.primitive.string_value) or lv.scalar.schema or lv.scalar.blob or lv.scalar.structured_dataset)): raise AssertionError( "Can only validate a literal string/FlyteFile/FlyteSchema value" ) # fetch the configuration type_conf = GreatExpectationsTypeTransformer.get_config( expected_python_type) conf_dict = type_conf[1].to_dict() # type: ignore ge_conf = GreatExpectationsFlyteConfig(**conf_dict) # fetch the data context context = ge.data_context.DataContext( ge_conf.context_root_dir) # type: ignore # determine the type of data connector selected_datasource = list( filter(lambda x: x["name"] == ge_conf.datasource_name, context.list_datasources())) if not selected_datasource: raise ValueError("Datasource doesn't exist!") data_connector_class_lookup = { data_connector_name: data_connector_class["class_name"] for data_connector_name, data_connector_class in selected_datasource[0]["data_connectors"].items() } specified_data_connector_class = data_connector_class_lookup[ ge_conf.data_connector_name] is_runtime = False if specified_data_connector_class == "RuntimeDataConnector": is_runtime = True if not ge_conf.data_asset_name: raise ValueError( "data_asset_name has to be given in a RuntimeBatchRequest") # file path for FlyteSchema and FlyteFile temp_dataset = "" # return value return_dataset = "" # FlyteSchema if lv.scalar.schema or lv.scalar.structured_dataset: return_dataset, temp_dataset = self._flyte_schema( is_runtime=is_runtime, ctx=ctx, ge_conf=ge_conf, lv=lv, expected_python_type=type_conf[0]) # FlyteFile if lv.scalar.blob: return_dataset, temp_dataset = self._flyte_file( ctx=ctx, ge_conf=ge_conf, lv=lv, expected_python_type=type_conf[0]) if lv.scalar.primitive: dataset = return_dataset = lv.scalar.primitive.string_value else: dataset = temp_dataset batch_request_conf = ge_conf.batch_request_config # minimalistic batch request final_batch_request = { "data_asset_name": ge_conf.data_asset_name if is_runtime else dataset, "datasource_name": ge_conf.datasource_name, "data_connector_name": ge_conf.data_connector_name, } # Great Expectations' RuntimeBatchRequest if batch_request_conf and (batch_request_conf["runtime_parameters"] or is_runtime): final_batch_request.update({ "runtime_parameters": batch_request_conf["runtime_parameters"] if batch_request_conf["runtime_parameters"] else {}, "batch_identifiers": batch_request_conf["batch_identifiers"], "batch_spec_passthrough": batch_request_conf["batch_spec_passthrough"], }) if is_runtime and lv.scalar.primitive: final_batch_request["runtime_parameters"]["query"] = dataset elif is_runtime and (lv.scalar.schema or lv.scalar.structured_dataset): final_batch_request["runtime_parameters"][ "batch_data"] = return_dataset.open().all() else: raise AssertionError( "Can only use runtime_parameters for query(str)/schema data" ) # Great Expectations' BatchRequest elif batch_request_conf: final_batch_request.update({ "data_connector_query": batch_request_conf["data_connector_query"], "batch_spec_passthrough": batch_request_conf["batch_spec_passthrough"], }) if ge_conf.checkpoint_params: checkpoint = SimpleCheckpoint( f"_tmp_checkpoint_{ge_conf.expectation_suite_name}", context, **ge_conf.checkpoint_params, ) else: checkpoint = SimpleCheckpoint( f"_tmp_checkpoint_{ge_conf.expectation_suite_name}", context) # identify every run uniquely run_id = RunIdentifier( **{ "run_name": ge_conf.datasource_name + "_run", "run_time": datetime.datetime.utcnow(), }) checkpoint_result = checkpoint.run( run_id=run_id, validations=[{ "batch_request": final_batch_request, "expectation_suite_name": ge_conf.expectation_suite_name, }], ) final_result = convert_to_json_serializable( checkpoint_result.list_validation_results())[0] result_string = "" if final_result["success"] is False: for every_result in final_result["results"]: if every_result["success"] is False: result_string += ( every_result["expectation_config"]["kwargs"]["column"] + " -> " + every_result["expectation_config"]["expectation_type"] + "\n") # raise a Great Expectations' exception raise ValidationError( "Validation failed!\nCOLUMN\t\tFAILED EXPECTATION\n" + result_string) logger.info("Validation succeeded!") return typing.cast(GreatExpectationsType, return_dataset)
# Use a SimpleCheckpoint to verify that your new Expectation Suite works. # <snippet> checkpoint_config = { "class_name": "SimpleCheckpoint", "validations": [{ "batch_request": multi_batch_all_years_batch_request, "expectation_suite_name": expectation_suite_name, }], } # </snippet> # <snippet> checkpoint = SimpleCheckpoint( f"{validator.active_batch_definition.data_asset_name}_{expectation_suite_name}", context, **checkpoint_config, ) checkpoint_result = checkpoint.run() assert checkpoint_result["success"] is True # </snippet> # If you are using code from this script as part of a Jupyter Notebook, uncommenting and running the # following lines will open your Data Docs for the `checkpoint`'s results: # context.build_data_docs() # validation_result_identifier = checkpoint_result.list_validation_result_identifiers()[0] # context.open_data_docs(resource_identifier=validation_result_identifier)