yaml_config = my_checkpoint_config.replace(
    "getting_started_datasource", GETTING_STARTED_DATASOURCE_NAME
)
my_checkpoint_config = my_checkpoint_config.replace(
    "getting_started_expectation_suite_taxi.demo",
    GETTING_STARTED_EXPECTATION_SUITE_NAME,
)


my_checkpoint_config = yaml.load(my_checkpoint_config)

# NOTE: The following code (up to and including the assert) is only for testing and can be ignored by users.
# In the current test, site_names are set to None because we do not want to update and build data_docs
# If you would like to build data_docs then either remove `site_names=None` or pass in a list of site_names you would like to build the docs on.
checkpoint = SimpleCheckpoint(
    **my_checkpoint_config, data_context=context, site_names=None
)
checkpoint_result = checkpoint.run(site_names=None)
assert checkpoint_result.run_results


# Create second checkpoint on yellow_tripdata_sample_2019-02.csv
# <snippet>
yaml_config = f"""
name: getting_started_checkpoint
config_version: 1.0
class_name: SimpleCheckpoint
run_name_template: "%Y%m%d-%H%M%S-my-run-name-template"
validations:
  - batch_request:
      datasource_name: getting_started_datasource
示例#2
0
    def execute(self, **kwargs) -> Any:
        context = ge.data_context.DataContext(
            self._context_root_dir)  # type: ignore

        if len(self.python_interface.inputs.keys()) != 1:
            raise TypeError(
                "Expected one input argument to validate the dataset")

        dataset_key = list(self.python_interface.inputs.keys())[0]
        dataset = kwargs[dataset_key]
        datatype = self.python_interface.inputs[dataset_key]

        if not issubclass(datatype, (FlyteFile, FlyteSchema, str)):
            raise TypeError(
                "'dataset' has to have FlyteFile/FlyteSchema/str datatype")

        # determine the type of data connector
        selected_datasource = list(
            filter(lambda x: x["name"] == self._datasource_name,
                   context.list_datasources()))

        if not selected_datasource:
            raise ValueError("Datasource doesn't exist!")

        data_connector_class_lookup = {
            data_connector_name: data_connector_class["class_name"]
            for data_connector_name, data_connector_class in
            selected_datasource[0]["data_connectors"].items()
        }

        specified_data_connector_class = data_connector_class_lookup[
            self._data_connector_name]

        is_runtime = False
        if specified_data_connector_class == "RuntimeDataConnector":
            is_runtime = True
            if not self._data_asset_name:
                raise ValueError(
                    "data_asset_name has to be given in a RuntimeBatchRequest")

        # FlyteFile
        if issubclass(datatype, FlyteFile):
            dataset = self._flyte_file(dataset)

        # FlyteSchema
        # convert schema to parquet file
        if issubclass(datatype, FlyteSchema) and not is_runtime:
            dataset = self._flyte_schema(dataset)

        # minimalistic batch request
        final_batch_request = {
            "data_asset_name":
            self._data_asset_name if is_runtime else dataset,
            "datasource_name": self._datasource_name,
            "data_connector_name": self._data_connector_name,
        }

        # Great Expectations' RuntimeBatchRequest
        if self._batch_request_config and (
                self._batch_request_config.runtime_parameters or is_runtime):
            final_batch_request.update({
                "runtime_parameters":
                self._batch_request_config.runtime_parameters
                if self._batch_request_config.runtime_parameters else {},
                "batch_identifiers":
                self._batch_request_config.batch_identifiers,
                "batch_spec_passthrough":
                self._batch_request_config.batch_spec_passthrough,
            })

            if is_runtime and issubclass(datatype, str):
                final_batch_request["runtime_parameters"]["query"] = dataset
            elif is_runtime and issubclass(datatype, FlyteSchema):
                final_batch_request["runtime_parameters"][
                    "batch_data"] = dataset.open().all()
            else:
                raise AssertionError(
                    "Can only use runtime_parameters for query(str)/schema data"
                )

        # Great Expectations' BatchRequest
        elif self._batch_request_config:
            final_batch_request.update({
                "data_connector_query":
                self._batch_request_config.data_connector_query,
                "batch_spec_passthrough":
                self._batch_request_config.batch_spec_passthrough,
            })

        if self._checkpoint_params:
            checkpoint = SimpleCheckpoint(
                f"_tmp_checkpoint_{self._expectation_suite_name}",
                context,
                **self._checkpoint_params,
            )
        else:
            checkpoint = SimpleCheckpoint(
                f"_tmp_checkpoint_{self._expectation_suite_name}",
                context,
            )

        # identify every run uniquely
        run_id = RunIdentifier(
            **{
                "run_name": self._datasource_name + "_run",
                "run_time": datetime.datetime.utcnow(),
            })

        checkpoint_result = checkpoint.run(
            run_id=run_id,
            validations=[{
                "batch_request":
                final_batch_request,
                "expectation_suite_name":
                self._expectation_suite_name,
            }],
        )
        final_result = convert_to_json_serializable(
            checkpoint_result.list_validation_results())[0]

        result_string = ""
        if final_result["success"] is False:
            for every_result in final_result["results"]:
                if every_result["success"] is False:
                    result_string += (
                        every_result["expectation_config"]["kwargs"]["column"]
                        + " -> " +
                        every_result["expectation_config"]["expectation_type"]
                        + "\n")

            # raise a Great Expectations' exception
            raise ValidationError(
                "Validation failed!\nCOLUMN\t\tFAILED EXPECTATION\n" +
                result_string)

        logger.info("Validation succeeded!")

        return final_result
示例#3
0
    def to_python_value(
        self,
        ctx: FlyteContext,
        lv: Literal,
        expected_python_type: Type[GreatExpectationsType],
    ) -> GreatExpectationsType:
        if not (lv and lv.scalar and
                ((lv.scalar.primitive and lv.scalar.primitive.string_value)
                 or lv.scalar.schema or lv.scalar.blob
                 or lv.scalar.structured_dataset)):
            raise AssertionError(
                "Can only validate a literal string/FlyteFile/FlyteSchema value"
            )

        # fetch the configuration
        type_conf = GreatExpectationsTypeTransformer.get_config(
            expected_python_type)
        conf_dict = type_conf[1].to_dict()  # type: ignore

        ge_conf = GreatExpectationsFlyteConfig(**conf_dict)

        # fetch the data context
        context = ge.data_context.DataContext(
            ge_conf.context_root_dir)  # type: ignore

        # determine the type of data connector
        selected_datasource = list(
            filter(lambda x: x["name"] == ge_conf.datasource_name,
                   context.list_datasources()))

        if not selected_datasource:
            raise ValueError("Datasource doesn't exist!")

        data_connector_class_lookup = {
            data_connector_name: data_connector_class["class_name"]
            for data_connector_name, data_connector_class in
            selected_datasource[0]["data_connectors"].items()
        }

        specified_data_connector_class = data_connector_class_lookup[
            ge_conf.data_connector_name]

        is_runtime = False
        if specified_data_connector_class == "RuntimeDataConnector":
            is_runtime = True
            if not ge_conf.data_asset_name:
                raise ValueError(
                    "data_asset_name has to be given in a RuntimeBatchRequest")

        # file path for FlyteSchema and FlyteFile
        temp_dataset = ""

        # return value
        return_dataset = ""

        # FlyteSchema
        if lv.scalar.schema or lv.scalar.structured_dataset:
            return_dataset, temp_dataset = self._flyte_schema(
                is_runtime=is_runtime,
                ctx=ctx,
                ge_conf=ge_conf,
                lv=lv,
                expected_python_type=type_conf[0])

        # FlyteFile
        if lv.scalar.blob:
            return_dataset, temp_dataset = self._flyte_file(
                ctx=ctx,
                ge_conf=ge_conf,
                lv=lv,
                expected_python_type=type_conf[0])

        if lv.scalar.primitive:
            dataset = return_dataset = lv.scalar.primitive.string_value
        else:
            dataset = temp_dataset

        batch_request_conf = ge_conf.batch_request_config

        # minimalistic batch request
        final_batch_request = {
            "data_asset_name":
            ge_conf.data_asset_name if is_runtime else dataset,
            "datasource_name": ge_conf.datasource_name,
            "data_connector_name": ge_conf.data_connector_name,
        }

        # Great Expectations' RuntimeBatchRequest
        if batch_request_conf and (batch_request_conf["runtime_parameters"]
                                   or is_runtime):
            final_batch_request.update({
                "runtime_parameters":
                batch_request_conf["runtime_parameters"]
                if batch_request_conf["runtime_parameters"] else {},
                "batch_identifiers":
                batch_request_conf["batch_identifiers"],
                "batch_spec_passthrough":
                batch_request_conf["batch_spec_passthrough"],
            })

            if is_runtime and lv.scalar.primitive:
                final_batch_request["runtime_parameters"]["query"] = dataset
            elif is_runtime and (lv.scalar.schema
                                 or lv.scalar.structured_dataset):
                final_batch_request["runtime_parameters"][
                    "batch_data"] = return_dataset.open().all()
            else:
                raise AssertionError(
                    "Can only use runtime_parameters for query(str)/schema data"
                )

        # Great Expectations' BatchRequest
        elif batch_request_conf:
            final_batch_request.update({
                "data_connector_query":
                batch_request_conf["data_connector_query"],
                "batch_spec_passthrough":
                batch_request_conf["batch_spec_passthrough"],
            })

        if ge_conf.checkpoint_params:
            checkpoint = SimpleCheckpoint(
                f"_tmp_checkpoint_{ge_conf.expectation_suite_name}",
                context,
                **ge_conf.checkpoint_params,
            )
        else:
            checkpoint = SimpleCheckpoint(
                f"_tmp_checkpoint_{ge_conf.expectation_suite_name}", context)

        # identify every run uniquely
        run_id = RunIdentifier(
            **{
                "run_name": ge_conf.datasource_name + "_run",
                "run_time": datetime.datetime.utcnow(),
            })

        checkpoint_result = checkpoint.run(
            run_id=run_id,
            validations=[{
                "batch_request":
                final_batch_request,
                "expectation_suite_name":
                ge_conf.expectation_suite_name,
            }],
        )
        final_result = convert_to_json_serializable(
            checkpoint_result.list_validation_results())[0]

        result_string = ""
        if final_result["success"] is False:
            for every_result in final_result["results"]:
                if every_result["success"] is False:
                    result_string += (
                        every_result["expectation_config"]["kwargs"]["column"]
                        + " -> " +
                        every_result["expectation_config"]["expectation_type"]
                        + "\n")

            # raise a Great Expectations' exception
            raise ValidationError(
                "Validation failed!\nCOLUMN\t\tFAILED EXPECTATION\n" +
                result_string)

        logger.info("Validation succeeded!")

        return typing.cast(GreatExpectationsType, return_dataset)
示例#4
0
# Use a SimpleCheckpoint to verify that your new Expectation Suite works.

# <snippet>
checkpoint_config = {
    "class_name":
    "SimpleCheckpoint",
    "validations": [{
        "batch_request": multi_batch_all_years_batch_request,
        "expectation_suite_name": expectation_suite_name,
    }],
}
# </snippet>

# <snippet>
checkpoint = SimpleCheckpoint(
    f"{validator.active_batch_definition.data_asset_name}_{expectation_suite_name}",
    context,
    **checkpoint_config,
)
checkpoint_result = checkpoint.run()

assert checkpoint_result["success"] is True
# </snippet>

# If you are using code from this script as part of a Jupyter Notebook, uncommenting and running the
# following lines will open your Data Docs for the `checkpoint`'s results:

# context.build_data_docs()
# validation_result_identifier = checkpoint_result.list_validation_result_identifiers()[0]
# context.open_data_docs(resource_identifier=validation_result_identifier)