def _get_reader_fn(self, reader_method=None, path=None):
        """Static helper for parsing reader types. If reader_method is not provided, path will be used to guess the
        correct reader_method.

        Args:
            reader_method (str): the name of the reader method to use, if available.
            path (str): the path used to guess

        Returns:
            ReaderMethod to use for the filepath

        """
        if reader_method is None and path is None:
            raise ge_exceptions.ExecutionEngineError(
                "Unable to determine pandas reader function without reader_method or path."
            )

        reader_options = {}
        if reader_method is None:
            path_guess = self.guess_reader_method_from_path(path)
            reader_method = path_guess["reader_method"]
            reader_options = path_guess.get(
                "reader_options"
            )  # This may not be there; use None in that case

        try:
            reader_fn = getattr(pd, reader_method)
            if reader_options:
                reader_fn = partial(reader_fn, **reader_options)
            return reader_fn
        except AttributeError:
            raise ge_exceptions.ExecutionEngineError(
                f'Unable to find reader_method "{reader_method}" in pandas.')
예제 #2
0
    def __init__(
        self,
        name: str,
        execution_engine=None,
        data_context_root_directory: Optional[str] = None,
    ):
        """
        Build a new Datasource.

        Args:
            name: the name for the datasource
            execution_engine (ClassConfig): the type of compute engine to produce
            data_context_root_directory: Installation directory path (if installed on a filesystem).
        """
        self._name = name

        self._data_context_root_directory = data_context_root_directory

        try:
            self._execution_engine = instantiate_class_from_config(
                config=execution_engine,
                runtime_environment={},
                config_defaults={
                    "module_name": "great_expectations.execution_engine"
                },
            )
            self._datasource_config = {
                "execution_engine": execution_engine,
            }
        except Exception as e:
            raise ge_exceptions.ExecutionEngineError(message=str(e))

        self._data_connectors = {}
    def guess_reader_method_from_path(path):
        """Helper method for deciding which reader to use to read in a certain path.

        Args:
            path (str): the to use to guess

        Returns:
            ReaderMethod to use for the filepath

        """
        if path.endswith(".csv") or path.endswith(".tsv"):
            return {"reader_method": "read_csv"}
        elif path.endswith(".parquet"):
            return {"reader_method": "read_parquet"}
        elif path.endswith(".xlsx") or path.endswith(".xls"):
            return {"reader_method": "read_excel"}
        elif path.endswith(".json"):
            return {"reader_method": "read_json"}
        elif path.endswith(".pkl"):
            return {"reader_method": "read_pickle"}
        elif path.endswith(".feather"):
            return {"reader_method": "read_feather"}
        elif path.endswith(".csv.gz") or path.endswith(".tsv.gz"):
            return {
                "reader_method": "read_csv",
                "reader_options": {
                    "compression": "gzip"
                },
            }
        elif path.endswith(".sas7bdat") or path.endswith(".xpt"):
            return {"reader_method": "read_sas"}

        else:
            raise ge_exceptions.ExecutionEngineError(
                f'Unable to determine reader method from path: "{path}".')
예제 #4
0
            def inner_func(
                cls,
                execution_engine: PandasExecutionEngine,
                metric_domain_kwargs: Dict,
                metric_value_kwargs: Dict,
                metrics: Dict[str, Any],
                runtime_configuration: Dict,
            ):
                filter_column_isnull = kwargs.get(
                    "filter_column_isnull",
                    getattr(cls, "filter_column_isnull", False))

                df, _, accessor_domain_kwargs = execution_engine.get_compute_domain(
                    domain_kwargs=metric_domain_kwargs,
                    domain_type=domain_type)

                column_name = accessor_domain_kwargs["column"]

                if column_name not in metrics["table.columns"]:
                    raise ge_exceptions.ExecutionEngineError(
                        message=
                        f'Error: The column "{column_name}" in BatchData does not exist.'
                    )

                if filter_column_isnull:
                    df = df[df[column_name].notnull()]

                return metric_fn(
                    cls,
                    column=df[column_name],
                    **metric_value_kwargs,
                    _metrics=metrics,
                )
 def active_batch_data_id(self, batch_id) -> None:
     if batch_id in self.loaded_batch_data_dict.keys():
         self._active_batch_data_id = batch_id
     else:
         raise ge_exceptions.ExecutionEngineError(
             f"Unable to set active_batch_data_id to {batch_id}. The may data may not be loaded."
         )
    def sample_using_hash(
        self,
        df: DataFrame,
        batch_spec: BatchSpec,
    ) -> DataFrame:
        """Hash the values in the named column, and only keep rows that match the given hash_value.

        Args:
            df: dataframe to sample
            batch_spec: should contain keys `column_name` and optionally `hash_digits`
                (default is 1 if not provided), `hash_value` (default is "f" if not provided),
                and `hash_function_name` (default is "md5" if not provided)

        Returns:
            Sampled dataframe

        Raises:
            SamplerError
        """
        self.verify_batch_spec_sampling_kwargs_exists(batch_spec)
        self.verify_batch_spec_sampling_kwargs_key_exists(
            "column_name", batch_spec)
        column_name: str = self.get_sampling_kwargs_value_or_default(
            batch_spec, "column_name")
        hash_digits: int = self.get_sampling_kwargs_value_or_default(
            batch_spec=batch_spec,
            sampling_kwargs_key="hash_digits",
            default_value=1)
        hash_value: str = self.get_sampling_kwargs_value_or_default(
            batch_spec=batch_spec,
            sampling_kwargs_key="hash_value",
            default_value="f")

        hash_function_name: str = self.get_sampling_kwargs_value_or_default(
            batch_spec=batch_spec,
            sampling_kwargs_key="hash_function_name",
            default_value="md5",
        )

        try:
            getattr(hashlib, str(hash_function_name))
        except (TypeError, AttributeError):
            raise (ge_exceptions.ExecutionEngineError(
                f"""The sampling method used with SparkDFExecutionEngine has a reference to an invalid hash_function_name.
                    Reference to {hash_function_name} cannot be found."""))

        def _encrypt_value(to_encode):
            to_encode_str = str(to_encode)
            hash_func = getattr(hashlib, hash_function_name)
            hashed_value = hash_func(
                to_encode_str.encode()).hexdigest()[-1 * hash_digits:]
            return hashed_value

        encrypt_udf = F.udf(_encrypt_value, sparktypes.StringType())
        res = (df.withColumn(
            "encrypted_value", encrypt_udf(column_name)).filter(
                F.col("encrypted_value") == hash_value).drop("encrypted_value")
               )
        return res
    def sample_using_hash(
        self,
        df: pd.DataFrame,
        batch_spec: BatchSpec,
    ) -> pd.DataFrame:
        """Hash the values in the named column, and only keep rows that match the given hash_value.

        Args:
            df: dataframe to sample
            batch_spec: should contain keys `column_name` and optionally `hash_digits`
                (default is 1 if not provided), `hash_value` (default is "f" if not provided),
                and `hash_function_name` (default is "md5" if not provided)

        Returns:
            Sampled dataframe

        Raises:
            SamplerError
        """
        self.verify_batch_spec_sampling_kwargs_exists(batch_spec)
        self.verify_batch_spec_sampling_kwargs_key_exists(
            "column_name", batch_spec)
        column_name: str = self.get_sampling_kwargs_value_or_default(
            batch_spec, "column_name")
        hash_digits: int = self.get_sampling_kwargs_value_or_default(
            batch_spec=batch_spec,
            sampling_kwargs_key="hash_digits",
            default_value=1)
        hash_value: str = self.get_sampling_kwargs_value_or_default(
            batch_spec=batch_spec,
            sampling_kwargs_key="hash_value",
            default_value="f")

        hash_function_name: str = self.get_sampling_kwargs_value_or_default(
            batch_spec=batch_spec,
            sampling_kwargs_key="hash_function_name",
            default_value="md5",
        )

        try:
            hash_func = getattr(hashlib, hash_function_name)
        except (TypeError, AttributeError):
            raise (ge_exceptions.ExecutionEngineError(
                f"""The sampling method used with PandasExecutionEngine has a reference to an invalid hash_function_name.
                       Reference to {hash_function_name} cannot be found."""))

        matches = df[column_name].map(lambda x: hash_func(str(x).encode(
        )).hexdigest()[-1 * hash_digits:] == hash_value)
        return df[matches]
예제 #8
0
            def inner_func(
                cls,
                execution_engine: SqlAlchemyExecutionEngine,
                metric_domain_kwargs: Dict,
                metric_value_kwargs: Dict,
                metrics: Dict[str, Any],
                runtime_configuration: Dict,
            ):
                filter_column_isnull = kwargs.get(
                    "filter_column_isnull",
                    getattr(cls, "filter_column_isnull", False))
                if filter_column_isnull:
                    compute_domain_kwargs = execution_engine.add_column_row_condition(
                        metric_domain_kwargs)
                else:
                    # We do not copy here because if compute domain is different, it will be copied by get_compute_domain
                    compute_domain_kwargs = metric_domain_kwargs
                (
                    selectable,
                    compute_domain_kwargs,
                    accessor_domain_kwargs,
                ) = execution_engine.get_compute_domain(
                    compute_domain_kwargs, domain_type=domain_type)

                column_name: str = accessor_domain_kwargs["column"]

                sqlalchemy_engine: sa.engine.Engine = execution_engine.engine

                if column_name not in metrics["table.columns"]:
                    raise ge_exceptions.ExecutionEngineError(
                        message=
                        f'Error: The column "{column_name}" in BatchData does not exist.'
                    )

                dialect = sqlalchemy_engine.dialect
                metric_aggregate = metric_fn(
                    cls,
                    column=sa.column(column_name),
                    **metric_value_kwargs,
                    _dialect=dialect,
                    _table=selectable,
                    _column_name=column_name,
                    _sqlalchemy_engine=sqlalchemy_engine,
                    _metrics=metrics,
                )
                return metric_aggregate, compute_domain_kwargs, accessor_domain_kwargs
예제 #9
0
    def _sample_using_hash(
        df,
        column_name: str,
        hash_digits: int = 1,
        hash_value: str = "f",
        hash_function_name: str = "md5",
    ):
        """Hash the values in the named column, and split on that"""
        try:
            hash_func = getattr(hashlib, hash_function_name)
        except (TypeError, AttributeError) as e:
            raise (ge_exceptions.ExecutionEngineError(
                f"""The sampling method used with PandasExecutionEngine has a reference to an invalid hash_function_name.
                    Reference to {hash_function_name} cannot be found."""))

        matches = df[column_name].map(lambda x: hash_func(str(x).encode(
        )).hexdigest()[-1 * hash_digits:] == hash_value)
        return df[matches]
예제 #10
0
 def _split_on_hashed_column(
     df,
     column_name: str,
     hash_digits: int,
     batch_identifiers: dict,
     hash_function_name: str = "md5",
 ):
     """Split on the hashed value of the named column"""
     try:
         hash_method = getattr(hashlib, hash_function_name)
     except (TypeError, AttributeError) as e:
         raise (ge_exceptions.ExecutionEngineError(
             f"""The splitting method used with SparkDFExecutionEngine has a reference to an invalid hash_function_name.
                 Reference to {hash_function_name} cannot be found."""))
     matching_rows = df[column_name].map(
         lambda x: hash_method(str(x).encode()).hexdigest()[
             -1 * hash_digits:] == batch_identifiers["hash_value"])
     return df[matching_rows]
            def inner_func(
                cls,
                execution_engine: "SparkDFExecutionEngine",
                metric_domain_kwargs: Dict,
                metric_value_kwargs: Dict,
                metrics: Dict[Tuple, Any],
                runtime_configuration: Dict,
            ):
                filter_column_isnull = kwargs.get(
                    "filter_column_isnull", getattr(cls, "filter_column_isnull", False)
                )

                if filter_column_isnull:
                    compute_domain_kwargs = execution_engine.add_column_row_condition(
                        metric_domain_kwargs
                    )
                else:
                    # We do not copy here because if compute domain is different, it will be copied by get_compute_domain
                    compute_domain_kwargs = metric_domain_kwargs

                (
                    data,
                    compute_domain_kwargs,
                    accessor_domain_kwargs,
                ) = execution_engine.get_compute_domain(
                    domain_kwargs=compute_domain_kwargs, domain_type=domain_type
                )

                column_name = accessor_domain_kwargs["column"]
                if column_name not in data.columns:
                    raise ge_exceptions.ExecutionEngineError(
                        message=f'Error: The column "{column_name}" in BatchData does not exist.'
                    )

                column = data[column_name]
                metric_aggregate = metric_fn(
                    cls,
                    column=column,
                    **metric_value_kwargs,
                    _table=data,
                    _column_name=column_name,
                    _metrics=metrics,
                )
                return metric_aggregate, compute_domain_kwargs, accessor_domain_kwargs
    def get_batch_data_and_markers(
        self, batch_spec: BatchSpec
    ) -> Tuple[Any, BatchMarkers]:  # batch_data
        # We need to build a batch_markers to be used in the dataframe
        batch_markers: BatchMarkers = BatchMarkers(
            {
                "ge_load_time": datetime.datetime.now(datetime.timezone.utc).strftime(
                    "%Y%m%dT%H%M%S.%fZ"
                )
            }
        )

        batch_data: PandasBatchData
        if isinstance(batch_spec, RuntimeDataBatchSpec):
            # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated
            if isinstance(batch_spec.batch_data, pd.DataFrame):
                df = batch_spec.batch_data
            elif isinstance(batch_spec.batch_data, PandasBatchData):
                df = batch_spec.batch_data.dataframe
            else:
                raise ValueError(
                    "RuntimeDataBatchSpec must provide a Pandas DataFrame or PandasBatchData object."
                )
            batch_spec.batch_data = "PandasDataFrame"
        elif isinstance(batch_spec, S3BatchSpec):
            if self._s3 is None:
                raise ge_exceptions.ExecutionEngineError(
                    f"""PandasExecutionEngine has been passed a S3BatchSpec,
                        but the ExecutionEngine does not have a boto3 client configured. Please check your config."""
                )
            s3_engine = self._s3
            s3_url = S3Url(batch_spec.path)
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options or {}
            if "compression" not in reader_options.keys():
                reader_options["compression"] = sniff_s3_compression(s3_url)
            s3_object = s3_engine.get_object(Bucket=s3_url.bucket, Key=s3_url.key)
            logger.debug(
                "Fetching s3 object. Bucket: {} Key: {}".format(
                    s3_url.bucket, s3_url.key
                )
            )
            reader_fn = self._get_reader_fn(reader_method, s3_url.key)
            buf = BytesIO(s3_object["Body"].read())
            buf.seek(0)
            df = reader_fn(buf, **reader_options)
        elif isinstance(batch_spec, PathBatchSpec):
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options
            path: str = batch_spec.path
            reader_fn: Callable = self._get_reader_fn(reader_method, path)
            df = reader_fn(path, **reader_options)
        else:
            raise ge_exceptions.BatchSpecError(
                f"batch_spec must be of type RuntimeDataBatchSpec, PathBatchSpec, or S3BatchSpec, not {batch_spec.__class__.__name__}"
            )

        df = self._apply_splitting_and_sampling_methods(batch_spec, df)
        if df.memory_usage().sum() < HASH_THRESHOLD:
            batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe(df)

        typed_batch_data = PandasBatchData(execution_engine=self, dataframe=df)

        return typed_batch_data, batch_markers
    def get_batch_data_and_markers(
            self,
            batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:  # batch_data
        # We need to build a batch_markers to be used in the dataframe
        batch_markers: BatchMarkers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        batch_data: Any
        if isinstance(batch_spec, RuntimeDataBatchSpec):
            # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated
            batch_data = batch_spec.batch_data
            if isinstance(batch_data, str):
                raise ge_exceptions.ExecutionEngineError(
                    f"""PandasExecutionEngine has been passed a string type batch_data, "{batch_data}", which is illegal.
Please check your config.""")
            if isinstance(batch_spec.batch_data, pd.DataFrame):
                df = batch_spec.batch_data
            elif isinstance(batch_spec.batch_data, PandasBatchData):
                df = batch_spec.batch_data.dataframe
            else:
                raise ValueError(
                    "RuntimeDataBatchSpec must provide a Pandas DataFrame or PandasBatchData object."
                )
            batch_spec.batch_data = "PandasDataFrame"

        elif isinstance(batch_spec, S3BatchSpec):
            if self._s3 is None:
                self._instantiate_s3_client()
            # if we were not able to instantiate S3 client, then raise error
            if self._s3 is None:
                raise ge_exceptions.ExecutionEngineError(
                    """PandasExecutionEngine has been passed a S3BatchSpec,
                        but the ExecutionEngine does not have a boto3 client configured. Please check your config."""
                )
            s3_engine = self._s3
            try:
                reader_method: str = batch_spec.reader_method
                reader_options: dict = batch_spec.reader_options or {}
                path: str = batch_spec.path
                s3_url = S3Url(path)
                if "compression" not in reader_options.keys():
                    inferred_compression_param = sniff_s3_compression(s3_url)
                    if inferred_compression_param is not None:
                        reader_options[
                            "compression"] = inferred_compression_param
                s3_object = s3_engine.get_object(Bucket=s3_url.bucket,
                                                 Key=s3_url.key)
            except (ParamValidationError, ClientError) as error:
                raise ge_exceptions.ExecutionEngineError(
                    f"""PandasExecutionEngine encountered the following error while trying to read data from S3 Bucket: {error}"""
                )
            logger.debug(
                f"Fetching s3 object. Bucket: {s3_url.bucket} Key: {s3_url.key}"
            )
            reader_fn = self._get_reader_fn(reader_method, s3_url.key)
            buf = BytesIO(s3_object["Body"].read())
            buf.seek(0)
            df = reader_fn(buf, **reader_options)

        elif isinstance(batch_spec, AzureBatchSpec):
            if self._azure is None:
                self._instantiate_azure_client()
            # if we were not able to instantiate Azure client, then raise error
            if self._azure is None:
                raise ge_exceptions.ExecutionEngineError(
                    """PandasExecutionEngine has been passed a AzureBatchSpec,
                        but the ExecutionEngine does not have an Azure client configured. Please check your config."""
                )
            azure_engine = self._azure
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options or {}
            path: str = batch_spec.path
            azure_url = AzureUrl(path)
            blob_client = azure_engine.get_blob_client(
                container=azure_url.container, blob=azure_url.blob)
            azure_object = blob_client.download_blob()
            logger.debug(
                f"Fetching Azure blob. Container: {azure_url.container} Blob: {azure_url.blob}"
            )
            reader_fn = self._get_reader_fn(reader_method, azure_url.blob)
            buf = BytesIO(azure_object.readall())
            buf.seek(0)
            df = reader_fn(buf, **reader_options)

        elif isinstance(batch_spec, GCSBatchSpec):
            if self._gcs is None:
                self._instantiate_gcs_client()
            # if we were not able to instantiate GCS client, then raise error
            if self._gcs is None:
                raise ge_exceptions.ExecutionEngineError(
                    """PandasExecutionEngine has been passed a GCSBatchSpec,
                        but the ExecutionEngine does not have an GCS client configured. Please check your config."""
                )
            gcs_engine = self._gcs
            gcs_url = GCSUrl(batch_spec.path)
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options or {}
            try:
                gcs_bucket = gcs_engine.get_bucket(gcs_url.bucket)
                gcs_blob = gcs_bucket.blob(gcs_url.blob)
                logger.debug(
                    f"Fetching GCS blob. Bucket: {gcs_url.bucket} Blob: {gcs_url.blob}"
                )
            except GoogleAPIError as error:
                raise ge_exceptions.ExecutionEngineError(
                    f"""PandasExecutionEngine encountered the following error while trying to read data from GCS Bucket: {error}"""
                )
            reader_fn = self._get_reader_fn(reader_method, gcs_url.blob)
            buf = BytesIO(gcs_blob.download_as_bytes())
            buf.seek(0)
            df = reader_fn(buf, **reader_options)

        elif isinstance(batch_spec, PathBatchSpec):
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options
            path: str = batch_spec.path
            reader_fn: Callable = self._get_reader_fn(reader_method, path)
            df = reader_fn(path, **reader_options)

        else:
            raise ge_exceptions.BatchSpecError(
                f"batch_spec must be of type RuntimeDataBatchSpec, PathBatchSpec, S3BatchSpec, or AzureBatchSpec, not {batch_spec.__class__.__name__}"
            )

        df = self._apply_splitting_and_sampling_methods(batch_spec, df)
        if df.memory_usage().sum() < HASH_THRESHOLD:
            batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe(
                df)

        typed_batch_data = PandasBatchData(execution_engine=self, dataframe=df)

        return typed_batch_data, batch_markers