def _sample_using_hash( df, column_name: str, hash_digits: int = 1, hash_value: str = "f", hash_function_name: str = "md5", ): try: getattr(hashlib, str(hash_function_name)) except (TypeError, AttributeError) as e: raise (ge_exceptions.ExecutionEngineError( f"""The sampling method used with SparkDFExecutionEngine has a reference to an invalid hash_function_name. Reference to {hash_function_name} cannot be found.""")) def _encrypt_value(to_encode): to_encode_str = str(to_encode) hash_func = getattr(hashlib, hash_function_name) hashed_value = hash_func( to_encode_str.encode()).hexdigest()[-1 * hash_digits:] return hashed_value encrypt_udf = F.udf(_encrypt_value, StringType()) res = (df.withColumn( "encrypted_value", encrypt_udf(column_name)).filter( F.col("encrypted_value") == hash_value).drop("encrypted_value") ) return res
def _split_on_hashed_column( df, column_name: str, hash_digits: int, partition_definition: dict, hash_function_name: str = "sha256", ): """Split on the hashed value of the named column""" try: getattr(hashlib, hash_function_name) except (TypeError, AttributeError) as e: raise (ge_exceptions.ExecutionEngineError( f"""The splitting method used with SparkDFExecutionEngine has a reference to an invalid hash_function_name. Reference to {hash_function_name} cannot be found.""")) def _encrypt_value(to_encode): hash_func = getattr(hashlib, hash_function_name) hashed_value = hash_func( to_encode.encode()).hexdigest()[-1 * hash_digits:] return hashed_value encrypt_udf = F.udf(_encrypt_value, StringType()) res = (df.withColumn( "encrypted_value", encrypt_udf(column_name)).filter( F.col("encrypted_value") == partition_definition["hash_value"]).drop("encrypted_value")) return res
def get_batch_data_and_markers( self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]: # batch_data # We need to build a batch_markers to be used in the dataframe batch_markers: BatchMarkers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) if isinstance(batch_spec, RuntimeDataBatchSpec): # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated batch_data = batch_spec.batch_data elif isinstance(batch_spec, PathBatchSpec): reader_method: str = batch_spec.get("reader_method") reader_options: dict = batch_spec.get("reader_options") or {} path: str = batch_spec["path"] reader_fn: Callable = self._get_reader_fn(reader_method, path) batch_data = reader_fn(path, **reader_options) elif isinstance(batch_spec, S3BatchSpec): if self._s3 is None: raise ge_exceptions.ExecutionEngineError( f"""PandasExecutionEngine has been passed a S3BatchSpec, but the ExecutionEngine does not have a boto3 client configured. Please check your config.""" ) s3_engine = self._s3 s3_url = S3Url(batch_spec.get("s3")) reader_method: str = batch_spec.get("reader_method") reader_options: dict = batch_spec.get("reader_options") or {} s3_object = s3_engine.get_object(Bucket=s3_url.bucket, Key=s3_url.key) logger.debug("Fetching s3 object. Bucket: {} Key: {}".format( s3_url.bucket, s3_url.key)) reader_fn = self._get_reader_fn(reader_method, s3_url.key) batch_data = reader_fn( StringIO(s3_object["Body"].read().decode( s3_object.get("ContentEncoding", "utf-8"))), **reader_options, ) else: raise BatchSpecError( f"batch_spec must be of type RuntimeDataBatchSpec, PathBatchSpec, or S3BatchSpec, not {batch_spec.__class__.__name__}" ) batch_data = self._apply_splitting_and_sampling_methods( batch_spec, batch_data) if batch_data.memory_usage().sum() < HASH_THRESHOLD: batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe( batch_data) typed_batch_data = self._get_typed_batch_data(batch_data) return typed_batch_data, batch_markers
def get_batch_data_and_markers( self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]: # batch_data # We need to build a batch_markers to be used in the dataframe batch_markers: BatchMarkers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) batch_data: Any if isinstance(batch_spec, RuntimeDataBatchSpec): # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated batch_data = batch_spec.batch_data if isinstance(batch_data, str): raise ge_exceptions.ExecutionEngineError( f"""SparkDFExecutionEngine has been passed a string type batch_data, "{batch_data}", which is illegal. Please check your config.""") batch_spec.batch_data = "SparkDataFrame" elif isinstance(batch_spec, PathBatchSpec): reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options path: str = batch_spec.path try: reader_options = self.spark.read.options(**reader_options) reader_fn: Callable = self._get_reader_fn( reader=reader_options, reader_method=reader_method, path=path, ) batch_data = reader_fn(path) except AttributeError: raise ExecutionEngineError(""" Unable to load pyspark. Pyspark is required for SparkDFExecutionEngine. """) else: raise BatchSpecError(""" Invalid batch_spec: batch_data is required for a SparkDFExecutionEngine to operate. """) batch_data = self._apply_splitting_and_sampling_methods( batch_spec, batch_data) typed_batch_data = SparkDFBatchData(execution_engine=self, dataframe=batch_data) return typed_batch_data, batch_markers
def _sample_using_hash( df, column_name: str, hash_digits: int = 1, hash_value: str = "f", hash_function_name: str = "md5", ): """Hash the values in the named column, and split on that""" try: hash_func = getattr(hashlib, hash_function_name) except (TypeError, AttributeError) as e: raise (ge_exceptions.ExecutionEngineError( f"""The sampling method used with PandasExecutionEngine has a reference to an invalid hash_function_name. Reference to {hash_function_name} cannot be found.""")) matches = df[column_name].map(lambda x: hash_func(str(x).encode( )).hexdigest()[-1 * hash_digits:] == hash_value) return df[matches]
def _split_on_hashed_column( df, column_name: str, hash_digits: int, partition_definition: dict, hash_function_name: str = "md5", ): """Split on the hashed value of the named column""" try: hash_method = getattr(hashlib, hash_function_name) except (TypeError, AttributeError) as e: raise (ge_exceptions.ExecutionEngineError( f"""The splitting method used with SparkDFExecutionEngine has a reference to an invalid hash_function_name. Reference to {hash_function_name} cannot be found.""")) matching_rows = df[column_name].map( lambda x: hash_method(str(x).encode()).hexdigest()[ -1 * hash_digits:] == partition_definition["hash_value"]) return df[matching_rows]
def get_batch_data_and_markers( self, batch_spec: BatchSpec ) -> Tuple[Any, BatchMarkers]: # batch_data # We need to build a batch_markers to be used in the dataframe batch_markers: BatchMarkers = BatchMarkers( { "ge_load_time": datetime.datetime.now(datetime.timezone.utc).strftime( "%Y%m%dT%H%M%S.%fZ" ) } ) """ As documented in Azure DataConnector implementations, Pandas and Spark execution engines utilize separate path formats for accessing Azure Blob Storage service. However, Pandas and Spark execution engines utilize identical path formats for accessing all other supported cloud storage services (AWS S3 and Google Cloud Storage). Moreover, these formats (encapsulated in S3BatchSpec and GCSBatchSpec) extend PathBatchSpec (common to them). Therefore, at the present time, all cases with the exception of Azure Blob Storage , are handled generically. """ batch_data: Any if isinstance(batch_spec, RuntimeDataBatchSpec): # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated batch_data = batch_spec.batch_data if isinstance(batch_data, str): raise ge_exceptions.ExecutionEngineError( f"""SparkDFExecutionEngine has been passed a string type batch_data, "{batch_data}", which is illegal. Please check your config.""" ) batch_spec.batch_data = "SparkDataFrame" elif isinstance(batch_spec, AzureBatchSpec): reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options or {} path: str = batch_spec.path azure_url = AzureUrl(path) try: credential = self._azure_options.get("credential") storage_account_url = azure_url.account_url if credential: self.spark.conf.set( "fs.wasb.impl", "org.apache.hadoop.fs.azure.NativeAzureFileSystem", ) self.spark.conf.set( f"fs.azure.account.key.{storage_account_url}", credential ) reader: DataFrameReader = self.spark.read.options(**reader_options) reader_fn: Callable = self._get_reader_fn( reader=reader, reader_method=reader_method, path=path, ) batch_data = reader_fn(path) except AttributeError: raise ExecutionEngineError( """ Unable to load pyspark. Pyspark is required for SparkDFExecutionEngine. """ ) elif isinstance(batch_spec, PathBatchSpec): reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options or {} path: str = batch_spec.path try: reader: DataFrameReader = self.spark.read.options(**reader_options) reader_fn: Callable = self._get_reader_fn( reader=reader, reader_method=reader_method, path=path, ) batch_data = reader_fn(path) except AttributeError: raise ExecutionEngineError( """ Unable to load pyspark. Pyspark is required for SparkDFExecutionEngine. """ ) # pyspark will raise an AnalysisException error if path is incorrect except pyspark.sql.utils.AnalysisException: raise ExecutionEngineError( f"""Unable to read in batch from the following path: {path}. Please check your configuration.""" ) else: raise BatchSpecError( """ Invalid batch_spec: batch_data is required for a SparkDFExecutionEngine to operate. """ ) batch_data = self._apply_splitting_and_sampling_methods(batch_spec, batch_data) typed_batch_data = SparkDFBatchData(execution_engine=self, dataframe=batch_data) return typed_batch_data, batch_markers
def get_batch_data_and_markers( self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]: # batch_data # We need to build a batch_markers to be used in the dataframe batch_markers: BatchMarkers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) batch_data: PandasBatchData if isinstance(batch_spec, RuntimeDataBatchSpec): # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated if isinstance(batch_spec.batch_data, pd.DataFrame): df = batch_spec.batch_data elif isinstance(batch_spec.batch_data, PandasBatchData): df = batch_spec.batch_data.dataframe else: raise ValueError( "RuntimeDataBatchSpec must provide a Pandas DataFrame or PandasBatchData object." ) batch_spec.batch_data = "PandasDataFrame" elif isinstance(batch_spec, S3BatchSpec): if self._s3 is None: raise ge_exceptions.ExecutionEngineError( f"""PandasExecutionEngine has been passed a S3BatchSpec, but the ExecutionEngine does not have a boto3 client configured. Please check your config.""" ) s3_engine = self._s3 s3_url = S3Url(batch_spec.path) reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options or {} if "compression" not in reader_options.keys(): reader_options["compression"] = sniff_s3_compression(s3_url) s3_object = s3_engine.get_object(Bucket=s3_url.bucket, Key=s3_url.key) logger.debug("Fetching s3 object. Bucket: {} Key: {}".format( s3_url.bucket, s3_url.key)) reader_fn = self._get_reader_fn(reader_method, s3_url.key) buf = BytesIO(s3_object["Body"].read()) buf.seek(0) df = reader_fn(buf, **reader_options) elif isinstance(batch_spec, PathBatchSpec): reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options path: str = batch_spec.path reader_fn: Callable = self._get_reader_fn(reader_method, path) df = reader_fn(path, **reader_options) else: raise BatchSpecError( f"batch_spec must be of type RuntimeDataBatchSpec, PathBatchSpec, or S3BatchSpec, not {batch_spec.__class__.__name__}" ) df = self._apply_splitting_and_sampling_methods(batch_spec, df) if df.memory_usage().sum() < HASH_THRESHOLD: batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe( df) typed_batch_data = PandasBatchData(execution_engine=self, dataframe=df) return typed_batch_data, batch_markers