def get_batch(self, batch_kwargs, batch_parameters=None): """class-private implementation of get_data_asset""" if self.spark is None: logger.error("No spark session available") return None reader_options = batch_kwargs.get("reader_options", {}) # We need to build batch_markers to be used with the DataFrame batch_markers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) if "path" in batch_kwargs or "s3" in batch_kwargs: # If both are present, let s3 override path = batch_kwargs.get("path") path = batch_kwargs.get("s3", path) reader_method = batch_kwargs.get("reader_method") reader = self.spark.read for option in reader_options.items(): reader = reader.option(*option) reader_fn = self._get_reader_fn(reader, reader_method, path) df = reader_fn(path) elif "query" in batch_kwargs: df = self.spark.sql(batch_kwargs["query"]) elif "dataset" in batch_kwargs and isinstance( batch_kwargs["dataset"], (DataFrame, SparkDFDataset)): df = batch_kwargs.get("dataset") # We don't want to store the actual dataframe in kwargs; copy the remaining batch_kwargs batch_kwargs = { k: batch_kwargs[k] for k in batch_kwargs if k != "dataset" } if isinstance(df, SparkDFDataset): # Grab just the spark_df reference, since we want to override everything else df = df.spark_df # Record this in the kwargs *and* the id batch_kwargs["SparkDFRef"] = True batch_kwargs["ge_batch_id"] = str(uuid.uuid1()) else: raise BatchKwargsError( "Unrecognized batch_kwargs for spark_source", batch_kwargs) if "limit" in batch_kwargs: df = df.limit(batch_kwargs["limit"]) return Batch( datasource_name=self.name, batch_kwargs=batch_kwargs, data=df, batch_parameters=batch_parameters, batch_markers=batch_markers, data_context=self._data_context, )
def get_batch(self, batch_kwargs, batch_parameters=None): # pandas cannot take unicode as a delimiter, which can happen in py2. Handle this case explicitly. # We handle it here so that the updated value will be in the batch_kwargs for transparency to the user. if PY2 and "reader_options" in batch_kwargs and "sep" in batch_kwargs['reader_options'] and \ batch_kwargs['reader_options']['sep'] is not None: batch_kwargs['reader_options']['sep'] = str(batch_kwargs['reader_options']['sep']) # We will use and manipulate reader_options along the way reader_options = batch_kwargs.get("reader_options", {}) # We need to build a batch_markers to be used in the dataframe batch_markers = BatchMarkers({ "ge_load_time": datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ") }) if "path" in batch_kwargs: path = batch_kwargs['path'] reader_method = batch_kwargs.get("reader_method") reader_fn = self._get_reader_fn(reader_method, path) df = reader_fn(path, **reader_options) elif "s3" in batch_kwargs: try: import boto3 s3 = boto3.client("s3", **self._boto3_options) except ImportError: raise BatchKwargsError("Unable to load boto3 client to read s3 asset.", batch_kwargs) raw_url = batch_kwargs["s3"] reader_method = batch_kwargs.get("reader_method") url = S3Url(raw_url) logger.debug("Fetching s3 object. Bucket: %s Key: %s" % (url.bucket, url.key)) s3_object = s3.get_object(Bucket=url.bucket, Key=url.key) reader_fn = self._get_reader_fn(reader_method, url.key) df = reader_fn( StringIO(s3_object["Body"].read().decode(s3_object.get("ContentEncoding", "utf-8"))), **reader_options ) elif "dataset" in batch_kwargs and isinstance(batch_kwargs["dataset"], (pd.DataFrame, pd.Series)): df = batch_kwargs.get("dataset") # We don't want to store the actual dataframe in kwargs; copy the remaining batch_kwargs batch_kwargs = {k: batch_kwargs[k] for k in batch_kwargs if k != 'dataset'} batch_kwargs["PandasInMemoryDF"] = True batch_kwargs["ge_batch_id"] = str(uuid.uuid1()) else: raise BatchKwargsError("Invalid batch_kwargs: path, s3, or df is required for a PandasDatasource", batch_kwargs) if df.memory_usage().sum() < HASH_THRESHOLD: batch_markers["pandas_data_fingerprint"] = hashlib.md5(pd.util.hash_pandas_object( df, index=True).values).hexdigest() return Batch( datasource_name=self.name, batch_kwargs=batch_kwargs, data=df, batch_parameters=batch_parameters, batch_markers=batch_markers, data_context=self._data_context )
def get_batch(self, batch_kwargs, batch_parameters=None): # We need to build a batch_id to be used in the dataframe batch_markers = BatchMarkers({ "ge_load_time": datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ") }) if "bigquery_temp_table" in batch_kwargs: query_support_table_name = batch_kwargs.get("bigquery_temp_table") elif "snowflake_transient_table" in batch_kwargs: # Snowflake uses a transient table, so we expect a table_name to be provided query_support_table_name = batch_kwargs.get("snowflake_transient_table") else: query_support_table_name = None if "query" in batch_kwargs: if "limit" in batch_kwargs or "offset" in batch_kwargs: logger.warning("Limit and offset parameters are ignored when using query-based batch_kwargs; consider " "adding limit and offset directly to the generated query.") if "query_parameters" in batch_kwargs: query = Template(batch_kwargs["query"]).safe_substitute(batch_kwargs["query_parameters"]) else: query = batch_kwargs["query"] batch_reference = SqlAlchemyBatchReference(engine=self.engine, query=query, table_name=query_support_table_name, schema=batch_kwargs.get("schema")) elif "table" in batch_kwargs: limit = batch_kwargs.get('limit') offset = batch_kwargs.get('offset') if limit is not None or offset is not None: logger.info("Generating query from table batch_kwargs based on limit and offset") raw_query = sqlalchemy.select([sqlalchemy.text("*")])\ .select_from(sqlalchemy.schema.Table(batch_kwargs['table'], sqlalchemy.MetaData(), schema=batch_kwargs.get("schema")))\ .offset(offset)\ .limit(limit) query = str(raw_query.compile(self.engine, compile_kwargs={"literal_binds": True})) batch_reference = SqlAlchemyBatchReference(engine=self.engine, query=query, table_name=query_support_table_name, schema=batch_kwargs.get("schema")) else: batch_reference = SqlAlchemyBatchReference(engine=self.engine, table_name=batch_kwargs["table"], schema=batch_kwargs.get("schema")) else: raise ValueError("Invalid batch_kwargs: exactly one of 'table' or 'query' must be specified") return Batch( datasource_name=self.name, batch_kwargs=batch_kwargs, data=batch_reference, batch_parameters=batch_parameters, batch_markers=batch_markers, data_context=self._data_context )
def _run_suite( self, dataset_name: str, dataset_path: Optional[str], df: Any, target_expectation_suite_name: str, run_id: str, ): target_suite = self.expectation_context.get_expectation_suite( target_expectation_suite_name) batch_markers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) batch_kwargs = {"datasource": generate_datasource_name(dataset_name)} if dataset_path: dataasset_name, _ = os.path.splitext( os.path.basename(dataset_path)) batch_kwargs["path"] = str(dataset_path) batch_kwargs["data_asset_name"] = dataasset_name batch = Batch( "kedro", batch_kwargs=BatchKwargs(batch_kwargs), data=df, batch_parameters=None, batch_markers=batch_markers, data_context=self.expectation_context, ) try: v = Validator( batch=batch, expectation_suite=target_suite, ) except ValueError: raise UnsupportedDataSet validator_dataset_batch = v.get_dataset() return self.expectation_context.run_validation_operator( "action_list_operator", [validator_dataset_batch], run_id=run_id)
def _run_suite(self, dataset, target_expectation_suite_name, run_id): class_name = self._get_ge_class_name(dataset) target_suite = self.expectation_context.get_expectation_suite( target_expectation_suite_name) df = dataset.load() batch = Batch( 'kedro', BatchKwargs({ 'path': 'kedro', 'datasource': 'kedro' }), df, None, BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }), self.expectation_context) v = Validator(batch, target_suite, { 'module_name': 'great_expectations.dataset', 'class_name': class_name }) vgdf = v.get_dataset() self.expectation_context.run_validation_operator( 'action_list_operator', [vgdf], run_id=run_id)
def get_batch(self, batch_kwargs, batch_parameters=None): # We will use and manipulate reader_options along the way reader_options = batch_kwargs.get("reader_options", {}) # We need to build a batch_markers to be used in the dataframe batch_markers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) if "path" in batch_kwargs: path = batch_kwargs["path"] reader_method = batch_kwargs.get("reader_method") reader_fn = self._get_reader_fn(reader_method, path) df = reader_fn(path, **reader_options) elif "s3" in batch_kwargs: try: import boto3 s3 = boto3.client("s3", **self._boto3_options) except ImportError: raise BatchKwargsError( "Unable to load boto3 client to read s3 asset.", batch_kwargs) raw_url = batch_kwargs["s3"] reader_method = batch_kwargs.get("reader_method") url = S3Url(raw_url) logger.debug("Fetching s3 object. Bucket: {} Key: {}".format( url.bucket, url.key)) s3_object = s3.get_object(Bucket=url.bucket, Key=url.key) reader_fn = self._get_reader_fn(reader_method, url.key) default_reader_options = self._infer_default_options( reader_fn, reader_options) if not reader_options.get( "encoding") and default_reader_options.get("encoding"): reader_options["encoding"] = s3_object.get( "ContentEncoding", default_reader_options.get("encoding")) df = reader_fn(BytesIO(s3_object["Body"].read()), **reader_options) elif "dataset" in batch_kwargs and isinstance( batch_kwargs["dataset"], (pd.DataFrame, pd.Series)): df = batch_kwargs.get("dataset") # We don't want to store the actual dataframe in kwargs; copy the remaining batch_kwargs batch_kwargs = { k: batch_kwargs[k] for k in batch_kwargs if k != "dataset" } batch_kwargs["PandasInMemoryDF"] = True batch_kwargs["ge_batch_id"] = str(uuid.uuid1()) else: raise BatchKwargsError( "Invalid batch_kwargs: path, s3, or df is required for a PandasDatasource", batch_kwargs, ) if df.memory_usage().sum() < HASH_THRESHOLD: batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe( df) return Batch( datasource_name=self.name, batch_kwargs=batch_kwargs, data=df, batch_parameters=batch_parameters, batch_markers=batch_markers, data_context=self._data_context, )
def get_batch(self, batch_kwargs, batch_parameters=None): # We need to build a batch_id to be used in the dataframe batch_markers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) if "bigquery_temp_table" in batch_kwargs: query_support_table_name = batch_kwargs.get("bigquery_temp_table") elif "snowflake_transient_table" in batch_kwargs: # Snowflake uses a transient table, so we expect a table_name to be provided query_support_table_name = batch_kwargs.get( "snowflake_transient_table") else: query_support_table_name = None if "query" in batch_kwargs: if "limit" in batch_kwargs or "offset" in batch_kwargs: logger.warning( "Limit and offset parameters are ignored when using query-based batch_kwargs; consider " "adding limit and offset directly to the generated query.") if "query_parameters" in batch_kwargs: query = Template(batch_kwargs["query"]).safe_substitute( batch_kwargs["query_parameters"]) else: query = batch_kwargs["query"] batch_reference = SqlAlchemyBatchReference( engine=self.engine, query=query, table_name=query_support_table_name, schema=batch_kwargs.get("schema"), ) elif "table" in batch_kwargs: table = batch_kwargs["table"] limit = batch_kwargs.get("limit") offset = batch_kwargs.get("offset") if limit is not None or offset is not None: # AWS Athena does not support offset if (offset is not None and self.engine.dialect.name.lower() == "awsathena"): raise NotImplementedError( "AWS Athena does not support OFFSET.") logger.info( "Generating query from table batch_kwargs based on limit and offset" ) # In BigQuery the table name is already qualified with its schema name if self.engine.dialect.name.lower() == "bigquery": schema = None else: schema = batch_kwargs.get("schema") raw_query = (sqlalchemy.select( [sqlalchemy.text("*")]).select_from( sqlalchemy.schema.Table( table, sqlalchemy.MetaData(), schema=schema)).offset(offset).limit(limit)) query = str( raw_query.compile(self.engine, compile_kwargs={"literal_binds": True})) batch_reference = SqlAlchemyBatchReference( engine=self.engine, query=query, table_name=query_support_table_name, schema=batch_kwargs.get("schema"), ) else: batch_reference = SqlAlchemyBatchReference( engine=self.engine, table_name=table, schema=batch_kwargs.get("schema"), ) else: raise ValueError( "Invalid batch_kwargs: exactly one of 'table' or 'query' must be specified" ) return Batch( datasource_name=self.name, batch_kwargs=batch_kwargs, data=batch_reference, batch_parameters=batch_parameters, batch_markers=batch_markers, data_context=self._data_context, )