def get_batch(self, batch_kwargs, batch_parameters=None): # We need to build a batch_id to be used in the dataframe batch_markers = BatchMarkers({ "ge_load_time": datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ") }) if "bigquery_temp_table" in batch_kwargs: query_support_table_name = batch_kwargs.get("bigquery_temp_table") elif "snowflake_transient_table" in batch_kwargs: # Snowflake uses a transient table, so we expect a table_name to be provided query_support_table_name = batch_kwargs.get("snowflake_transient_table") else: query_support_table_name = None if "query" in batch_kwargs: if "limit" in batch_kwargs or "offset" in batch_kwargs: logger.warning("Limit and offset parameters are ignored when using query-based batch_kwargs; consider " "adding limit and offset directly to the generated query.") if "query_parameters" in batch_kwargs: query = Template(batch_kwargs["query"]).safe_substitute(batch_kwargs["query_parameters"]) else: query = batch_kwargs["query"] batch_reference = SqlAlchemyBatchReference(engine=self.engine, query=query, table_name=query_support_table_name, schema=batch_kwargs.get("schema")) elif "table" in batch_kwargs: limit = batch_kwargs.get('limit') offset = batch_kwargs.get('offset') if limit is not None or offset is not None: logger.info("Generating query from table batch_kwargs based on limit and offset") raw_query = sqlalchemy.select([sqlalchemy.text("*")])\ .select_from(sqlalchemy.schema.Table(batch_kwargs['table'], sqlalchemy.MetaData(), schema=batch_kwargs.get("schema")))\ .offset(offset)\ .limit(limit) query = str(raw_query.compile(self.engine, compile_kwargs={"literal_binds": True})) batch_reference = SqlAlchemyBatchReference(engine=self.engine, query=query, table_name=query_support_table_name, schema=batch_kwargs.get("schema")) else: batch_reference = SqlAlchemyBatchReference(engine=self.engine, table_name=batch_kwargs["table"], schema=batch_kwargs.get("schema")) else: raise ValueError("Invalid batch_kwargs: exactly one of 'table' or 'query' must be specified") return Batch( datasource_name=self.name, batch_kwargs=batch_kwargs, data=batch_reference, batch_parameters=batch_parameters, batch_markers=batch_markers, data_context=self._data_context )
def get_batch(self, batch_kwargs, batch_parameters=None): # We need to build a batch_id to be used in the dataframe batch_markers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) if "bigquery_temp_table" in batch_kwargs: query_support_table_name = batch_kwargs.get("bigquery_temp_table") elif "snowflake_transient_table" in batch_kwargs: # Snowflake can use either a transient or temp table, so we allow a table_name to be provided query_support_table_name = batch_kwargs.get( "snowflake_transient_table") else: query_support_table_name = None if "query" in batch_kwargs: if "limit" in batch_kwargs or "offset" in batch_kwargs: logger.warning( "Limit and offset parameters are ignored when using query-based batch_kwargs; consider " "adding limit and offset directly to the generated query.") if "query_parameters" in batch_kwargs: query = Template(batch_kwargs["query"]).safe_substitute( batch_kwargs["query_parameters"]) else: query = batch_kwargs["query"] batch_reference = SqlAlchemyBatchReference( engine=self.engine, query=query, table_name=query_support_table_name, schema=batch_kwargs.get("schema"), ) elif "table" in batch_kwargs: table = batch_kwargs["table"] if batch_kwargs.get("use_quoted_name"): table = quoted_name(table, quote=True) limit = batch_kwargs.get("limit") offset = batch_kwargs.get("offset") if limit is not None or offset is not None: # AWS Athena does not support offset if (offset is not None and self.engine.dialect.name.lower() == "awsathena"): raise NotImplementedError( "AWS Athena does not support OFFSET.") logger.info( "Generating query from table batch_kwargs based on limit and offset" ) # In BigQuery the table name is already qualified with its schema name if self.engine.dialect.name.lower() == "bigquery": schema = None else: schema = batch_kwargs.get("schema") raw_query = (sqlalchemy.select( [sqlalchemy.text("*")]).select_from( sqlalchemy.schema.Table( table, sqlalchemy.MetaData(), schema=schema)).offset(offset).limit(limit)) query = str( raw_query.compile(self.engine, compile_kwargs={"literal_binds": True})) batch_reference = SqlAlchemyBatchReference( engine=self.engine, query=query, table_name=query_support_table_name, schema=batch_kwargs.get("schema"), ) else: batch_reference = SqlAlchemyBatchReference( engine=self.engine, table_name=table, schema=batch_kwargs.get("schema"), ) else: raise ValueError( "Invalid batch_kwargs: exactly one of 'table' or 'query' must be specified" ) return Batch( datasource_name=self.name, batch_kwargs=batch_kwargs, data=batch_reference, batch_parameters=batch_parameters, batch_markers=batch_markers, data_context=self._data_context, )
def get_batch(self, batch_kwargs, batch_parameters=None): # We need to build a batch_id to be used in the dataframe batch_markers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) if "bigquery_temp_table" in batch_kwargs: # deprecated-v0.15.3 warnings.warn( "BigQuery tables that are created as the result of a query are no longer created as " "permanent tables. Thus, a named permanent table through the `bigquery_temp_table`" "parameter is not required. The `bigquery_temp_table` parameter is deprecated as of" "v0.15.3 and will be removed in v0.18.", DeprecationWarning, ) if "snowflake_transient_table" in batch_kwargs: # Snowflake can use either a transient or temp table, so we allow a table_name to be provided query_support_table_name = batch_kwargs.get( "snowflake_transient_table") else: query_support_table_name = None if "query" in batch_kwargs: if "limit" in batch_kwargs or "offset" in batch_kwargs: logger.warning( "Limit and offset parameters are ignored when using query-based batch_kwargs; consider " "adding limit and offset directly to the generated query.") if "query_parameters" in batch_kwargs: query = Template(batch_kwargs["query"]).safe_substitute( batch_kwargs["query_parameters"]) else: query = batch_kwargs["query"] batch_reference = SqlAlchemyBatchReference( engine=self.engine, query=query, table_name=query_support_table_name, schema=batch_kwargs.get("schema"), ) elif "table" in batch_kwargs: table = batch_kwargs["table"] if batch_kwargs.get("use_quoted_name"): table = quoted_name(table, quote=True) limit = batch_kwargs.get("limit") offset = batch_kwargs.get("offset") if limit is not None or offset is not None: logger.info( "Generating query from table batch_kwargs based on limit and offset" ) # In BigQuery the table name is already qualified with its schema name if self.engine.dialect.name.lower() == "bigquery": schema = None else: schema = batch_kwargs.get("schema") # limit doesn't compile properly for oracle so we will append rownum to query string later if self.engine.dialect.name.lower() == "oracle": raw_query = sqlalchemy.select( [sqlalchemy.text("*")]).select_from( sqlalchemy.schema.Table(table, sqlalchemy.MetaData(), schema=schema)) else: raw_query = (sqlalchemy.select( [sqlalchemy.text("*")]).select_from( sqlalchemy.schema.Table( table, sqlalchemy.MetaData(), schema=schema)).offset(offset).limit(limit)) query = str( raw_query.compile(self.engine, compile_kwargs={"literal_binds": True})) # use rownum instead of limit in oracle if self.engine.dialect.name.lower() == "oracle": query += "\nWHERE ROWNUM <= %d" % limit batch_reference = SqlAlchemyBatchReference( engine=self.engine, query=query, table_name=query_support_table_name, schema=batch_kwargs.get("schema"), ) else: batch_reference = SqlAlchemyBatchReference( engine=self.engine, table_name=table, schema=batch_kwargs.get("schema"), ) else: raise ValueError( "Invalid batch_kwargs: exactly one of 'table' or 'query' must be specified" ) return Batch( datasource_name=self.name, batch_kwargs=batch_kwargs, data=batch_reference, batch_parameters=batch_parameters, batch_markers=batch_markers, data_context=self._data_context, )