def execute_transformation_query(bq_client): """Executes transformation query to a new destination table. Args: bq_client: Object representing a reference to a BigQuery Client """ dataset_ref = bq_client.get_dataset(bigquery.DatasetReference( project=config.billing_project_id, dataset_id=config.output_dataset_id)) table_ref = dataset_ref.table(config.output_table_name) job_config = bigquery.QueryJobConfig() job_config.destination = table_ref job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE job_config.time_partitioning = bigquery.TimePartitioning( field='usage_start_time', expiration_ms=None) sql = Template(file_to_string(config.sql_file_path)) sql = sql.safe_substitute(billing_table=config.billing_project_id + '.' + config.billing_dataset_id + '.' + config.billing_table_name, allocation_method=config.allocation_method ) logging.info('Attempting query on all dates...') # Execute Query query_job = bq_client.query( sql, job_config=job_config) query_job.result() # Waits for the query to finish logging.info('Transformation query complete. All partitions are updated.')
def execute_query(bq_client: bigquery.Client, env_vars: {}, query_path: object, output_table_name: str, time_partition: bool) -> None: """Executes transformation query to a new destination table. Args: bq_client: bigquery.Client object env_vars: Dictionary of key: value, where value is environment variable query_path: Object representing location of SQL query to execute output_table_name: String representing name of table that holds output time_partition: Boolean indicating whether to time-partition output """ dataset_ref = bq_client.get_dataset( bigquery.DatasetReference(project=bq_client.project, dataset_id=env_vars['corrected_dataset_id'])) table_ref = dataset_ref.table(output_table_name) job_config = bigquery.QueryJobConfig() job_config.destination = table_ref job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE # Time Partitioning table is only needed for final output query if time_partition: job_config.time_partitioning = bigquery.TimePartitioning( field='usage_start_time', expiration_ms=None) logging.info('Attempting query...') # Execute Query query_job = bq_client.query(query=render_template(query_path, env_vars), job_config=job_config) query_job.result() # Waits for the query to finish
def csv_in_gcs_to_table(event, context): from google.cloud import bigquery client = bigquery.Client() bucket_name = "egen-poc-bucket" object_name = event['name'] table_id = 'cparkar-project-310718.egen_poc_dataset.egen_poc_table' schema = [ bigquery.SchemaField('Sno', 'INTEGER'), bigquery.SchemaField('State', 'STRING'), bigquery.SchemaField('ConfirmedIndianNational', 'INTEGER'), bigquery.SchemaField('ConfirmedForeignNational', 'INTEGER'), bigquery.SchemaField('Cured', 'INTEGER'), bigquery.SchemaField('Deaths', 'INTEGER'), bigquery.SchemaField('Confirmed', 'INTEGER') ] job_config = bigquery.LoadJobConfig() job_config.schema = schema job_config.source_format = bigquery.SourceFormat.CSV job_config.write_disposition = bigquery.WriteDisposition().WRITE_APPEND job_config.skip_leading_rows = 1 uri = "gs://{}/{}".format(bucket_name, object_name) load_job = client.load_table_from_uri(uri, table_id, job_config=job_config) load_job.result()
def execute_query(bq_client): """Executes transformation query to a new destination table. Args: bq_client: Object representing a reference to a BigQuery Client """ dataset_ref = bq_client.get_dataset( bigquery.DatasetReference( project=config.config_vars['project_id'], dataset_id=config.config_vars['output_dataset_id'])) table_ref = dataset_ref.table(config.config_vars['output_table_name']) job_config = bigquery.QueryJobConfig() job_config.destination = table_ref job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE sql = file_to_string(config.config_vars['sql_file_path']) logging.info('Attempting query on all dates...') # Execute Query query_job = bq_client.query(sql, job_config=job_config) query_job.result() # Waits for the query to finish logging.info('Query complete. The table is updated.')
def csv_in_gcs_to_table(bucket_name: str, object_name: str, dataset_id: str, table_id: str, schema: List[bigquery.SchemaField]) -> None: """Upload CSV to BigQuery table. If the table already exists, it overwrites the table data. Args: bucket_name: Bucket name for holding the object object_name: Name of object to be uploaded dataset_id: Dataset id where the table is located. table_id: String holding id of hte table. schema: Schema of the table_id """ client = bigquery.Client() dataset_ref = client.dataset(dataset_id) job_config = bigquery.LoadJobConfig() job_config.schema = schema job_config.source_format = bigquery.SourceFormat.CSV job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE uri = "gs://{}/{}".format(bucket_name, object_name) load_job = client.load_table_from_uri(uri, dataset_ref.table(table_id), job_config=job_config) load_job.result()