def create_table(client, dataset_ref, table_name, is_partitioned=False): table_ref = dataset_ref.table(table_name) table_obj = Table(table_ref, schema=TABLE_SCHEMA) if is_partitioned: time_partitioning = TimePartitioning() time_partitioning.field = 'partition_value' table_obj.time_partitioning = time_partitioning return client.create_table(table_obj)
def __create_test_table(self, table_name, dataset_id): table_schema = [ SchemaField("int_data", "INT64"), SchemaField("str_data", "STRING") ] table_reference = TableReference(dataset_id, table_name) test_table = Table(table_reference, table_schema) test_table.time_partitioning = TimePartitioning("DAY") self.__delete_if_exists(test_table) self.GCP_BIGQUERY_CLIENT.create_table(test_table) return test_table
def store_digital_health_status_data(project_id, json_data, destination_table, schema=None): """ Stores the fetched digital_health_sharing_status data in a BigQuery dataset. If the table doesn't exist, it will create that table. If the table does exist, it will create a partition in the designated table or append to the same partition. This is necessary for storing data has "RECORD" type fields which do not conform to a dataframe. The data is stored using a JSON file object since it is one of the ways BigQuery expects it. :param project_id: identifies the project :param json_data: list of json objects retrieved from process_digital_health_data_to_json :param destination_table: fully qualified destination table name as 'project.dataset.table' :param schema: a list of SchemaField objects corresponding to the destination table :return: returns the bq job_id for the loading of digital health data """ # Parameter check if not isinstance(project_id, str): raise RuntimeError( f'Please specify the project in which to create the table') client = get_client(project_id) if not schema: schema = get_table_schema(DIGITAL_HEALTH_SHARING_STATUS) try: table = client.get_table(destination_table) except NotFound: table = Table(destination_table, schema=schema) table.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY) table = client.create_table(table) file_obj = StringIO() for json_obj in json_data: json.dump(json_obj, file_obj) file_obj.write('\n') job_config = LoadJobConfig( source_format=SourceFormat.NEWLINE_DELIMITED_JSON, schema=schema) job = client.load_table_from_file(file_obj, table, rewind=True, job_config=job_config, job_id_prefix='ps_digital_health_load_') job.result() return job.job_id
def setUp(self): self.project_id = os.environ.get(PROJECT_ID) self.dataset_id = os.environ.get('COMBINED_DATASET_ID') self.dataset_ref = DatasetReference(self.project_id, self.dataset_id) self.client = bq.get_client(self.project_id) self.schema = [ SchemaField("person_id", "INT64"), SchemaField("first_name", "STRING"), SchemaField("last_name", "STRING"), SchemaField("algorithm", "STRING") ] self.ps_api_fields = [ dict(name='person_id', type='integer', mode='nullable'), dict(name='first_name', type='string', mode='nullable'), dict(name='last_name', type='string', mode='nullable') ] self.id_match_fields = [ dict(name='person_id', type='integer', mode='nullable'), dict(name='first_name', type='string', mode='nullable'), dict(name='last_name', type='string', mode='nullable'), dict(name='algorithm', type='string', mode='nullable') ] self.hpo_id = 'fake_site' self.id_match_table_id = f'{IDENTITY_MATCH_TABLE}_{self.hpo_id}' self.ps_values_table_id = f'ps_api_values_{self.hpo_id}' # Create and populate the ps_values site table schema = bq.get_table_schema(PS_API_VALUES) tablename = self.ps_values_table_id table = Table(f'{self.project_id}.{self.dataset_id}.{tablename}', schema=schema) table.time_partitioning = TimePartitioning( type_=TimePartitioningType.HOUR) table = self.client.create_table(table) populate_query = POPULATE_PS_VALUES.render( project_id=self.project_id, drc_dataset_id=self.dataset_id, ps_values_table_id=self.ps_values_table_id) job = self.client.query(populate_query) job.result()
def test_should_create_table_from_table_object(self): # given table_id = f'{self.dataset_manager.project_id}.{self.dataset_manager.dataset_name}.example_test_table' table = Table(table_id, schema=[ { "mode": "NULLABLE", "name": "example_field", "type": "STRING" }, ]) table.time_partitioning = TimePartitioning() # when self.dataset_manager.create_table_from_schema('example_test_table', schema=None, table=table) # then self.table_should_exists()
def create_table_from_schema(self, table_id: str, schema: typing.Union[typing.List[dict], Path, None] = None, table=None): from google.cloud.bigquery import Table, TimePartitioning if schema and table: raise ValueError( "You can't provide both schema and table, because the table you provide" "should already contain the schema.") if not schema and not table: raise ValueError("You must provide either schema or table.") if isinstance(schema, Path): schema = json.loads(schema.read_text()) if table is None: table = Table(table_id, schema=schema) table.time_partitioning = TimePartitioning() self.logger.info(f'CREATING TABLE FROM SCHEMA: {table.schema}') self.bigquery_client.create_table(table)
def enrich_task(): client = Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) table = Table(temp_table_ref) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) if time_partitioning_field is not None: table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table.schema = schema table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql = read_file(sql_path, environment) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' # Copy temporary table to destination copy_job_config = CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table(dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)
def enrich_task(ds, **kwargs): template_context = kwargs.copy() template_context['ds'] = ds template_context['params'] = environment client = Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) table = Table(temp_table_ref) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table.schema = schema table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql_template = read_file(sql_path) sql = kwargs['task'].render_template('', sql_template, template_context) print('Enrichment sql:') print(sql) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' if load_all_partitions: # Copy temporary table to destination copy_job_config = CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table( dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' else: # Merge # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement merge_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 merge_job_config.priority = QueryPriority.INTERACTIVE merge_sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/merge_{task}.sql'.format( task=task)) merge_sql_template = read_file(merge_sql_path) template_context['params']['source_table'] = temp_table_name merge_sql = kwargs['task'].render_template( '', merge_sql_template, template_context) print('Merge sql:') print(merge_sql) merge_job = client.query(merge_sql, location='US', job_config=merge_job_config) submit_bigquery_job(merge_job, merge_job_config) assert merge_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)
def setUp(self): self.maxDiff = None self.project_id = os.environ.get(PROJECT_ID) self.dataset_id = os.environ.get('COMBINED_DATASET_ID') self.dataset_ref = DatasetReference(self.project_id, self.dataset_id) self.client = bq.get_client(self.project_id) self.hpo_id = 'fake_site' self.id_match_table_id = f'{IDENTITY_MATCH_TABLE}_{self.hpo_id}' self.ps_values_table_id = f'{PS_API_VALUES}_{self.hpo_id}' self.pii_address_table_id = f'{self.hpo_id}_pii_address' self.pii_email_table_id = f'{self.hpo_id}_pii_email' self.pii_phone_number_table_id = f'{self.hpo_id}_pii_phone_number' self.pii_name_table_id = f'{self.hpo_id}_pii_name' self.person_table_id = f'{self.hpo_id}_person' self.location_table_id = f'{self.hpo_id}_location' self.fq_concept_table = f'{self.project_id}.{self.dataset_id}.concept' # Create and populate the ps_values site table schema = resources.fields_for(f'{PS_API_VALUES}') table = Table( f'{self.project_id}.{self.dataset_id}.{self.ps_values_table_id}', schema=schema) table.time_partitioning = TimePartitioning( type_=TimePartitioningType.HOUR) table = self.client.create_table(table, exists_ok=True) populate_query = POPULATE_PS_VALUES.render( project_id=self.project_id, drc_dataset_id=self.dataset_id, ps_values_table_id=self.ps_values_table_id) job = self.client.query(populate_query) job.result() # Create and populate the drc_id_match_table schema = resources.fields_for(f'{IDENTITY_MATCH_TABLE}') table = Table( f'{self.project_id}.{self.dataset_id}.{self.id_match_table_id}', schema=schema) table.time_partitioning = TimePartitioning( type_=TimePartitioningType.HOUR) table = self.client.create_table(table, exists_ok=True) populate_query = POPULATE_ID_MATCH.render( project_id=self.project_id, drc_dataset_id=self.dataset_id, id_match_table_id=self.id_match_table_id) job = self.client.query(populate_query) job.result() # Create and populate pii_name, pii_email, pii_phone_number, and pii_address table schema = resources.fields_for(f'{PII_NAME}') table = Table( f'{self.project_id}.{self.dataset_id}.{self.pii_name_table_id}', schema=schema) table.time_partitioning = TimePartitioning( type_=TimePartitioningType.HOUR) table = self.client.create_table(table, exists_ok=True) schema = resources.fields_for(f'{PII_EMAIL}') table = Table( f'{self.project_id}.{self.dataset_id}.{self.pii_email_table_id}', schema=schema) table.time_partitioning = TimePartitioning( type_=TimePartitioningType.HOUR) table = self.client.create_table(table, exists_ok=True) schema = resources.fields_for(f'{PII_PHONE_NUMBER}') table = Table( f'{self.project_id}.{self.dataset_id}.{self.pii_phone_number_table_id}', schema=schema) table.time_partitioning = TimePartitioning( type_=TimePartitioningType.HOUR) table = self.client.create_table(table, exists_ok=True) schema = resources.fields_for(f'{PII_ADDRESS}') table = Table( f'{self.project_id}.{self.dataset_id}.{self.pii_address_table_id}', schema=schema) table.time_partitioning = TimePartitioning( type_=TimePartitioningType.HOUR) table = self.client.create_table(table, exists_ok=True) person_table = Table( f'{self.project_id}.{self.dataset_id}.{self.person_table_id}', schema=person_schema) person_table = self.client.create_table(person_table, exists_ok=True) location_table = Table( f'{self.project_id}.{self.dataset_id}.{self.location_table_id}', schema=location_schema) location_table = self.client.create_table(location_table, exists_ok=True) concept_table = Table(f'{self.project_id}.{self.dataset_id}.concept', schema=concept_schema) concept_table = self.client.create_table(concept_table, exists_ok=True)
def commit(self): # The commit is where the upload is actually done for BigQuery (special case). # The _create_or_update_table method can be called multiple times; # each time, data is appended to the .avro file. When "committing", # this .avro file is uploaded and, depending on the load strategy, used. if not hasattr(self, "avro_file_name"): # There was no data ever uploaded # Do nothing self.log.info("Nothing to upload!") return # Clean up after yourself first self.avro_writer.close() # Fetch the relevant configuration project_id = self.table_creation_config.get("database_name", self.database_name) assert project_id, "Missing Project ID!" load_strategy = self.table_creation_config["load_strategy"] primary_key = self.table_creation_config["primary_key"] schema_name = self.table_creation_config["schema_name"] schema_suffix = self.table_creation_config["schema_suffix"] table_name_final = self.table_creation_config["table_name"] table_suffix = "__ewah_tmp" columns_definition = self.table_creation_config["columns_definition"] new_schema_name = schema_name + schema_suffix is_full_refresh = (load_strategy == EC.LS_INSERT_REPLACE or not self.test_if_table_exists( table_name=table_name_final, schema_name=new_schema_name, project_id=project_id, )) conn = self.dwh_hook.dbconn ds_new = conn.get_dataset(new_schema_name) # Create temp table with .avro file if is_full_refresh: # temp table is also the final table for full refresh! table_name = table_name_final else: table_name = table_name_final + table_suffix # Drop temp table if it already exists if self.test_if_table_exists( table_name=table_name, schema_name=new_schema_name, project_id=project_id, ): # Drop table before re-creating it conn.delete_table( conn.get_table( TableReference(dataset_ref=ds_new, table_id=table_name))) # Create temp table with .avro file table_obj = Table(".".join([project_id, new_schema_name, table_name])) if is_full_refresh and self.partition_field: table_obj.time_partitioning = bigquery.TimePartitioning( type_=self.partition_type, field=self.partition_field, ) if self.require_partition_filter: table_obj.require_partition_filter = True self.log.info("Uploading data into table now...") with open(self.avro_file_name, "rb") as source_file: job = conn.load_table_from_file( file_obj=source_file, destination=table_obj, job_id_prefix="ewah_", rewind=True, job_config=LoadJobConfig( autodetect=False, source_format="AVRO", schema=[ SchemaField(name=name, field_type=field["data_type"]) for name, field in columns_definition.items() ], ), ) try: job.result() except: self.log.info("Errors occured - job errors: {0}".format( job.errors)) raise assert job.state == "DONE", "Invalid job state: {0}".format( job.state) if not is_full_refresh: # Need to merge new rows into the existing table fields_pk = set(primary_key or []) fields_all = set(columns_definition.keys() or []) fields_non_pk = fields_all - fields_pk if load_strategy == EC.LS_UPSERT: assert fields_pk elif load_strategy == EC.LS_INSERT_ADD: fields_pk = [] # Ignore if set else: raise Exception("Not implemented!") merge_statement = """ MERGE INTO `{target}` AS TARGET USING `{source}` AS SOURCE ON {condition} WHEN MATCHED THEN UPDATE SET {update_fields} WHEN NOT MATCHED THEN INSERT ({insert_fields}) VALUES ({insert_fields}) """.format( target=".".join( [project_id, new_schema_name, table_name_final]), source=".".join([project_id, new_schema_name, table_name]), condition=" AND ".join([ "TARGET.`{0}` = SOURCE.`{0}`".format(field) for field in fields_pk ]) or "FALSE", insert_fields="`{0}`".format("`, `".join(fields_all)), update_fields=", ".join([ "`{0}` = SOURCE.`{0}`".format(field) for field in fields_non_pk ]), ) self.log.info( "Executing query:\n\n{0}\n\n".format(merge_statement)) job = conn.query( query=merge_statement, job_id_prefix="ewah_", ) try: job.result() except: self.log.info("Errors occured - job errors: {0}".format( job.errors)) raise assert job.state == "DONE", "Invalid job state: {0}".format( job.state) # Remove old temp table from dataset conn.delete_table( conn.get_table( TableReference(dataset_ref=ds_new, table_id=table_name))) self.log.info("Done!")