def add_bigquery(temp_data, table_name, table_path, dataset_name, schema): """ Given a dataset name and a table_name (in bigquery), create a table in bigquery with schema given in 'schema' and the data stored in Google Storage path 'table_path'. It deduces the file format (NJSON or CSV) from temp_data. """ if local: os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/tmp/key.json' bq = bigquery.Client(project='gabinete-compartilhado') ds = bq.dataset(dataset_name) # Create dataset if non-existent: try: bq.create_dataset(ds) except: pass # Create a local object (bigQuery table): table_ref = ds.table(table_name) table = bigquery.Table(table_ref) # Configure the bigquery table: if os.path.exists(temp_data): external_config = bigquery.ExternalConfig('NEWLINE_DELIMITED_JSON') elif os.path.exists(temp_data.replace('.json', '.csv')): external_config = bigquery.ExternalConfig('CSV') else: raise Exception('unknown temp_data file extension') external_config.schema = schema # external_config.autodetect = True external_config.ignore_unknown_values = True external_config.max_bad_records = 100 source_uris = [table_path] external_config.source_uris = source_uris table.external_data_configuration = external_config # create table (first delete existent table): try: bq.delete_table(table) except Exception as e: print(e) pass bq.create_table(table) print('Table Cr')
def create_external_tables(self, bucket_name, prefix, date, tables, project, dataset, table_prefix, version): if table_prefix: table_prefix += "_" else: table_prefix = "" gcs_loc = f"gs://{bucket_name}/{prefix}/v{version}/{date}" client = bigquery.Client(project=project) dataset_ref = client.dataset(dataset) for leanplum_name in tables: table_name = f"{table_prefix}{leanplum_name}_v{version}_{date}" logging.info(f"Creating table {table_name}") table_ref = bigquery.TableReference(dataset_ref, table_name) table = bigquery.Table(table_ref) client.delete_table(table, not_found_ok=True) external_config = bigquery.ExternalConfig('CSV') external_config.source_uris = [f"{gcs_loc}/{leanplum_name}/*"] external_config.autodetect = True table.external_data_configuration = external_config client.create_table(table)
def create_external_table_hive_partitioning( bq_client: bigquery.Client, dataset: bigquery.Dataset, table_id: str, gcs_directory_path: str) -> bigquery.Table: """ Creates an external table with AUTO hive partitioning in GCS :param bq_client: Client object to bigquery :param dataset: dataset object. Check 'get_or_create_dataset' method :param table_id: Table to be created :param gcs_directory_path: Directory of GCS with the data. For example: If you have a structure like this: "gs://bucket/images_metadata/source_id=abc/date=2018-02-20" You should pass: "******" :return: """ table = bigquery.Table(dataset.table(table_id)) external_config = bigquery.ExternalConfig( bigquery.SourceFormat.PARQUET) external_config.source_uris = [f"{gcs_directory_path}/*"] hive_part_opt = HivePartitioningOptions() hive_part_opt.mode = "AUTO" hive_part_opt.source_uri_prefix = gcs_directory_path external_config.hive_partitioning = hive_part_opt table.external_data_configuration = external_config table = bq_client.create_table(table, exists_ok=True) return table
def external_config(self): if self.source_format == "csv": _external_config = bigquery.ExternalConfig("CSV") _external_config.options.skip_leading_rows = 1 _external_config.options.allow_quoted_newlines = True _external_config.options.allow_jagged_rows = True _external_config.autodetect = False _external_config.schema = self.table_obj._load_schema(self.mode) # You can add new formats here else: raise NotImplementedError( "Base dos Dados just supports comma separated csv files") _external_config.source_uris = f"gs://{self.table_obj.bucket_name}/staging/{self.table_obj.dataset_id}/{self.table_obj.table_id}/*" if self.partitioned: _external_config.hive_partitioning = self.partition() return _external_config
def query_external_gcs_temporary_table(): # [START bigquery_query_external_gcs_temp] from google.cloud import bigquery # Construct a BigQuery client object. client = bigquery.Client() # Configure the external data source and query job. external_config = bigquery.ExternalConfig("CSV") external_config.source_uris = [ "gs://cloud-samples-data/bigquery/us-states/us-states.csv" ] external_config.schema = [ bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("post_abbr", "STRING"), ] external_config.options.skip_leading_rows = 1 table_id = "us_states" job_config = bigquery.QueryJobConfig( table_definitions={table_id: external_config}) # Example query to find states starting with 'W'. sql = 'SELECT * FROM `{}` WHERE name LIKE "W%"'.format(table_id) query_job = client.query(sql, job_config=job_config) # Make an API request. w_states = list(query_job) # Wait for the job to complete. print("There are {} states with names starting with W.".format( len(w_states)))
def createTempTable(gscfiles): gcsbucketname = os.environ['gcsproject'].lower().strip() gcsbucketname += '.appspot.com' gcsprefix = '' if (os.environ['gcspath']): gcsprefix += os.environ['gcspath'].strip() + '/' tables = [] for gcf in gscfiles: gscfile = gcf.strip() gcsfullurl = "gs://{}/{}{}.csv".format(gcsbucketname,gcsprefix,gscfile) gcsduration = int(os.environ['bigquery_temptable_duration'].strip()) # set expiration of the table gcsexpiration = datetime.utcnow() + timedelta(seconds=gcsduration) bgqclient = bigquery.Client() bgqdataset = os.environ['bigquery_dataset'].strip() bgqdatasetid = bgqclient.dataset(bgqdataset) brazefields = [bf.lower().strip() for bf in os.environ['brazesegmentfields'].split(',')] brazetype = [bt.upper().strip() for bt in os.environ['brazesegmenttype'].split(',')] bgqschema = [] # Generate schema based on field type for bf, bt in zip(brazefields, brazetype): bgqschema.append(bigquery.SchemaField(bf, bt, mode="NULLABLE")) table = bigquery.Table(bgqdatasetid.table(gscfile) , schema=bgqschema) table.expires = gcsexpiration external_config = bigquery.ExternalConfig("CSV") external_config.options.skip_leading_rows = 1 external_config.source_uris = [gcsfullurl] table.external_data_configuration = external_config table = bgqclient.create_table(table) tables.append(table) return tables
def main (): log_uri = "gs://andy-00000002-bucket/api/api-test.log" #log_uri = "gs://andy-00000002-bucket/envoy/api/api-0kg1-application-http-egress-2020-07-27-06-25-11.log" client = bigquery.Client() dataset_ref = client.dataset("service_log_parser") print(dataset_ref.table('envoy_access_log')) dataset_id = "service_log_parser" project_id = "green-reporter-266619" table_name = table_name_generator() table_id = project_id + '.' + dataset_id + '.' + table_name #dest_table_id = project_id + '.' + dataset_id + '.' + 'envoy_access_log' dest_table_id = '{}.{}'.format(dataset_id, 'envoy_access_log') insert_sql = insert_table_query_generator(table_name, dest_table_id) #print(insert_sql) external_config = bigquery.ExternalConfig("CSV") external_config.source_uris = [ log_uri, ] external_config.schema = [ bigquery.SchemaField("content", "STRING", mode="REQUIRED"), ] job_config = bigquery.QueryJobConfig(table_definitions={table_name: external_config}) query_job = client.query(insert_sql, job_config=job_config) if len(list(query_job)) == 0: # Waits for query to finish print("Load data completed!")
def add_external_gcs_source( self, gcs_url: str, dataset_name: str, table_name: str, skip_rows: int = 0, delimiter: str = ",", quote: str = '"', source_format: str = "CSV", project_id: str = None, ): dataset_ref = self._dataset_ref(dataset_name=dataset_name, project_id=project_id) table = dataset_ref.table(table_name) external_config = bigquery.ExternalConfig(source_format=source_format) external_config.autodetect = True external_config.source_uris = [gcs_url] external_config.options.skip_leading_rows = skip_rows external_config.options.field_delimiter = delimiter external_config.options.quote = quote table.external_data_configuration = external_config return self.client.create_table(table)
def run_federated_query(self, query_type, query): """Runs native queries on EXTERNAL files Args: query_type(str): Code for the category of the query to run (SIMPLE_SELECT_*, SELECT_ONE_STRING, SELECT_50_PERCENT). query(str): The query to run. """ file_formats = file_constants.FILE_CONSTANTS['sourceFormats'] source_format = file_formats[self.file_type] external_config = bigquery.ExternalConfig(source_format=source_format) external_config.source_uris = [self.file_uri + '/*'] if source_format != 'AVRO' and source_format != 'PARQUET': main_table_util = table_util.TableUtil(self.native_table_id, self.dataset_id) external_config.schema = main_table_util.table.schema if source_format == 'CSV': external_config.options.skip_leading_rows = 1 external_config.compression = self.compression.upper() table_id = self.native_table_id + '_external' results_destination = '{0:s}.{1:s}.{2:s}_query_results'.format( self.bq_project, self.dataset_id, table_id) logging.info( 'Storing query results in {0:s}'.format(results_destination)) job_config = bigquery.QueryJobConfig( table_definitions={table_id: external_config}, use_legacy_sql=False, allow_large_results=True, destination=results_destination) bql = query.format(table_id) print(bql) query_job = self.bq_client.query(bql, job_config=job_config) logging.info("Running external {0:s} query.".format(query_type)) query_job.result() query_result = benchmark_result_util.QueryBenchmarkResultUtil( job=query_job, job_type=self.job_type, benchmark_name=self.benchmark_name, project_id=self.bq_project, results_table_name=self.results_table_name, results_dataset_id=self.results_table_dataset_id, bq_logs_dataset=self.bq_logs_dataset_id, bql=bql, query_category=query_type, main_table_name=self.native_table_id, table_dataset_id=self.dataset_id, table_type=EXTERNAL_TYPE_ID, file_uri=self.file_uri, ) query_result.insert_results_row() self.bq_client.delete_table(results_destination) logging.info('Deleting results destination table {0:s}'.format( results_destination))
def query_external_sheets_permanent_table(dataset_id): # [START bigquery_query_external_sheets_perm] from google.cloud import bigquery import google.auth # Create credentials with Drive & BigQuery API scopes. # Both APIs must be enabled for your project before running this code. credentials, project = google.auth.default(scopes=[ "https://www.googleapis.com/auth/drive", "https://www.googleapis.com/auth/bigquery", ]) # Construct a BigQuery client object. client = bigquery.Client(credentials=credentials, project=project) # TODO(developer): Set dataset_id to the ID of the dataset to fetch. # dataset_id = "your-project.your_dataset" # Configure the external data source. dataset = client.get_dataset(dataset_id) table_id = "us_states" schema = [ bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("post_abbr", "STRING"), ] table = bigquery.Table(dataset.table(table_id), schema=schema) external_config = bigquery.ExternalConfig("GOOGLE_SHEETS") # Use a shareable link or grant viewing access to the email address you # used to authenticate with BigQuery (this example Sheet is public). sheet_url = ( "https://docs.google.com/spreadsheets" "/d/1i_QCL-7HcSyUZmIbP9E6lO_T5u3HnpLe7dnpHaijg_E/edit?usp=sharing") external_config.source_uris = [sheet_url] external_config.options.skip_leading_rows = 1 # Optionally skip header row. external_config.options.range = ( "us-states!A20:B49" # Optionally set range of the sheet to query from. ) table.external_data_configuration = external_config # Create a permanent table linked to the Sheets file. table = client.create_table(table) # Make an API request. # Example query to find states starting with "W". sql = 'SELECT * FROM `{}.{}` WHERE name LIKE "W%"'.format( dataset_id, table_id) query_job = client.query(sql) # Make an API request. # Wait for the query to complete. w_states = list(query_job) print( "There are {} states with names starting with W in the selected range." .format(len(w_states)))
def ingest_file_external(data, context): ''' This function is triggered by Cloud Storage bucket. This function is obsoleted because BQ external data source can lead to a potential high data query cost. This code will be removed once confirming the decision. ''' bucket_name = data['bucket'] file_name = data['name'] file_path = 'gs://{}/{}'.format(bucket_name, file_name) logging.info('Triggered by file {}'.format(file_path)) db_ref = DB.document(u'streaming_files/%s' % file_name) dataset_ref = BQ.dataset(BQ_DATASET) schema = [ bigquery.SchemaField('hash', 'STRING'), bigquery.SchemaField('size', 'STRING'), bigquery.SchemaField('stripped_size', 'STRING'), bigquery.SchemaField('weight', 'STRING'), bigquery.SchemaField('number', 'STRING'), bigquery.SchemaField('version', 'STRING'), bigquery.SchemaField('merkle_root', 'STRING'), bigquery.SchemaField('timestamp', 'STRING'), bigquery.SchemaField('timestamp_month', 'STRING'), bigquery.SchemaField('nonce', 'STRING'), bigquery.SchemaField('bits', 'STRING'), bigquery.SchemaField('coinbase_param', 'STRING'), bigquery.SchemaField('transaction_count', 'STRING') ] table_suffix = _table_suffix() table_id = '{}_{}'.format(BQ_TABLE, table_suffix) logging.info('Start table creation {}'.format(table_id)) table = bigquery.Table(dataset_ref.table(table_id), schema=schema) external_config = bigquery.ExternalConfig('CSV') external_config.source_uris = [file_path] external_config.options.skip_leading_rows = 1 table.external_data_configuration = external_config if _was_already_ingested(db_ref): _handle_duplication(db_ref) else: try: BQ.create_table(table) _handle_success(db_ref) logging.info('table creation success {}'.format(table_id)) except Exception: _handle_error(db_ref) logging.info('table creation fails {}'.format(table_id))
def apply_gcs_changes(data_type, **kwargs): ''' Sets up a file in Google Cloud Storage as an temporary table, and merges it with an existing table in BigQuery. The file's location in GCS is retrieved through XCOM, and the schema for the temporary table is loaded from GCS. Data types should be: 'accounts', 'offers', or 'trustlines'. Parameters: data_type - type of the data being uploaded; should be string Returns: N/A ''' key_path = Variable.get('api_key_path') credentials = service_account.Credentials.from_service_account_file( key_path) client = bigquery.Client(credentials=credentials, project=credentials.project_id) gcs_bucket_name = Variable.get('gcs_exported_data_bucket_name') gcs_filepath = kwargs['task_instance'].xcom_pull( task_ids=f'load_{data_type}_to_gcs') schema = read_local_schema(data_type) external_config = bigquery.ExternalConfig('NEWLINE_DELIMITED_JSON') external_config.source_uris = [f'gs://{gcs_bucket_name}/{gcs_filepath}'] external_config.schema = [ bigquery.SchemaField(field['name'], field['type'], mode=field['mode']) for field in schema ] # The temporary table is is equal to the name of the file used to create it. Table ids cannot have '-'. Instead they have '_' table_id = f'{splitext(basename(gcs_filepath))[0]}' table_id = table_id.replace('-', '_') job_config = bigquery.QueryJobConfig( table_definitions={table_id: external_config}) sql_query = create_merge_query(table_id, data_type, schema) logging.info(f'Merge query is: {sql_query}') logging.info(f'Running BigQuery job with config: {job_config._properties}') query_job = client.query(sql_query, job_config=job_config) result_rows = query_job.result() if query_job.error_result: logging.info(f'Query errors: {query_job.errors}') raise AirflowException(f'Query job failed: {query_job.error_result}') logging.info( f'Job timeline: {[t._properties for t in query_job.timeline]}') logging.info( f'{query_job.total_bytes_billed} bytes billed at billing tier {query_job.billing_tier}' ) logging.info(f'Total rows affected: {query_job.num_dml_affected_rows}')
def createTable(bqDataset, bqExternalTable, fileLocation): bigqueryClient = bigquery.Client() extConfig = bigquery.ExternalConfig("CSV") extConfig.source_uris = [fileLocation] extConfig.options.skip_leading_rows = 1 bqSchema = getSchema() tableRef = bigqueryClient.dataset(bqDataset).table(bqExternalTable) table = bigquery.Table(tableRef, schema=bqSchema) table.external_data_configuration = extConfig table = bigqueryClient.create_table(table, exists_ok=True) return(0)
def query_external_sheets_temporary_table(): # [START bigquery_query_external_sheets_temp] # [START bigquery_auth_drive_scope] from google.cloud import bigquery import google.auth # Create credentials with Drive & BigQuery API scopes. # Both APIs must be enabled for your project before running this code. credentials, project = google.auth.default(scopes=[ "https://www.googleapis.com/auth/drive", "https://www.googleapis.com/auth/bigquery", ]) # Construct a BigQuery client object. client = bigquery.Client(credentials=credentials, project=project) # [END bigquery_auth_drive_scope] # Configure the external data source and query job. external_config = bigquery.ExternalConfig("GOOGLE_SHEETS") # Use a shareable link or grant viewing access to the email address you # used to authenticate with BigQuery (this example Sheet is public). sheet_url = ( "https://docs.google.com/spreadsheets" "/d/1i_QCL-7HcSyUZmIbP9E6lO_T5u3HnpLe7dnpHaijg_E/edit?usp=sharing") external_config.source_uris = [sheet_url] external_config.schema = [ bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("post_abbr", "STRING"), ] external_config.options.skip_leading_rows = 1 # Optionally skip header row. external_config.options.range = ( "us-states!A20:B49" # Optionally set range of the sheet to query from. ) table_id = "us_states" job_config = bigquery.QueryJobConfig( table_definitions={table_id: external_config}) # Example query to find states starting with "W". sql = 'SELECT * FROM `{}` WHERE name LIKE "W%"'.format(table_id) query_job = client.query(sql, job_config=job_config) # Make an API request. # Wait for the query to complete. w_states = list(query_job) print( "There are {} states with names starting with W in the selected range." .format(len(w_states)))
def create_external_table(dest_dataset, dest_table, gs_path): dataset_ref = client.dataset(dest_dataset) table_ref = bigquery.TableReference(dataset_ref, dest_table) table = bigquery.Table(table_ref) external_config = bigquery.ExternalConfig('CSV') external_config.autodetect = True external_config.max_bad_records = 100000000 source_uris = [gs_path] external_config.source_uris = source_uris table.external_data_configuration = external_config client.create_table(table) print('Table {}.{} created.'.format(dest_dataset, dest_table))
def test_query_external_gcs_permanent_table(client, to_delete): dataset_id = "query_external_gcs_{}".format(_millis()) project = client.project dataset_ref = bigquery.DatasetReference(project, dataset_id) dataset = bigquery.Dataset(dataset_ref) client.create_dataset(dataset) to_delete.append(dataset) # [START bigquery_query_external_gcs_perm] # from google.cloud import bigquery # client = bigquery.Client() # dataset_id = 'my_dataset' # Configure the external data source dataset_ref = bigquery.DatasetReference(project, dataset_id) table_id = "us_states" schema = [ bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("post_abbr", "STRING"), ] table = bigquery.Table(dataset_ref.table(table_id), schema=schema) external_config = bigquery.ExternalConfig("CSV") external_config.source_uris = [ "gs://cloud-samples-data/bigquery/us-states/us-states.csv" ] external_config.options.skip_leading_rows = 1 # optionally skip header row table.external_data_configuration = external_config # Create a permanent table linked to the GCS file table = client.create_table(table) # API request # Example query to find states starting with 'W' sql = 'SELECT * FROM `{}.{}` WHERE name LIKE "W%"'.format( dataset_id, table_id) query_job = client.query(sql) # API request w_states = list(query_job) # Waits for query to finish print("There are {} states with names starting with W.".format( len(w_states))) # [END bigquery_query_external_gcs_perm] assert len(w_states) == 4
def load_gcs_to_bq(log_uri): dataset_id = get_env_var('DATASET_ID') target_table_name = get_env_var('TABLE_NAME') client = bigquery.Client() temp_table_name = table_name_generator() target_table_id = '{}.{}'.format(dataset_id, target_table_name) insert_sql = insert_table_query_generator(temp_table_name, target_table_id) external_config = bigquery.ExternalConfig("CSV") external_config.source_uris = [ log_uri, ] external_config.schema = [ bigquery.SchemaField("content", "STRING", mode="REQUIRED"), ] job_config = bigquery.QueryJobConfig( table_definitions={temp_table_name: external_config}) query_job = client.query(insert_sql, job_config=job_config) if len(list(query_job)) == 0: print("Load {} to {} completed!".format(log_uri, target_table_id))
def create_external_table(project_id: str) -> None: client = bigquery.Client() dataset_id = "social_dataset" dataset_ref = bigquery.DatasetReference(project_id, dataset_id) table_id = "ja_kakei_chousa_income_divide_over_two_member" table = bigquery.Table(dataset_ref.table(table_id)) external_config = bigquery.ExternalConfig("PARQUET") external_config.source_uris = [ "gs://ja-kakei-chousa-income-divide-over-two-member/*" ] external_config.autodetect = True hive_partitioning = bigquery.external_config.HivePartitioningOptions() hive_partitioning.mode = "AUTO" hive_partitioning.require_partition_filter = False hive_partitioning.source_uri_prefix = ( "gs://ja-kakei-chousa-income-divide-over-two-member") external_config.hive_partitioning = hive_partitioning table.external_data_configuration = external_config table = client.create_table(table, exists_ok=True)
def create_external_table(project_id: str) -> None: client = bigquery.Client() dataset_id = "social_dataset" dataset_ref = bigquery.DatasetReference(project_id, dataset_id) table_id = "jasso_gakuseiseikatsu_stats_annual_income_divide_university" table = bigquery.Table(dataset_ref.table(table_id)) external_config = bigquery.ExternalConfig("PARQUET") external_config.source_uris = [ "gs://jasso-gakuseiseikatsu-stats-annual-income-divide-university/*" ] external_config.autodetect = True hive_partitioning = bigquery.external_config.HivePartitioningOptions() hive_partitioning.mode = "AUTO" hive_partitioning.require_partition_filter = False hive_partitioning.source_uri_prefix = ( "gs://jasso-gakuseiseikatsu-stats-annual-income-divide-university") external_config.hive_partitioning = hive_partitioning table.external_data_configuration = external_config table = client.create_table(table, exists_ok=True)
def create_bq_table(project, gcs_uri, dataset, table, require_hive_partition_filter=True): bq_client = bigquery.Client(project=project) table_ref = bq_client.dataset(dataset).table(table) table = bigquery.Table(table_ref) hive_partition_options = HivePartitioningOptions() hive_partition_options.mode = "AUTO" hive_partition_options.source_uri_prefix = gcs_uri # To prevent one from accidentaly scan the whole table, set this # partition filter requirement. # # table.require_partition_filter = True is not supported by the class yet. # hive_partition_options.require_partition_filter = True is not # supported by the class yet. # So I need to do the following to include the option: hive_partition_options._properties[ "require_partition_filter"] = require_hive_partition_filter extconfig = bigquery.ExternalConfig('CSV') extconfig.schema = [bigquery.SchemaField('line', 'STRING')] extconfig.options.field_delimiter = u'\u00ff' extconfig.options.quote_character = '' # extconfig.compression = 'GZIP' extconfig.options.allow_jagged_rows = False extconfig.options.allow_quoted_newlines = False extconfig.max_bad_records = 10000000 extconfig.source_uris = [os.path.join(gcs_uri, "*")] extconfig.hive_partitioning = hive_partition_options table.external_data_configuration = extconfig bq_client.create_table(table)
def test_query_sheet_linked_bq_table(self): dataset = bq.create_dataset(self.project_id, self.dataset_id, self.description, self.label_or_tag) # add Google Drive scope external_data_scopes = [ "https://www.googleapis.com/auth/drive", "https://www.googleapis.com/auth/cloud-platform" ] client = bq.get_client(self.project_id, external_data_scopes) # Configure the external data source and query job. external_config = bigquery.ExternalConfig("GOOGLE_SHEETS") # Grant viewing access to the test sheet to BigQuery test accounts sheet_url = ( "https://docs.google.com/spreadsheets" "/d/1JI-KyigmwZU9I2J6TZqVTPNoEAWVqiFeF8Y549-dvzM/edit#gid=0") schema = [ bigquery.SchemaField("site_name", "STRING"), bigquery.SchemaField("hpo_id", "STRING"), bigquery.SchemaField("site_point_of_contact", "STRING"), ] external_config.source_uris = [sheet_url] external_config.schema = schema external_config.options.range = ( "contacts!A1:C5" # limit scope so that other items can be added to sheet ) external_config.options.skip_leading_rows = 1 # Optionally skip header row. table_id = consts.HPO_ID_CONTACT_LIST_TABLE_ID table = bigquery.Table(dataset.table(table_id), schema=schema) table.external_data_configuration = external_config table = client.create_table(table) table_content_query = f'SELECT * FROM `{dataset.dataset_id}.{table.table_id}`' actual_df = bq.query_sheet_linked_bq_table(self.project_id, table_content_query, external_data_scopes) expected_dict = [{ 'site_name': 'Fake Site Name 1', 'hpo_id': 'fake_1', 'site_point_of_contact': 'fake.email.1@site_1.fakedomain; fake.email.2@site_1.fakedomain' }, { 'site_name': 'Fake Site Name 2', 'hpo_id': 'fake_2', 'site_point_of_contact': 'no data steward' }, { 'site_name': 'Fake Site Name 3', 'hpo_id': 'fake_3', 'site_point_of_contact': 'Fake.Email.1@site_3.fake_domain; Fake.Email.2@site_3.fake_domain' }, { 'site_name': 'Fake Site Name 4', 'hpo_id': 'fake_4', 'site_point_of_contact': '[email protected]; [email protected]' }] expected_df = pd.DataFrame( expected_dict, columns=["site_name", "hpo_id", "site_point_of_contact"]) pd.testing.assert_frame_equal(actual_df, expected_df)
def apply_gcs_changes(data_type, **kwargs): ''' Sets up a file in Google Cloud Storage as an temporary table, and merges it with an existing table in BigQuery. The file's location in GCS is retrieved through XCOM, and the schema for the temporary table is loaded from GCS. Data types should be: 'accounts', 'offers', 'trustlines', 'dimAccounts', 'dimOffers', or 'dimMarkets'. Parameters: data_type - type of the data being uploaded; should be string Returns: N/A ''' key_path = Variable.get('api_key_path') credentials = service_account.Credentials.from_service_account_file(key_path) client = bigquery.Client(credentials=credentials, project=credentials.project_id) gcs_bucket_name = Variable.get('gcs_exported_data_bucket_name') gcs_filepath = kwargs['task_instance'].xcom_pull(task_ids=f'load_{data_type}_to_gcs') schema_dict = read_local_schema(data_type) bq_schema_list = [bigquery.SchemaField(field['name'], field['type'], mode=field['mode']) for field in schema_dict] external_config = bigquery.ExternalConfig('NEWLINE_DELIMITED_JSON') external_config.source_uris = [f'gs://{gcs_bucket_name}/{gcs_filepath}'] external_config.schema = bq_schema_list # The temporary table is is equal to the name of the file used to create it. Table ids cannot have '-'. Instead they have '_' table_id = f'{splitext(basename(gcs_filepath))[0]}' table_id = table_id.replace('-', '_') job_config = bigquery.QueryJobConfig(table_definitions={table_id: external_config}) #check if the table already exists; if it does not then we need to create it using the schema that we have already read in true_table_id = f'{Variable.get("bq_project")}.{Variable.get("bq_dataset")}.{Variable.get("table_ids", deserialize_json=True)[data_type]}' try: _ = client.get_table(true_table_id) except NotFound: logging.info(f'Bigquery table not found; creating table with id {true_table_id}') table = bigquery.Table(true_table_id, schema=bq_schema_list) client.create_table(table) if data_type in ['accounts', 'offers', 'trustlines']: logging.info('Using merge query...') sql_query = create_merge_query(table_id, data_type, schema_dict) elif data_type in ['dimAccounts', 'dimOffers', 'dimMarkets']: logging.info('Using insert unique query...') sql_query = create_insert_unique_query(table_id, data_type, schema_dict) else: raise AirflowException(f'Data type {data_type} has no corresponding query') logging.info(f'Query is: {sql_query}') logging.info(f'Running BigQuery job with config: {job_config._properties}') query_job = client.query(sql_query, job_config=job_config) result_rows = query_job.result() if query_job.error_result: logging.info(f'Query errors: {query_job.errors}') raise AirflowException(f'Query job failed: {query_job.error_result}') logging.info(f'Job timeline: {[t._properties for t in query_job.timeline]}') logging.info(f'{query_job.total_bytes_billed} bytes billed at billing tier {query_job.billing_tier}') logging.info(f'Total rows affected: {query_job.num_dml_affected_rows}')
# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from google.cloud import bigquery SCHEMA = [bigquery.SchemaField('line', 'STRING')] bq_client = bigquery.Client(project='bigquery-public-data-staging') table_ref = bq_client.dataset('wikipedia_pipeline').table('view_gcs') table = bigquery.Table(table_ref, schema=SCHEMA) extconfig = bigquery.ExternalConfig('CSV') extconfig.schema = SCHEMA extconfig.options.field_delimiter = u'\u00ff' extconfig.options.quote_character = '' extconfig.compression = 'GZIP' extconfig.options.allow_jagged_rows = False extconfig.options.allow_quoted_newlines = False extconfig.max_bad_records = 10000000 extconfig.source_uris = [ "gs://wiki-staging/dumps.wikimedia.org/other/pageviews/*" ] table.external_data_configuration = extconfig bq_client.create_table(table)
def create( self, path=None, job_config_params=None, partitioned=False, force_dataset=True, if_table_exists="raise", if_storage_data_exists="raise", if_table_config_exists="raise", ): """Creates BigQuery table at staging dataset. If you add a path, it automatically saves the data in the storage, creates a datasets folder and BigQuery location, besides creating the table and its configuration files. The new table should be located at `<dataset_id>_staging.<table_id>` in BigQuery. It looks for data saved in Storage at `<bucket_name>/staging/<dataset_id>/<table_id>/*` and builds the table. It currently supports the types: - Comma Delimited CSV Data can also be partitioned following the hive partitioning scheme `<key1>=<value1>/<key2>=<value2>`, for instance, `year=2012/country=BR` Args: path (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with job_config_params (dict): Optional. Job configuration params from bigquery partitioned (bool): Optional. Whether data is partitioned if_table_exists (str): Optional What to do if table exists * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing force_dataset (bool): Creates `<dataset_id>` folder and BigQuery Dataset if it doesn't exists. if_table_config_exists (str): Optional. What to do if config files already exist * 'raise': Raises FileExistError * 'replace': Replace with blank template * 'pass'; Do nothing if_storage_data_exists (str): Optional. What to do if data already exists on your bucket: * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing Todo: * Implement if_table_exists=raise * Implement if_table_exists=pass """ if path is None: # Look if table data already exists at Storage data = self.client["storage_staging"].list_blobs( self.bucket_name, prefix=f"staging/{self.dataset_id}/{self.table_id}") # Raise: Cannot create table without external data if not data: raise BaseDosDadosException( "You must provide a path for uploading data") # Add data to storage if isinstance( path, ( str, Path, ), ): Storage(self.dataset_id, self.table_id, **self.main_vars).upload(path, mode="staging", if_exists=if_storage_data_exists) # Create Dataset if it doesn't exist if force_dataset: dataset_obj = Dataset(self.dataset_id, **self.main_vars) try: dataset_obj.init() except FileExistsError: pass dataset_obj.create(if_exists="pass") self.init( data_sample_path=path, if_folder_exists="replace", if_table_config_exists=if_table_config_exists, ) external_config = external_config = bigquery.ExternalConfig("CSV") external_config.options.skip_leading_rows = 1 external_config.options.allow_quoted_newlines = True external_config.options.allow_jagged_rows = True external_config.autodetect = False external_config.schema = self._load_schema("staging") external_config.source_uris = ( f"gs://{self.bucket_name}/staging/{self.dataset_id}/{self.table_id}/*" ) if partitioned: hive_partitioning = bigquery.external_config.HivePartitioningOptions( ) hive_partitioning.mode = "AUTO" hive_partitioning.source_uri_prefix = self.uri.format( dataset=self.dataset_id, table=self.table_id).replace("*", "") external_config.hive_partitioning = hive_partitioning table = bigquery.Table(self.table_full_name["staging"]) table.external_data_configuration = external_config # Lookup if table alreay exists table_ref = None try: table_ref = self.client["bigquery_staging"].get_table( self.table_full_name["staging"]) except google.api_core.exceptions.NotFound: pass if isinstance(table_ref, google.cloud.bigquery.table.Table): if if_table_exists == "pass": return None elif if_table_exists == "raise": raise FileExistsError( "Table already exists, choose replace if you want to overwrite it" ) if if_table_exists == "replace": self.delete(mode="staging") self.client["bigquery_staging"].create_table(table)
#!/usr/bin/env python """Import VAT rates from Google Sheets.""" import google.auth from google.cloud import bigquery # Use job config with external table definition external_config = bigquery.ExternalConfig("GOOGLE_SHEETS") external_config.source_uris = [ "https://docs.google.com/spreadsheets/d/1-eUbaYIuppfwoCyawKEfNVHRx_HJfn_VBOc7c2YYS48" ] external_config.schema = [ bigquery.SchemaField("country_code", "STRING"), bigquery.SchemaField("country", "STRING"), bigquery.SchemaField("vat", "STRING"), bigquery.SchemaField("effective_date", "DATE"), ] external_config.options.skip_leading_rows = 2 job_config = bigquery.QueryJobConfig( table_definitions={"vat_rates_sheet": external_config}) # Use credentials that include a google drive scope credentials, project = google.auth.default(scopes=[ "https://www.googleapis.com/auth/drive", "https://www.googleapis.com/auth/bigquery", ]) client = bigquery.Client(credentials=credentials, project=project) query = client.query( """CREATE OR REPLACE TABLE `moz-fx-data-shared-prod`.mozilla_vpn_derived.vat_rates_v1
def update_from_cloud_storage(args): os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = args.google_key_path client = bigquery.Client() bucket_name = "igenie-tweets" blob_name = "historical/{}.json".format("tweets-raw") GS_URL = 'gs://{}/{}'.format(bucket_name, blob_name) external_config = bigquery.ExternalConfig("NEWLINE_DELIMITED_JSON") external_config.autodetect = True external_config.source_uris = [GS_URL] job_config = bigquery.QueryJobConfig() job_config.table_definitions = {"temp": external_config} file_name = "tweets-enriched.json" QUERY = ('SELECT id,' 'id_str,' 'constituent,' 'text,' 'coordinates,' 'created_at,' 'favorited,' 'place,' 'lang,' 'metadata,' 'retweeted,' 'entities.hashtags,' 'entities.symbols,' 'source,' 'user.time_zone,' 'user.location,' 'user.friends_count,' 'user.followers_count,' 'favorite_count,' 'retweet_count,' 'geo ,' 'search_term ' 'FROM `temp`') TIMEOUT = 100 # in seconds query_job = client.query( QUERY, job_config=job_config) # API request - starts the query assert query_job.state == 'RUNNING' # Waits for the query to finish iterator = query_job.result(timeout=TIMEOUT) with open(file_name, "a") as f: for row in iterator: # Included attributes result = {} result["id"] = row.id result['id_str'] = row.id_str result['text'] = row.text result['coordinates'] = row.coordinates result['favorited'] = row.favorited result['place'] = row.place result['lang'] = row.lang result['metadata'] = row.metadata result['retweeted'] = row.retweeted result['entities_hashtags'] = row["entities.hashtags"] result['entities_symbols'] = row["entities.symbols"] result['source'] = row.source result['user_time_zone'] = row["user.time_zone"] result['user_location'] = row["user.location"] result['user_friends_count'] = row["user.friends_count"] result['user_followers_count'] = row["user.followers_count"] result['favorite_count'] = row.favorite_count result['retweet_count'] = row.retweet_count result['geo'] = row.geo result['search_term'] = row.search_term # Extra attributes # constituent_id, constituent_name constituent_id, constituent_name = get_constituent_id_name( row.constituent) result['constituent_id'] = constituent_id result['constituent_name'] = constituent_name # created at - date result['date'] = datetime.strptime(row.created_at, '%a %b %d %H:%M:%S %z %Y') if not row.relevance: result["relevance"] = -1 else: result["relevance"] = row.relevance # sentiment score result["sentiment_score"] = get_nltk_sentiment(row.text) update_tags(result) f.write(json.dumps(result, cls=MongoEncoder) + '\n')
def create( self, path=None, job_config_params=None, partitioned=False, if_exists="raise", force_dataset=True, ): """Creates BigQuery table at staging dataset. If you add a path, it automatically saves the data in the storage, creates a datasets folder and BigQuery location, besides creating the table and its configuration files. The new table should be located at `<dataset_id>_staging.<table_id>` in BigQuery. It looks for data saved in Storage at `<bucket_name>/staging/<dataset_id>/<table_id>/*` and builds the table. It currently supports the types: - Comma Delimited CSV Data can also be partitioned following the hive partitioning scheme `<key1>=<value1>/<key2>=<value2>`, for instance, `year=2012/country=BR` Args: path (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with job_config_params (dict): Optional. Job configuration params from bigquery partitioned (bool): Optional. Whether data is partitioned if_exists (str): Optional What to do if table exists * 'raise' : Raises Conflict exception * 'replace' : Replace table * 'pass' : Do nothing force_dataset (bool): Creates `<dataset_id>` folder and BigQuery Dataset if it doesn't exists. Todo: * Implement if_exists=raise * Implement if_exists=pass """ # Add data to storage if isinstance( path, ( str, PosixPath, ), ): Storage(self.dataset_id, self.table_id, **self.main_vars).upload( path, mode="staging", if_exists="replace" ) # Create Dataset if it doesn't exist if force_dataset: dataset_obj = Dataset(self.dataset_id, **self.main_vars) try: dataset_obj.init() except FileExistsError: pass dataset_obj.create(if_exists="pass") self.init(data_sample_path=path, if_exists="replace") external_config = external_config = bigquery.ExternalConfig("CSV") external_config.options.skip_leading_rows = 1 external_config.options.allow_quoted_newlines = True external_config.options.allow_jagged_rows = True external_config.autodetect = False external_config.schema = self._load_schema("staging") external_config.source_uris = ( f"gs://basedosdados/staging/{self.dataset_id}/{self.table_id}/*" ) if partitioned: hive_partitioning = bigquery.external_config.HivePartitioningOptions() hive_partitioning.mode = "AUTO" hive_partitioning.source_uri_prefix = self.uri.format( dataset=self.dataset_id, table=self.table_id ).replace("*", "") external_config.hive_partitioning = hive_partitioning table = bigquery.Table(self.table_full_name["staging"]) table.external_data_configuration = external_config if if_exists == "replace": self.delete(mode="staging") self.client["bigquery_staging"].create_table(table) table = bigquery.Table(self.table_full_name["staging"])
def create_external_table(self, table_name, source_uris: list = [], schema=None, source_format: str = 'NEWLINE_DELIMITED_JSON', delete_if_exists=True, **kwargs): """ Create a BigQuery external table using source_uris as data files. The default source format is 'NEWLINE_DELIMITED_JSON'. For list of all available table source formats, run: > bq help mkdef If schema is omitted then table is used with autodetect schema. This only works with source formats which contain their own schema such as json or avro files. :param table_name: full table name including dataset name :param source_uris: list of google storage URI data file paths (ie: gs://bucket_name/myfiles/*) :param schema: table schema. If not defined then schema autodetect is used :param source_format: default is 'NEWLINE_DELIMITED_JSON'. For the full list run > bq help mkdef :param delete_if_exists: delete table if already exists :return: None """ bq = self.client # delete existing table if delete_if_exists: self.delete_table(table_name) try: # create external table full_table_name = f"{bq.project}.{table_name}" table = bigquery.Table(full_table_name) # external table configurations external_table_config = bigquery.ExternalConfig( source_format=source_format) if schema: external_table_config.schema = schema else: external_table_config.autodetect = True external_table_config.ignore_unknown_values = True # table_config.max_bad_records = 10 for k, v in kwargs.items(): if hasattr(external_table_config, k): setattr(external_table_config, k, v) elif hasattr(external_table_config.options, k): setattr(external_table_config.options, k, v) external_table_config.source_uris = source_uris table.external_data_configuration = external_table_config logger.info( f"creating bigquery external table: {table.dataset_id}.{table.table_id} \t " f"format:'{external_table_config.source_format}' " f"source_uris: {external_table_config.source_uris}") # create the external table table = bq.create_table(table) logger.info( f"created bigquery external table: {table.dataset_id}.{table.table_id} " f"timestamp: {table.created}, location: {table.location}") return table except google_exceptions.BadRequest as err: logger.fatal( f"critical error while creating bigquery external table.") logger.fatal(str(err)) raise err
def _bq_client_create_external_table( table_name, schema_fields, source_objects, source_format, geojson=False, hive_options=None, bucket=None, post_hook=None, ): # TODO: must be fully qualified table name ext = bigquery.ExternalConfig(source_format) ext.source_uris = source_objects ext.autodetect = schema_fields is None ext.ignore_unknown_values = True if geojson: ext.json_extension = "GEOJSON" if hive_options: assert (len(source_objects) == 1 ), "cannot use hive partitioning with more than one URI" opt = bigquery.external_config.HivePartitioningOptions() # _Strongly_ recommend using CUSTOM mode and explicitly-defined # key schema for more than a trivial number of files opt.mode = hive_options.get("mode", "AUTO") opt.require_partition_filter = hive_options.get( "require_partition_filter", False) # TODO: this is very fragile, we should probably be calculating it from # the source_objects and validating the format (prefix, trailing slashes) prefix = hive_options["source_uri_prefix"] if prefix and bucket: opt.source_uri_prefix = bucket + "/" + prefix else: opt.source_uri_prefix = prefix ext.hive_partitioning = opt client = bigquery.Client(project=get_project_id(), location=CALITP_BQ_LOCATION) dataset_name, _ = table_name.split(".") full_dataset_name = ".".join((get_project_id(), dataset_name)) try: client.get_dataset(full_dataset_name) except NotFound: print(f"Dataset {full_dataset_name} not found, creating.") dataset = bigquery.Dataset(full_dataset_name) dataset.location = "us-west2" client.create_dataset(dataset, timeout=30) # for some reason, you can set the project name in the bigquery client, and # it doesn't need to be in the SQL code, but this bigquery API still requires # the fully qualified table name when initializing a Table. full_table_name = f"{get_project_id()}.{table_name}" # First delete table if it exists print(f"Deleting external table if exists: {full_table_name}") client.delete_table(full_table_name, not_found_ok=True) # (re)create table tbl = bigquery.Table(full_table_name, schema_fields) tbl.external_data_configuration = ext print( f"Creating external table: {full_table_name} {tbl} {source_objects} {hive_options}" ) created_table = client.create_table(tbl, timeout=300, exists_ok=True) if post_hook: client.query(post_hook).result() print(f"Successfully ran {post_hook}") return created_table
def execute(self, data): #setup client = self.client bigquery = self.bigquery datetime = self.datetime pytz = self.pytz time = self.time name = data.get("titleName") emails = data.get("emails") query = data.get("query") table = "" # # create a dataset first if needed dataset_main = self.make_dataset() table_id = "{}.{}".format(dataset_main, name) # # create external table if (self.env.get("create_external_table")): try: # Configure the external data source dataset_id = dataset_main table_id = "{}.{}".format(dataset_main, query) schema = [ bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("post_abbr", "STRING"), ] table = bigquery.Table(table_id, schema=schema) external_config = bigquery.ExternalConfig("CSV") external_config.source_uris = [ "gs://cloud-samples-data/bigquery/us-states/us-states.csv" ] external_config.options.skip_leading_rows = 1 # optionally skip header row table.external_data_configuration = external_config # Create a permanent table linked to the GCS file table = client.create_table(table) # API request # Example query to find states starting with 'W' sql = 'SELECT * FROM `{}` WHERE name LIKE "W%"'.format( table_id) query_job = client.query(sql) # API request w_states = list(query_job) # Waits for query to finish return "There are {} states with names starting with W. we pulled the data from us-states.csv in cloud storage".format( len(w_states)) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # create temp external table elif (self.env.get("create_temp_external_table")): try: schema = ["filename", "name"] # Configure the external data source and query job. external_config = bigquery.ExternalConfig("CSV") external_config.source_uris = [ "gs://cloud-samples-data/bigquery/us-states/us-states.csv" ] external_config.schema = [ bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("post_abbr", "STRING"), ] external_config.options.skip_leading_rows = 1 table_id = "usa_states" job_config = bigquery.QueryJobConfig( table_definitions={table_id: external_config}) # Example query to find states starting with 'W'. sql = """ SELECT _FILE_NAME AS {},{} FROM `{}` WHERE name LIKE "W%" """.format(schema[0], schema[1], table_id) query_job = client.query( sql, job_config=job_config) # Make an API request. query_job.result() return json.dumps({ "schema": [{ "field": x } for x in schema], "data": [ # Row values can be accessed by field name or index. { schema[0]: row[schema[0]], schema[1]: row[schema[1]] } for row in query_job ] }) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # drive create external table elif (self.env.get("drive_create_external_table")): try: dataset_id = dataset_main # Configure the external data source. dataset = client.get_dataset(dataset_id) table_id = query schema = [ bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("post_abbr", "STRING"), ] table = bigquery.Table(dataset.table(table_id), schema=schema) external_config = bigquery.ExternalConfig("GOOGLE_SHEETS") # Use a shareable link or grant viewing access to the email address you # used to authenticate with BigQuery (this example Sheet is public). sheet_url = ( "https://docs.google.com/spreadsheets/d/1i_QCL-7HcSyUZmIbP9E6lO_T5u3HnpLe7dnpHaijg_E/edit?usp=sharing" ) external_config.source_uris = [sheet_url] external_config.options.skip_leading_rows = 1 # Optionally skip header row. external_config.options.range = ( "us-states!A20:B49" # Optionally set range of the sheet to query from. ) table.external_data_configuration = external_config # Create a permanent table linked to the Sheets file. table = client.create_table(table) # Make an API request. # Example query to find states starting with "W". sql = 'SELECT * FROM `{}.{}` WHERE name LIKE "W%"'.format( dataset_id, table_id) query_job = client.query(sql) # Make an API request. # Wait for the query to complete. w_states = list(query_job) return "There are {} states with names starting with W in the selected range. this data came from google drive".format( len(w_states)) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # # drive create temp external table elif (self.env.get("drive_create_temp_external_table")): try: schema = ["name", "post_abbr"] # Configure the external data source and query job. external_config = bigquery.ExternalConfig("GOOGLE_SHEETS") sheet_url = ( "https://docs.google.com/spreadsheets" "/d/1i_QCL-7HcSyUZmIbP9E6lO_T5u3HnpLe7dnpHaijg_E/edit?usp=sharing" ) external_config.source_uris = [sheet_url] external_config.schema = [ bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("post_abbr", "STRING"), ] external_config.options.skip_leading_rows = 1 # Optionally skip header row. external_config.options.range = ( "us-states!A20:B49" # Optionally set range of the sheet to query from. ) table_id = "usa_states" job_config = bigquery.QueryJobConfig( table_definitions={table_id: external_config}) # Example query to find states starting with 'W'. sql = """ SELECT * FROM `{}` WHERE name LIKE "W%" """.format(table_id) query_job = client.query( sql, job_config=job_config) # Make an API request. query_job.result() [print(row) for row in query_job] return json.dumps({ "schema": [{ "field": x } for x in schema], "data": [ # Row values can be accessed by field name or index. { schema[0]: row[schema[0]], schema[1]: row[schema[1]] } for row in query_job ] }) except BaseException as e: print('my custom error\n') print(e.__class__.__name__) print('\n') print(e) return 'an error occured check the output from the backend' # return "Check the backend env dictionary you did set it so the backend didnt do anything"