def instance_database_bigquery(name_database): dataset_ref = client.dataset(name_database) dataset = bigquery.Dataset(dataset_ref) return dataset
def test_data_scientist_tutorial(delete_dataset): # [START bqml_data_scientist_tutorial_create_dataset] dataset = bigquery.Dataset(client.dataset('bqml_tutorial')) dataset.location = 'US' client.create_dataset(dataset) # [END bqml_data_scientist_tutorial_create_dataset] # [START bqml_data_scientist_tutorial_create_model] sql = """ CREATE OR REPLACE MODEL `bqml_tutorial.sample_model` OPTIONS(model_type='logistic_reg') AS SELECT IF(totals.transactions IS NULL, 0, 1) AS label, IFNULL(device.operatingSystem, "") AS os, device.isMobile AS is_mobile, IFNULL(geoNetwork.country, "") AS country, IFNULL(totals.pageviews, 0) AS pageviews FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` WHERE _TABLE_SUFFIX BETWEEN '20160801' AND '20170630' """ df = client.query(sql).to_dataframe() print(df) # [END bqml_data_scientist_tutorial_create_model] # [START bqml_data_scientist_tutorial_get_training_statistics] sql = """ SELECT * FROM ML.TRAINING_INFO(MODEL `bqml_tutorial.sample_model`) """ df = client.query(sql).to_dataframe() print(df) # [END bqml_data_scientist_tutorial_get_training_statistics] # [START bqml_data_scientist_tutorial_evaluate_model] sql = """ SELECT * FROM ML.EVALUATE(MODEL `bqml_tutorial.sample_model`, ( SELECT IF(totals.transactions IS NULL, 0, 1) AS label, IFNULL(device.operatingSystem, "") AS os, device.isMobile AS is_mobile, IFNULL(geoNetwork.country, "") AS country, IFNULL(totals.pageviews, 0) AS pageviews FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` WHERE _TABLE_SUFFIX BETWEEN '20170701' AND '20170801')) """ df = client.query(sql).to_dataframe() print(df) # [END bqml_data_scientist_tutorial_evaluate_model] # [START bqml_data_scientist_tutorial_predict_transactions] sql = """ SELECT country, SUM(predicted_label) as total_predicted_purchases FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, ( SELECT IFNULL(device.operatingSystem, "") AS os, device.isMobile AS is_mobile, IFNULL(totals.pageviews, 0) AS pageviews, IFNULL(geoNetwork.country, "") AS country FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` WHERE _TABLE_SUFFIX BETWEEN '20170701' AND '20170801')) GROUP BY country ORDER BY total_predicted_purchases DESC LIMIT 10 """ df = client.query(sql).to_dataframe() print(df) # [END bqml_data_scientist_tutorial_predict_transactions] # [START bqml_data_scientist_tutorial_predict_purchases] sql = """ SELECT fullVisitorId, SUM(predicted_label) as total_predicted_purchases FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, ( SELECT IFNULL(device.operatingSystem, "") AS os, device.isMobile AS is_mobile, IFNULL(totals.pageviews, 0) AS pageviews, IFNULL(geoNetwork.country, "") AS country, fullVisitorId FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` WHERE _TABLE_SUFFIX BETWEEN '20170701' AND '20170801')) GROUP BY fullVisitorId ORDER BY total_predicted_purchases DESC LIMIT 10 """ df = client.query(sql).to_dataframe() print(df)
def create_dataset(dataset_name, bigquery_client, location=DEFAULT_LOCATION): dataset = bigquery.Dataset('{project_id}.{dataset_name}'.format( project_id=bigquery_client.project, dataset_name=dataset_name)) dataset.location = location return bigquery_client.create_dataset(dataset, exists_ok=True)
import requests import os from google.cloud import bigquery from google.oauth2 import service_account from urllib.parse import urlparse, urlencode, urlunparse, parse_qsl # Construct a credentials object credentials = service_account.Credentials.from_service_account_file( '<path of your service account>', scopes=["https://www.googleapis.com/auth/cloud-platform"], ) # Construct a BigQuery client object. bigquery_client = bigquery.Client(credentials=credentials, project=credentials.project_id) dataset = bigquery.Dataset('<project-name>.<schema-name>') def set_query_field(url, field, value, replace=False): # Parse out the different parts of the URL. components = urlparse(url) query_pairs = parse_qsl(urlparse(url).query) if replace: query_pairs = [(f, v) for (f, v) in query_pairs if f != field] query_pairs.append((field, value)) new_query_str = urlencode(query_pairs) # Finally, construct the new URL new_components = (components.scheme, components.netloc, components.path,
def _create_dataset(client, dataset_ref, location): dataset = bigquery.Dataset(dataset_ref) dataset.location = location return client.create_dataset(dataset)
from google.cloud import bigquery from google.api_core.exceptions import Conflict from pysparkdp.config import Config, logger Config().init_connect() bc = bigquery.Client() # ds == dataset # Dataset IDs must be alphanumeric (plus underscores) # and must be at most 1024 characters long. ds_id = "bigquery_python" ds_ref = bc.dataset(ds_id) ds = bigquery.Dataset(ds_ref) try: ds = bc.create_dataset(ds) except Conflict: logger.info("Dataset %s already exist", ds_id) ds = bc.get_dataset(ds_ref) table_id = "lines" table_ref = ds_ref.table(table_id) table = bigquery.Table(table_ref) table.description = "Lines from GS process" table.schema = [ bigquery.SchemaField('line', 'STRING', mode='REQUIRED'), bigquery.SchemaField('id', 'INTEGER', mode='REQUIRED') ]
def load_csv_to_table(self, dataset_id, table_id, csv_path, auto_detect_schema=True, schema=[], skip_rows=0, append_to_table=False): ''' :param dataset_id: dataset name :param table_id: table name :param csv_path: path to exported csv file :param auto_detect_schema: auto detect schema types from CSV (default True) :param schema: (default None) schema = [ bigquery.SchemaField('full_name', 'STRING', mode='REQUIRED', description='blah'), bigquery.SchemaField('age', 'INTEGER', mode='REQUIRED'), ] :param skip_rows: how many rows in CSV to skip (default 0) :return: None Details on loading local data https://cloud.google.com/bigquery/docs/loading-data-local ''' location = 'US' dataset_ref = self.client.get_dataset(dataset_id) table_ref = dataset_ref.table(table_id) dataset = bigquery.Dataset(dataset_ref) dataset.location = location job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.CSV # https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv#overwriting_a_table_with_csv_data #job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY job_config.skip_leading_rows = skip_rows job_config.quote_character = '\"' if not append_to_table: job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY if auto_detect_schema: assert schema == [], 'Auto-detect is True, but schema is specified' job_config.autodetect = True else: job_config.autodetect = False assert schema != [], 'Auto-detect is False, but no schema specified' job_config.schema = schema with open(csv_path, 'rb') as csv_file: load_table_job = self.client.load_table_from_file( csv_file, table_ref, location=location, job_config=job_config, ) # API request log.info('Uploading table... ') #print('Uploading table... ') try: load_table_job.result() # Wait for load to complete except: errors = load_table_job.errors log.error(errors) return errors print(load_table_job.error_result) table = self.client.get_table(table_ref) log.info(''' {} rows loaded into table {} in dataset {} in project: {}. '''.format(load_table_job.output_rows, table.table_id, dataset_id, self.client.project)) '''
import json #BigQuery connection bigquery_client = bigquery.Client.from_service_account_json( os.environ['bigquery_creds']) #Create BigQuery structures if they do not exist #Define dataset and table dataset = bigquery_client.dataset('trafficdata') table = dataset.table('chicagodata') #Create dataset if it does not exist try: bigquery_client.get_dataset(dataset) except: dataset_ref = bigquery_client.create_dataset(bigquery.Dataset(dataset)) print('Dataset created.') #Create table if it does not exist try: bigquery_client.get_table(table) except: schema = [ bigquery.SchemaField('dateAdded', 'TIMESTAMP', mode='REQUIRED'), bigquery.SchemaField('segmentid', 'INTEGER', mode='REQUIRED'), bigquery.SchemaField('_strheading', 'STRING', mode='REQUIRED'), bigquery.SchemaField('_traffic', 'INTEGER', mode='REQUIRED'), bigquery.SchemaField('_tost', 'STRING', mode='REQUIRED'), bigquery.SchemaField('_fromst', 'STRING', mode='REQUIRED'), bigquery.SchemaField('_last_updt', 'TIMESTAMP', mode='REQUIRED'), bigquery.SchemaField('_length', 'FLOAT', mode='REQUIRED'),
def createDatasetBQ(self, dataset_id, client): dataset = bigquery.Dataset(dataset_id) dataset.location = "asia-south1" dataset = client.create_dataset(dataset) print("Created dataset {}.{}".format(client.project, dataset.dataset_id)) return dataset
# Create new Google BigQuery client using Cloud Platform project default client = bigquery.Client() # Reference to project id and new or existing dataset project_id = "bigquery-tour" dataset_id = "tour-dataset" full_dataset_id = "{}.{}".format(project_id, dataset_id) # Check to see if dataset already exists within project, if not then create new BigQuery dataset try: dataset = client.get_dataset(full_dataset_id) print("Dataset {} exists".format(full_dataset_id)) except NotFound: print("Dataset {} not found, creating new dataset".format(full_dataset_id)) db = bigquery.Dataset(full_dataset_id) dataset = client.create_dataset(db) # Reference to new table for storing query results table_id = dataset.table("nearest_ports") # Configure query job and set destination as new table mentioned above job_config = bigquery.QueryJobConfig(destination=table_id) # Query to find 5 nearest ports to Singapore's JURONG ISLAND port query = """ SELECT port_name, ST_DISTANCE(port_geom, (SELECT port_geom FROM bigquery-public-data.geo_international_ports.world_port_index WHERE country = "SG" AND port_name = "JURONG ISLAND")) AS distance_in_meters FROM bigquery-public-data.geo_international_ports.world_port_index ORDER BY 2
def main(mysqlConfigs, redshiftConfigs): SCHEMA = [ bigquery.SchemaField('id', 'INTEGER', mode='REQUIRED'), bigquery.SchemaField('name', 'STRING', mode='REQUIRED'), ] project_id = os.getenv('PROJECT_ID') dataset_id = 'testdb' table_id = 'testtbl' client = bigquery.Client() client.delete_dataset(client.dataset(dataset_id), delete_contents=True) dataset = bigquery.Dataset(client.dataset(dataset_id)) dataset = client.create_dataset(dataset) dataset.location = 'US' table = bigquery.Table(dataset.table(table_id), schema=SCHEMA) table = client.create_table(table) query = "SELECT * FROM `{}`.`{}`.`{}` limit 100".format(project_id, dataset_id, table_id) client = bigquery.Client() table_ref = client.dataset(dataset_id).table(table_id) ''' table = client.get_table(table_ref) rows_to_insert = [ ('Phred Phlyntstone', 32), ('Wylma Phlyntstone', 1), ] errors = client.insert_rows(table, rows_to_insert) print(errors) assert errors == [] ''' query_job = client.query(query, location='US') for row in query_job: print(row) conn = pymysql.connect(**mysqlConfigs) stream = BinLogStreamReader( connection_settings = mysqlConfigs, server_id=100, blocking=True, resume_stream=True, only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent]) cursor = conn.cursor() for binlogevent in stream: e_start_pos, last_pos = stream.log_pos, stream.log_pos #print([a for a in dir(binlogevent) if not a.startswith('__')]) for row in binlogevent.rows: event = {"schema": binlogevent.schema, "table": binlogevent.table, "type": type(binlogevent).__name__, "row": row } #if isinstance(binlog_event, QueryEvent) and binlog_event.query == 'BEGIN': # e_start_pos = last_pos #print(json.dumps(event)) binlog2sql = concat_sql_from_binlog_event(cursor=cursor, binlog_event=binlogevent, row=row, e_start_pos=e_start_pos).replace('`', "").replace('testtbl', '`testdb.testtbl`') print(binlog2sql) query_job = client.query(binlog2sql, location='US') result = query_job.result()
def run_natality_tutorial(override_values={}): # [START bigquery_query_natality_tutorial] """Create a Google BigQuery linear regression input table. In the code below, the following actions are taken: * A new dataset is created "natality_regression." * A query is run against the public dataset, bigquery-public-data.samples.natality, selecting only the data of interest to the regression, the output of which is stored in a new "regression_input" table. * The output table is moved over the wire to the user's default project via the built-in BigQuery Connector for Spark that bridges BigQuery and Cloud Dataproc. """ from google.cloud import bigquery # Create a new Google BigQuery client using Google Cloud Platform project # defaults. client = bigquery.Client() # Prepare a reference to a new dataset for storing the query results. dataset_id = 'natality_regression' # [END bigquery_query_natality_tutorial] # To facilitate testing, we replace values with alternatives # provided by the testing harness. dataset_id = override_values.get("dataset_id", dataset_id) # [START bigquery_query_natality_tutorial] dataset = bigquery.Dataset(client.dataset(dataset_id)) # Create the new BigQuery dataset. dataset = client.create_dataset(dataset) # In the new BigQuery dataset, create a reference to a new table for # storing the query results. table_ref = dataset.table('regression_input') # Configure the query job. job_config = bigquery.QueryJobConfig() # Set the destination table to the table reference created above. job_config.destination = table_ref # Set up a query in Standard SQL, which is the default for the BigQuery # Python client library. # The query selects the fields of interest. query = """ SELECT weight_pounds, mother_age, father_age, gestation_weeks, weight_gain_pounds, apgar_5min FROM `bigquery-public-data.samples.natality` WHERE weight_pounds IS NOT NULL AND mother_age IS NOT NULL AND father_age IS NOT NULL AND gestation_weeks IS NOT NULL AND weight_gain_pounds IS NOT NULL AND apgar_5min IS NOT NULL """ # Run the query. query_job = client.query(query, job_config=job_config) query_job.result() # Waits for the query to finish
def get_dataset(self, name): dataset_ref = self._bigquery_client.dataset(name) return bigquery.Dataset(dataset_ref)
def create_dataset(self, name): dataset_ref = self._bigquery_client.dataset(name) dataset = bigquery.Dataset(dataset_ref) dataset = self._bigquery_client.create_dataset(dataset) return dataset
def test_load_table_add_column(client, to_delete): dataset_id = "load_table_add_column_{}".format(_millis()) project = client.project dataset_ref = bigquery.DatasetReference(project, dataset_id) dataset = bigquery.Dataset(dataset_ref) dataset.location = "US" dataset = client.create_dataset(dataset) to_delete.append(dataset) snippets_dir = os.path.abspath(os.path.dirname(__file__)) filepath = os.path.join(snippets_dir, "..", "tests", "data", "people.csv") table_ref = dataset_ref.table("my_table") old_schema = [bigquery.SchemaField("full_name", "STRING", mode="REQUIRED")] table = client.create_table(bigquery.Table(table_ref, schema=old_schema)) # [START bigquery_add_column_load_append] # from google.cloud import bigquery # client = bigquery.Client() # dataset_ref = client.dataset('my_dataset') # filepath = 'path/to/your_file.csv' # Retrieves the destination table and checks the length of the schema table_id = "my_table" table_ref = dataset_ref.table(table_id) table = client.get_table(table_ref) print("Table {} contains {} columns.".format(table_id, len(table.schema))) # Configures the load job to append the data to the destination table, # allowing field addition job_config = bigquery.LoadJobConfig() job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND job_config.schema_update_options = [ bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION ] # In this example, the existing table contains only the 'full_name' column. # 'REQUIRED' fields cannot be added to an existing schema, so the # additional column must be 'NULLABLE'. job_config.schema = [ bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"), bigquery.SchemaField("age", "INTEGER", mode="NULLABLE"), ] job_config.source_format = bigquery.SourceFormat.CSV job_config.skip_leading_rows = 1 with open(filepath, "rb") as source_file: job = client.load_table_from_file( source_file, table_ref, location="US", # Must match the destination dataset location. job_config=job_config, ) # API request job.result() # Waits for table load to complete. print( "Loaded {} rows into {}:{}.".format( job.output_rows, dataset_id, table_ref.table_id ) ) # Checks the updated length of the schema table = client.get_table(table) print("Table {} now contains {} columns.".format(table_id, len(table.schema))) # [END bigquery_add_column_load_append] assert len(table.schema) == 2 assert table.num_rows > 0
def _create_dataset(self): from google.cloud import bigquery dataset = bigquery.Dataset(self.full_dataset_name) dataset.location = 'US' self.client.create_dataset(dataset, timeout=30)
def test_load_table_relax_column(client, to_delete): dataset_id = "load_table_relax_column_{}".format(_millis()) project = client.project dataset_ref = bigquery.DatasetReference(project, dataset_id) dataset = bigquery.Dataset(dataset_ref) dataset.location = "US" dataset = client.create_dataset(dataset) to_delete.append(dataset) snippets_dir = os.path.abspath(os.path.dirname(__file__)) filepath = os.path.join(snippets_dir, "..", "tests", "data", "people.csv") table_ref = dataset_ref.table("my_table") old_schema = [ bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"), bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), bigquery.SchemaField("favorite_color", "STRING", mode="REQUIRED"), ] table = client.create_table(bigquery.Table(table_ref, schema=old_schema)) # [START bigquery_relax_column_load_append] # from google.cloud import bigquery # client = bigquery.Client() # dataset_ref = client.dataset('my_dataset') # filepath = 'path/to/your_file.csv' # Retrieves the destination table and checks the number of required fields table_id = "my_table" table_ref = dataset_ref.table(table_id) table = client.get_table(table_ref) original_required_fields = sum(field.mode == "REQUIRED" for field in table.schema) # In this example, the existing table has 3 required fields. print("{} fields in the schema are required.".format(original_required_fields)) # Configures the load job to append the data to a destination table, # allowing field relaxation job_config = bigquery.LoadJobConfig() job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND job_config.schema_update_options = [ bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION ] # In this example, the existing table contains three required fields # ('full_name', 'age', and 'favorite_color'), while the data to load # contains only the first two fields. job_config.schema = [ bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"), bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), ] job_config.source_format = bigquery.SourceFormat.CSV job_config.skip_leading_rows = 1 with open(filepath, "rb") as source_file: job = client.load_table_from_file( source_file, table_ref, location="US", # Must match the destination dataset location. job_config=job_config, ) # API request job.result() # Waits for table load to complete. print( "Loaded {} rows into {}:{}.".format( job.output_rows, dataset_id, table_ref.table_id ) ) # Checks the updated number of required fields table = client.get_table(table) current_required_fields = sum(field.mode == "REQUIRED" for field in table.schema) print("{} fields in the schema are now required.".format(current_required_fields)) # [END bigquery_relax_column_load_append] assert original_required_fields - current_required_fields == 1 assert len(table.schema) == 3 assert table.schema[2].mode == "NULLABLE" assert table.num_rows > 0
#### Authentication credentials = service_account.Credentials.from_service_account_file( key_path, scopes=scopes) client = bigquery.Client(credentials=credentials, project=project) #### Create dataset if it does not exist dataset_id_full = f'{client.project}.{dataset_id}' table_id_full = f"{project}.{dataset_id}.{table_id}" datasets = list(client.list_datasets()) if len(datasets) == 0: dataset = bigquery.Dataset(dataset_id_full) dataset = client.create_dataset(dataset) elif datasets[0].dataset_id == dataset_id: dataset = datasets[0] #### sort schema_json in same order as df columns def _get_schema(schema_fname, csv_fname, table_id): with open(schema_fname, 'r') as fp: schema_json = json.load(fp)[table_id] df = pd.read_csv(csv_fname) schema = list() for key in df.columns: val = schema_json[key] schema.append( bigquery.SchemaField(val['name'], val['type'], val['mode']) )
def setup(self): """Sets up resources for tests. """ # set up bq datasets to hold staging tables self.staging_dataset_id = "bqbml_test_staging_dataset" self.resized_dataset_id = "bqbml_test_resized_staging_dataset" self.bq_client = bigquery.Client() staging_dataset_ref = self.bq_client.dataset(self.staging_dataset_id) self.staging_dataset = bigquery.Dataset(staging_dataset_ref) resized_staging_dataset_ref = self.bq_client.dataset( self.resized_dataset_id) self.resized_dataset = bigquery.Dataset(resized_staging_dataset_ref) # define path that holds schemas used for creating staging tables abs_path = os.path.abspath(os.path.dirname(__file__)) self.json_schema_path = os.path.join(abs_path, 'test_schemas') # add test schemas to schema path schema_50_string_50_numeric = { "fields": [{ "type": "STRING", "name": "string1", "mode": "REQUIRED" }, { "type": "STRING", "name": "string2", "mode": "REQUIRED" }, { "type": "NUMERIC", "name": "numeric1", "mode": "REQUIRED" }, { "type": "NUMERIC", "name": "numeric2", "mode": "REQUIRED" }] } with open(self.json_schema_path + '/50_STRING_50_NUMERIC_4.json', 'w') as sch2: json.dump(schema_50_string_50_numeric, sch2) # set up test params self.test_file_parameters = { 'fileType': ['csv', 'json'], 'fileCompressionTypes': { 'csv': ['none'], 'json': ['none'] }, 'numColumns': [4], 'numFiles': [1, 100, 1000, 10000], 'targetDataSizes': [.000001], 'stagingDataSizes': ['1KB'], 'columnTypes': ['50_STRING_50_NUMERIC'], } # set up GCS resources needed for dataglow job df_staging_bucket_id = 'bq_benchmark_dataflow_test' gcs_client = storage.Client() self.df_staging_bucket = gcs_client.create_bucket(df_staging_bucket_id) staging_blob = self.df_staging_bucket.blob('staging/') temp_blob = self.df_staging_bucket.blob('temp/') staging_blob.upload_from_string('') temp_blob.upload_from_string('') self.df_staging_path = 'gs://{0:1}/staging'.format( df_staging_bucket_id) self.df_temp_path = 'gs://{0:1}/temp'.format(df_staging_bucket_id)
def submit(self, sql, create, dml=None): """ Submit the sql query to create a de-identified table. :param sql: The sql to send. :param create: a flag to identify if this query should create a new table or append to an existing table. :param dml: boolean flag identifying if a statement is a dml statement """ dml = False if dml is None else dml table_name = self.get_tablename() client = bq.Client.from_service_account_json(self.private_key) # # Let's make sure the out dataset exists datasets = list(client.list_datasets()) found = np.sum( [1 for dataset in datasets if dataset.dataset_id == self.odataset]) if not found: dataset = bq.Dataset(client.dataset(self.odataset)) client.create_dataset(dataset) # create the output table if create: LOGGER.info('creating new table:\t%s', self.tablename) bq_utils.create_standard_table(self.tablename, self.tablename, drop_existing=True, dataset_id=self.odataset) write_disposition = bq_consts.WRITE_EMPTY else: write_disposition = bq_consts.WRITE_APPEND LOGGER.info('appending results to table:\t%s', self.tablename) job = bq.QueryJobConfig() job.priority = self.priority job.dry_run = True dml_job = None if not dml: job.destination = client.dataset(self.odataset).table( self.tablename) job.use_query_cache = True job.allow_large_results = True job.write_disposition = write_disposition if self.partition: job._properties['timePartitioning'] = {'type': 'DAY'} job._properties['clustering'] = {'field': 'person_id'} else: # create a copy of the job config to use if the dry-run passes dml_job = copy(job) LOGGER.info( 'submitting a dry-run for:\t%s\t\tpriority:\t%s\t\tpartition:\t%s', self.get_tablename(), self.priority, self.partition) logpath = os.path.join(self.logpath, self.idataset) try: os.makedirs(logpath) except OSError: # log path already exists and we don't care pass try: response = client.query(sql, location='US', job_config=job) except Exception: LOGGER.exception( 'dry run query failed for:\t%s\n' '\t\tSQL:\t%s\n' '\t\tjob config:\t%s', self.get_tablename(), sql, job) else: if response.state == 'DONE': if dml_job: job = dml_job job.dry_run = False LOGGER.info('dry-run passed. submitting query for execution.') response = client.query(sql, location='US', job_config=job) LOGGER.info( 'submitted a %s job for table:\t%s\t\tstatus:\t%s\t\tvalue:\t%s', 'bigquery', table_name, 'pending', response.job_id) self.wait(client, response.job_id)
def __enter__(self): client = bigquery.Client() dataset = bigquery.Dataset(f"{client.project}.{self.name}") dataset.location = "US" dataset = client.create_dataset(dataset, exists_ok=True) return dataset
# In[ ]: get_ipython().run_cell_magic('bash', '', 'gcloud config set project $PROJECT\ngcloud config set compute/region $REGION') # ## Create BigQuery tables # If you have not already created a BigQuery dataset for our data, run the following cell: # In[ ]: # Created a BigQuery dataset for our data bq = bigquery.Client(project = PROJECT) dataset = bigquery.Dataset(bq.dataset("taxifare")) try: bq.create_dataset(dataset) print("Dataset created") except: print("Dataset already exists") # Let's create a table with 1 million examples. # # Note that the order of columns is exactly what was in our CSV files. # In[ ]:
def create_resized_tables(self): """Creates resized staging tables using the targetDataSizes parameters. Utilizes the bq_table_resizer module from the Dataflow Data Generator tool to create resized staging tables. """ staging_dataset_ref = self.bq_client.dataset(self.staging_dataset_id) staging_dataset = bigquery.Dataset(staging_dataset_ref) sizes = self.file_params['targetDataSizes'] sizes.sort() abs_path = os.path.abspath(os.path.dirname(__file__)) bq_resizer_path = os.path.join( abs_path, '../../dataflow-data-generator/bigquery-scripts' ) # Gather staging tables that were created in # self.create_staging_tables() tables = list(self.bq_client.list_tables(staging_dataset)) for table in tables: for i in range(0, len(sizes)): # If the size of the current iteration is the smallest size # in the sizes list, use the corresponding staging table # created in create_resized_table() as the source base table. # Otherwise, if the file size of the current iteration is # greater than the last iteration, use the resized table from # the previous iteration as the source base table when running # so that each table can take advantage of the size of the last. size = sizes[i] if size > sizes[i - 1]: base = sizes[i - 1] source_table = get_resized_table_name( table.table_id, base ) source_dataset = self.resized_dataset_id else: source_table = table.table_id source_dataset = self.staging_dataset_id destination_table = get_resized_table_name( table.table_id, size ) target_gb = size command_str = ('python {0:s}/bq_table_resizer.py ' '--project {1:s} ' '--source_dataset {2:s} ' '--source_table {3:s} ' '--destination_dataset {4:s} ' '--destination_table {5:s} ' '--target_gb {6:f} ') command = command_str.format( bq_resizer_path, self.project, source_dataset, source_table, self.resized_dataset_id, destination_table, target_gb, ) os.system(command) logging.info('Created resized table from {0:s}'.format( source_table )) logging.info('Resized table complete: {0:s}'.format( destination_table ))
def bigquery(data_directory, ignore_missing_dependency, **params): try: import google.api_core.exceptions from google.cloud import bigquery except ImportError: msg = 'google-cloud-bigquery dependency is missing' if ignore_missing_dependency: logger.warning('Ignored: %s', msg) return 0 else: raise click.ClickException(msg) project_id = os.environ['GOOGLE_BIGQUERY_PROJECT_ID'] bqclient = bigquery.Client(project=project_id) # Create testing dataset. testing_dataset = bqclient.dataset('testing') try: bqclient.create_dataset(bigquery.Dataset(testing_dataset)) except google.api_core.exceptions.Conflict: pass # Skip if already created. # Set up main data table. data_directory = Path(data_directory) functional_alltypes_path = data_directory / 'functional_alltypes.csv' functional_alltypes_schema = [] schema_path = data_directory / 'functional_alltypes_bigquery_schema.json' with open(str(schema_path)) as schemafile: schema_json = json.load(schemafile) for field in schema_json: functional_alltypes_schema.append( bigquery.SchemaField.from_api_repr(field) ) load_config = bigquery.LoadJobConfig() load_config.skip_leading_rows = 1 # skip the header row. load_config.schema = functional_alltypes_schema # Load main data table. functional_alltypes_schema = [] with open(str(functional_alltypes_path), 'rb') as csvfile: job = bqclient.load_table_from_file( csvfile, testing_dataset.table('functional_alltypes'), job_config=load_config, ).result() if job.error_result: raise click.ClickException(str(job.error_result)) # Load an ingestion time partitioned table. functional_alltypes_path = data_directory / 'functional_alltypes.csv' with open(str(functional_alltypes_path), 'rb') as csvfile: load_config.time_partitioning = bigquery.TimePartitioning() job = bqclient.load_table_from_file( csvfile, testing_dataset.table('functional_alltypes_parted'), job_config=load_config, ).result() if job.error_result: raise click.ClickException(str(job.error_result)) # Create a table with complex data types (nested and repeated). struct_table_path = data_directory / 'struct_table.avro' with open(str(struct_table_path), 'rb') as avrofile: load_config = bigquery.LoadJobConfig() load_config.source_format = 'AVRO' job = bqclient.load_table_from_file( avrofile, testing_dataset.table('struct_table'), job_config=load_config, ) if job.error_result: raise click.ClickException(str(job.error_result)) # Create empty date-partitioned table. date_table = bigquery.Table(testing_dataset.table('date_column_parted')) date_table.schema = [ bigquery.SchemaField('my_date_parted_col', 'DATE'), bigquery.SchemaField('string_col', 'STRING'), bigquery.SchemaField('int_col', 'INTEGER'), ] date_table.time_partitioning = bigquery.TimePartitioning( field='my_date_parted_col' ) bqclient.create_table(date_table) # Create empty timestamp-partitioned tables. timestamp_table = bigquery.Table( testing_dataset.table('timestamp_column_parted') ) timestamp_table.schema = [ bigquery.SchemaField('my_timestamp_parted_col', 'DATE'), bigquery.SchemaField('string_col', 'STRING'), bigquery.SchemaField('int_col', 'INTEGER'), ] timestamp_table.time_partitioning = bigquery.TimePartitioning( field='my_timestamp_parted_col' ) bqclient.create_table(timestamp_table) # Create a table with a numeric column numeric_table = bigquery.Table(testing_dataset.table('numeric_table')) numeric_table.schema = [ bigquery.SchemaField('string_col', 'STRING'), bigquery.SchemaField('numeric_col', 'NUMERIC'), ] bqclient.create_table(numeric_table) df = pd.read_csv( str(data_directory / 'functional_alltypes.csv'), usecols=['string_col', 'double_col'], header=0, ) with tempfile.NamedTemporaryFile(mode='a+b') as csvfile: df.to_csv(csvfile, header=False, index=False) csvfile.seek(0) load_config = bigquery.LoadJobConfig() load_config.skip_leading_rows = 1 # skip the header row. load_config.schema = numeric_table.schema job = bqclient.load_table_from_file( csvfile, testing_dataset.table('numeric_table'), job_config=load_config, ).result() if job.error_result: raise click.ClickException(str(job.error_result))
def create_dataset(): print('Creating destination dataset') dataset_ref = bq_client.dataset(TO_DATASET, project=TO_PROJECT) dataset = bigquery.Dataset(dataset_ref) dataset.location = TO_LOCATION dataset = bq_client.create_dataset(dataset)
def run_authorized_view_tutorial(): # Note to user: This is a group email for testing purposes. Replace with # your own group email address when running this code. analyst_group_email = '*****@*****.**' # [START bigquery_authorized_view_tutorial] # Create a source dataset # [START bigquery_avt_create_source_dataset] from google.cloud import bigquery client = bigquery.Client() source_dataset_id = 'github_source_data' source_dataset = bigquery.Dataset(client.dataset(source_dataset_id)) # Specify the geographic location where the dataset should reside. source_dataset.location = 'US' source_dataset = client.create_dataset(source_dataset) # API request # [END bigquery_avt_create_source_dataset] # Populate a source table # [START bigquery_avt_create_source_table] source_table_id = 'github_contributors' job_config = bigquery.QueryJobConfig() job_config.destination = source_dataset.table(source_table_id) sql = """ SELECT commit, author, committer, repo_name FROM `bigquery-public-data.github_repos.commits` LIMIT 1000 """ query_job = client.query( sql, # Location must match that of the dataset(s) referenced in the query # and of the destination table. location='US', job_config=job_config) # API request - starts the query query_job.result() # Waits for the query to finish # [END bigquery_avt_create_source_table] # Create a separate dataset to store your view # [START bigquery_avt_create_shared_dataset] shared_dataset_id = 'shared_views' shared_dataset = bigquery.Dataset(client.dataset(shared_dataset_id)) shared_dataset.location = 'US' shared_dataset = client.create_dataset(shared_dataset) # API request # [END bigquery_avt_create_shared_dataset] # Create the view in the new dataset # [START bigquery_avt_create_view] shared_view_id = 'github_analyst_view' view = bigquery.Table(shared_dataset.table(shared_view_id)) sql_template = """ SELECT commit, author.name as author, committer.name as committer, repo_name FROM `{}.{}.{}` """ view.view_query = sql_template.format( client.project, source_dataset_id, source_table_id) view = client.create_table(view) # API request # [END bigquery_avt_create_view] # Assign access controls to the dataset containing the view # [START bigquery_avt_shared_dataset_access] # analyst_group_email = '*****@*****.**' access_entries = shared_dataset.access_entries access_entries.append( bigquery.AccessEntry('READER', 'groupByEmail', analyst_group_email) ) shared_dataset.access_entries = access_entries shared_dataset = client.update_dataset( shared_dataset, ['access_entries']) # API request # [END bigquery_avt_shared_dataset_access] # Authorize the view to access the source dataset # [START bigquery_avt_source_dataset_access] access_entries = source_dataset.access_entries access_entries.append( bigquery.AccessEntry(None, 'view', view.reference.to_api_repr()) ) source_dataset.access_entries = access_entries source_dataset = client.update_dataset( source_dataset, ['access_entries']) # API request # [END bigquery_avt_source_dataset_access] # [END bigquery_authorized_view_tutorial] return (source_dataset, shared_dataset)
if datasets: print("Datasets in project {}:".format(project)) for dataset in datasets: # API request(s) print("\t{}".format(dataset.dataset_id)) else: print("{} project does not contain any datasets.".format(project)) dataset_id = "hackathon_dataset" print(f"dataset ids: {[d.dataset_id for d in datasets]}") if dataset_id in [d.dataset_id for d in datasets]: print("Already created") else: # Construct a full Dataset object to send to the API. dataset = bigquery.Dataset(dataset_id) dataset = client.create_dataset(dataset) # API request print("Created dataset {}.{}".format(client.project, dataset.dataset_id)) #Read Airports avro file and append to dataset airports_uri = "gs://hackathon-mtl-2019/avro/flight_airports.avro/*" airport_table_id = "flight_airports" job_config = bigquery.LoadJobConfig() job_config.autodetect = True job_config.source_format = bigquery.SourceFormat.AVRO dataset_ref = client.dataset(dataset_id)
def test_manage_views(client, to_delete): project = client.project source_dataset_id = "source_dataset_{}".format(_millis()) source_dataset_ref = client.dataset(source_dataset_id) source_dataset = bigquery.Dataset(source_dataset_ref) source_dataset = client.create_dataset(source_dataset) to_delete.append(source_dataset) job_config = bigquery.LoadJobConfig() job_config.schema = [ bigquery.SchemaField("name", "STRING"), bigquery.SchemaField("post_abbr", "STRING"), ] job_config.skip_leading_rows = 1 uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv" source_table_id = "us_states" load_job = client.load_table_from_uri( uri, source_dataset.table(source_table_id), job_config=job_config ) load_job.result() shared_dataset_id = "shared_dataset_{}".format(_millis()) shared_dataset_ref = client.dataset(shared_dataset_id) shared_dataset = bigquery.Dataset(shared_dataset_ref) shared_dataset = client.create_dataset(shared_dataset) to_delete.append(shared_dataset) # [START bigquery_create_view] # from google.cloud import bigquery # client = bigquery.Client() # project = 'my-project' # source_dataset_id = 'my_source_dataset' # source_table_id = 'us_states' # shared_dataset_ref = client.dataset('my_shared_dataset') # This example shows how to create a shared view of a source table of # US States. The source table contains all 50 states, while the view will # contain only states with names starting with 'W'. view_ref = shared_dataset_ref.table("my_shared_view") view = bigquery.Table(view_ref) sql_template = 'SELECT name, post_abbr FROM `{}.{}.{}` WHERE name LIKE "W%"' view.view_query = sql_template.format(project, source_dataset_id, source_table_id) view = client.create_table(view) # API request print("Successfully created view at {}".format(view.full_table_id)) # [END bigquery_create_view] # [START bigquery_update_view_query] # from google.cloud import bigquery # client = bigquery.Client() # project = 'my-project' # source_dataset_id = 'my_source_dataset' # source_table_id = 'us_states' # shared_dataset_ref = client.dataset('my_shared_dataset') # This example shows how to update a shared view of a source table of # US States. The view's query will be updated to contain only states with # names starting with 'M'. view_ref = shared_dataset_ref.table("my_shared_view") view = bigquery.Table(view_ref) sql_template = 'SELECT name, post_abbr FROM `{}.{}.{}` WHERE name LIKE "M%"' view.view_query = sql_template.format(project, source_dataset_id, source_table_id) view = client.update_table(view, ["view_query"]) # API request # [END bigquery_update_view_query] # [START bigquery_get_view] # from google.cloud import bigquery # client = bigquery.Client() # shared_dataset_id = 'my_shared_dataset' view_ref = client.dataset(shared_dataset_id).table("my_shared_view") view = client.get_table(view_ref) # API Request # Display view properties print("View at {}".format(view.full_table_id)) print("View Query:\n{}".format(view.view_query)) # [END bigquery_get_view] assert view.view_query is not None analyst_group_email = "*****@*****.**" # [START bigquery_grant_view_access] # from google.cloud import bigquery # client = bigquery.Client() # Assign access controls to the dataset containing the view # shared_dataset_id = 'my_shared_dataset' # analyst_group_email = '*****@*****.**' shared_dataset = client.get_dataset( client.dataset(shared_dataset_id) ) # API request access_entries = shared_dataset.access_entries access_entries.append( bigquery.AccessEntry("READER", "groupByEmail", analyst_group_email) ) shared_dataset.access_entries = access_entries shared_dataset = client.update_dataset( shared_dataset, ["access_entries"] ) # API request # Authorize the view to access the source dataset # project = 'my-project' # source_dataset_id = 'my_source_dataset' source_dataset = client.get_dataset( client.dataset(source_dataset_id) ) # API request view_reference = { "projectId": project, "datasetId": shared_dataset_id, "tableId": "my_shared_view", } access_entries = source_dataset.access_entries access_entries.append(bigquery.AccessEntry(None, "view", view_reference)) source_dataset.access_entries = access_entries source_dataset = client.update_dataset( source_dataset, ["access_entries"] ) # API request
exit() project_name = sys.argv[1] bucket_name = sys.argv[2] # Set up our GCS, BigQuery, and Natural Language clients storage_client = storage.Client() bq_client = bigquery.Client(project=project_name) nl_client = language.LanguageServiceClient() # Set up client objects for the vision and translate_v2 API Libraries vision_client = vision_v1.ImageAnnotatorClient() # Setup the BigQuery dataset and table objects dataset_ref = bq_client.dataset('image_classification_dataset') dataset = bigquery.Dataset(dataset_ref) table_ref = dataset.table('image_text_detail') table = bq_client.get_table(table_ref) # Create an array to store results data to be inserted into the BigQuery table rows_for_bq = [] # Get a list of the files in the Cloud Storage Bucket files = storage_client.bucket(bucket_name).list_blobs() bucket = storage_client.bucket(bucket_name) print('Processing image files from GCS. This will take a few minutes..') # Process files from Cloud Storage and save the result to send to BigQuery for file in files: if file.name.endswith('jpg') or file.name.endswith('png'):
import csv from google.api_core.exceptions import NotFound from google.cloud import bigquery from google.cloud.bigquery import LoadJobConfig # TODO: support for multiple datasets DATASET_NAME = 'g_sheets' client = bigquery.Client().from_service_account_json('service-account-key.json') dataset = client.dataset(DATASET_NAME) if not DATASET_NAME in [l.dataset_id for l in client.list_datasets()]: dataset = client.create_dataset(bigquery.Dataset(dataset)) def load_from_file(file_path, table): table_ref = dataset.table(table) load_config = LoadJobConfig() load_config.autodetect = True with open(file_path, 'rb') as readable: result = client.load_table_from_file(readable, table_ref, job_config=load_config) # return result return table_ref.dataset_id + '.' + table_ref.table_id def table_exists(table_name):