def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema( bigquery_client, dataset_id): """Test that a DataFrame containing column with None-type values and int64 datatype can be uploaded without specifying a schema. https://github.com/googleapis/python-bigquery/issues/22 """ table_id = "{}.{}.load_table_from_dataframe_w_nullable_int64_datatype".format( bigquery_client.project, dataset_id) df_data = collections.OrderedDict([("x", pandas.Series([1, 2, None, 4], dtype="Int64"))]) dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) load_job.result() table = bigquery_client.get_table(table_id) assert tuple(table.schema) == (bigquery.SchemaField("x", "INTEGER"), ) assert table.num_rows == 4
def gcs_to_gbq(): # Connect to Google Cloud Storage and set bucket storage_client = storage.Client() bucket = storage_client.get_bucket(config.gcp['bucket_name']) blobs = bucket.list_blobs() client = bigquery.Client() # Connect to Google BigQuery and define BigQuery options dataset_ref = client.dataset(config.gcp['dataset_id']) job_config = bigquery.LoadJobConfig() job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE job_config.allow_quoted_newlines = True job_config.allow_jagged_rows = True job_config.autodetect = False job_config.skip_leading_rows = 1 job_config.field_delimiter = "|" job_config.source_format = bigquery.SourceFormat.CSV # Iterate though all files of defined Google Cloud Storage bucket for blob in blobs: # Set dynamic URL for current Cloud Storage file and BigQuery schema file uri = 'gs://' + config.gcp['bucket_name'] + '/' + blob.name file_name_json = blob.name.replace(".csv", ".json") file_name_json_path = os.path.join(config.general['csv_cache_folder'], file_name_json) # Load JSON File for schema and set schema fields for BigQuery input_json = open(file_name_json_path) input_json_config = json.load(input_json) job_config.schema = [ bigquery.SchemaField(item["name"], item["type"]) for item in input_json_config["fields"] ] # Set dynamic table name for Google BigQuery table_name = blob.name.replace(".csv", "") # Create new big query table / replace existing load_job = client.load_table_from_uri(uri, dataset_ref.table(table_name), job_config=job_config) assert load_job.job_type == 'load' load_job.result() # Waits for table load to complete. assert load_job.state == 'DONE' # Send slack message when load job done send_message_to_slack( ':bigquery: Backend Data Sync: Update DWH Data OK :thumbs-up-green:', config.slack_hooks['slack_channel_1'])
def _construct_schema(uuid): """ Creates a BigQuery Schema for the data set with uuid. https://cloud.google.com/bigquery/docs/schemas https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.schema.SchemaField.html https://socratadiscovery.docs.apiary.io/#reference/0/find-by-id/search-by-id """ catalog_url = '{0}/api/catalog/v1?ids={1}'.format(URI, uuid) response = urllib.request.urlopen(catalog_url, context=context) catalog_data = json.load(response)["results"][0]["resource"] schema = [] for i in range(0, len(catalog_data["columns_field_name"])): name = catalog_data["columns_field_name"][i] field_type = _encode_datatype(catalog_data["columns_datatype"][i]) description = catalog_data["columns_description"][i] schema.append(bigquery.SchemaField(name, field_type, mode='NULLABLE', description=description)) return schema
def prepare_steaming_sink(project_id, bq_dataset, bq_table): # create BigQuery table schema = [bigquery.SchemaField(field_name, data_type) for field_name, data_type in BQ_SCHEMA_DEF.items()] bq_client = bigquery.Client(project=project_id) dataset = bigquery.Dataset(bq_dataset, bq_client) table = bigquery.Table(bq_table, dataset, schema=schema) if table.exists(): print('Deleting BQ Table {}...'.format(bq_table)) table.delete() print('Creating BQ Table {}...'.format(bq_table)) table.create() print('BQ Table {} is up and running'.format(bq_table)) print("")
def finalize(self, client): i_dataset, i_table = self.get_tablename().split('.') ischema = client.get_table(client.dataset(i_dataset).table(i_table)).schema table = client.get_table(client.dataset(i_dataset + '_deid').table(i_table)) fields = [field.name for field in table.schema] newfields = [] for field in ischema: if field.name in fields: temp_field = bq.SchemaField( name=field.name, field_type=field.field_type, description=field.description, mode=field.mode ) newfields.append(temp_field) table.schema = newfields client.update_table(table, ['schema'])
def test_copy_bq_views_not_table(self, mock_table_exists: mock.MagicMock, mock_copy_view: mock.MagicMock) -> None: """Check that copy_view is not called on a table that does not have a view_query.""" table_ref = bigquery.TableReference(self.mock_source_dataset, "table_not_view") schema_fields = [bigquery.SchemaField("fake_schema_field", "STRING")] table_not_view = bigquery.Table(table_ref, schema_fields) self.mock_client.list_tables.return_value = [table_not_view] self.mock_client.get_table.return_value = table_not_view mock_table_exists.side_effect = self.table_exists_side_effect copy_bq_views( source_project_id=self.mock_source_project_id, source_dataset_id=self.mock_source_dataset_id, destination_project_id=self.mock_destination_project_id, destination_dataset_id=self.mock_destination_dataset_id, ) mock_copy_view.assert_not_called()
def update_table(self): if is_exist(self.dataset_id, table_id=self.table_id) == True: table = client.get_table(self.table_ref) original_schema = table.schema new_schema = [ bigquery.SchemaField(field["name"], field["type"]) for field in self.parse_file.schema() ] if schema_match(original_schema, new_schema): pass else: print("Recreating table {} with new schema...".format( self.table_id)) self.create_table() else: print("table does not exist. Creating {} table...".format( self.table_id)) self.create_table()
def _extract_schema(self, df): schema = [] for column_name, dtype in dict(df.dtypes).items(): type_map = { np.dtype('object'): bigquery.SchemaField(column_name, 'STRING'), np.dtype('int32'): bigquery.SchemaField(column_name, 'INTEGER'), np.dtype('int64'): bigquery.SchemaField(column_name, 'INTEGER'), np.dtype('float64'): bigquery.SchemaField(column_name, 'FLOAT'), np.dtype('datetime64[ns]'): bigquery.SchemaField(column_name, 'DATE'), pd.DatetimeTZDtype(tz='UTC'): bigquery.SchemaField(column_name, 'DATE') } schema.append(type_map[dtype]) return schema
def uploadToBigQuery(df): # Since string columns use the "object" dtype, pass in a (partial) schema # to ensure the correct BigQuery data type. job_config = bigquery.LoadJobConfig(schema=[ bigquery.SchemaField("Timestamp", "TIMESTAMP"), bigquery.SchemaField("Activity", "STRING"), bigquery.SchemaField("Description", "STRING"), bigquery.SchemaField("Location", "STRING"), bigquery.SchemaField("SoraRank", "INTEGER"), bigquery.SchemaField("client_ip", "STRING"), ]) job = client.load_table_from_dataframe(df, table_id, job_config=job_config) # Wait for the load job to complete. print(job.result())
def create_ga_kb_exit_rate_by_product(): schema = [ bigquery.SchemaField('ga_date', 'DATE', mode='NULLABLE'), bigquery.SchemaField('ga_exitPagePath', 'STRING', mode='NULLABLE'), bigquery.SchemaField('ga_exitRate', 'FLOAT', mode='NULLABLE'), bigquery.SchemaField('ga_exits', 'INTEGER', mode='NULLABLE'), bigquery.SchemaField('ga_pageviews', 'INTEGER', mode='NULLABLE'), bigquery.SchemaField('product', 'STRING', mode='NULLABLE') ] table_ref = dataset_ref.table('ga_kb_exit_rate_by_product') table = bigquery.Table(table_ref, schema=schema) table = client.create_table(table) # API request assert table.table_id == 'ga_kb_exit_rate_by_product'
def csv_loader(data, context): client = bigquery.Client() dataset_id = os.environ['DATASET'] dataset_ref = client.dataset(dataset_id) job_config = bigquery.LoadJobConfig() job_config.schema = [ bigquery.SchemaField('Date', 'DATE'), bigquery.SchemaField('Fund', 'STRING'), bigquery.SchemaField('Company', 'STRING'), bigquery.SchemaField('Ticker', 'STRING'), bigquery.SchemaField('CUSIP', 'STRING'), bigquery.SchemaField('Shares', 'FLOAT'), bigquery.SchemaField('Market_Value', 'FLOAT'), bigquery.SchemaField('Weight', 'FLOAT'), ] job_config.skip_leading_rows = 1 job_config.source_format = bigquery.SourceFormat.CSV # get the URI for uploaded CSV in GCS from 'data' uri = f"gs://{os.environ['BUCKET']}/{data['name']}" # lets do this load_job = client.load_table_from_uri( uri, dataset_ref.table(os.environ['TABLE']), job_config=job_config) print(f'Starting job: {load_job.job_id}') print(f"Function=csv_loader, Version={os.environ['VERSION']}") print(f"Loading file: {data['name']}") load_job.result() # wait for table load to complete. print('Job finished.') destination_table = client.get_table(dataset_ref.table(os.environ['TABLE'])) print(f"Table: {os.environ['TABLE']} now have {destination_table.num_rows} rows.") time.sleep(1) tbl = f"{dataset_id}.{os.environ['TABLE']}" dedup(tbl)
def create_gtrends_queries(dataset_name, table_name): schema = [ bigquery.SchemaField('update_date', 'DATE', mode='NULLABLE'), bigquery.SchemaField('region', 'STRING', mode='NULLABLE'), bigquery.SchemaField('original_query', 'STRING', mode='NULLABLE'), bigquery.SchemaField('translated_query', 'STRING', mode='NULLABLE'), bigquery.SchemaField('query_key_ts', 'STRING', mode='NULLABLE'), bigquery.SchemaField('search_increase_pct', 'INTEGER', mode='NULLABLE'), ] dataset_ref = client.dataset(dataset_name) table_ref = dataset_ref.table(table_name) table = bigquery.Table(table_ref, schema=schema) table = client.create_table(table)
def stage_imdb_title_principals(): uri = 'gs://udacity-de/imdb/title.principals.tsv.gz' source_format = bigquery.SourceFormat.CSV table_id = 'imdb.title_principals' schema = [ bigquery.SchemaField("tconst", "STRING"), bigquery.SchemaField("ordering", "INT64"), bigquery.SchemaField("nconst", "STRING"), bigquery.SchemaField("category", "STRING"), bigquery.SchemaField("job", "STRING"), bigquery.SchemaField("characters", "STRING"), ] field_delimiter = '\t' load_csv(uri, table_id, schema, field_delimiter)
def push_data_to_bigquery(client, df, dataset_id, table_id, date_column_name): table_schema = [] for name, dtype in zip(df.columns, df.dtypes): if name == date_column_name: table_schema.append('DATETIME') elif dtype.name == 'object': table_schema.append('STRING') else: table_schema.append(str(dtype).upper()) # Assigning the references dataset_id = dataset_id table_id = table_id dataset_ref = client.dataset(dataset_id) table_ref = dataset_ref.table(table_id) # Extracting the column names bq_column_names = list(df.columns) # Converting the date column df[date_column_name] = pd.to_datetime(df[date_column_name]) df[date_column_name] = df[date_column_name].dt.strftime( "%Y-%m-%dT%H:%M:%S") # Customise the Jobconfig setup job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON schema_results = [] # Dynamically creating the SQL Schema From Two Lists (Column Names) for name, schema in zip(bq_column_names, table_schema): schema_results.append( bigquery.SchemaField(name, schema, mode='NULLABLE')) job_config.schema = schema_results # Running the Job iside of a StringIO stream: with io.StringIO(df.to_json(orient="records", lines=True)) as source_file: job = client.load_table_from_file(source_file, table_ref, job_config=job_config) job.result()
def create_table(client, table_name, schema, project=None, dataset=None, partitioning_type=None, partitioned_field=None, clustering_fields=None): """ Create Table with Schema :param client: BQ Client :param table_name: Table name :param schema: Table Schema :param project: default to client.project :param dataset: default to client.dataset :param partitioning_type: either : `time` or `range` partitioned :param partitioned_field: field name use for partitionning :param clustering_fields: fields to use for clustering :return: created table """ partitioning_types = { "time" : TimePartitioning(type_= TimePartitioningType.HOUR, field=partitioned_field, require_partition_filter=True), "range" : RangePartitioning(range_= PartitionRange(start=0, end=100, interval=10), field=partitioned_field) } try: if project is None: project = client.project if dataset is None: dataset = client.dataset logging.info("Project: {}\tDataset: {}\tTable: {}\t\tPartitioning Type:{}".format(project, dataset.dataset_id, table_name, partitioning_type)) table_id = "{}.{}.{}".format(project, dataset.dataset_id, table_name) table = bigquery.Table(table_id, schema=schema) if partitioning_type is not None: partitioning_type = partitioning_type.lower() if partitioning_type == "time": logging.info("Table Partitioning: {}".format(partitioning_type)) schema.append(bigquery.SchemaField("ZONE_PARTITIONTIME","TIMESTAMP")) table.schema = schema table.time_partitioning = partitioning_types.get(partitioning_type) elif partitioning_type == "range": table.range_partitioning = partitioning_types.get(partitioning_type) if clustering_fields is not None: table.clustering_fields = clustering_fields client.create_table(table, exists_ok=True) table = client.get_table(table) logging.info("Table {} created successfully.".format(table_id)) return table except Exception as error: raise error
def download_race_pitstop(year, round_start, round_end, key_path): round_num = round_start # Obtain schema for upload bigquery_client = create_bigquery_client(key_path) dataset = bigquery_client.dataset('F1_Modelling_Raw') schema_for_upload = [ bigquery.SchemaField("year", "INTEGER"), bigquery.SchemaField("round", "INTEGER"), bigquery.SchemaField("driverId", "STRING"), bigquery.SchemaField("stop", "INTEGER"), bigquery.SchemaField("lap", "INTEGER"), bigquery.SchemaField("time", "TIME"), bigquery.SchemaField("duration", "FLOAT") ] clear_uploads_folder('bigquery_upload') while round_num <= round_end: print('***Attempting year {0}, round number {1}'.format(year, round_num)) if len(str(round_num)) == 1: round_num_csv_suffix = '0' + str(round_num) else: round_num_csv_suffix = str(round_num) # Query the Ergast API print('Round {0}: Attempting to request Pitstops json from Ergast API'.format(round_num)) response = rq.get('http://ergast.com/api/f1/{0}/{1}/pitstops.json?limit=100'.format(year,round_num)) pitstops_dict = response.json() print('Round {0}: Successfully converted response into pitstops_dict'.format(round_num)) # Reformat the resulting dataframe pitstops_df = pd.DataFrame(data = pitstops_dict['MRData']['RaceTable']['Races'][0]['PitStops']) pitstops_df['year'] = year pitstops_df['round'] = round_num pitstops_df = pitstops_df[['year', 'round', 'driverId', 'stop', 'lap', 'time', 'duration']] with open('bigquery_upload/pitstops_{0}{1}.csv'.format(year,round_num_csv_suffix), 'w',newline='') as csv_file: pitstops_df.to_csv(csv_file,index=False) print('Round {0}: Successfully saved into a csv file'.format(round_num)) table = dataset.table('Race_Pitstops_{0}{1}'.format(year,round_num_csv_suffix)) upload_to_bigquery('bigquery_upload/pitstops_{0}{1}.csv'.format(year, round_num_csv_suffix), bigquery_client, schema_for_upload, table) print('***Successfully completed upload of year {0}, round {1}'.format(year,round_num)) round_num += 1
def up(client): migration = BigQueryMigration(client) dataset = migration.dataset(dataset_name) table = migration.client.get_table(dataset.table(table_name)) orig_schema = table.schema new_schema = orig_schema.copy() if table.schema[-1].name == 'product_type': logging.warning( f'product_type already added to {table_name} in {dataset_name} dataset!' ) return dataset new_schema.append(bigquery.SchemaField('product_type', 'STRING')) table.schema = new_schema migration.client.update_table(table, ['schema']) return dataset
def create_schema_object_from_json(table_name): f = open("./{0}.json".format(table_name)) txt = f.read() json_data = json.loads(txt) datatype = '' schema = [] for e in json_data: if e['type'] == 'INTEGER': datatype = 'INT64' elif e['type'] == 'FLOAT': datatype = 'FLOAT64' else: datatype = e['type'] schemaField = bigquery.SchemaField(e['name'], datatype) schema.append(schemaField) return schema
def create_table(self, dest_dataset, dest_table, list_schema): dataset = self.client.dataset(dest_dataset) if not dataset.exists(): print("Dataset {} does not exist.".format(dest_dataset)) return table = dataset.table(dest_table) if table.exists(): print("Table [{}.{}] existed already. Skip creating.".format( dest_dataset, dest_table)) return schemas = [] for (item, type) in list_schema: schemas.append(bigquery.SchemaField(item, type)) table.schema = schemas table.create() print("Empty Table [{}.{}] created.".format(dest_dataset, dest_table))
def load_data(): client = bigquery.Client() dataset_id = 'landing' dataset_ref = client.dataset(dataset_id) job_config = bigquery.LoadJobConfig() # Allows to update with any new schema/field additions. Optional Extra (Required in this case). job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND job_config.schema_update_options = [ bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION ] job_config.schema = [ bigquery.SchemaField("title", "STRING"), bigquery.SchemaField("subreddit", "STRING"), bigquery.SchemaField("score", "Integer"), bigquery.SchemaField("id", "STRING"), bigquery.SchemaField("url", "STRING"), bigquery.SchemaField("comms_num", "Integer"), bigquery.SchemaField("created", "DATE"), ] job_config.skip_leading_rows = 1 # The source format defaults to CSV, so the line below is optional. job_config.source_format = bigquery.SourceFormat.CSV uri = "gs://dubai_source_api/source-api" # API request try: load_job = client.load_table_from_uri(uri, dataset_ref.table("dubai_posts"), job_config=job_config) except Exception as e: print('Failed to load data due to an expected error.') print('System abort!') print(str(e)) sys.exit() else: print("Starting job {}".format(load_job.job_id)) # Waits for table load to complete. load_job.result() print("Job finished.") destination_table = client.get_table(dataset_ref.table("dubai_posts")) table = "landing.dubai_posts" print("Loaded {} rows into {}.".format(destination_table.num_rows, table))
def BuildSchema(host, database, user, password, table): logging.debug('build schema for table %s in database %s' % (table, database)) conn = Connect(host, database, user, password) cursor = conn.cursor() cursor.execute("DESCRIBE %s;" % table) tableDecorator = cursor.fetchall() schema = [] for col in tableDecorator: colType = col[1].split("(")[0] if colType not in bqTypeDict: logging.warning("Unknown type detected, using string: %s", str(col[1])) field_mode = "NULLABLE" if col[2] == "YES" else "REQUIRED" field = bigquery.SchemaField(col[0], bqTypeDict.get(colType, "STRING"), mode=field_mode) schema.append(field) return tuple(schema)
def resetOuputTable(bigquery_client, project, dataset, table_name): dataset_ref = bigquery_client.dataset(dataset) table_ref = dataset_ref.table(table_name) try: bigquery_client.delete_table(table_ref) except NotFound: pass schema = [ bigquery.SchemaField('asofdate', 'DATE', mode='REQUIRED'), bigquery.SchemaField('datasource', 'STRING', mode='REQUIRED'), bigquery.SchemaField('date', 'DATE', mode='REQUIRED'), bigquery.SchemaField('type', 'STRING', mode='REQUIRED'), bigquery.SchemaField('mau', 'FLOAT', mode='REQUIRED'), bigquery.SchemaField('low90', 'FLOAT', mode='REQUIRED'), bigquery.SchemaField('high90', 'FLOAT', mode='REQUIRED'), ] + [ bigquery.SchemaField('p{}'.format(q), 'FLOAT', mode='REQUIRED') for q in range(10, 100, 10) ] table = bigquery.Table(table_ref, schema=schema) table = bigquery_client.create_table(table) return table
def _create_schemafield_list(self, schema_file_path): """ スキーマのjsonを引数にとって、BigQueryロード用のSchemaFieldを生成する see: https://cloud.google.com/bigquery/docs/schemas?hl=ja#creating_a_json_schema_file """ with open(schema_file_path, mode='r') as input_file: json_file = input_file.read() schema_json = json.loads(json_file) schema_field_list = [] for schema in schema_json: # fixme NULLABLE以外のmodeも対応できるように schema_field = bigquery.SchemaField( schema['name'], schema['type'], mode="NULLABLE", description=schema['description']) schema_field_list.append(schema_field) return schema_field_list
def test_run_query_write_to_table(self, tmp_path, bigquery_client, project_id, temporary_dataset): query_file_path = tmp_path / "sql" / "test" / "query_v1" os.makedirs(query_file_path) query_file = query_file_path / "query.sql" query_file.write_text("SELECT 'foo' AS a") schema = [bigquery.SchemaField("a", "STRING", mode="NULLABLE")] table = bigquery.Table(f"{project_id}.{temporary_dataset}.query_v1", schema=schema) bigquery_client.create_table(table) try: result = subprocess.check_output( [ ENTRYPOINT_SCRIPT, "query", "--dataset_id=" + temporary_dataset, "--destination_table=query_v1", "--project_id=" + project_id, "--replace", str(query_file), ], stderr=subprocess.STDOUT, ) assert b"Current status: DONE" in result assert b"No metadata.yaml found for {}" in result result = bigquery_client.query( f"SELECT a FROM {project_id}.{temporary_dataset}.query_v1" ).result() assert result.total_rows == 1 for row in result: assert row.a == "foo" except subprocess.CalledProcessError as e: assert b"No such file or directory: 'bq'" in e.output assert b"No metadata.yaml found for {}" in e.output assert ( b'subprocess.check_call(["bq"] + query_arguments, stdin=query_stream)' in e.output)
def process(self, element): if self._initialized is False: logging.info( '>>> load_sql_to_bq WriteToBigQuery - loading schema list') schema = [] loaded = json.loads(self._schema.get()) for f in loaded['fields']: schema.append( bigquery.SchemaField(f['name'], f['type'], f['mode'])) self._schema = schema logging.info( '>>> load_sql_to_bq WriteToBigQuery - initializing client, refs, data set and table' ) dataset = 'pyr_{}_{}'.format(self._client.get(), self._env.get()) self._client = bigquery.Client(project=self._project) self._dataset_ref = self._client.dataset(dataset) self._table_ref = self._dataset_ref.table(self._table.get()) try: # Delete the table if it exists - jc 2/19/20 self._client.delete_table(self._table_ref) except Exception: pass self._client.create_dataset(self._dataset_ref, exists_ok=True) self._client.create_table( bigquery.Table(self._table_ref, schema=self._schema)) self._initialized = True logging.info( '>>> load_sql_to_bq WriteToBigQuery - writing {} records to bigquery' .format(len(element))) logging.info( '>>> load_sql_to_bq WriteToBigQuery - element len is {}'.format( len(json.dumps(element, default=str)))) rs = self._client.insert_rows(self._table_ref, element, self._schema) logging.info( '>>> load_sql_to_bq WriteToBigQuery - committed {} records to bigquery' .format(len(element))) logging.info(rs)
def test_load_table_from_dataframe_w_explicit_schema_source_format_csv_floats( bigquery_client, dataset_id): from google.cloud.bigquery.job import SourceFormat table_schema = (bigquery.SchemaField("float_col", "FLOAT"), ) df_data = collections.OrderedDict([ ( "float_col", [ 0.14285714285714285, 0.51428571485748, 0.87128748, 1.807960649, 2.0679610649, 2.4406779661016949, 3.7148514257, 3.8571428571428572, 1.51251252e40, ], ), ]) dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) table_id = "{}.{}.load_table_from_dataframe_w_explicit_schema_csv".format( bigquery_client.project, dataset_id) job_config = bigquery.LoadJobConfig(schema=table_schema, source_format=SourceFormat.CSV) load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id, job_config=job_config) load_job.result() table = bigquery_client.get_table(table_id) rows = bigquery_client.list_rows(table_id) floats = [r.values()[0] for r in rows] assert tuple(table.schema) == table_schema assert table.num_rows == 9 assert floats == df_data["float_col"]
def store_to_bigquery_(context, target): from google.cloud import bigquery context = Context().context() client = bigquery.Client(project=Context().project_id) dataset_id, table_id, target = context['global'][ 'dataset_id'], context[target]['table_id_raw']['id'], target bucket_path = '{}/'.format(context[target]["location"]) dataset_ref = client.dataset(dataset_id) job_config = bigquery.LoadJobConfig() job_config.create_disposition = 'CREATE_IF_NEEDED' job_config.schema = [ bigquery.SchemaField(element, 'STRING') for element in context[target]['table_id_raw']['schema'] ] job_config.write_disposition = 'WRITE_TRUNCATE' job_config.skip_leading_rows = 1 uri = 'gs://{}.csv'.format(context[target]['location']) try: job_config.field_delimiter = ';' load_job = client.load_table_from_uri(uri, dataset_ref.table(table_id), job_config=job_config) load_job.result() except: job_config.field_delimiter = ',' load_job = client.load_table_from_uri(uri, dataset_ref.table(table_id), job_config=job_config) load_job.result() job_config = bigquery.QueryJobConfig() table_ref = client.dataset(dataset_id).table(target) job_config.destination = table_ref job_config.use_legacy_sql = True job_config.create_disposition = 'CREATE_IF_NEEDED' job_config.write_disposition = 'WRITE_APPEND' query = open_query(context[target]['table_id']['keygen']) query_job = client.query(query, job_config=job_config) results = query_job.result()
def test_client_library_upload_from_dataframe(temp_dataset): # [START bigquery_migration_client_library_upload_from_dataframe] from google.cloud import bigquery import pandas df = pandas.DataFrame( { 'my_string': ['a', 'b', 'c'], 'my_int64': [1, 2, 3], 'my_float64': [4.0, 5.0, 6.0], 'my_timestamp': [ pandas.Timestamp("1998-09-04T16:03:14"), pandas.Timestamp("2010-09-13T12:03:45"), pandas.Timestamp("2015-10-02T16:00:00") ], } ) client = bigquery.Client() table_id = 'my_dataset.new_table' # [END bigquery_migration_client_library_upload_from_dataframe] table_id = ( temp_dataset.dataset_id + ".test_client_library_upload_from_dataframe" ) # [START bigquery_migration_client_library_upload_from_dataframe] # Since string columns use the "object" dtype, pass in a (partial) schema # to ensure the correct BigQuery data type. job_config = bigquery.LoadJobConfig(schema=[ bigquery.SchemaField("my_string", "STRING"), ]) job = client.load_table_from_dataframe( df, table_id, job_config=job_config ) # Wait for the load job to complete. job.result() # [END bigquery_migration_client_library_upload_from_dataframe] client = bigquery.Client() table = client.get_table(table_id) assert table.num_rows == 3
def load_gcs_to_bq(log_uri): dataset_id = get_env_var('DATASET_ID') target_table_name = get_env_var('TABLE_NAME') client = bigquery.Client() temp_table_name = table_name_generator() target_table_id = '{}.{}'.format(dataset_id, target_table_name) insert_sql = insert_table_query_generator(temp_table_name, target_table_id) external_config = bigquery.ExternalConfig("CSV") external_config.source_uris = [ log_uri, ] external_config.schema = [ bigquery.SchemaField("content", "STRING", mode="REQUIRED"), ] job_config = bigquery.QueryJobConfig( table_definitions={temp_table_name: external_config}) query_job = client.query(insert_sql, job_config=job_config) if len(list(query_job)) == 0: print("Load {} to {} completed!".format(log_uri, target_table_id))
def AddTable(self): try: schema_bq = [] for field, datatype in self.settings['schema'][ self.settings['table']].items(): schema_bq.append( bigquery.SchemaField(field, datatype[0], mode=datatype[1])) table_ref = self.client.dataset(self.settings['dataset']).table( self.settings['table']) table = bigquery.Table(table_ref, schema=schema_bq) table = self.client.create_table(table) # API request assert table.table_id == self.settings['table'] logger.info("Table has been created {}".format( self.settings['table'])) except Exception as e: logger.info("Failed adding table {}".format(e))