def upload_tweets(): big_query_client = bigquery.Client.from_service_account_json('my-beam-project-b2834963a4ae.json') dataset_ref = big_query_client.dataset('Tweets') dataset = Dataset(dataset_ref) dataset.description = 'This represents tweets of trending topics' dataset = big_query_client.create_dataset(dataset) SCHEMA = [ SchemaField('Tweets', 'STRING', mode='Nullable'), ] table_ref = big_query_client.dataset('Tweets').table('tabletweet') load_config = LoadJobConfig() load_config.skip_leading_rows = 0 load_config.schema = SCHEMA load_config.allow_quoted_newlines = True load_config.ignore_unknown_values = False load_config.max_bad_records = 200 with open('tweets.csv', 'rb') as readable: big_query_client.load_table_from_file( readable, table_ref, job_config=load_config) print('tweets file uploaded to big query')
def insertCSV(stock): client = bigquery.Client(project_id) SCHEMA = [ SchemaField('symbol', 'STRING', mode='required'), SchemaField('date', 'DATE', mode='required'), SchemaField('close', 'FLOAT', mode='required'), SchemaField('high', 'FLOAT', mode='required'), SchemaField('low', 'FLOAT', mode='required'), SchemaField('open', 'FLOAT', mode='required'), SchemaField('volume', 'INTEGER', mode='required'), ] table_ref = client.dataset(dataset_id).table(stock) load_config = LoadJobConfig() load_config.skip_leading_rows = 1 load_config.schema = SCHEMA with open('Data/%s.csv' % stock, 'rb') as readable: r = csv.reader(readable, delimiter=',') client.load_table_from_file(readable, table_ref, job_config=load_config)
def load_task(): client = Client() job_config = LoadJobConfig() schema_path = os.path.join( dags_folder, 'resources/stages/raw/schemas/{task}.json'.format(task=task)) job_config.schema = read_bigquery_schema_from_file(schema_path) job_config.source_format = SourceFormat.CSV if file_format == 'csv' else SourceFormat.NEWLINE_DELIMITED_JSON if file_format == 'csv': job_config.skip_leading_rows = 1 job_config.write_disposition = 'WRITE_TRUNCATE' job_config.allow_quoted_newlines = allow_quoted_newlines job_config.ignore_unknown_values = True export_location_uri = 'gs://{bucket}/export'.format( bucket=output_bucket) uri = '{export_location_uri}/{task}/*.{file_format}'.format( export_location_uri=export_location_uri, task=task, file_format=file_format) table_ref = client.dataset(dataset_name_raw).table(task) load_job = client.load_table_from_uri(uri, table_ref, job_config=job_config) submit_bigquery_job(load_job, job_config) assert load_job.state == 'DONE'
def __create_load_job_config( self, ems_load_job_config: EmsLoadJobConfig) -> LoadJobConfig: config = LoadJobConfig() config.labels = ems_load_job_config.labels config.create_disposition = ems_load_job_config.create_disposition.value config.write_disposition = ems_load_job_config.write_disposition.value config.schema = _parse_schema_resource(ems_load_job_config.schema) config.skip_leading_rows = ems_load_job_config.skip_leading_rows return config
def create_table_from_csv(self, dataset, table_name, file_path, schema): table_ref = dataset.table(table_name) load_config = LoadJobConfig() load_config.skip_leading_rows = 1 load_config.schema = schema with open(file_path, 'rb') as readable: self.client.load_table_from_file( readable, table_ref, job_config=load_config) # API request return
def DTSTableDefinition_to_BQLoadJobConfig(dts_tabledef): """ https://cloud.google.com/bigquery/docs/reference/data-transfer/partner/rpc/google.cloud.bigquery.datatransfer.v1#tabledefinition TO https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/reference.html#google.cloud.bigquery.job.LoadJob :param dts_tabledef: :return: """ from bq_dts import rest_client job_config = LoadJobConfig() dts_schema = RPCRecordSchema_to_GCloudSchema(dts_tabledef['schema']) job_config.schema = dts_schema # BQ DTS does not provide controls for the following dispositions job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE if 'format' in dts_tabledef: dts_format = dts_tabledef['format'] source_format = rest_client.BQ_DTS_FORMAT_TO_BQ_SOURCE_FORMAT_MAP[ dts_format] assert source_format is not None job_config.source_format = source_format if 'max_bad_records' in dts_tabledef: job_config.max_bad_records = dts_tabledef['max_bad_records'] if 'encoding' in dts_tabledef: dts_encoding = dts_tabledef['encoding'] job_config.encoding = rest_client.BQ_DTS_ENCODING_TO_BQ_ENCODING_MAP[ dts_encoding] if 'csv_options' in dts_tabledef: csv_opts = dts_tabledef['csv_options'] if 'field_delimiter' in csv_opts: job_config.field_delimiter = csv_opts['field_delimiter'] if 'allow_quoted_newlines' in csv_opts: job_config.allow_quoted_newlines = csv_opts[ 'allow_quoted_newlines'] if 'quote_char' in csv_opts: job_config.quote_character = csv_opts['quote_char'] if 'skip_leading_rows' in csv_opts: job_config.skip_leading_rows = csv_opts['skip_leading_rows'] return job_config
def load_stage(dst_dataset: Dataset, bq_client: Client, bucket_name: str, gcs_client: storage.Client) -> List[LoadJob]: """ Stage files from a bucket to a dataset :param dst_dataset: reference to destination dataset object :param bq_client: a BigQuery client object :param bucket_name: the location in GCS containing the vocabulary files :param gcs_client: a Cloud Storage client object :return: list of completed load jobs """ blobs = list(gcs_client.list_blobs(bucket_name)) table_blobs = [_filename_to_table_name(blob.name) for blob in blobs] missing_blobs = [ table for table in VOCABULARY_TABLES if table not in table_blobs ] if missing_blobs: raise RuntimeError( f'Bucket {bucket_name} is missing files for tables {missing_blobs}' ) load_jobs = [] for blob in blobs: table_name = _filename_to_table_name(blob.name) # ignore any non-vocabulary files if table_name not in VOCABULARY_TABLES: continue destination = dst_dataset.table(table_name) safe_schema = safe_schema_for(table_name) job_config = LoadJobConfig() job_config.schema = safe_schema job_config.skip_leading_rows = 1 job_config.field_delimiter = FIELD_DELIMITER job_config.max_bad_records = MAX_BAD_RECORDS job_config.source_format = 'CSV' job_config.quote_character = '' source_uri = f'gs://{bucket_name}/{blob.name}' load_job = bq_client.load_table_from_uri(source_uri, destination, job_config=job_config) LOGGER.info(f'table:{destination} job_id:{load_job.job_id}') load_jobs.append(load_job) load_job.result() return load_jobs
def create_table(self, path, table_from='uri'): bp = BQParser(path) dataset_name = bp.dataset_name table_name = bp.table_name skip_leading_rows = bp.skip_leading_rows schema = bp.schema table_ref = self.client.dataset(dataset_name).table(table_name) load_config = LoadJobConfig() load_config.skip_leading_rows = skip_leading_rows load_config.schema = schema file_source = bp.properties.get('inputPath') if table_from == 'uri': self.client.load_table_from_uri(source_uris=file_source, destination=table_ref, job_config=load_config) else: raise ValueError('Not supported')
def load_folder(dst_dataset: str, bq_client: BQClient, bucket_name: str, prefix: str, gcs_client: GCSClient, hpo_id: str) -> List[LoadJob]: """ Stage files from a bucket to a dataset :param dst_dataset: Identifies the destination dataset :param bq_client: a BigQuery client object :param bucket_name: the bucket in GCS containing the archive files :param prefix: prefix of the filepath URI :param gcs_client: a Cloud Storage client object :param hpo_id: Identifies the HPO site :return: list of completed load jobs """ blobs = list(gcs_client.list_blobs(bucket_name, prefix=prefix)) load_jobs = [] for blob in blobs: table_name = _filename_to_table_name(blob.name) if table_name not in AOU_REQUIRED: LOGGER.debug(f'Skipping file for {table_name}') continue schema = get_table_schema(table_name) hpo_table_name = f'{hpo_id}_{table_name}' fq_hpo_table = f'{bq_client.project}.{dst_dataset}.{hpo_table_name}' destination = Table(fq_hpo_table, schema=schema) destination = bq_client.create_table(destination) job_config = LoadJobConfig() job_config.schema = schema job_config.skip_leading_rows = 1 job_config.source_format = 'CSV' source_uri = f'gs://{bucket_name}/{blob.name}' load_job = bq_client.load_table_from_uri( source_uri, destination, job_config=job_config, job_id_prefix=f"{__file__.split('/')[-1].split('.')[0]}_") LOGGER.info(f'table:{destination} job_id:{load_job.job_id}') load_jobs.append(load_job) load_job.result() return load_jobs
def add_load_job_csv_config(unhandled_hints: Set[str], hints: ValidatedRecordsHints, fail_if_cant_handle_hint: bool, config: bigquery.LoadJobConfig) -> None: # source_format: File format of the data. config.source_format = 'CSV' # encoding: The character encoding of the data. # The supported values are UTF-8 or ISO-8859-1. # "UTF-8 or ISO-8859-1" # if hints.encoding == 'UTF8': config.encoding = 'UTF-8' else: # Currently records hints don't support ISO-8859-1 cant_handle_hint(fail_if_cant_handle_hint, 'encoding', hints) quiet_remove(unhandled_hints, 'encoding') # field_delimiter: The separator for fields in a CSV file. assert isinstance(hints.field_delimiter, str) config.field_delimiter = hints.field_delimiter quiet_remove(unhandled_hints, 'field-delimiter') # allow_jagged_rows: Allow missing trailing optional columns (CSV only). # null_marker: Represents a null value (CSV only) # # (documentation is mangled for this one, but I assume the default is # '' or something sensible, so not messing with it) # quote_character: Character used to quote data sections (CSV # only). # # [Optional] The value that is used to quote data sections in # a CSV file. BigQuery converts the string to ISO-8859-1 # encoding, and then uses the first byte of the encoded string # to split the data in its raw, binary state. The default # value is a double-quote ('"'). If your data does not contain # quoted sections, set the property value to an empty # string. If your data contains quoted newline characters, you # must also set the allowQuotedNewlines property to # true. # # @default " # I tried a few combinations and found that when you leave quote_character as the default # # * Fields quoted with "" are loaded without the surrounding quotes in the # string # * "" becomes " in a quoted field # * "" stays "" in a non-quoted field # * nonnumeric quoting works fine # * full quoting works fine if hints.quoting is None: config.quote_character = '' elif hints.quoting == 'all' or hints.quoting == 'minimal' or hints.quoting == 'nonnumeric': # allow_quoted_newlines: Allow quoted data containing newline # characters (CSV only). config.allow_quoted_newlines = True assert isinstance(hints.quotechar, str) config.quote_character = hints.quotechar if hints.doublequote: pass else: cant_handle_hint(fail_if_cant_handle_hint, 'doublequote', hints) else: _assert_never(hints.quoting) quiet_remove(unhandled_hints, 'quoting') quiet_remove(unhandled_hints, 'quotechar') quiet_remove(unhandled_hints, 'doublequote') # No mention of escaping in BigQuery documentation, and in # practice backslashes come through without being interpreted. if hints.escape is None: pass else: cant_handle_hint(fail_if_cant_handle_hint, 'escape', hints) quiet_remove(unhandled_hints, 'escape') # skip_leading_rows: Number of rows to skip when reading data (CSV only). if hints.header_row: config.skip_leading_rows = 1 else: config.skip_leading_rows = 0 quiet_remove(unhandled_hints, 'header-row') # "When you load CSV or JSON data, values in DATE columns must # use the dash (-) separator and the date must be in the # following format: YYYY-MM-DD (year-month-day)." if hints.dateformat == 'YYYY-MM-DD': pass else: cant_handle_hint(fail_if_cant_handle_hint, 'dateformat', hints) quiet_remove(unhandled_hints, 'dateformat') # "When you load JSON or CSV data, values in TIMESTAMP columns # must use a dash (-) separator for the date portion of the # timestamp, and the date must be in the following format: # YYYY-MM-DD (year-month-day). The hh:mm:ss # (hour-minute-second) portion of the timestamp must use a # colon (:) separator." # # # To test, log into BigQuery web console and try SQL like this # (assumption is that the same timestamp parser is used during # CSV loads) # # select TIMESTAMP("2000-01-02 16:34:56.789012US/Eastern") as a; # # Tests performed and result displayed on console query: # # DATE: # * 01-02-2019 (rejected): # * 01/02/19 (rejected): # * 2019-01-01 (accepted): 2019-01-01 # DATETIME: # * 2019-01-01 1:00pm (rejected): # * 2019-01-01 1:00:00pm (rejected) # * 2019-01-01 1:00PM (rejected): # * 2019-01-01 13:00 (rejected): # * 2019-01-01 13:00:00 (accepted): 2019-01-01T13:00:00 # * 2019-01-01 1:00pm US/Eastern (rejected): # * 2019-01-01 1:00:00pm US/Eastern (rejected): # * 2019-01-01 13:00:00 US/Eastern (rejected): # * 2019-01-01 13:00:00 EST (rejected): # * 1997-12-17 07:37:16-08 (rejected) # * 2019-01-01T13:00:00 (accepted): 2019-01-01T13:00:00 # # TIME: # * 1:00pm (rejected): # * 1:00:00pm (rejected): # * 13:00 (rejected): # * 13:00:00 (accepted): 13:00:00 # * 1:00pm US/Eastern (rejected): # * 1:00pm EST (rejected): # * 07:37:16-08 (rejected): # # TIMESTAMP ("Required format is YYYY-MM-DD # HH:MM[:SS[.SSSSSS]]", which is BS, as it doesn't specify the # timezone format): # # * 2019-01-01 1:00pm (rejected): # * 2019-01-01 1:00:00pm (rejected) # * 2019-01-01 1:00PM (rejected): # * 2019-01-01 13:00 (rejected): # * 2019-01-01 13:00:00 (accepted): 2019-01-01T13:00:00 # * 2019-01-01 1:00pm US/Eastern (rejected): # * 2019-01-01 1:00:00pm US/Eastern (rejected): # * 2019-01-01 13:00:00 US/Eastern (rejected): # * 2019-01-01 13:00:00 EST (rejected): # * 1997-12-17 07:37:16-08 (accepted): 1997-12-17 15:37:16 UTC # * 2019-01-01T13:00:00-08 (accepted): 2019-01-01 21:00:00 UTC # * 2000-01-02 16:34:56.789012+0000 (rejected) # * 2000-01-02 16:34:56.789012+00:00 (accepted) # * 2000-01-02 16:34:56.789012EST (rejected) # * 2000-01-02 16:34:56.789012US/Eastern (rejected) # * 2000-01-02 16:34:56.789012UTC (accepted): 2000-01-02 16:34:56.789012 UTC # * 2000-01-02 16:34:56.789012 UTC (accepted: 2000-01-02 16:34:56.789012 UTC # # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#examples # # https://stackoverflow.com/questions/47466296/bigquery-datetime-format-csv-to-bigquery-yyyy-mm-dd-hhmmss-ssssss # # BigQuery supports exactly one format of ingesting timestamps # with timezones (what they call 'TIMESTAMP' they call timestamps # without timezones 'DATETIME'. # # That format they accept is ISO 8601, which sounds all nice and # standardy. Usable timestamps look like 2000-01-02 # 16:34:56.789012+00:00. # Cool cool. The only issue is that Python's strftime doesn't # actually provide a way to add the ':' in the timezone # offset. The only timezone offset code, %z, does not provide the # colon. Other implementations (GNU libc) offers the %:z option, # but that doesn't exist in Python and thus in Pandas. # # So if you're using Python to export timestamps with timezones, # you should probably use the `YYYY-MM-DD HH24:MI:SS` format and # express them in UTC. # # https://stackoverflow.com/questions/44836581/does-python-time-strftime-process-timezone-options-correctly-for-rfc-3339 # https://stackoverflow.com/questions/28729212/pandas-save-date-in-iso-format # if hints.datetimeformat in ['YYYY-MM-DD HH24:MI:SS', 'YYYY-MM-DD HH:MI:SS']: pass else: cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformat', hints) quiet_remove(unhandled_hints, 'datetimeformat') if hints.datetimeformattz in ['YYYY-MM-DD HH:MI:SSOF', 'YYYY-MM-DD HH24:MI:SSOF', 'YYYY-MM-DD HH:MI:SS']: pass else: cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformattz', hints) quiet_remove(unhandled_hints, 'datetimeformattz') if hints.timeonlyformat in ['HH24:MI:SS', 'HH:MI:SS']: pass else: cant_handle_hint(fail_if_cant_handle_hint, 'timeonlyformat', hints) quiet_remove(unhandled_hints, 'timeonlyformat') # No options to change this. Tested with unix newlines, dos # newlines and mac newlines and all were understood.: if hints.record_terminator in ['\n', '\r\n', '\r', None]: pass else: cant_handle_hint(fail_if_cant_handle_hint, 'record-terminator', hints) quiet_remove(unhandled_hints, 'record-terminator') # No way to flag compression, but tested uncompressed, with # gzip and works great. .bz2 gives "400 Unsupported # compression type". Not sure about .lzo, but pandas can't # handle it regardless, so doubt it's handled. if hints.compression is None or hints.compression == 'GZIP': pass else: cant_handle_hint(fail_if_cant_handle_hint, 'compression', hints) quiet_remove(unhandled_hints, 'compression')
def create_bq_schema(): schema = [] for col in bus_res_keys: name = col.split(':')[-1] if col in date_cols: type = "TIMESTAMP" else: type = "STRING" x = SchemaField(name, type) schema.append(x) return schema if __name__ == '__main__': client = bigquery.Client(project=PROJECT_ID) table_ref = client.dataset('bus').table('bus') load_config = LoadJobConfig() load_config.skip_leading_rows = 1 load_config.schema = create_bq_schema() bucket = storage.Client(project=PROJECT_ID).bucket(BUCKET) for blob in bucket.list_blobs(): uri = "gs://{bucket}/{filename}".format(bucket=BUCKET, filename=blob.name) print("Loading {}".format(blob.name)) job = client.load_table_from_uri(uri, table_ref, job_config=load_config) job.result()