def _load_to_bq(self, client, dataset, table_name, table_schema, table_config, key_props, metadata_columns, truncate, rows): logger = self.logger partition_field = table_config.get("partition_field", None) cluster_fields = table_config.get("cluster_fields", None) force_fields = table_config.get("force_fields", {}) schema = build_schema(table_schema, key_properties=key_props, add_metadata=metadata_columns, force_fields=force_fields) load_config = LoadJobConfig() load_config.ignore_unknown_values = True load_config.schema = schema if partition_field: load_config.time_partitioning = bigquery.table.TimePartitioning( type_=bigquery.table.TimePartitioningType.DAY, field=partition_field) if cluster_fields: load_config.clustering_fields = cluster_fields load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate: logger.info(f"Load {table_name} by FULL_TABLE (truncate)") load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE else: logger.info(f"Appending to {table_name}") load_config.write_disposition = WriteDisposition.WRITE_APPEND logger.info("loading {} to BigQuery".format(table_name)) load_job = None try: load_job = client.load_table_from_file(rows, dataset.table(table_name), job_config=load_config, rewind=True) logger.info("loading job {}".format(load_job.job_id)) job = load_job.result() logger.info(job._properties) return job except google_exceptions.BadRequest as err: logger.error("failed to load table {} from file: {}".format( table_name, str(err))) if load_job and load_job.errors: reason = err.errors[0]["reason"] messages = [f"{err['message']}" for err in load_job.errors] logger.error("reason: {reason}, errors:\n{e}".format( reason=reason, e="\n".join(messages))) err.message = f"reason: {reason}, errors: {';'.join(messages)}" raise err
def load_task(): client = Client() job_config = LoadJobConfig() schema_path = os.path.join( dags_folder, 'resources/stages/raw/schemas/{task}.json'.format(task=task)) job_config.schema = read_bigquery_schema_from_file(schema_path) job_config.source_format = SourceFormat.CSV if file_format == 'csv' else SourceFormat.NEWLINE_DELIMITED_JSON if file_format == 'csv': job_config.skip_leading_rows = 1 job_config.write_disposition = 'WRITE_TRUNCATE' job_config.allow_quoted_newlines = allow_quoted_newlines job_config.ignore_unknown_values = True export_location_uri = 'gs://{bucket}/export'.format( bucket=output_bucket) uri = '{export_location_uri}/{task}/*.{file_format}'.format( export_location_uri=export_location_uri, task=task, file_format=file_format) table_ref = client.dataset(dataset_name_raw).table(task) load_job = client.load_table_from_uri(uri, table_ref, job_config=job_config) submit_bigquery_job(load_job, job_config) assert load_job.state == 'DONE'
def process_response_rows_for_bigquery(self, rows: list, table_reference: TableReference): rows_dataframe = DataFrame.from_records(rows) rows_dataframe = concat( [rows_dataframe, rows_dataframe['dimensions'].apply(Series)], axis=1, join='inner') rows_dataframe = rows_dataframe.drop(['dimensions'], axis=1) rows_dataframe['date'] = rows_dataframe['date'].apply( lambda x: x.date()) job_config = LoadJobConfig() job_config.write_disposition = WriteDisposition.WRITE_APPEND job_config.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY, field='date') job_config.schema = [ self._get_schema_for_field(column) for column in list(rows_dataframe.columns.values) ] try: load_job = self.bigquery.client.load_table_from_dataframe( rows_dataframe, table_reference, job_config=job_config) load_job.result() except BadRequest as error: print(error.errors)
def __create_load_job_config( self, ems_load_job_config: EmsLoadJobConfig) -> LoadJobConfig: config = LoadJobConfig() config.labels = ems_load_job_config.labels config.create_disposition = ems_load_job_config.create_disposition.value config.write_disposition = ems_load_job_config.write_disposition.value config.schema = _parse_schema_resource(ems_load_job_config.schema) config.skip_leading_rows = ems_load_job_config.skip_leading_rows return config
def _process_data_for_bigquery(self, data: DataFrame, output_tablereference: TableReference): job_config = LoadJobConfig() job_config.write_disposition = WriteDisposition.WRITE_APPEND job_config.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY, field='date') try: load_job = self.bigquery.client.load_table_from_dataframe( data, output_tablereference, job_config=job_config) load_job.result() except BadRequest as error: print(error.errors)
def DTSTableDefinition_to_BQLoadJobConfig(dts_tabledef): """ https://cloud.google.com/bigquery/docs/reference/data-transfer/partner/rpc/google.cloud.bigquery.datatransfer.v1#tabledefinition TO https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/reference.html#google.cloud.bigquery.job.LoadJob :param dts_tabledef: :return: """ from bq_dts import rest_client job_config = LoadJobConfig() dts_schema = RPCRecordSchema_to_GCloudSchema(dts_tabledef['schema']) job_config.schema = dts_schema # BQ DTS does not provide controls for the following dispositions job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE if 'format' in dts_tabledef: dts_format = dts_tabledef['format'] source_format = rest_client.BQ_DTS_FORMAT_TO_BQ_SOURCE_FORMAT_MAP[ dts_format] assert source_format is not None job_config.source_format = source_format if 'max_bad_records' in dts_tabledef: job_config.max_bad_records = dts_tabledef['max_bad_records'] if 'encoding' in dts_tabledef: dts_encoding = dts_tabledef['encoding'] job_config.encoding = rest_client.BQ_DTS_ENCODING_TO_BQ_ENCODING_MAP[ dts_encoding] if 'csv_options' in dts_tabledef: csv_opts = dts_tabledef['csv_options'] if 'field_delimiter' in csv_opts: job_config.field_delimiter = csv_opts['field_delimiter'] if 'allow_quoted_newlines' in csv_opts: job_config.allow_quoted_newlines = csv_opts[ 'allow_quoted_newlines'] if 'quote_char' in csv_opts: job_config.quote_character = csv_opts['quote_char'] if 'skip_leading_rows' in csv_opts: job_config.skip_leading_rows = csv_opts['skip_leading_rows'] return job_config
def push_bq(): for table in rows.keys(): table_ref = bigquery_client.dataset(dataset_id).table(table) SCHEMA = build_schema(schemas[table]) load_config = LoadJobConfig() load_config.schema = SCHEMA load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate: load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE rows[table].seek(0) logger.info('loading {} to Bigquery.\n'.format(table)) load_job = bigquery_client.load_table_from_file( rows[table], table_ref, job_config=load_config) logger.info('loading job {}'.format(load_job.job_id)) logger.info(load_job.result()) rows[table] = TemporaryFile(mode='w+b')
def load_data(self, dataframe, dataset_id, table_id, chunksize): from google.cloud.bigquery import LoadJobConfig from six import BytesIO destination_table = self.client.dataset(dataset_id).table(table_id) job_config = LoadJobConfig() job_config.write_disposition = 'WRITE_APPEND' job_config.source_format = 'NEWLINE_DELIMITED_JSON' rows = [] remaining_rows = len(dataframe) total_rows = remaining_rows self._print("\n\n") for index, row in dataframe.reset_index(drop=True).iterrows(): row_json = row.to_json(force_ascii=False, date_unit='s', date_format='iso') rows.append(row_json) remaining_rows -= 1 if (len(rows) % chunksize == 0) or (remaining_rows == 0): self._print("\rLoad is {0}% Complete".format( ((total_rows - remaining_rows) * 100) / total_rows)) body = '{}\n'.format('\n'.join(rows)) if isinstance(body, bytes): body = body.decode('utf-8') body = body.encode('utf-8') body = BytesIO(body) try: self.client.load_table_from_file( body, destination_table, job_config=job_config).result() except self.http_error as ex: self.process_http_error(ex) rows = [] self._print("\n")
def persist_lines_job(project_id, dataset_id, lines=None, truncate=False, validate_records=True): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = {} bigquery_client = bigquery.Client(project=project_id) # try: # dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref) # except exceptions.Conflict: # pass for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema".format(msg.stream)) schema = schemas[msg.stream] if validate_records: validate(msg.record, schema) # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, with a newline character splitting each row. dat = bytes(json.dumps(msg.record) + '\n', 'UTF-8') rows[msg.stream].write(dat) # rows[msg.stream].write(bytes(str(msg.record) + '\n', 'UTF-8')) state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties # tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table])) rows[table] = TemporaryFile(mode='w+b') errors[table] = None # try: # tables[table] = bigquery_client.create_table(tables[table]) # except exceptions.Conflict: # pass elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for table in rows.keys(): table_ref = bigquery_client.dataset(dataset_id).table(table) SCHEMA = build_schema(schemas[table]) load_config = LoadJobConfig() load_config.schema = SCHEMA load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate: load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE rows[table].seek(0) logger.info("loading {} to Bigquery.\n".format(table)) load_job = bigquery_client.load_table_from_file( rows[table], table_ref, job_config=load_config) logger.info("loading job {}".format(load_job.job_id)) logger.info(load_job.result()) # for table in errors.keys(): # if not errors[table]: # print('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table), tables[table].path) # else: # print('Errors:', errors[table], sep=" ") return state
def persist_lines_job(project_id, dataset_id, lines=None, truncate=False, validate_records=True): state = None schemas = {} key_properties = {} rows = {} errors = {} bigquery_client = bigquery.Client(project=project_id) for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: log_message = ('A record for stream {} was encountered ' 'before a corresponding schema') raise Exception(log_message.format(msg.stream)) schema = schemas[msg.stream] msg.record = convert_dict_keys_to_bigquery_format( record=msg.record) if validate_records: validate(msg.record, schema) # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, # with a newline character splitting each row. dat = bytes(simplejson.dumps(msg.record) + '\n', 'UTF-8') rows[msg.stream].write(dat) state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schema = convert_schema_column_names_to_bigquery_format( schema=msg.schema) schemas[table] = schema key_properties[table] = msg.key_properties rows[table] = TemporaryFile(mode='w+b') errors[table] = None elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for table in rows.keys(): table_ref = bigquery_client.dataset(dataset_id).table(table) SCHEMA = build_schema(schemas[table]) load_config = LoadJobConfig() load_config.schema = SCHEMA load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate: load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE rows[table].seek(0) logger.info("loading {} to Bigquery.\n".format(table)) load_job = bigquery_client.load_table_from_file(rows[table], table_ref, job_config=load_config) logger.info("loading job {}".format(load_job.job_id)) return state
def persist_lines_job(project_id, dataset_id, lines=None, truncate=False, validate_records=True): state = None schemas = {} rows = {} bigquery_client = bigquery.Client(project=project_id) for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(msg.stream)) schema = schemas[msg.stream] if validate_records: validate(msg.record, schema) # NEWLINE_DELIMITED_JSON expects JSON string data, with a newline splitting each row. rows[msg.stream].write( bytes(json.dumps(msg.record) + "\n", "UTF-8")) state = None elif isinstance(msg, singer.StateMessage): logger.debug("Setting state to {}".format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema rows[table] = TemporaryFile(mode="w+b") elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for table in rows.keys(): table_ref = bigquery_client.dataset(dataset_id).table(table) SCHEMA = build_schema(schemas[table]) load_config = LoadJobConfig() load_config.schema = SCHEMA load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate: load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE else: load_config.schema_update_options = [ SchemaUpdateOption.ALLOW_FIELD_ADDITION ] load_job = bigquery_client.load_table_from_file(rows[table], table_ref, job_config=load_config, rewind=True) logger.info( f"Loading '{table}' to BigQuery as job '{load_job.job_id}'", extra={"stream": table}) try: load_job.result() except Exception as e: logger.error(f"Error on inserting to table '{table}': {str(e)}", extra={"stream": table}) return logger.info(f"Loaded {load_job.output_rows} row(s) to '{table}'", extra={"stream": table}) return state
def persist_lines_job( client, dataset, lines=None, truncate=False, forced_fulltables=[], validate_records=True, table_suffix=None, ): state = None schemas = {} key_properties = {} rows = {} errors = {} table_suffix = table_suffix or "" for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): table_name = msg.stream + table_suffix if table_name not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(table_name)) schema = schemas[table_name] if validate_records: validate(msg.record, schema) new_rec = filter(schema, msg.record) # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, with a newline character splitting each row. data = bytes( json.dumps(new_rec, cls=DecimalEncoder) + "\n", "UTF-8") rows[table_name].write(data) state = None elif isinstance(msg, singer.StateMessage): logger.debug("Setting state to {}".format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table_name = msg.stream + table_suffix if table_name in rows: continue schemas[table_name] = msg.schema key_properties[table_name] = msg.key_properties rows[table_name] = TemporaryFile(mode="w+b") errors[table_name] = None elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for table in rows.keys(): key_props = key_properties[table] SCHEMA = build_schema(schemas[table], key_properties=key_props) load_config = LoadJobConfig() load_config.schema = SCHEMA load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate or (table in forced_fulltables): logger.info(f"Load {table} by FULL_TABLE") load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE logger.info("loading {} to Bigquery.\n".format(table)) try: load_job = client.load_table_from_file(rows[table], dataset.table(table), job_config=load_config, rewind=True) logger.info("loading job {}".format(load_job.job_id)) logger.info(load_job.result()) except google_exceptions.BadRequest as err: logger.error("failed to load table {} from file: {}".format( table, str(err))) if load_job.errors: messages = [ f"reason: {err['reason']}, message: {err['message']}" for err in load_job.errors ] logger.error("errors:\n{}".format("\n".join(messages))) raise yield state
def persist_lines_job(project_id, dataset_id, lines=None, truncate=False, validate_records=True): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = {} bigquery_client = bigquery.Client(project=project_id) # try: # dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref) # except exceptions.Conflict: # pass for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(msg.stream)) schema = schemas[msg.stream] if validate_records: validate(msg.record, schema) # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, with a newline character splitting each row. simplified = dict() for k in msg.record: v = msg.record[k] if isinstance(v, decimal.Decimal): v = float(v) if isinstance(v, bool): v = str(v) simplified[k] = v json_row = json.dumps(simplified) rows[msg.stream].write(bytes(json_row, "UTF-8")) rows[msg.stream].write(bytes("\n", "UTF-8")) # rows[msg.stream].write(bytes(str(msg.record) + '\n', 'UTF-8')) state = None elif isinstance(msg, singer.StateMessage): logger.debug("Setting state to {}".format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = json.loads(msg.schema) key_properties[table] = msg.key_properties tables[table] = bigquery.Table( bigquery_client.dataset(dataset_id).table(table), schema=build_schema(schemas[table])) rows[table] = TemporaryFile(mode="w+b") errors[table] = None try: tables[table] = bigquery_client.create_table(tables[table]) except exceptions.Conflict: pass elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for table in rows.keys(): table_ref = bigquery_client.dataset(dataset_id).table(table) SCHEMA = build_schema(schemas[table]) load_config = LoadJobConfig() load_config.schema = SCHEMA load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate: load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE rows[table].seek(0) logger.info("loading {} to Bigquery.\n".format(table)) data = rows[table] logger.info( f"table_ref: {table_ref}, config: {load_config}, data: {data}") try: load_job = bigquery_client.load_table_from_file( data, table_ref, job_config=load_config) logger.info("loading job {}".format(load_job.job_id)) logger.info(load_job.result()) except exceptions.GoogleAPIError as e: logger.error(f"Exception in load job: {e}") for er in e.errors: logger.error(f"\t Error: {er}") raise for table in errors.keys(): if not errors[table]: print( "Loaded {} row(s) into {}:{}".format(rows[table], dataset_id, table), tables[table].path) else: print("Errors: {}", errors[table]) return state