def _process_responses_for_bigquery(self, data: Sequence[dict], table_reference: TableReference): job_config = LoadJobConfig() job_config.write_disposition = WriteDisposition.WRITE_APPEND job_config.time_partitioning = TimePartitioning(type_=TimePartitioningType.DAY, field='date') job_config.schema = ( SchemaField('url', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('cluster', SqlTypeNames.STRING), SchemaField('name', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('query', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('date', SqlTypeNames.DATETIME, 'REQUIRED'), SchemaField('elements', SqlTypeNames.RECORD, 'REPEATED', fields=( SchemaField('content', SqlTypeNames.STRING), SchemaField('operation', SqlTypeNames.STRING), SchemaField('result', SqlTypeNames.STRING), )), ) for data_item in data: data_item['date'] = data_item['date'].strftime('%Y-%m-%dT%H:%M:%S.%f') for element in data_item['elements']: if element['result'] is not None: element['result'] = str(element['result']) load_job = self.bigquery.client.load_table_from_json(data, table_reference, job_config=job_config) load_job.result()
def _process_response_for_bigquery(self, table_reference: TableReference, rankings: Sequence[dict]): job_config = LoadJobConfig() job_config.write_disposition = WriteDisposition.WRITE_APPEND job_config.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY, field='date') job_config.schema = ( SchemaField('date', SqlTypeNames.DATETIME, 'REQUIRED'), SchemaField('url', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('project_id', SqlTypeNames.INTEGER, 'REQUIRED'), SchemaField('prev_rank', SqlTypeNames.INTEGER), SchemaField('keyword', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('keyword_group', SqlTypeNames.STRING, 'REPEATED'), SchemaField('ranking', SqlTypeNames.INTEGER), SchemaField('ranking_change', SqlTypeNames.INTEGER, 'REQUIRED'), SchemaField('rank', SqlTypeNames.STRING, 'REQUIRED'), SchemaField('scalar_type', SqlTypeNames.STRING, 'REQUIRED'), ) for ranking in rankings: ranking['date'] = ranking['date'].strftime('%Y-%m-%dT%H:%M:%S.%f') load_job = self.bigquery.client.load_table_from_json( rankings, table_reference, job_config=job_config) load_job.result()
def _execute_load_in_source(context, source, source_name): destination = context.solid_config.get('destination') load_job_config = _preprocess_config( context.solid_config.get('load_job_config', {})) cfg = LoadJobConfig(**load_job_config) if load_job_config else None context.log.info( 'executing BQ load with config: %s for source %s' % (cfg.to_api_repr() if cfg else '(no config provided)', source)) if source_name == BigQueryLoadSource.DataFrame: context.resources.bigquery.load_table_from_dataframe( source, destination, job_config=cfg).result() # Load from file. See: https://cloud.google.com/bigquery/docs/loading-data-local elif source_name == BigQueryLoadSource.File: with open(source, 'rb') as file_obj: context.resources.bigquery.load_table_from_file( file_obj, destination, job_config=cfg).result() # Load from GCS. See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage elif source_name == BigQueryLoadSource.GCS: context.resources.biquery.load_table_from_uri(source, destination, job_config=cfg).result()
def import_csv(self, bucket_name: str, bucket_path: str, dataset: str, table: str, sep: str = "\t") -> bool: logging.info( f"DataWarehouse.import_csv {bucket_path} to {dataset}.{table} ...") client = self._get_client() config = LoadJobConfig() config.autodetect = True config.field_delimiter = sep bucket_url = f"gs://{self.config.lake_path}/{bucket_path}" load_job = client.load_table_from_uri(bucket_url, f"{dataset}.{table}", job_config=config) result = load_job.result() logging.info( f"DataWarehouse.import_csv {bucket_path} to {dataset}.{table} Complete!" ) return True
def test_schema_setter_invalid_field(self): from google.cloud.bigquery.job import LoadJobConfig from google.cloud.bigquery.schema import SchemaField config = LoadJobConfig() full_name = SchemaField("full_name", "STRING", mode="REQUIRED") with self.assertRaises(ValueError): config.schema = [full_name, object()]
def populate_table(self, table_path, schema, data=[], make_immediately_available=False, replace_existing_table=False): # type: (str, List[SchemaField], Optional[List[Any]], Optional[bool], Optional[bool]) -> None """Creates a table and populates it with a list of rows. If make_immediately_available is False, the table will be created using streaming inserts. Note that streaming inserts are immediately available for querying, but not for exporting or copying, so if you need that capability you should set make_immediately_available to True. https://cloud.google.com/bigquery/streaming-data-into-bigquery If the table is already created, it will raise a RuntimeError, unless replace_existing_table is True. Args: table_path: A string of the form '<dataset id>.<table name>' or '<project id>.<dataset id>.<table name>'. schema: A list of SchemaFields to represent the table's schema. data: A list of rows, each of which corresponds to a row to insert into the table. make_immediately_available: If False, the table won't immediately be available for copying or exporting, but will be available for querying. If True, after this operation returns, it will be available for copying and exporting too. replace_existing_table: If set to True, the table at table_path will be deleted and recreated if it's already present. Raises: RuntimeError if the table at table_path is already there and replace_existing_table is False """ # Use the Table object so we can pass through the schema. table = Table(self.get_table_reference_from_path(table_path), schema) if self.table_exists(table): if replace_existing_table: self.delete_table(table) else: raise RuntimeError('The table {} already exists.'.format(table_path)) self.create_table(table) if data: if make_immediately_available: output = cStringIO.StringIO() csv_out = csv.writer(output) for row in data: csv_out.writerow(row) job_config = LoadJobConfig() job_config.source_format = 'text/csv' # By default this does six retries. It does not accept any other timeout or # retry parameters. job = self.gclient.load_table_from_file(output, table.reference, job_config=job_config, rewind=True) job.result() output.close() else: self._stream_chunks_of_rows(table, data, schema)
def test_schema_setter(self): from google.cloud.bigquery.job import LoadJobConfig from google.cloud.bigquery.schema import SchemaField config = LoadJobConfig() full_name = SchemaField("full_name", "STRING", mode="REQUIRED") age = SchemaField("age", "INTEGER", mode="REQUIRED") config.schema = [full_name, age] self.assertEqual(config.schema, [full_name, age])
def __build_load_config(self, gcs_uri, file_format, schema_file, append): gcs_file_ext = os.path.splitext(gcs_uri)[1] if gcs_file_ext: file_format = gcs_file_ext.lstrip(".") jc = LoadJobConfig() jc.source_format = self.__get_bq_format(file_format) jc.schema = self.__load_schema(schema_file) jc.write_disposition = "WRITE_APPEND" if append else "WRITE_TRUNCATE" return jc, file_format
def _execute_load_in_source(context, source, source_name): destination = context.solid_config.get('destination') load_job_config = _preprocess_config(context.solid_config.get('load_job_config', {})) cfg = LoadJobConfig(**load_job_config) if load_job_config else None context.log.info( 'executing BQ load with config: %s for source %s' % (cfg.to_api_repr() if cfg else '(no config provided)', source) ) context.resources.bigquery.load_table_from_source( source_name, source, destination, job_config=cfg ).result()
def _process_response_rows_for_bigquery(self, dataframe: DataFrame, table_reference: TableReference): job_config = LoadJobConfig() job_config.write_disposition = WriteDisposition.WRITE_APPEND job_config.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY, field='date') job_config.schema = [ self._get_schema_for_field(column) for column in list(dataframe.columns.values) ] load_job = self.bigquery.client.load_table_from_dataframe( dataframe, table_reference, job_config=job_config) load_job.result()
def _compute_fn(context, inputs): destination = context.solid_config.get('destination') load_job_config = _preprocess_config(context.solid_config.get('load_job_config', {})) cfg = LoadJobConfig(**load_job_config) if load_job_config else None context.log.info( 'executing BQ load with config: %s' % (cfg.to_api_repr() if cfg else '(no config provided)') ) context.resources.bq.load_table_from_source( source, inputs, destination, job_config=cfg ).result() yield Result(None)
def test_change_skip_leading_rows(): loader = DsvDataLoader(table=None, from_=None, load_job_config=LoadJobConfig(), bq_client=None) assert loader.load_job_config.skip_leading_rows is None loader = loader.skip_leading_rows(1) assert loader.load_job_config.skip_leading_rows == 1
def test_change_field_delimiter(): loader = DsvDataLoader(table=None, from_=None, load_job_config=LoadJobConfig(), bq_client=None) assert loader.load_job_config.field_delimiter is None loader = loader.with_field_delimiter("\t") assert loader.load_job_config.field_delimiter == "\t"
def test_change_null_marker(): loader = DsvDataLoader(table=None, from_=None, load_job_config=LoadJobConfig(), bq_client=None) assert loader.load_job_config.null_marker is None loader = loader.with_null_marker("NULL") assert loader.load_job_config.null_marker == "NULL"
def test_change_encoding(): loader = JsonDataLoader(table=None, from_=None, load_job_config=LoadJobConfig(), bq_client=None) assert loader.load_job_config.encoding is None loader = loader.with_encoding("UTF-8") assert loader.load_job_config.encoding == "UTF-8"
def test_change_quote_character(): loader = DsvDataLoader(table=None, from_=None, load_job_config=LoadJobConfig(), bq_client=None) assert loader.load_job_config.quote_character is None loader = loader.with_quote_character("'") assert loader.load_job_config.quote_character == "'"
def test_change_to_partition(): loader = BaseDataLoader(table=None, from_=None, load_job_config=LoadJobConfig(), bq_client=None) assert loader.partition is None loader = loader.to_partition("20201023") assert loader.partition == "20201023"
def test_begin_w_autodetect(self): from google.cloud.bigquery.job import LoadJobConfig path = "/projects/{}/jobs".format(self.PROJECT) resource = self._make_resource() resource["configuration"]["load"]["autodetect"] = True # Ensure None for missing server-set props del resource["statistics"]["creationTime"] del resource["etag"] del resource["selfLink"] del resource["user_email"] conn = _make_connection(resource) client = _make_client(project=self.PROJECT, connection=conn) config = LoadJobConfig() config.autodetect = True job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client, config) with mock.patch( "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" ) as final_attributes: job._begin() final_attributes.assert_called_with({"path": path}, client, job) sent = { "jobReference": { "projectId": self.PROJECT, "jobId": self.JOB_ID }, "configuration": { "load": { "sourceUris": [self.SOURCE1], "destinationTable": { "projectId": self.PROJECT, "datasetId": self.DS_ID, "tableId": self.TABLE_ID, }, "autodetect": True, } }, } conn.api_request.assert_called_once_with(method="POST", path=path, data=sent, timeout=None) self._verifyResourceProperties(job, resource)
def _make_load_job_config(source_format, # type: str write_disposition, # type: str schema=None, # type: Optional[List[SchemaField]] skip_leading_row=False, #type: bool ): """ Makes and returns a LoadJobConfig according to the passed-in parameters. Args: source_format: Should be a recognized BigQuery source format. See https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.sourceFormat write_disposition: Should be a recognized BigQuery write disposition. See https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.writeDisposition schema: A list of SchemaFields. If unset, BigQuery will try to infer a schema. skip_leading_row: If True, the first row of the file loaded in will be skipped. """ job_config = LoadJobConfig() job_config.source_format = source_format job_config.write_disposition = write_disposition if schema: job_config.schema = schema else: job_config.autodetect = True if skip_leading_row: job_config.skip_leading_rows = 1 return job_config
def update_or_create_table_from_csv( client: bigquery.Client, table_name: str, source_file: str, dataset: str, source_schema_file: str, ): LOGGER.debug("update_or_create_table_from_csv: %s=%s", table_name, [source_file]) dataset_ref = client.dataset(dataset) table_ref = dataset_ref.table(table_name) job_config = LoadJobConfig() job_config.source_format = "CSV" job_config.skip_leading_rows = 1 if Path(source_schema_file).exists(): job_config.schema = get_table_schema(source_schema_file) else: job_config.autodetect = True job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE with open(source_file, "rb") as source_fp: load_job = client.load_table_from_file(source_fp, destination=table_ref, job_config=job_config) # wait for job to complete load_job.result() LOGGER.info("updated config table: %s", table_ref.table_id)
def test_change_ignore_unknown_values(): loader = BaseDataLoader(table=None, from_=None, load_job_config=LoadJobConfig(), bq_client=None) assert loader.load_job_config.ignore_unknown_values is None loader = loader.ignore_unknown_values() assert loader.load_job_config.ignore_unknown_values is True loader = loader.ignore_unknown_values(False) assert loader.load_job_config.ignore_unknown_values is False
def test_constructor(): loader = DsvDataLoader(table=None, from_=None, load_job_config=LoadJobConfig(), bq_client=None) assert loader.table is None assert loader.from_ is None assert loader.load_job_config is not None assert loader.load_job_config.source_format == SourceFormat.CSV assert loader._bq_client is None
def test_change_allow_quoted_newlines(): loader = DsvDataLoader(table=None, from_=None, load_job_config=LoadJobConfig(), bq_client=None) assert loader.load_job_config.allow_quoted_newlines is None loader = loader.allow_quoted_newlines() assert loader.load_job_config.allow_quoted_newlines is True loader = loader.allow_quoted_newlines(False) assert loader.load_job_config.allow_quoted_newlines is False
def test_constructor(): loader = JsonDataLoader(table=None, from_=None, load_job_config=LoadJobConfig(), bq_client=None) assert loader.table is None assert loader.from_ is None assert loader.load_job_config is not None assert loader.load_job_config.source_format == SourceFormat.NEWLINE_DELIMITED_JSON assert loader._bq_client is None
def test_change_autodetect(): loader = JsonDataLoader(table=None, from_=None, load_job_config=LoadJobConfig(), bq_client=None) assert loader.load_job_config.autodetect is None loader = loader.autodetect() assert loader.load_job_config.autodetect is True loader = loader.autodetect(False) assert loader.load_job_config.autodetect is False
def _process_responses_for_bigquery(self, responses: Sequence[dict], schema: Sequence[SchemaField], table_reference: TableReference): job_config = LoadJobConfig() job_config.write_disposition = WriteDisposition.WRITE_APPEND job_config.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY, field='request_date') job_config.schema = schema for response in responses: if 'request_date' in response: response['request_date'] = response['request_date'].strftime( '%Y-%m-%dT%H:%M:%S.%f') if 'date' in response: response['date'] = response['date'].strftime( '%Y-%m-%dT%H:%M:%S.%f') load_job = self.bigquery.client.load_table_from_json( responses, table_reference, job_config=job_config) load_job.result()
def test_ctor_w_config(self): from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery.job import LoadJobConfig client = _make_client(project=self.PROJECT) full_name = SchemaField("full_name", "STRING", mode="REQUIRED") age = SchemaField("age", "INTEGER", mode="REQUIRED") config = LoadJobConfig() config.schema = [full_name, age] job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client, config) self.assertEqual(job.schema, [full_name, age]) config.destination_table_description = "Description" expected = {"description": "Description"} self.assertEqual( config._properties["load"]["destinationTableProperties"], expected) friendly_name = "Friendly Name" config._properties["load"]["destinationTableProperties"] = { "friendlyName": friendly_name } self.assertEqual(config.destination_table_friendly_name, friendly_name)
def test_change_write_disposition(): loader = BaseDataLoader(table=None, from_=None, load_job_config=LoadJobConfig(), bq_client=None) assert loader.load_job_config.write_disposition is None loader = loader.overwrite() assert loader.load_job_config.write_disposition == WriteDisposition.WRITE_TRUNCATE loader = loader.append() assert loader.load_job_config.write_disposition == WriteDisposition.WRITE_APPEND loader = loader.error_if_exists() assert loader.load_job_config.write_disposition == WriteDisposition.WRITE_EMPTY
def _process_response_rows_for_bigquery(self, rows: Sequence[dict], methods: Sequence[dict], table_reference: TableReference): rows_dataframe = DataFrame.from_records(rows) rows_dataframe['date'] = rows_dataframe['date'].apply( lambda x: x.date()) job_config = LoadJobConfig() job_config.write_disposition = WriteDisposition.WRITE_APPEND job_config.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY, field='date') job_config.schema = [ self._get_schema_for_field(column, methods) for column in list(rows_dataframe.columns.values) ] try: load_job = self.bigquery.client.load_table_from_dataframe( rows_dataframe, table_reference, job_config=job_config) load_job.result() except BadRequest as error: print(error.errors)
def __init__(self, *, table, partition: Optional[str] = None, from_: PackageFileLoader, bq_client: Client, load_job_config: LoadJobConfig = LoadJobConfig()): """Constructor of DsvDataLoader. Args: table (Table): table to load data into. from_ (PackageFileLoader): specifies where data is. bq_client (Client): instance of bigquery client to use accross the DSL. partition (Optional[str], optional): if you plan to load into a specific partition. Used as a decorator. Defaults to None. load_job_config (LoadJobConfig, optional): Big Query load job config. This is the object updated by this DSL. Defaults to LoadJobConfig(). """ _load_job_config = deepcopy(load_job_config) _load_job_config.source_format = SourceFormat.CSV super().__init__(table=table, partition=partition, from_=from_, bq_client=bq_client, load_job_config=_load_job_config)