def load_task(): client = Client() job_config = LoadJobConfig() schema_path = os.path.join( dags_folder, 'resources/stages/raw/schemas/{task}.json'.format(task=task)) job_config.schema = read_bigquery_schema_from_file(schema_path) job_config.source_format = SourceFormat.CSV if file_format == 'csv' else SourceFormat.NEWLINE_DELIMITED_JSON if file_format == 'csv': job_config.skip_leading_rows = 1 job_config.write_disposition = 'WRITE_TRUNCATE' job_config.allow_quoted_newlines = allow_quoted_newlines job_config.ignore_unknown_values = True export_location_uri = 'gs://{bucket}/export'.format( bucket=output_bucket) uri = '{export_location_uri}/{task}/*.{file_format}'.format( export_location_uri=export_location_uri, task=task, file_format=file_format) table_ref = client.dataset(dataset_name_raw).table(task) load_job = client.load_table_from_uri(uri, table_ref, job_config=job_config) submit_bigquery_job(load_job, job_config) assert load_job.state == 'DONE'
def load_batches(gcs_client: storage.Client, bq_client: bigquery.Client, gsurl: str, load_config: bigquery.LoadJobConfig, job_id: str, table: bigquery.TableReference): """orchestrate 1 or more load jobs based on number of URIs and total byte size of objects at gsurl""" batches = get_batches_for_gsurl(gcs_client, gsurl) jobs: List[Tuple[bigquery.TableReference, bigquery.LoadJob]] = [] for batch in batches: # None is passed to destination parameter below because the load_config # object contains the destination information job: bigquery.LoadJob = bq_client.load_table_from_uri( batch, None, job_config=load_config, job_id=job_id) jobs.append((table, job)) logging.log_bigquery_job( job, table, f"Submitted asynchronous bigquery load job: {job.job_id}") start_poll_for_errors = time.monotonic() # Check if job failed quickly while time.monotonic( ) - start_poll_for_errors < constants.WAIT_FOR_JOB_SECONDS: # Check if job failed quickly for table_ref, job in jobs: job.reload(client=bq_client) check_for_bq_job_and_children_errors(bq_client, job, table_ref) time.sleep(constants.JOB_POLL_INTERVAL_SECONDS)
def load_gcs_file(client: bigquery.Client, uri: str, table_ref: bigquery.table.TableReference, job_config: bigquery.LoadJobConfig, # job_id: str = str(datetime.datetime.now()).replace(' ', ''), ): """ Args: client: uri: table_ref: job_config: job_id: Returns: Examples: """ load_job = client.load_table_from_uri( source_uris=uri, destination=table_ref, # job_id=job_id, job_id_prefix='lgs-', job_config=job_config, ) tic = time.time() print('Starting job {}'.format(load_job.job_id)) load_job.result() print('Job took {} seconds'.format(time.time() - tic)) assert load_job.state == 'DONE'
def ingest_by_uri(cl: bigquery.Client, destination: str, uri: str, schema: List[Any]) -> bigquery.LoadJob: c = bigquery.LoadJobConfig( source_format="NEWLINE_DELIMITED_JSON", write_disposition="WRITE_TRUNCATE", schema=schema ) return cl.load_table_from_uri(uri, destination=destination, job_config=c)
def load_bigquery_table_via_bq_apis(bq_client: bigquery.Client, dataset_id, table_name, imported_data_info, src_uris): """ Load tables using BigQuery Load jobs, using the same configuration as BQ DTS ImportedDataInfo :return: """ # https://googlecloudplatform.github.io/google-cloud-python/latest/_modules/google/cloud/bigquery/client.html#Client.load_table_from_uri # Step 1 - Translate required fields for BigQuery Python SDK tgt_tabledef = imported_data_info['table_defs'][0] # Step 2 - Create target table if it doesn't exist dataset_ref = bq_client.dataset(dataset_id) table_ref = dataset_ref.table(table_name) try: bq_client.get_table(table_ref) except exceptions.NotFound: # Step 2a - Attach schema tgt_schema = RPCRecordSchema_to_GCloudSchema(tgt_tabledef['schema']) tgt_table = bigquery.Table(table_ref, schema=tgt_schema) # Step 2b - Attach description tgt_table.description = imported_data_info[ 'destination_table_description'] # Step 2c - Conditionally set partitioning type if '$' in table_name: tgt_table.partitioning_type = 'DAY' tgt_table._properties['tableReference'][ 'tableId'], _, _ = table_name.partition('$') # Step 2d - Create BigQuery table bq_client.create_table(tgt_table) # Step 3a - Create BigQuery Load Job ID current_datetime = datetime.datetime.utcnow().isoformat() raw_job_id = f'{table_name}_{current_datetime}' clean_job_id = BQ_JOB_ID_MATCHER.sub('___', raw_job_id) # Step 3b - Create BigQuery Job Config job_config = DTSTableDefinition_to_BQLoadJobConfig(tgt_tabledef) # Step 4 - Execute BigQuery Load Job using Python SDK load_job = bq_client.load_table_from_uri(source_uris=src_uris, destination=table_ref, job_id=clean_job_id, job_config=job_config) return load_job
def load_stage(dst_dataset: Dataset, bq_client: Client, bucket_name: str, gcs_client: storage.Client) -> List[LoadJob]: """ Stage files from a bucket to a dataset :param dst_dataset: reference to destination dataset object :param bq_client: a BigQuery client object :param bucket_name: the location in GCS containing the vocabulary files :param gcs_client: a Cloud Storage client object :return: list of completed load jobs """ blobs = list(gcs_client.list_blobs(bucket_name)) table_blobs = [_filename_to_table_name(blob.name) for blob in blobs] missing_blobs = [ table for table in VOCABULARY_TABLES if table not in table_blobs ] if missing_blobs: raise RuntimeError( f'Bucket {bucket_name} is missing files for tables {missing_blobs}' ) load_jobs = [] for blob in blobs: table_name = _filename_to_table_name(blob.name) # ignore any non-vocabulary files if table_name not in VOCABULARY_TABLES: continue destination = dst_dataset.table(table_name) safe_schema = safe_schema_for(table_name) job_config = LoadJobConfig() job_config.schema = safe_schema job_config.skip_leading_rows = 1 job_config.field_delimiter = FIELD_DELIMITER job_config.max_bad_records = MAX_BAD_RECORDS job_config.source_format = 'CSV' job_config.quote_character = '' source_uri = f'gs://{bucket_name}/{blob.name}' load_job = bq_client.load_table_from_uri(source_uri, destination, job_config=job_config) LOGGER.info(f'table:{destination} job_id:{load_job.job_id}') load_jobs.append(load_job) load_job.result() return load_jobs
def load_folder(dst_dataset: str, bq_client: BQClient, bucket_name: str, prefix: str, gcs_client: GCSClient, hpo_id: str) -> List[LoadJob]: """ Stage files from a bucket to a dataset :param dst_dataset: Identifies the destination dataset :param bq_client: a BigQuery client object :param bucket_name: the bucket in GCS containing the archive files :param prefix: prefix of the filepath URI :param gcs_client: a Cloud Storage client object :param hpo_id: Identifies the HPO site :return: list of completed load jobs """ blobs = list(gcs_client.list_blobs(bucket_name, prefix=prefix)) load_jobs = [] for blob in blobs: table_name = _filename_to_table_name(blob.name) if table_name not in AOU_REQUIRED: LOGGER.debug(f'Skipping file for {table_name}') continue schema = get_table_schema(table_name) hpo_table_name = f'{hpo_id}_{table_name}' fq_hpo_table = f'{bq_client.project}.{dst_dataset}.{hpo_table_name}' destination = Table(fq_hpo_table, schema=schema) destination = bq_client.create_table(destination) job_config = LoadJobConfig() job_config.schema = schema job_config.skip_leading_rows = 1 job_config.source_format = 'CSV' source_uri = f'gs://{bucket_name}/{blob.name}' load_job = bq_client.load_table_from_uri( source_uri, destination, job_config=job_config, job_id_prefix=f"{__file__.split('/')[-1].split('.')[0]}_") LOGGER.info(f'table:{destination} job_id:{load_job.job_id}') load_jobs.append(load_job) load_job.result() return load_jobs