def main(event, context): """Entrypoint for Cloud Function""" data = base64.b64decode(event['data']) upstream_bq_dts_obj = json.loads(data) error = upstream_bq_dts_obj.get('errorStatus') if error: logging.error( RuntimeError(f"Error in upstream query job: {error['message']}.")) else: project_id = get_env('PROJECT_ID') dataset_id = upstream_bq_dts_obj['destinationDatasetId'] table_name = upstream_bq_dts_obj['params'][ 'destination_table_name_template'] schedule_time = upstream_bq_dts_obj['scheduleTime'] bq_client = bigquery.Client(client_info=CLIENT_INFO) dataset_ref = bigquery.DatasetReference.from_string( dataset_id, default_project=project_id) table_ref = dataset_ref.table(table_name) destination_uri = get_destination_uri(schedule_time) extract_config = bigquery.ExtractJobConfig( compression=get_env('COMPRESSION'), destination_format=get_env('DEST_FMT'), field_delimeter=get_env('FIELD_DELIMITER'), use_avro_logical_types=get_env('USE_AVRO_TYPES')) bq_client.extract_table(table_ref, destination_uri, job_id_prefix="email_export_", job_config=extract_config) print( f"Exporting {project_id}:{dataset_id}.{table_name} to {destination_uri}" )
def _extract_bq_table(project_id, dataset_id, table_id, bucket_name, facturation_project_id): work_directory = str(uuid.uuid4()) facturation_project_id = facturation_project_id or project_id # Prepare extract job client = bigquery.Client(project=facturation_project_id) dataset_ref = client.dataset(dataset_id, project=project_id) table_ref = dataset_ref.table(table_id) gs_uri = "gs://{}/{}/part_*.csv.gz".format(bucket_name, work_directory) extract_conf = bigquery.ExtractJobConfig() extract_conf.compression = 'GZIP' extract_conf.destination_format = 'CSV' extract_conf.print_header = False # Ensure bucket exists location = client.get_dataset(dataset_ref).location _ensure_bucket(project_id, bucket_name, location) print('Extracting table %s to %s' % (table_ref, gs_uri)) extract_job = client.extract_table(table_ref, gs_uri, job_config=extract_conf) extract_job.result() _check_job_status(extract_job) return work_directory
def _upload_table_to_gcs( table, bucket, gcs_path, experiment_slug, table_name, source_project, client, storage_client, ): """Export the provided table reference to GCS as JSON.""" # add a random string to the identifier to prevent collision errors if there # happen to be multiple instances running that export data for the same experiment tmp = "".join(random.choices(string.ascii_lowercase, k=8)) destination_uri = ( f"gs://{bucket}/{gcs_path}/{experiment_slug}_{table_name}_{tmp}.ndjson" ) print(f"Export table {table} to {destination_uri}") job_config = bigquery.ExtractJobConfig() job_config.destination_format = "NEWLINE_DELIMITED_JSON" extract_job = client.extract_table(table, destination_uri, location="US", job_config=job_config) extract_job.result() # convert ndjson to json _convert_ndjson_to_json(bucket, gcs_path, experiment_slug, table_name, storage_client, tmp)
def _export_table( client: bigquery.Client, project_id: str, dataset_id: str, table: str, bucket: str, storage_client: storage.Client, ): """Export a single table or view to GCS as JSON.""" # since views cannot get exported directly, write data into a temporary table job = client.query( f""" SELECT * FROM {dataset_id}.{table} """ ) job.result() destination_uri = f"gs://{bucket}/{table}.ndjson" dataset_ref = bigquery.DatasetReference(project_id, job.destination.dataset_id) table_ref = dataset_ref.table(job.destination.table_id) logger.info(f"Export table {table} to {destination_uri}") job_config = bigquery.ExtractJobConfig() job_config.destination_format = "NEWLINE_DELIMITED_JSON" extract_job = client.extract_table( table_ref, destination_uri, location="US", job_config=job_config ) extract_job.result() # convert ndjson to json _convert_ndjson_to_json(bucket, table, storage_client)
def _publish_table_as_json(self, result_table): """Export the `result_table` data as JSON to Cloud Storage.""" prefix = (f"api/{self.api_version}/tables/{self.dataset}/" f"{self.table}/{self.version}/files/") if self.date is not None: # if date exists, then query is incremental and newest results are exported prefix += f"{self.date}/" logging.info( f"""Export JSON for {result_table} to {self.stage_gcs_path}""") table_ref = self.client.get_table(result_table) job_config = bigquery.ExtractJobConfig() job_config.destination_format = "NEWLINE_DELIMITED_JSON" # "*" makes sure that files larger than 1GB get split up into JSON files # files are written to a stage directory first destination_uri = (f"gs://{self.target_bucket}/" + self.stage_gcs_path + "*.ndjson") extract_job = self.client.extract_table(table_ref, destination_uri, location="US", job_config=job_config) extract_job.result() self._gcp_convert_ndjson_to_json(prefix)
def save_table_to_storage(bq_table_id, bq_project_id='freestyle-libre-app', bq_dataset_id='tmp', cred_file=None): print('saving file to storage...') PROJECT_ID = bq_project_id CREDENTIALS = get_credentials(cred_file) yesterday_dt = datetime.datetime.today() - datetime.timedelta(days=1) yesterday_str = datetime.datetime.strftime(yesterday_dt, '%Y%m%d') gcs_bucket = 'adcpipeline.appspot.com/digital-exhibit/data/fsll/nfc_scan_success' gcs_filename = '{filename}.json'.format(filename='scan_data') gcs_destination_uri = 'gs://{}/{}'.format(gcs_bucket, gcs_filename) bq_client = bigquery.Client(project=PROJECT_ID, credentials=CREDENTIALS) dataset_ref = bq_client.dataset(bq_dataset_id) table_ref = dataset_ref.table(bq_table_id) extract_config = bigquery.ExtractJobConfig() # extract_config.compression = 'NONE' extract_config.destination_format = 'NEWLINE_DELIMITED_JSON' extract_job = bq_client.extract_table(table_ref, gcs_destination_uri, job_id_prefix='exhibit', job_config=extract_config, location='US') # API request extract_job.result() # Waits for job to complete. print('Exported {}.{}.{} to {}'.format(bq_project_id, bq_dataset_id, bq_table_id, gcs_destination_uri)) return (extract_job)
def table_to_cloud_storage(self, dataset_id, table_id, bucket_name, filename, job_config=None, export_format="csv", compression_format="gz", location="US", **kwargs): """Extract a table from BigQuery and send to GoogleStorage""" complete_filename = self._complete_filename(filename, export_format, compression_format) destination_uri = "gs://{}/{}".format(bucket_name, complete_filename) table = self._client.dataset(dataset_id).table(table_id) job_config = job_config if job_config else bigquery.ExtractJobConfig() job_config.compression = self.COMPRESSION_FORMATS.get( compression_format) job_config.destination_format = self.FILE_FORMATS.get(export_format) return self._client.extract_table(table, destination_uri, location=location, job_config=job_config, **kwargs).result()
def bq_to_bucket_tsv(src_table, project, dataset, bucket_name, bucket_file, do_batch, do_header): """ Get a BQ Result to a Bucket TSV file Export BQ table to a cloud bucket """ client = bigquery.Client() destination_uri = "gs://{}/{}".format(bucket_name, bucket_file) dataset_ref = client.dataset(dataset, project=project) table_ref = dataset_ref.table(src_table) job_config = bigquery.ExtractJobConfig() if do_batch: job_config.priority = bigquery.QueryPriority.BATCH location = 'US' job_config.field_delimiter = '\t' job_config.print_header = do_header extract_job = client.extract_table(table_ref, destination_uri, location="US", job_config=job_config) # Query job_state = 'NOT_STARTED' while job_state != 'DONE': extract_job = client.get_job(extract_job.job_id, location=location) print('Job {} is currently in state {}'.format(extract_job.job_id, extract_job.state)) job_state = extract_job.state if job_state != 'DONE': time.sleep(5) print('Job {} is done'.format(extract_job.job_id)) extract_job = client.get_job(extract_job.job_id, location=location) if extract_job.error_result is not None: print('Error result!! {}'.format(extract_job.error_result)) return False return True
def execute(self, *args): super().execute() valid = EssentialParameters( self.__class__.__name__, [self._tblname, self._bucket, self._dest_dir] ) valid() os.makedirs(self._dest_dir, exist_ok=True) if isinstance(self._credentials, str): self._logger.warning( ( "DeprecationWarning: " "In the near future, " "the `credentials` will be changed to accept only dictionary types. " ) ) key_filepath = self._credentials else: key_filepath = self._source_path_reader(self._credentials) gbq_client = BigQuery.get_bigquery_client(key_filepath) gbq_ref = gbq_client.dataset(self._dataset).table(self._tblname) gcs_client = Gcs.get_gcs_client(key_filepath) gcs_bucket = gcs_client.bucket(self._bucket) ymd_hms = datetime.now().strftime("%Y%m%d%H%M%S%f") path = "%s-%s" % ("".join(random.choices(string.ascii_letters, k=8)), ymd_hms) prefix = "%s/%s/%s" % (self._dataset, self._tblname, path) """ gsc dir -> gs://{bucket_name} /{dataset_name}/{table_name} /{XXXXXXXX}-{yyyyMMddHHmmssSSS}/*.csv.gz """ if self._filename: dest_gcs = "gs://%s/%s/%s*.csv.gz" % (self._bucket, prefix, self._filename) else: dest_gcs = "gs://%s/%s/*.csv.gz" % (self._bucket, prefix) # job config settings job_config = bigquery.ExtractJobConfig() job_config.compression = bigquery.Compression.GZIP job_config.destination_format = bigquery.DestinationFormat.CSV # Execute query. job = gbq_client.extract_table( gbq_ref, dest_gcs, job_config=job_config, location=self._location ) job.result() # Download from gcs for blob in gcs_client.list_blobs(gcs_bucket, prefix=prefix): dest = os.path.join(self._dest_dir, os.path.basename(blob.name)) blob.download_to_filename(dest) # Cleanup temporary files for blob in gcs_client.list_blobs(gcs_bucket, prefix=prefix): blob.delete()
def GBQTableToGCS(self, **kwargs): self.bucket = kwargs.get('bucket', None) self.destination = kwargs.get('destination', None) self.dataset = kwargs.get('dataset', None) self.table = kwargs.get('table', None) self.field_delimiter = kwargs.get('field_delimiter', ',') client = bigquery.Client() self.destination_uri = f'gs://{self.bucket}/{self.destination}' self.dataset_ref = client.dataset(self.dataset) self.table_ref = self.dataset_ref.table(self.table) job_config = bigquery.ExtractJobConfig() job_config.field_delimiter = self.field_delimiter if helpers.isLocationArgVersion(): extract_job = client.extract_table(self.table_ref, self.destination_uri, location='US', job_config=job_config) extract_job.result() else: extract_job = client.extract_table(self.table_ref, self.destination_uri,job_config=job_config) extract_job.result() return extract_job
def export_table(bq_client, table_ref, dest_uri, dest_fmt): """ Run the extract job to export the give table to the given destination and wait for completion""" job_config = bigquery.ExtractJobConfig(destination_format=dest_fmt) extract_job = bq_client.extract_table(table_ref, dest_uri, location='US', job_config=job_config) extract_job.result() logging.info("Exported %s to %s", table_ref.table_id, dest_uri)
def get_extract_config(file_name): jc = bq.ExtractJobConfig() compression = bq.job.Compression.GZIP if fnmatch(file_name, '*.gz') \ else bq.job.Compression.NONE jc.compression = compression jc.destination_format = bq.job.DestinationFormat.CSV return jc
def config_job(self, destination_format='CSV', field_delimiter=',', print_header=True): job_config = bigquery.ExtractJobConfig() job_config.destination_format = destination_format job_config.field_delimiter = field_delimiter job_config.print_header = print_header return job_config
def read_file(bucket, storage_client=storage_client, bq_client=bq_client): ds_ref = bq_client.dataset('pysearchml') bq_client.create_dataset(ds_ref, exists_ok=True) table_id = 'es_docs' table_ref = ds_ref.table(table_id) bucket_obj = storage_client.bucket(bucket) if not bucket_obj.exists(): bucket_obj.create() # Query GA data query_path = PATH / f'{args.model_name}' / 'ga_data.sql' query = open(str(query_path)).read() print(query) job_config = bigquery.QueryJobConfig() job_config.destination = f'{bq_client.project}.pysearchml.{table_id}' job_config.maximum_bytes_billed = 10 * (1024 ** 3) job_config.write_disposition = 'WRITE_TRUNCATE' job = bq_client.query(query, job_config=job_config) job.result() # export BigQuery table to GCS destination_uri = f'gs://{bucket}/es_docs.gz' extract_config = bigquery.ExtractJobConfig() extract_config.compression = 'GZIP' extract_config.destination_format = 'NEWLINE_DELIMITED_JSON' job = bq_client.extract_table(table_ref, destination_uri, job_config=extract_config) job.result() # Download data blob = bucket_obj.blob('es_docs.gz') file_obj = gzip.io.BytesIO() blob.download_to_file(file_obj) file_obj.seek(0) c = 0 for row in gzip.GzipFile(fileobj=file_obj, mode='rb'): row = json.loads(row) yield { '_index': index, '_source': row, '_id': row['sku'] } c += 1 if not c % 1000: print(c) # Delete BQ Table bq_client.delete_table(table_ref)
def _dataset_to_bucket_job(self, dataset_to_bucket_config): config = dataset_to_bucket_config source = self._build_table_id(config.data_name) job_config = bigquery.ExtractJobConfig() job_config.compression = 'GZIP' destination_uri = (self._blob_uri_prefix + config.data_name + '-*.csv.gz') job_config.field_delimiter = self._separator job = self._bq_client.extract_table(source=source, destination_uris=destination_uri, job_config=job_config) return job
def bqtable2gs(self, dataset_name, table_name, bucket, gspath, file_name, file_format=CSV, compression=False, ext='.gzip'): bq_client = self.client(self.project_id) job_config = bigquery.ExtractJobConfig() job_config.destination_format = file_format dataset_ref = bq_client.dataset(dataset_name, project=self.bq_project_id) if compression: job_config.compression = 'GZIP' file_name = file_name + ext destination_uri = 'gs://' + path.join(bucket, gspath, file_name) extract_job = \ bq_client.extract_table(dataset_ref.table(table_name), destination_uri, job_config=job_config) return extract_job.result(), destination_uri
def _extract(self): job_config = bigquery.ExtractJobConfig(**{ "compression":"GZIP", "destinationFormat":"CSV", }) name = os.path.join(self.gcs_prefix, self.table_ref.table_id) job = self.bq_client.extract_table(self.table_ref, f'gs://{self.bucket}/{name}*.csv.gz', job_config=job_config) log.info("Waiting for BigQuery Table Extract job to finish...") result = job.result() # result.destination_uri_file_counts log.info("BigQuery job finished.")
def execute(self, *args): super().execute() valid = EssentialParameters( self.__class__.__name__, [self._tblname, self._bucket, self._dest_dir]) valid() os.makedirs(self._dest_dir, exist_ok=True) gbq_client = bigquery.Client.from_service_account_json( self._credentials) gbq_ref = gbq_client.dataset(self._dataset).table(self._tblname) gcs_client = storage.Client.from_service_account_json( self._credentials) gcs_bucket = gcs_client.get_bucket(self._bucket) ymd_hms = datetime.now().strftime("%Y%m%d%H%M%S%f") path = "%s-%s" % ("".join(random.choices(string.ascii_letters, k=8)), ymd_hms) prefix = "%s/%s/%s" % (self._dataset, self._tblname, path) # gsc dir -> gs://{bucket_name}/{dataset_name}/{table_name}/{XXXXXXXX}-{yyyyMMddHHmmssSSS}/*.csv.gz if self._filename: dest_gcs = "gs://%s/%s/%s*.csv.gz" % (self._bucket, prefix, self._filename) else: dest_gcs = "gs://%s/%s/*.csv.gz" % (self._bucket, prefix) # job config settings job_config = bigquery.ExtractJobConfig() job_config.compression = bigquery.Compression.GZIP job_config.desctination_format = bigquery.DestinationFormat.CSV # Execute query. job = gbq_client.extract_table(gbq_ref, dest_gcs, job_config=job_config, location=self._location) job.result() # Download from gcs for blob in gcs_bucket.list_blobs(prefix=prefix): dest = os.path.join(self._dest_dir, os.path.basename(blob.name)) blob.download_to_filename(dest) # Cleanup temporary files for blob in gcs_bucket.list_blobs(prefix=prefix): blob.delete()
def store_table(table_ref, client, destination_format, compression, destination_uri): job_config = bq.ExtractJobConfig() job_config.compression = bq.Compression.GZIP if compression else bq.Compression.NONE job_config.destination_format = destination_format destination_format_ = destination_format.split("_")[-1] extension = "." + destination_format_.lower( ) + f"{'.gz' if compression else ''}" destination_uri = destination_uri + extension with msg.loading(): client.extract_table(source=table_ref, destination_uris=destination_uri, job_config=job_config).result() msg.good("Table stored 🚀")
def main(validation_init_date, validation_end_date, bucket, destination): # Remove everything and deletes destination folder to receive new files. rmtree(destination, ignore_errors=True) os.makedirs(destination, exist_ok=True) storage_client = storage.Client() bq_client = bigquery.Client() ds_ref = bq_client.dataset('pysearchml') table_id = str(uuid.uuid4().hex) table_ref = ds_ref.table(table_id) # Query GA data query_path = PATH / 'validation.sql' query = open(str(query_path)).read() query = query.format(validation_init_date=validation_init_date, validation_end_date=validation_end_date) job_config = bigquery.QueryJobConfig() job_config.destination = f'{bq_client.project}.pysearchml.{table_id}' job_config.maximum_bytes_billed = 10 * (1024**3) job_config.write_disposition = 'WRITE_TRUNCATE' job = bq_client.query(query, job_config=job_config) job.result() # export BigQuery table to GCS # bucket will be set in accordance to which validation dataset is referenced, i.e., # whether regular validation or validation for the training dataset. destination_uri = f"gs://{bucket}/validation*.gz" extract_config = bigquery.ExtractJobConfig() extract_config.compression = 'GZIP' extract_config.destination_format = 'NEWLINE_DELIMITED_JSON' job = bq_client.extract_table(table_ref, destination_uri, job_config=extract_config) job.result() # Download data bucket_obj = storage_client.bucket(bucket.split('/')[0]) blobs = bucket_obj.list_blobs(prefix=bucket.partition('/')[-1]) for blob in blobs: blob.download_to_filename(f"{destination}/{blob.name.split('/')[-1]}") blob.delete() # delete BQ table bq_client.delete_table(table_ref)
def table_to_gcs(dataset: str, table: str, uri: str, gzip: bool = True, delete_first: bool = True): """Load a file from google cloud storage into BigQuery Parameters ---------- dataset: str The Bigquery dataset table: str The Bigquery table uri: str The google cloud storage uri (``gs://....``) gzip: bool Compress output with gzip or not """ client = bigquery.Client() destination_uri = uri dataset_ref = client.dataset(dataset) table_ref = dataset_ref.table(table) logging.info("Exporting {}.{} to {}".format(dataset, table, destination_uri)) job_config = bigquery.ExtractJobConfig() if (gzip): job_config.compression = bigquery.Compression.GZIP job_config.destination_format = bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON extract_job = client.extract_table( table_ref, destination_uri, # Location must match that of the source table. location="US", job_config=job_config) # API request try: extract_job.result() # Waits for table load to complete. logging.info("Extract completed") except google.api_core.exceptions.BadRequest: logging.error(f"extract failed") logging.error(extract_job.errors) logging.info("Exported {}.{} to {}".format(dataset, table, destination_uri))
def export_table_to_gcs(table_ref, table_name): """ Exporting the dataset table to GCS :param table_ref: the table to export :return: """ client = bigquery.Client() destination_uri = data_dir job_config = bigquery.ExtractJobConfig(print_header=False) extract_job = client.extract_table(table_ref, destination_uri, location="US", job_config=job_config) extract_job.result() print("Exported {}:{}.{} to {}".format(project_id, dataset_id, table_name, destination_uri))
def export_to_gcs(self, table_ref, client, dst_uri): from google.cloud import bigquery job_config = bigquery.ExtractJobConfig() # use delimiter which never exists in data job_config.field_delimiter = '\t' job_config.print_header = False extract_job = client.extract_table( table_ref, dst_uri + "/000000_0", # this naming rule has to be identical to datalake-hive for future useage efficiency location='US', # API request job_config=job_config) extract_job.result() # Waits for job to complete.
def run(self): logging.info( 'Started BigQueryToFileJob with SQL {} and output filename {}'. format(self.sql, self.output_filename)) # Query # TODO: Make sure it works with different locations. random_name = random_string(10) destination_table = self.bigquery_client.dataset( self.temp_bigquery_dataset).table(random_name) query_job_config = bigquery.QueryJobConfig() query_job_config.priority = bigquery.QueryPriority.INTERACTIVE query_job_config.destination = destination_table query_job = self.bigquery_client.query(self.sql, job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' # Export # TODO: Allow exporting to multiple files in case output is bigger than 1GB. bucket = self.temp_bucket filename = random_name + '.json' object = filename destination_uri = "gs://{}/{}".format(bucket, object) extract_job_config = bigquery.ExtractJobConfig() extract_job_config.priority = bigquery.QueryPriority.INTERACTIVE extract_job_config.destination_format = bigquery.job.DestinationFormat.NEWLINE_DELIMITED_JSON extract_job = self.bigquery_client.extract_table( destination_table, destination_uri, job_config=extract_job_config) submit_bigquery_job(extract_job, extract_job_config) assert query_job.state == 'DONE' # Delete the BigQuery table self.bigquery_client.delete_table(destination_table) # Download download_from_gcs(bucket, object, self.output_filename) delete_in_gcs(bucket, object)
def export_table(self, table, path, localtion='US'): if path[-1] != '/': path = path + '/' self.rm(path) destination_uri = "gs://" + str(Path(path, 'data-*.csv.gz')) dataset_id = table.split('.')[0] table_id = table.split('.')[-1] dataset_ref = bigquery.DatasetReference(project=self.project, dataset_id=dataset_id) table_ref = dataset_ref.table(table_id) job_config = bigquery.ExtractJobConfig() job_config.compression = bigquery.Compression.GZIP extract_job = self.client.extract_table(table_ref, destination_uri, location=localtion, job_config=job_config) extract_job.result() # Waits for job to complete return self.ls(path)
def _extract_to_blobs(self, source_table): # Returns list of blobs # 1: EXTRACT extract_job_config = bigquery.ExtractJobConfig( compression="GZIP", destination_format="CSV" ) extract_prefix = "staging/{}_{}".format(source_table, uuid.uuid4().hex) extract_destination_uri = "gs://{}/{}-*.csv.gz".format( self.staging_bucket, extract_prefix ) extract_job = bq_client.extract_table( source_table, extract_destination_uri, job_config=extract_job_config ) # API request extract_job.result() # Waits for job to complete. logger.info("Exported {} to {}".format(source_table, extract_destination_uri)) # 2: LIST BLOBS storage_client = storage.Client() bucket = storage_client.bucket(self.staging_bucket) return bucket.list_blobs(prefix=extract_prefix)
def _export_table( client: bigquery.Client, project_id: str, dataset_id: str, table: str, bucket: str, target_path: str, storage_client: storage.Client, ): """Export a single table or view to GCS as JSON.""" # since views cannot get exported directly, write data into a temporary table job = client.query( f""" SELECT * FROM {dataset_id}.{table} WHERE analysis_basis = 'enrollments' """ ) # todo: once experimenter supports different analysis_bases, remove filter job.result() # add a random string to the identifier to prevent collision errors if there # happen to be multiple instances running that export data for the same experiment tmp = "".join(random.choices(string.ascii_lowercase, k=8)) destination_uri = f"gs://{bucket}/{target_path}/{table}-{tmp}.ndjson" dataset_ref = bigquery.DatasetReference(project_id, job.destination.dataset_id) table_ref = dataset_ref.table(job.destination.table_id) logger.info(f"Export table {table} to {destination_uri}") job_config = bigquery.ExtractJobConfig() job_config.destination_format = "NEWLINE_DELIMITED_JSON" extract_job = client.extract_table(table_ref, destination_uri, location="US", job_config=job_config) extract_job.result() # convert ndjson to json _convert_ndjson_to_json(bucket, target_path, table, storage_client, tmp)
def export_table_to_cloud_storage_async( self, source_table_dataset_ref: bigquery.DatasetReference, source_table_id: str, destination_uri: str, destination_format: bigquery.DestinationFormat ) -> Optional[bigquery.ExtractJob]: if not self.table_exists(source_table_dataset_ref, source_table_id): logging.error("Table [%s] does not exist in dataset [%s]", source_table_id, str(source_table_dataset_ref)) return None table_ref = source_table_dataset_ref.table(source_table_id) job_config = bigquery.ExtractJobConfig() job_config.destination_format = destination_format return self.client.extract_table( table_ref, destination_uri, # Location must match that of the source table. location=self.LOCATION, job_config=job_config)
def extract_tables(FROM_DATASET): # Extract all tables in a dataset to a Cloud Storage bucket. print('Extracting {}:{} to Cloud Storage bucket {}'.format( FROM_PROJECT, FROM_DATASET, FROM_BUCKET)) tables = list(bq_client.list_tables(bq_client.dataset(FROM_DATASET))) extract_jobs = [] for table in tables: job_config = bigquery.ExtractJobConfig() job_config.destination_format = bigquery.DestinationFormat.AVRO extract_job = bq_client.extract_table( table.reference, ['gs://{}/{}.avro'.format(FROM_BUCKET, table.table_id)], location=FROM_LOCATION, # Available in 0.32.0 library. job_config=job_config) # Starts the extract job. extract_jobs.append(extract_job) for job in extract_jobs: job.result() return tables
def export_to_cloud_storage(dataset_ref: bigquery.dataset.DatasetReference, bucket: str, view: bqview.BigQueryView, state_code: str): """Exports the table corresponding to the given view to the bucket. Extracts the entire table and exports in JSON format to the given bucket in Cloud Storage. This is a synchronous function that waits for the query job to complete before returning. Args: dataset_ref: The dataset where the view and table exist. bucket: The bucket in Cloud Storage where the export should go. view: The view whose corresponding table to export. state_code: The state code of the data being exported. """ source_tablename = _table_name_for_view(view, state_code) if table_exists(dataset_ref, source_tablename): destination_filename = _destination_filename_for_view(view, state_code) destination_uri = "gs://{}/{}".format(bucket, destination_filename) table_ref = dataset_ref.table(source_tablename) job_config = bigquery.ExtractJobConfig() job_config.destination_format = \ bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON extract_job = client().extract_table( table_ref, destination_uri, # Location must match that of the source table. location=LOCATION, job_config=job_config) # Waits for job to complete extract_job.result() else: logging.error("Table [%s] does not exist in dataset [%s]", source_tablename, str(dataset_ref))