def table_exists(dataset_ref: bigquery.dataset.DatasetReference, table_id: str) -> bool: """Check whether or not a BigQuery Table or View exists in a Dataset.""" table_ref = dataset_ref.table(table_id) try: client().get_table(table_ref) return True except exceptions.NotFound: logging.warning("Table [%s] does not exist in dataset [%s]", table_id, str(dataset_ref)) return False
def create_or_update_view(dataset_ref: bigquery.dataset.DatasetReference, view: bqview.BigQueryView): """Create a View if it does not exist, or update its query if it does. Args: dataset_ref: The BigQuery dataset to store the view in. view: The View to create or update. """ view_ref = dataset_ref.table(view.view_id) bq_view = bigquery.Table(view_ref) bq_view.view_query = view.view_query if table_exists(dataset_ref, view.view_id): logging.info("Updating existing view [%s]", str(bq_view)) client().update_table(bq_view, ['view_query']) else: logging.info("Creating view %s", str(bq_view)) client().create_table(bq_view)
def export_to_cloud_storage(dataset_ref: bigquery.dataset.DatasetReference, bucket: str, view: bqview.BigQueryView, state_code: str): """Exports the table corresponding to the given view to the bucket. Extracts the entire table and exports in JSON format to the given bucket in Cloud Storage. This is a synchronous function that waits for the query job to complete before returning. Args: dataset_ref: The dataset where the view and table exist. bucket: The bucket in Cloud Storage where the export should go. view: The view whose corresponding table to export. state_code: The state code of the data being exported. """ source_tablename = _table_name_for_view(view, state_code) if table_exists(dataset_ref, source_tablename): destination_filename = _destination_filename_for_view(view, state_code) destination_uri = "gs://{}/{}".format(bucket, destination_filename) table_ref = dataset_ref.table(source_tablename) job_config = bigquery.ExtractJobConfig() job_config.destination_format = \ bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON extract_job = client().extract_table( table_ref, destination_uri, # Location must match that of the source table. location=LOCATION, job_config=job_config) # Waits for job to complete extract_job.result() else: logging.error("Table [%s] does not exist in dataset [%s]", source_tablename, str(dataset_ref))
def start_table_load( dataset_ref: bigquery.dataset.DatasetReference, table_name: str, schema_type: SchemaType) -> \ Optional[Tuple[bigquery.job.LoadJob, bigquery.table.TableReference]]: """Loads a table from CSV data in GCS to BigQuery. Given a table name, retrieve the export URI and schema from export_config, then load the table into BigQuery. This starts the job, but does not wait until it completes. Tables are created if they do not exist, and overwritten if they do exist. Because we are using bigquery.WriteDisposition.WRITE_TRUNCATE, the table's data will be completely wiped and overwritten with the contents of the CSV. Args: dataset_ref: The BigQuery dataset to load the table into. Gets created if it does not already exist. table_name: Table to import. Table must be defined in the export_config.*_TABLES_TO_EXPORT for the given module schema_type: The schema of the table being loaded, either SchemaType.JAILS or SchemaType.STATE. Returns: (load_job, table_ref) where load_job is the LoadJob object containing job details, and table_ref is the destination TableReference object. If the job fails to start, returns None. """ if schema_type == SchemaType.JAILS: export_schema = export_config.COUNTY_TABLE_EXPORT_SCHEMA elif schema_type == SchemaType.STATE: export_schema = export_config.STATE_TABLE_EXPORT_SCHEMA else: logging.exception("Unknown schema type: %s", schema_type) return None bq_utils.create_dataset_if_necessary(dataset_ref) uri = export_config.gcs_export_uri(table_name) table_ref = dataset_ref.table(table_name) try: bq_schema = [ bigquery.SchemaField(field['name'], field['type'], field['mode']) for field in export_schema[table_name] ] except KeyError: logging.exception( "Unknown table name '%s'. Is it listed in " "the TABLES_TO_EXPORT for the %s module?", schema_type, table_name) return None job_config = bigquery.LoadJobConfig() job_config.schema = bq_schema job_config.source_format = bigquery.SourceFormat.CSV job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE load_job = bq_utils.client().load_table_from_uri(uri, table_ref, job_config=job_config) logging.info("Started load job %s for table %s.%s.%s", load_job.job_id, table_ref.project, table_ref.dataset_id, table_ref.table_id) return load_job, table_ref