def _recreate_table(client: bigquery.Client, table_id: str, schema: list) -> None: logger.info(f'Recreating {table_id} table...') dataset_ref = client.dataset(DATASET_ID) table_ref = dataset_ref.table(table_id) table = bigquery.Table(table_ref, schema=schema) client.delete_table(table_ref, not_found_ok=True) client.create_table(table)
def delete_views_or_table(client: bigquery.Client, view_or_table_name: str, dataset: str): LOGGER.debug("delete_views_or_tables: %s", view_or_table_name) dataset_ref = client.dataset(dataset) table_ref = dataset_ref.table(view_or_table_name) client.delete_table(table_ref) LOGGER.info("deleted view or table: %s", view_or_table_name)
def clean_up_bq_tables(client: cloud_bigquery.Client, table_names: List[str]) -> None: for table_name in table_names: try: client.get_table(table_name) client.delete_table(table_name) except NotFound: pass
def table_id_us_east1(bigquery_client: bigquery.Client, project_id: str, dataset_id_us_east1: str): table_id = prefixer.create_prefix() full_table_id = f"{project_id}.{dataset_id_us_east1}.{table_id}" table = bigquery.Table( full_table_id, schema=[bigquery.SchemaField("string_col", "STRING")]) bigquery_client.create_table(table) yield full_table_id bigquery_client.delete_table(table, not_found_ok=True)
def random_table_id(bigquery_client: bigquery.Client, project_id: str, dataset_id: str): """Create a new table ID each time, so random_table_id can be used as target for load jobs. """ random_table_id = prefixer.create_prefix() full_table_id = f"{project_id}.{dataset_id}.{random_table_id}" yield full_table_id bigquery_client.delete_table(full_table_id, not_found_ok=True)
def writable_table(bigquery_client: bigquery.Client, project_id: str, random_dataset: bigquery.Dataset): full_table_id = f"{project_id}.{random_dataset.dataset_id}.writable_table_{random.randrange(1_000_000_000)}" table = bigquery.Table(full_table_id) table.schema = [ bigquery.SchemaField("field1", "STRING"), bigquery.SchemaField("field2", "INTEGER"), ] bigquery_client.create_table(table) yield full_table_id bigquery_client.delete_table(full_table_id)
def scalars_extreme_table(bigquery_client: bigquery.Client, project_id: str, dataset_id: str): schema = bigquery_client.schema_from_json(DATA_DIR / "scalars_schema.json") job_config = bigquery.LoadJobConfig() job_config.schema = schema job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON full_table_id = f"{project_id}.{dataset_id}.scalars_extreme" with open(DATA_DIR / "scalars_extreme.jsonl", "rb") as data_file: job = bigquery_client.load_table_from_file(data_file, full_table_id, job_config=job_config) job.result() yield full_table_id bigquery_client.delete_table(full_table_id)
def delete_table(bq_client: bigquery.Client, dataset_id: str, table_name: str) -> None: """Deletes a specified table in BigQuery. Args: bq_client: bigquery.Client object. dataset_id: String holding ID of dataset table_name: String of table name to delete Returns: None; Deletes a table in BigQuery """ dataset_ref = bq_client.dataset(dataset_id) table_ref = dataset_ref.table(table_name) bq_client.delete_table(table=table_ref)
def bigquery_empty_table( bigquery_dataset: str, bigquery_dml_dataset: str, bigquery_client: bigquery.Client, bigquery_schema: List[bigquery.SchemaField], ): project_id = bigquery_client.project # Cleanup the sample_dml table, if it exists. old_table_id = f"{project_id}.{bigquery_dataset}.sample_dml" bigquery_client.delete_table(old_table_id, not_found_ok=True) # Create new table in its own dataset. dataset_id = bigquery_dml_dataset table_id = f"{project_id}.{dataset_id}.sample_dml_{temp_suffix()}" empty_table = bigquery.Table(table_id, schema=bigquery_schema) bigquery_client.create_table(empty_table) yield table_id bigquery_client.delete_table(empty_table)
def delete_table(client: bigquery.Client, dataset_id: str, table_id: str): """ Deletes the specified table in the given project:dataset Args: client: BQ API client dataset_id: destination dataset table_id: table to be deleted Returns: Examples: delete_table(client, 'my_dataset', 'my_table') """ dataset_ref = client.dataset(dataset_id=dataset_id) tables_list = [t.table_id for t in list(client.list_tables(dataset_ref))] if table_id not in tables_list: print("THIS TABLE DOES NOT EXIST IN {}:{}".format(client.project, dataset_id)) else: table_ref = dataset_ref.table(table_id) client.delete_table(table_ref)
def delete_table_if_needed(client: bq.Client, dataset_id: str, table_id: str): table_full_id = get_full_table_name(client, dataset_id, table_id) client.delete_table(table_full_id, not_found_ok=True)
class BigQuery(BaseDb): """ A Google BigQuery database client Kwargs: name : str - The canonical name to use for this instance creds_file : str - The filepath of the desired GOOGLE_APPLICATION_CREDENTIALS file conn_kwargs : Use in place of a query string to set individual attributes of the connection defaults (project, etc) """ def __init__(self, name=None, creds_file=None, **conn_kwargs): if creds_file is None: creds_file = os.getenv('BIGQUERY_CREDS_FILE', None) self._bq_creds_file = creds_file self._conn_kwargs = dict(**BIGQUERY_DEFAULT_CONN_KWARGS) self._name = name for k, v in six.iteritems(conn_kwargs): if k in self._conn_kwargs: self._conn_kwargs[k] = v def __repr__(self): return '<{db.__class__.__name__}({project})>'.format(db=self, project=self._conn_kwargs['project']) @property def name(self): return self._name @property def project(self): return self._conn_kwargs['project'] @project.setter def project(self, value): self._conn_kwargs['project'] = value def _connect(self): if self._bq_creds_file is not None: if Path(self._bq_creds_file).exists(): os.environ.setdefault('GOOGLE_APPLICATION_CREDENTIALS', self._bq_creds_file) else: _log.warning('Path set by creds file does not exist: %s', self._bq_creds_file) self._conn = Client(**self._conn_kwargs) def _close(self): """ This is a no-op because the bigquery Client doesn't have a close method. The BaseDb close method will handle setting self._conn to None and self._connected to False. """ return def _query(self, query_string): self.connect() query_job = self._conn.query(query_string) return query_job.result() def query(self, query_string): from .result import QueryResult result = self._query(query_string) return QueryResult(result) def execute(self, query_string): self._query(query_string) def list_tables(self, dataset_id): """ List all tables in the provided dataset Args: dataset_id : str - The dataset to query Returns: list of table names """ self.connect() dataset_ref = self._conn.dataset(dataset_id) return [t.table_id for t in self._conn.list_tables(dataset_ref)] def delete_table(self, dataset_id, table_id): """ Delete the given table in the given dataset Args: dataset_id : str - The dataset containing the table to delete table_id : str - The table to delete Returns: None """ self.connect() table_ref = self._conn.dataset(dataset_id).table(table_id) self._conn.delete_table(table_ref)
def table_id(bigquery_client: bigquery.Client, project_id: str, dataset_id: str): table_id = f"{resource_prefix()}_update_with_dml" yield table_id full_table_id = f"{project_id}.{dataset_id}.{table_id}" bigquery_client.delete_table(full_table_id, not_found_ok=True)
def enrich_task(): client = Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) table = Table(temp_table_ref) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) if time_partitioning_field is not None: table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table.schema = schema table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql = read_file(sql_path, environment) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' # Copy temporary table to destination copy_job_config = CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table(dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)
def clean_up_bq_table(client: cloud_bigquery.Client, table_name: str) -> None: try: client.get_table(table_name) client.delete_table(table_name) except NotFound: pass
def bigquery_table(bigquery_client: bigquery.Client) -> bigquery.Table: """ Create a temporary table for the test, remove on cleanup """ table = bigquery_client.create_table(TEST_TABLE, exists_ok=True) yield table bigquery_client.delete_table(table)
def enrich_task(ds, **kwargs): template_context = kwargs.copy() template_context['ds'] = ds template_context['params'] = environment client = Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) table = Table(temp_table_ref) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table.schema = schema table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql_template = read_file(sql_path) sql = kwargs['task'].render_template('', sql_template, template_context) print('Enrichment sql:') print(sql) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' if load_all_partitions: # Copy temporary table to destination copy_job_config = CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table( dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' else: # Merge # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement merge_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 merge_job_config.priority = QueryPriority.INTERACTIVE merge_sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/merge_{task}.sql'.format( task=task)) merge_sql_template = read_file(merge_sql_path) template_context['params']['source_table'] = temp_table_name merge_sql = kwargs['task'].render_template( '', merge_sql_template, template_context) print('Merge sql:') print(merge_sql) merge_job = client.query(merge_sql, location='US', job_config=merge_job_config) submit_bigquery_job(merge_job, merge_job_config) assert merge_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)