def exist_table(client: bq.Client, dataset_id: str, table_id: str) -> bool: table_full_id = get_full_table_name(client, dataset_id, table_id) try: client.get_table(table_full_id) except NotFound: return False return True
def test_list_rows_page_size(bigquery_client: bigquery.Client, table_id: str): num_items = 7 page_size = 3 num_pages, num_last_page = divmod(num_items, page_size) to_insert = [{ "string_col": "item%d" % i, "rowindex": i } for i in range(num_items)] bigquery_client.load_table_from_json(to_insert, table_id).result() df = bigquery_client.list_rows( table_id, selected_fields=[ bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING) ], page_size=page_size, ) pages = df.pages for i in range(num_pages): page = next(pages) assert page.num_items == page_size page = next(pages) assert page.num_items == num_last_page
def _upload_entity_df( client: Client, table_name: str, entity_df: Union[pd.DataFrame, str], ) -> Table: """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table""" if isinstance(entity_df, str): job = client.query(f"CREATE TABLE {table_name} AS ({entity_df})") elif isinstance(entity_df, pd.DataFrame): # Drop the index so that we don't have unnecessary columns entity_df.reset_index(drop=True, inplace=True) job = client.load_table_from_dataframe(entity_df, table_name) else: raise InvalidEntityType(type(entity_df)) block_until_done(client, job) # Ensure that the table expires after some time table = client.get_table(table=table_name) table.expires = datetime.utcnow() + timedelta(minutes=30) client.update_table(table, ["expires"]) return table
def load_task(): client = Client() job_config = LoadJobConfig() schema_path = os.path.join( dags_folder, 'resources/stages/raw/schemas/{task}.json'.format(task=task)) job_config.schema = read_bigquery_schema_from_file(schema_path) job_config.source_format = SourceFormat.CSV if file_format == 'csv' else SourceFormat.NEWLINE_DELIMITED_JSON if file_format == 'csv': job_config.skip_leading_rows = 1 job_config.write_disposition = 'WRITE_TRUNCATE' job_config.allow_quoted_newlines = allow_quoted_newlines job_config.ignore_unknown_values = True export_location_uri = 'gs://{bucket}/export'.format( bucket=output_bucket) uri = '{export_location_uri}/{task}/*.{file_format}'.format( export_location_uri=export_location_uri, task=task, file_format=file_format) table_ref = client.dataset(dataset_name_raw).table(task) load_job = client.load_table_from_uri(uri, table_ref, job_config=job_config) submit_bigquery_job(load_job, job_config) assert load_job.state == 'DONE'
def create_view_task(ds, **kwargs): template_context = kwargs.copy() template_context['ds'] = ds template_context['params'] = environment client = Client() dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table(dest_table_name) table = Table(dest_table_ref) sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql_template = read_file(sql_path) sql = kwargs['task'].render_template('', sql_template, template_context) table.view_query = sql description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) logging.info('Creating view: ' + json.dumps(table.to_api_repr())) try: table = client.create_table(table) except Conflict: # https://cloud.google.com/bigquery/docs/managing-views table = client.update_table(table, ['view_query']) assert table.table_id == dest_table_name
def dataset_id(bigquery_client: bigquery.Client, project_id: str): dataset_id = prefixer.create_prefix() full_dataset_id = f"{project_id}.{dataset_id}" dataset = bigquery.Dataset(full_dataset_id) bigquery_client.create_dataset(dataset) yield dataset_id bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True)
def dataset_id(client: bigquery.Client): project_id = client.project dataset_id = prefixer.create_prefix() dataset = bigquery.Dataset(f"{project_id}.{dataset_id}") dataset = client.create_dataset(dataset) yield dataset_id client.delete_dataset(dataset_id, delete_contents=True)
def load_parquet( client: bigquery.Client, dataframe: pandas.DataFrame, destination_table_ref: bigquery.TableReference, location: Optional[str], schema: Optional[Dict[str, Any]], billing_project: Optional[str] = None, ): job_config = bigquery.LoadJobConfig() job_config.write_disposition = "WRITE_APPEND" job_config.create_disposition = "CREATE_NEVER" job_config.source_format = "PARQUET" if schema is not None: schema = pandas_gbq.schema.remove_policy_tags(schema) job_config.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) dataframe = cast_dataframe_for_parquet(dataframe, schema) try: client.load_table_from_dataframe( dataframe, destination_table_ref, job_config=job_config, location=location, project=billing_project, ).result() except pyarrow.lib.ArrowInvalid as exc: raise exceptions.ConversionError( "Could not convert DataFrame to Parquet.") from exc
def main(): args = get_sysargs(sys.argv[1:]) exporter = Exporter() jinjafy = Jinjafy('../data/queries/marreco/datajet/') client = Client() dataset = client.dataset(args.dataset) table = dataset.table(args.table) for day in range(args.days_init, args.days_end - 1, -1): print('processing day: ', day) for idx, file_ in enumerate(['productview.sql', 'search.sql', 'purchase.sql']): query = build_query(jinjafy, file_, {'dataset': '40663402', 'days_interval': day, 'days_interval_end': day}) exporter.bq_to_gcs(client, query, {'destination': table, 'maximum_bytes_billed': 1000000000000, 'write_disposition': 'WRITE_TRUNCATE'}, {'uri': args.uri.format(day=day, idx=idx), 'table': table, 'compression': 'GZIP', 'destination_format': 'NEWLINE_DELIMITED_JSON'})
def load_data_to_bq(df=None, table_name='CRY', dataset='price_data', project=None): client = Client() table = client.get_table(".".join([client.project, dataset, table_name])) if table.num_rows == 0 and df is not None: df.to_gbq(".".join([dataset, table_name]), if_exists='append') else: delete_qry = f'''DELETE FROM `{dataset+"."+table_name}` AS t2 WHERE concat(symbol, cast(date as string)) IN (SELECT concat(symbol, cast(MAX(date) as string)) FROM `{dataset+"."+table_name}` GROUP BY symbol) AND symbol IN {'("'+'","'.join(df.symbol.unique())+'")'}''' delete_DML = client.query(delete_qry) delete_DML.result() existing = pd.read_gbq(f'''select symbol, max(date) as max_date from {dataset+"."+table_name} group by symbol''', dialect="legacy") df = df.merge(existing, on='symbol', how='left') df = df.loc[(df.date.dt.tz_localize('UTC') > df.max_date) | df.max_date.isnull(), :] df.drop('max_date', axis=1, inplace=True) df.to_gbq(".".join([dataset, table_name]), if_exists='append', project_id=project)
def _add_new_columns(client: bigquery.Client, table_id: str, columns: List[str]) -> List[Dict]: """Adds any new columns if they are missing. Creates new string columns for every column if it doesn't exist. Args: client: The BigQuery client. table_id: Table id. columns: List of columns. Returns: The table schema. """ try: table = client.get_table(table_id) except google.api_core.exceptions.NotFound: logging.error( 'Table: \'%s\' not found - please create the table. It is okay to create it with no columns.', table_id) raise new_fields = [] for c in columns: field = bigquery.SchemaField(c, 'STRING') if field not in table.schema: new_fields.append(field) if new_fields: logging.info('Found new fields: %s', new_fields) table.schema += new_fields client.update_table(table, ['schema']) return table.schema
def _connect(self): if self._bq_creds_file is not None: if Path(self._bq_creds_file).exists(): os.environ.setdefault('GOOGLE_APPLICATION_CREDENTIALS', self._bq_creds_file) else: _log.warning('Path set by creds file does not exist: %s', self._bq_creds_file) self._conn = Client(**self._conn_kwargs)
def update_recently_unixtime(client: bigquery.Client, df_unixtime): table_id = f'{project_id}.{dataset}.{recently_unixtime_table}' # unixtimeデータフレームをunixtime管理テーブルへinsert client.insert_rows_from_dataframe(client.get_table(table_id), df_unixtime) # unixtime管理テーブルでTABLE_NAMEカラムが重複してるデータを削除 duplicate_query = f""" SELECT * EXCEPT(rowNumber) FROM ( SELECT *, ROW_NUMBER() OVER ( PARTITION BY TABLE_NAME ORDER BY UNIX_TIME DESC ) as rowNumber FROM {table_id} ) WHERE rowNumber = 1; """ job_config = bigquery.QueryJobConfig() job_config.destination = table_id job_config.write_disposition = 'WRITE_TRUNCATE' job = client.query(duplicate_query, job_config=job_config) job.result()
def dest_partitioned_table_allow_jagged(bq: bigquery.Client, dest_dataset, monkeypatch) -> bigquery.Table: public_table: bigquery.Table = bq.get_table( bigquery.TableReference.from_string( "bigquery-public-data.new_york_311.311_service_requests")) schema = public_table.schema if os.getenv('GCP_PROJECT') is None: monkeypatch.setenv("GCP_PROJECT", bq.project) extra_field_for_jagged_row_test = bigquery.schema.SchemaField( "extra_jagged_row_test_column", "STRING") schema.append(extra_field_for_jagged_row_test) table: bigquery.Table = bigquery.Table( f"{os.getenv('GCP_PROJECT')}" f".{dest_dataset.dataset_id}.cf_test_nyc_311_" f"{str(uuid.uuid4()).replace('-', '_')}", schema=schema, ) table.time_partitioning = bigquery.TimePartitioning() table.time_partitioning.type_ = bigquery.TimePartitioningType.HOUR table.time_partitioning.field = "created_date" table = bq.create_table(table) return table
def _table_exists(self, bq: bigquery.Client, table_ref: bigquery.TableReference) -> bool: try: bq.get_table(table_ref) return True except NotFound: return False
def dest_partitioned_table(request, bq: bigquery.Client, mock_env, dest_dataset) -> bigquery.Table: public_table: bigquery.Table = bq.get_table( bigquery.TableReference.from_string( "bigquery-public-data.new_york_311.311_service_requests")) schema = public_table.schema table: bigquery.Table = bigquery.Table( f"{os.environ.get('GCP_PROJECT')}" f".{dest_dataset.dataset_id}.cf_test_nyc_311_" f"{str(uuid.uuid4()).replace('-','_')}", schema=schema, ) table.time_partitioning = bigquery.TimePartitioning() table.time_partitioning.type_ = bigquery.TimePartitioningType.HOUR table.time_partitioning.field = "created_date" table = bq.create_table(table) def teardown(): bq.delete_table(table, not_found_ok=True) request.addfinalizer(teardown) return table
def update_or_create_table_from_csv( client: bigquery.Client, table_name: str, source_file: str, dataset: str, source_schema_file: str, ): LOGGER.debug("update_or_create_table_from_csv: %s=%s", table_name, [source_file]) dataset_ref = client.dataset(dataset) table_ref = dataset_ref.table(table_name) job_config = LoadJobConfig() job_config.source_format = "CSV" job_config.skip_leading_rows = 1 if Path(source_schema_file).exists(): job_config.schema = get_table_schema(source_schema_file) else: job_config.autodetect = True job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE with open(source_file, "rb") as source_fp: load_job = client.load_table_from_file(source_fp, destination=table_ref, job_config=job_config) # wait for job to complete load_job.result() LOGGER.info("updated config table: %s", table_ref.table_id)
def retry_query(gcs_client: storage.Client, bq_client: bigquery.Client, lock_blob: storage.Blob, failed_job_id: str, table: bigquery.TableReference, retry_attempt_cnt): """Retry a query that failed""" if retry_attempt_cnt > 1: # if this is not the first retry, truncate over the previous # job_id retry attempt suffix '_xx' (3 chars) retry_job_id = f"{failed_job_id[:-3]}_{retry_attempt_cnt:02}" # pad with zero else: retry_job_id = f"{failed_job_id}_{retry_attempt_cnt:02}" # pad with zero failed_job: bigquery.QueryJob = bq_client.get_job(failed_job_id) job_config: bigquery.QueryJobConfig = bigquery.QueryJobConfig( table_definitions=failed_job.table_definitions, use_legacy_sql=False) retry_job = bq_client.query(failed_job.query, job_config=job_config, job_id=retry_job_id) # To keep track of retry attempts between cloud # function invocations, the retry count state is # kept in the _bqlock lock file. utils.handle_bq_lock(gcs_client, lock_blob, retry_job_id, table, retry_attempt_cnt=retry_attempt_cnt) logging.log_bigquery_job( retry_job, table, f"Submitted asynchronous query job: {retry_job_id}")
def _export_table( client: bigquery.Client, project_id: str, dataset_id: str, table: str, bucket: str, storage_client: storage.Client, ): """Export a single table or view to GCS as JSON.""" # since views cannot get exported directly, write data into a temporary table job = client.query( f""" SELECT * FROM {dataset_id}.{table} """ ) job.result() destination_uri = f"gs://{bucket}/{table}.ndjson" dataset_ref = bigquery.DatasetReference(project_id, job.destination.dataset_id) table_ref = dataset_ref.table(job.destination.table_id) logger.info(f"Export table {table} to {destination_uri}") job_config = bigquery.ExtractJobConfig() job_config.destination_format = "NEWLINE_DELIMITED_JSON" extract_job = client.extract_table( table_ref, destination_uri, location="US", job_config=job_config ) extract_job.result() # convert ndjson to json _convert_ndjson_to_json(bucket, table, storage_client)
def delete_views_or_table(client: bigquery.Client, view_or_table_name: str, dataset: str): LOGGER.debug("delete_views_or_tables: %s", view_or_table_name) dataset_ref = client.dataset(dataset) table_ref = dataset_ref.table(view_or_table_name) client.delete_table(table_ref) LOGGER.info("deleted view or table: %s", view_or_table_name)
def create_view_task(): client = Client() dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table(dest_table_name) table = Table(dest_table_ref) sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql = read_file(sql_path, environment) table.view_query = sql description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) logging.info('Creating view: ' + json.dumps(table.to_api_repr())) try: table = client.create_table(table) except Conflict: # https://cloud.google.com/bigquery/docs/managing-views table = client.update_table(table, ['view_query']) assert table.table_id == dest_table_name
def execute_query(bq_client: bigquery.Client, env_vars: {}, query_path: object, output_table_name: str, time_partition: bool) -> None: """Executes transformation query to a new destination table. Args: bq_client: bigquery.Client object env_vars: Dictionary of key: value, where value is environment variable query_path: Object representing location of SQL query to execute output_table_name: String representing name of table that holds output time_partition: Boolean indicating whether to time-partition output """ dataset_ref = bq_client.get_dataset( bigquery.DatasetReference(project=bq_client.project, dataset_id=env_vars['corrected_dataset_id'])) table_ref = dataset_ref.table(output_table_name) job_config = bigquery.QueryJobConfig() job_config.destination = table_ref job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE # Time Partitioning table is only needed for final output query if time_partition: job_config.time_partitioning = bigquery.TimePartitioning( field='usage_start_time', expiration_ms=None) logging.info('Attempting query...') # Execute Query query_job = bq_client.query(query=render_template(query_path, env_vars), job_config=job_config) query_job.result() # Waits for the query to finish
def exist_dataset(client: bq.Client, dataset_id: str) -> bool: dataset_full_id = get_full_dataset_name(client, dataset_id) try: client.get_dataset(dataset_full_id) except NotFound: return False return True
def get_or_create_table(client: bigquery.Client, dataset_id: str, table_id: str) -> bigquery.Table: """ BigQueryのデータセットとテーブルを作成する。既に存在する場合は取得する。 """ logging.info(f'Creating dataset {dataset_id} if not exists...') dataset = client.create_dataset(dataset_id, exists_ok=True) # データセットを作成または取得する。 logging.info(f'Creating table {dataset_id}.{table_id} if not exists...') table_ref = dataset.table(table_id) return client.create_table( # テーブルを作成または取得する。 bigquery.Table(table_ref, schema=[ bigquery.SchemaField('id', 'string', description='ツイートのID'), bigquery.SchemaField('lang', 'string', description='ツイートの言語'), bigquery.SchemaField('screen_name', 'string', description='ユーザー名'), bigquery.SchemaField('text', 'string', description='ツイートの本文'), bigquery.SchemaField('created_at', 'timestamp', description='ツイートの日時'), ]), exists_ok=True)
def main(): from google.cloud.bigquery import Client tables = {} with open("/vol/required_tables.txt") as rt: table_names = rt.read().split() bq_client = Client() for table_name in table_names: splited_table_name = table_name.split(".") if len(splited_table_name) == 3: dataset_ref = bq_client.dataset(splited_table_name[1], project=splited_table_name[0]) else: dataset_ref = bq_client.dataset(splited_table_name[0]) table_ref = dataset_ref.table(splited_table_name[-1]) table = bq_client.get_table(table_ref) tables[table_name] = [ field.to_api_repr() for field in table.schema ] if table_name.endswith("*"): tables[table_name].append({ "name": "_TABLE_SUFFIX", "type": "STRING", "mode": "REQUIRED" }) with open("/vol/schema.json", mode="w") as schema: schema.write(json.dumps(tables))
def _recreate_table(client: bigquery.Client, table_id: str, schema: list) -> None: logger.info(f'Recreating {table_id} table...') dataset_ref = client.dataset(DATASET_ID) table_ref = dataset_ref.table(table_id) table = bigquery.Table(table_ref, schema=schema) client.delete_table(table_ref, not_found_ok=True) client.create_table(table)
def _upload_entity_df_into_bigquery( client: Client, project: str, dataset_name: str, entity_df: Union[pandas.DataFrame, str], ) -> Table: """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table""" table_id = _get_table_id_for_new_entity(client, project, dataset_name) if type(entity_df) is str: job = client.query(f"CREATE TABLE {table_id} AS ({entity_df})") job.result() elif isinstance(entity_df, pandas.DataFrame): # Drop the index so that we dont have unnecessary columns entity_df.reset_index(drop=True, inplace=True) # Upload the dataframe into BigQuery, creating a temporary table job_config = bigquery.LoadJobConfig() job = client.load_table_from_dataframe(entity_df, table_id, job_config=job_config) job.result() else: raise ValueError( f"The entity dataframe you have provided must be a Pandas DataFrame or BigQuery SQL query, " f"but we found: {type(entity_df)} ") # Ensure that the table expires after some time table = client.get_table(table=table_id) table.expires = datetime.utcnow() + timedelta(minutes=30) client.update_table(table, ["expires"]) return table
def iter_results( bigquery_client: bigquery.Client, query: str, job_config: QueryJobConfig, df_cleaner: Callable[[pd.DataFrame], pd.DataFrame] = None, ) -> Generator[pd.Series, None, None]: """ Page through the results of a query and yield each row as a pandas Series Args: bigquery_client (bigquery.Client): The BigQuery client query (str): The query to run job_config (QueryJobConfig): The BigQuery job config Returns: Generator[pd.Series, None, None]: A generator of pandas Series """ query_job = bigquery_client.query(query, job_config=job_config) query_job.result() # Get reference to destination table destination = bigquery_client.get_table(query_job.destination) rows = bigquery_client.list_rows(destination, page_size=10000) dfs = rows.to_dataframe_iterable() for df in dfs: if df_cleaner is not None: df = df_cleaner(df) for index, row in df.iterrows(): yield row
def clean_up_bq_tables(client: cloud_bigquery.Client, table_names: List[str]) -> None: for table_name in table_names: try: client.get_table(table_name) client.delete_table(table_name) except NotFound: pass
def load_views(bq: bigquery.Client, dataset: bigquery.Dataset, views: Dict[str, str]): """Load views for a test.""" for table, view_query in views.items(): view = bigquery.Table(dataset.table(table)) view.view_query = view_query.format(project=dataset.project, dataset=dataset.dataset_id) bq.create_table(view)