def make_ntile_table(ntiles: int = 100, truncate=False, sql_path: Union[str, Path] = None, table_suffix: Optional[str] = None) -> QueryJob: """ Create a table describing country share output for publications above the nth citation count percentile. The result has a name like ``country_share_99th`` or ``country_share_99th_arxiv``. :param ntiles: Included publications will have a citation percentile greater than this number, in the range 1-100 inclusive. :param truncate: If ``True``, replace the table if it exists. :param sql_path: Override the default SQL template. :param table_suffix: Append the table name with a given suffix. :return: Completed QueryJob. """ params = [ ScalarQueryParameter('ntiles', 'INT64', ntiles), ScalarQueryParameter('gt_ntile', 'INT64', ntiles - 1), ] if sql_path is None: sql = read_sql('country_share_template') else: sql = read_sql(sql_path) job = query( sql, f'country_share_{ntiles - 1}th{table_suffix if table_suffix else ""}', truncate=truncate, query_parameters=params) return job
def test_prepare_parameters(): query = 'SELECT test, test2, test3 FROM `useful-citizen-322414.test.test` WHERE test = {{test_str}} AND test2 = {{test_float}} LIMIT 10' new_query = GoogleBigQueryConnector._prepare_query(query) parameters = GoogleBigQueryConnector._prepare_parameters( new_query, { 'test_str': str('tortank'), 'test_int': int(1), 'test_float': float(0.0), 'test_bool': True, }, ) assert len(parameters) == 2 assert parameters[0] == ScalarQueryParameter('test_str', 'STRING', 'tortank') assert parameters[1] == ScalarQueryParameter('test_float', 'FLOAT64', 0.0)
def run_query(client, baseline_table, date, dry_run, output_dir=None, views_only=False): """Process a single table, potentially also writing out the generated queries.""" tables = table_names_from_baseline(baseline_table) last_seen_table = tables["last_seen_table"] last_seen_view = tables["last_seen_view"] render_kwargs = dict(header="-- Generated via bigquery_etl.glean_usage\n", usage_types=USAGE_TYPES) render_kwargs.update(tables) job_kwargs = dict(use_legacy_sql=False, dry_run=dry_run) query_sql = render(QUERY_FILENAME, **render_kwargs) init_sql = render(QUERY_FILENAME, init=True, **render_kwargs) view_sql = render(VIEW_FILENAME, **render_kwargs) sql = query_sql try: client.get_table(last_seen_table) except NotFound: if views_only: logging.info("Skipping view for table which doesn't exist:" f" {last_seen_table}") return elif dry_run: logging.info(f"Table does not yet exist: {last_seen_table}") else: logging.info(f"Creating table: {last_seen_table}") sql = init_sql else: if views_only: write_sql(output_dir, last_seen_view, "view.sql", view_sql) return # Table exists, so we will run the incremental query. job_kwargs.update( destination=f"{last_seen_table}${date.strftime('%Y%m%d')}", write_disposition=WriteDisposition.WRITE_TRUNCATE, query_parameters=[ ScalarQueryParameter("submission_date", "DATE", date) ], ) if not dry_run: logging.info(f"Running query for: {last_seen_table}") if output_dir: write_sql(output_dir, last_seen_view, "view.sql", view_sql) write_sql(output_dir, last_seen_table, "query.sql", query_sql) write_sql(output_dir, last_seen_table, "init.sql", init_sql) job_config = bigquery.QueryJobConfig(**job_kwargs) job = client.query(sql, job_config) if not dry_run: job.result() logging.info(f"Recreating view {last_seen_view}") client.query(view_sql, bigquery.QueryJobConfig(use_legacy_sql=False)).result()
def process(self, app_numbers): sql = """ SELECT application_number, application_kind, grant_date FROM `patents-public-data.patents.publications` WHERE country_code = @us_country_code AND application_number IN UNNEST(@application_numbers) AND IF ( publication_date >= @wipo_kind_codes_from, kind_code IN UNNEST(@wipo_patent_publication_codes), kind_code = @uspto_patent_publication_code ); """ job_config = QueryJobConfig(query_parameters=[ ScalarQueryParameter( 'us_country_code', 'STRING', US_COUNTRY_CODE, ), ArrayQueryParameter( 'application_numbers', 'STRING', app_numbers, ), ScalarQueryParameter( 'wipo_kind_codes_from', 'INT64', WIPO_KIND_CODES_FROM, ), ArrayQueryParameter( 'wipo_patent_publication_codes', 'STRING', WIPO_PATENT_PUBLICATION_CODES, ), ScalarQueryParameter( 'uspto_patent_publication_code', 'STRING', USPTO_PATENT_PUBLICATION_CODE, ), ]) query = self.storage_client.query(sql, job_config=job_config) logging.info('Executing query for publications') iterator = query.result() return iterator
def parse_param(param: Parameter) -> BQParameter: value = param.value if isinstance(value, list): if param.type == "STRUCT": return StructQueryParameter(param.name, *[parse_param(Parameter(**p)) for p in value]) elif param.type == "ARRAY<STRUCT>": value = [ StructQueryParameter("_", *[parse_param(Parameter(**p)) for p in v]) for v in value ] array_type = param.type.strip("ARRAY<").strip(">") return ArrayQueryParameter(param.name, array_type, value) else: return ScalarQueryParameter(param.name, param.type, param.value)
def client_run_sync_query_w_param(client, _): """Run a synchronous query using a query parameter""" QUERY_W_PARAM = ( 'SELECT name FROM `bigquery-public-data.usa_names.usa_1910_2013` ' 'WHERE state = @state') LIMIT = 100 LIMITED = '%s LIMIT %d' % (QUERY_W_PARAM, LIMIT) TIMEOUT_MS = 1000 # [START client_run_sync_query_w_param] from google.cloud.bigquery import ScalarQueryParameter param = ScalarQueryParameter('state', 'STRING', 'TX') query = client.run_sync_query(LIMITED, query_parameters=[param]) query.use_legacy_sql = False query.timeout_ms = TIMEOUT_MS query.run() # API request assert query.complete assert len(query.rows) == LIMIT assert [field.name for field in query.schema] == ['name']
def generate_query_params_for_date(date_param): return ScalarQueryParameter('update_timestamp', 'DATETIME', date_param)
def run_query(project_id, baseline_table, date, dry_run, output_dir=None, output_only=False): """Process a single table, potentially also writing out the generated queries.""" tables = table_names_from_baseline(baseline_table) last_seen_table = tables["last_seen_table"] last_seen_view = tables["last_seen_view"] render_kwargs = dict(header="-- Generated via bigquery_etl.glean_usage\n", usage_types=USAGE_TYPES) render_kwargs.update(tables) job_kwargs = dict(use_legacy_sql=False, dry_run=dry_run) query_sql = render(QUERY_FILENAME, **render_kwargs) init_sql = render(QUERY_FILENAME, init=True, **render_kwargs) view_sql = render(VIEW_FILENAME, **render_kwargs) view_metadata = render(VIEW_METADATA_FILENAME, format=False, **render_kwargs) sql = query_sql if not (referenced_table_exists(view_sql)): if output_only: logging.info("Skipping view for table which doesn't exist:" f" {last_seen_table}") return elif dry_run: logging.info(f"Table does not yet exist: {last_seen_table}") else: logging.info(f"Creating table: {last_seen_table}") sql = init_sql elif output_only: pass else: # Table exists, so we will run the incremental query. job_kwargs.update( destination=f"{last_seen_table}${date.strftime('%Y%m%d')}", write_disposition=WriteDisposition.WRITE_TRUNCATE, query_parameters=[ ScalarQueryParameter("submission_date", "DATE", date) ], ) if not dry_run: logging.info(f"Running query for: {last_seen_table}") if output_dir: write_sql(output_dir, last_seen_view, "metadata.yaml", view_metadata) write_sql(output_dir, last_seen_view, "view.sql", view_sql) write_sql(output_dir, last_seen_table, "query.sql", query_sql) write_sql(output_dir, last_seen_table, "init.sql", init_sql) if output_only: # Return before we initialize the BQ client so that we can generate SQL # without having BQ credentials. return client = bigquery.Client(project_id) job_config = bigquery.QueryJobConfig(**job_kwargs) job = client.query(sql, job_config) if not dry_run: job.result() logging.info(f"Recreating view {last_seen_view}") client.query(view_sql, bigquery.QueryJobConfig(use_legacy_sql=False)).result()
def run_query( project_id, baseline_table, date, dry_run, output_dir=None, output_only=False ): """Process a single table, potentially also writing out the generated queries.""" tables = table_names_from_baseline(baseline_table, include_project_id=False) table_id = tables["first_seen_table"] view_id = tables["first_seen_view"] render_kwargs = dict( header="-- Generated via bigquery_etl.glean_usage\n", project_id=project_id, # do not match on org_mozilla_firefoxreality fennec_id=any( (f"{app_id}_stable" in baseline_table) for app_id in [ "org_mozilla_firefox", "org_mozilla_fenix_nightly", "org_mozilla_fennec_aurora", "org_mozilla_firefox_beta", "org_mozilla_fenix", ] ), ) render_kwargs.update(tables) job_kwargs = dict(use_legacy_sql=False, dry_run=dry_run) query_sql = render(QUERY_FILENAME, **render_kwargs) init_sql = render(INIT_FILENAME, **render_kwargs) view_sql = render(VIEW_FILENAME, **render_kwargs) view_metadata = render(VIEW_METADATA_FILENAME, format=False, **render_kwargs) sql = query_sql if not (referenced_table_exists(view_sql)): if output_only: logging.info("Skipping view for table which doesn't exist:" f" {table_id}") return elif dry_run: logging.info(f"Table does not yet exist: {table_id}") else: logging.info(f"Creating table: {table_id}") sql = init_sql elif output_only: pass else: # Table exists, so just overwrite the entire table with the day's results job_kwargs.update( destination=f"{project_id}.{table_id}", write_disposition=WriteDisposition.WRITE_TRUNCATE, query_parameters=[ScalarQueryParameter("submission_date", "DATE", date)], ) if not dry_run: logging.info(f"Running query for: {table_id}") if output_dir: write_sql(output_dir, view_id, "metadata.yaml", view_metadata) write_sql(output_dir, view_id, "view.sql", view_sql) write_sql(output_dir, table_id, "query.sql", query_sql) write_sql(output_dir, table_id, "init.sql", init_sql) if output_only: # Return before we initialize the BQ client so that we can generate SQL # without having BQ credentials. return client = bigquery.Client(project_id) job_config = bigquery.QueryJobConfig(**job_kwargs) job = client.query(sql, job_config) if not dry_run: job.result() logging.info(f"Recreating view {view_id}") client.query(view_sql, bigquery.QueryJobConfig(use_legacy_sql=False)).result()
def generate_query_params_for_date( date_param: datetime.datetime, ) -> ScalarQueryParameter: return ScalarQueryParameter("update_timestamp", "DATETIME", date_param)