def make_ntile_table(ntiles: int = 100,
                     truncate=False,
                     sql_path: Union[str, Path] = None,
                     table_suffix: Optional[str] = None) -> QueryJob:
    """
    Create a table describing country share output for publications above the nth citation count percentile.

    The result has  a name like ``country_share_99th`` or ``country_share_99th_arxiv``.

    :param ntiles: Included publications will have a citation percentile greater than this number, in the range 1-100
        inclusive.
    :param truncate: If ``True``, replace the table if it exists.
    :param sql_path: Override the default SQL template.
    :param table_suffix: Append the table name with a given suffix.
    :return: Completed QueryJob.
    """
    params = [
        ScalarQueryParameter('ntiles', 'INT64', ntiles),
        ScalarQueryParameter('gt_ntile', 'INT64', ntiles - 1),
    ]
    if sql_path is None:
        sql = read_sql('country_share_template')
    else:
        sql = read_sql(sql_path)
    job = query(
        sql,
        f'country_share_{ntiles - 1}th{table_suffix if table_suffix else ""}',
        truncate=truncate,
        query_parameters=params)
    return job
Пример #2
0
def test_prepare_parameters():
    query = 'SELECT test, test2, test3 FROM `useful-citizen-322414.test.test` WHERE test = {{test_str}} AND test2 = {{test_float}} LIMIT 10'
    new_query = GoogleBigQueryConnector._prepare_query(query)
    parameters = GoogleBigQueryConnector._prepare_parameters(
        new_query,
        {
            'test_str': str('tortank'),
            'test_int': int(1),
            'test_float': float(0.0),
            'test_bool': True,
        },
    )
    assert len(parameters) == 2
    assert parameters[0] == ScalarQueryParameter('test_str', 'STRING', 'tortank')
    assert parameters[1] == ScalarQueryParameter('test_float', 'FLOAT64', 0.0)
Пример #3
0
def run_query(client,
              baseline_table,
              date,
              dry_run,
              output_dir=None,
              views_only=False):
    """Process a single table, potentially also writing out the generated queries."""
    tables = table_names_from_baseline(baseline_table)

    last_seen_table = tables["last_seen_table"]
    last_seen_view = tables["last_seen_view"]
    render_kwargs = dict(header="-- Generated via bigquery_etl.glean_usage\n",
                         usage_types=USAGE_TYPES)
    render_kwargs.update(tables)
    job_kwargs = dict(use_legacy_sql=False, dry_run=dry_run)

    query_sql = render(QUERY_FILENAME, **render_kwargs)
    init_sql = render(QUERY_FILENAME, init=True, **render_kwargs)
    view_sql = render(VIEW_FILENAME, **render_kwargs)
    sql = query_sql

    try:
        client.get_table(last_seen_table)
    except NotFound:
        if views_only:
            logging.info("Skipping view for table which doesn't exist:"
                         f" {last_seen_table}")
            return
        elif dry_run:
            logging.info(f"Table does not yet exist: {last_seen_table}")
        else:
            logging.info(f"Creating table: {last_seen_table}")
        sql = init_sql
    else:
        if views_only:
            write_sql(output_dir, last_seen_view, "view.sql", view_sql)
            return
        # Table exists, so we will run the incremental query.
        job_kwargs.update(
            destination=f"{last_seen_table}${date.strftime('%Y%m%d')}",
            write_disposition=WriteDisposition.WRITE_TRUNCATE,
            query_parameters=[
                ScalarQueryParameter("submission_date", "DATE", date)
            ],
        )
        if not dry_run:
            logging.info(f"Running query for: {last_seen_table}")

    if output_dir:
        write_sql(output_dir, last_seen_view, "view.sql", view_sql)
        write_sql(output_dir, last_seen_table, "query.sql", query_sql)
        write_sql(output_dir, last_seen_table, "init.sql", init_sql)

    job_config = bigquery.QueryJobConfig(**job_kwargs)
    job = client.query(sql, job_config)
    if not dry_run:
        job.result()
        logging.info(f"Recreating view {last_seen_view}")
        client.query(view_sql,
                     bigquery.QueryJobConfig(use_legacy_sql=False)).result()
    def process(self, app_numbers):
        sql = """
            SELECT application_number, application_kind, grant_date
            FROM `patents-public-data.patents.publications`
            WHERE
            country_code = @us_country_code
            AND application_number IN UNNEST(@application_numbers)
            AND IF (
                publication_date >= @wipo_kind_codes_from,
                kind_code IN UNNEST(@wipo_patent_publication_codes),
                kind_code = @uspto_patent_publication_code
            );
        """

        job_config = QueryJobConfig(query_parameters=[
            ScalarQueryParameter(
                'us_country_code',
                'STRING',
                US_COUNTRY_CODE,
            ),
            ArrayQueryParameter(
                'application_numbers',
                'STRING',
                app_numbers,
            ),
            ScalarQueryParameter(
                'wipo_kind_codes_from',
                'INT64',
                WIPO_KIND_CODES_FROM,
            ),
            ArrayQueryParameter(
                'wipo_patent_publication_codes',
                'STRING',
                WIPO_PATENT_PUBLICATION_CODES,
            ),
            ScalarQueryParameter(
                'uspto_patent_publication_code',
                'STRING',
                USPTO_PATENT_PUBLICATION_CODE,
            ),
        ])
        query = self.storage_client.query(sql, job_config=job_config)

        logging.info('Executing query for publications')
        iterator = query.result()

        return iterator
Пример #5
0
def parse_param(param: Parameter) -> BQParameter:
    value = param.value
    if isinstance(value, list):
        if param.type == "STRUCT":
            return StructQueryParameter(param.name, *[parse_param(Parameter(**p)) for p in value])
        elif param.type == "ARRAY<STRUCT>":
            value = [
                StructQueryParameter("_", *[parse_param(Parameter(**p)) for p in v]) for v in value
            ]
        array_type = param.type.strip("ARRAY<").strip(">")
        return ArrayQueryParameter(param.name, array_type, value)
    else:
        return ScalarQueryParameter(param.name, param.type, param.value)
Пример #6
0
def client_run_sync_query_w_param(client, _):
    """Run a synchronous query using a query parameter"""
    QUERY_W_PARAM = (
        'SELECT name FROM `bigquery-public-data.usa_names.usa_1910_2013` '
        'WHERE state = @state')
    LIMIT = 100
    LIMITED = '%s LIMIT %d' % (QUERY_W_PARAM, LIMIT)
    TIMEOUT_MS = 1000

    # [START client_run_sync_query_w_param]
    from google.cloud.bigquery import ScalarQueryParameter
    param = ScalarQueryParameter('state', 'STRING', 'TX')
    query = client.run_sync_query(LIMITED, query_parameters=[param])
    query.use_legacy_sql = False
    query.timeout_ms = TIMEOUT_MS
    query.run()  # API request

    assert query.complete
    assert len(query.rows) == LIMIT
    assert [field.name for field in query.schema] == ['name']
Пример #7
0
 def generate_query_params_for_date(date_param):
     return ScalarQueryParameter('update_timestamp', 'DATETIME', date_param)
def run_query(project_id,
              baseline_table,
              date,
              dry_run,
              output_dir=None,
              output_only=False):
    """Process a single table, potentially also writing out the generated queries."""
    tables = table_names_from_baseline(baseline_table)

    last_seen_table = tables["last_seen_table"]
    last_seen_view = tables["last_seen_view"]
    render_kwargs = dict(header="-- Generated via bigquery_etl.glean_usage\n",
                         usage_types=USAGE_TYPES)
    render_kwargs.update(tables)
    job_kwargs = dict(use_legacy_sql=False, dry_run=dry_run)

    query_sql = render(QUERY_FILENAME, **render_kwargs)
    init_sql = render(QUERY_FILENAME, init=True, **render_kwargs)
    view_sql = render(VIEW_FILENAME, **render_kwargs)
    view_metadata = render(VIEW_METADATA_FILENAME,
                           format=False,
                           **render_kwargs)
    sql = query_sql

    if not (referenced_table_exists(view_sql)):
        if output_only:
            logging.info("Skipping view for table which doesn't exist:"
                         f" {last_seen_table}")
            return
        elif dry_run:
            logging.info(f"Table does not yet exist: {last_seen_table}")
        else:
            logging.info(f"Creating table: {last_seen_table}")
        sql = init_sql
    elif output_only:
        pass
    else:
        # Table exists, so we will run the incremental query.
        job_kwargs.update(
            destination=f"{last_seen_table}${date.strftime('%Y%m%d')}",
            write_disposition=WriteDisposition.WRITE_TRUNCATE,
            query_parameters=[
                ScalarQueryParameter("submission_date", "DATE", date)
            ],
        )
        if not dry_run:
            logging.info(f"Running query for: {last_seen_table}")

    if output_dir:
        write_sql(output_dir, last_seen_view, "metadata.yaml", view_metadata)
        write_sql(output_dir, last_seen_view, "view.sql", view_sql)
        write_sql(output_dir, last_seen_table, "query.sql", query_sql)
        write_sql(output_dir, last_seen_table, "init.sql", init_sql)
    if output_only:
        # Return before we initialize the BQ client so that we can generate SQL
        # without having BQ credentials.
        return

    client = bigquery.Client(project_id)
    job_config = bigquery.QueryJobConfig(**job_kwargs)
    job = client.query(sql, job_config)
    if not dry_run:
        job.result()
        logging.info(f"Recreating view {last_seen_view}")
        client.query(view_sql,
                     bigquery.QueryJobConfig(use_legacy_sql=False)).result()
def run_query(
    project_id, baseline_table, date, dry_run, output_dir=None, output_only=False
):
    """Process a single table, potentially also writing out the generated queries."""
    tables = table_names_from_baseline(baseline_table, include_project_id=False)

    table_id = tables["first_seen_table"]
    view_id = tables["first_seen_view"]
    render_kwargs = dict(
        header="-- Generated via bigquery_etl.glean_usage\n",
        project_id=project_id,
        # do not match on org_mozilla_firefoxreality
        fennec_id=any(
            (f"{app_id}_stable" in baseline_table)
            for app_id in [
                "org_mozilla_firefox",
                "org_mozilla_fenix_nightly",
                "org_mozilla_fennec_aurora",
                "org_mozilla_firefox_beta",
                "org_mozilla_fenix",
            ]
        ),
    )
    render_kwargs.update(tables)
    job_kwargs = dict(use_legacy_sql=False, dry_run=dry_run)

    query_sql = render(QUERY_FILENAME, **render_kwargs)
    init_sql = render(INIT_FILENAME, **render_kwargs)
    view_sql = render(VIEW_FILENAME, **render_kwargs)
    view_metadata = render(VIEW_METADATA_FILENAME, format=False, **render_kwargs)
    sql = query_sql

    if not (referenced_table_exists(view_sql)):
        if output_only:
            logging.info("Skipping view for table which doesn't exist:" f" {table_id}")
            return
        elif dry_run:
            logging.info(f"Table does not yet exist: {table_id}")
        else:
            logging.info(f"Creating table: {table_id}")
        sql = init_sql
    elif output_only:
        pass
    else:
        # Table exists, so just overwrite the entire table with the day's results
        job_kwargs.update(
            destination=f"{project_id}.{table_id}",
            write_disposition=WriteDisposition.WRITE_TRUNCATE,
            query_parameters=[ScalarQueryParameter("submission_date", "DATE", date)],
        )
        if not dry_run:
            logging.info(f"Running query for: {table_id}")

    if output_dir:
        write_sql(output_dir, view_id, "metadata.yaml", view_metadata)
        write_sql(output_dir, view_id, "view.sql", view_sql)
        write_sql(output_dir, table_id, "query.sql", query_sql)
        write_sql(output_dir, table_id, "init.sql", init_sql)
    if output_only:
        # Return before we initialize the BQ client so that we can generate SQL
        # without having BQ credentials.
        return

    client = bigquery.Client(project_id)
    job_config = bigquery.QueryJobConfig(**job_kwargs)
    job = client.query(sql, job_config)
    if not dry_run:
        job.result()
        logging.info(f"Recreating view {view_id}")
        client.query(view_sql, bigquery.QueryJobConfig(use_legacy_sql=False)).result()
Пример #10
0
 def generate_query_params_for_date(
     date_param: datetime.datetime, ) -> ScalarQueryParameter:
     return ScalarQueryParameter("update_timestamp", "DATETIME", date_param)