Exemplo n.º 1
0
def _recreate_table(client: bigquery.Client, table_id: str, schema: list) -> None:
    logger.info(f'Recreating {table_id} table...')
    dataset_ref = client.dataset(DATASET_ID)
    table_ref = dataset_ref.table(table_id)
    table = bigquery.Table(table_ref, schema=schema)
    client.delete_table(table_ref, not_found_ok=True)
    client.create_table(table)
Exemplo n.º 2
0
def delete_views_or_table(client: bigquery.Client, view_or_table_name: str,
                          dataset: str):
    LOGGER.debug("delete_views_or_tables: %s", view_or_table_name)
    dataset_ref = client.dataset(dataset)
    table_ref = dataset_ref.table(view_or_table_name)
    client.delete_table(table_ref)
    LOGGER.info("deleted view or table: %s", view_or_table_name)
Exemplo n.º 3
0
def clean_up_bq_tables(client: cloud_bigquery.Client,
                       table_names: List[str]) -> None:
    for table_name in table_names:
        try:
            client.get_table(table_name)
            client.delete_table(table_name)
        except NotFound:
            pass
Exemplo n.º 4
0
def table_id_us_east1(bigquery_client: bigquery.Client, project_id: str,
                      dataset_id_us_east1: str):
    table_id = prefixer.create_prefix()
    full_table_id = f"{project_id}.{dataset_id_us_east1}.{table_id}"
    table = bigquery.Table(
        full_table_id, schema=[bigquery.SchemaField("string_col", "STRING")])
    bigquery_client.create_table(table)
    yield full_table_id
    bigquery_client.delete_table(table, not_found_ok=True)
Exemplo n.º 5
0
def random_table_id(bigquery_client: bigquery.Client, project_id: str,
                    dataset_id: str):
    """Create a new table ID each time, so random_table_id can be used as
    target for load jobs.
    """
    random_table_id = prefixer.create_prefix()
    full_table_id = f"{project_id}.{dataset_id}.{random_table_id}"
    yield full_table_id
    bigquery_client.delete_table(full_table_id, not_found_ok=True)
Exemplo n.º 6
0
def writable_table(bigquery_client: bigquery.Client, project_id: str,
                   random_dataset: bigquery.Dataset):
    full_table_id = f"{project_id}.{random_dataset.dataset_id}.writable_table_{random.randrange(1_000_000_000)}"
    table = bigquery.Table(full_table_id)
    table.schema = [
        bigquery.SchemaField("field1", "STRING"),
        bigquery.SchemaField("field2", "INTEGER"),
    ]
    bigquery_client.create_table(table)
    yield full_table_id
    bigquery_client.delete_table(full_table_id)
Exemplo n.º 7
0
def scalars_extreme_table(bigquery_client: bigquery.Client, project_id: str,
                          dataset_id: str):
    schema = bigquery_client.schema_from_json(DATA_DIR / "scalars_schema.json")
    job_config = bigquery.LoadJobConfig()
    job_config.schema = schema
    job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON
    full_table_id = f"{project_id}.{dataset_id}.scalars_extreme"
    with open(DATA_DIR / "scalars_extreme.jsonl", "rb") as data_file:
        job = bigquery_client.load_table_from_file(data_file,
                                                   full_table_id,
                                                   job_config=job_config)
    job.result()
    yield full_table_id
    bigquery_client.delete_table(full_table_id)
Exemplo n.º 8
0
def delete_table(bq_client: bigquery.Client, dataset_id: str,
                 table_name: str) -> None:
    """Deletes a specified table in BigQuery.

    Args:
        bq_client: bigquery.Client object.
        dataset_id: String holding ID of dataset
        table_name: String of table name to delete

    Returns:
        None; Deletes a table in BigQuery
    """
    dataset_ref = bq_client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_name)
    bq_client.delete_table(table=table_ref)
Exemplo n.º 9
0
def bigquery_empty_table(
    bigquery_dataset: str,
    bigquery_dml_dataset: str,
    bigquery_client: bigquery.Client,
    bigquery_schema: List[bigquery.SchemaField],
):
    project_id = bigquery_client.project
    # Cleanup the sample_dml table, if it exists.
    old_table_id = f"{project_id}.{bigquery_dataset}.sample_dml"
    bigquery_client.delete_table(old_table_id, not_found_ok=True)
    # Create new table in its own dataset.
    dataset_id = bigquery_dml_dataset
    table_id = f"{project_id}.{dataset_id}.sample_dml_{temp_suffix()}"
    empty_table = bigquery.Table(table_id, schema=bigquery_schema)
    bigquery_client.create_table(empty_table)
    yield table_id
    bigquery_client.delete_table(empty_table)
Exemplo n.º 10
0
def delete_table(client: bigquery.Client, dataset_id: str, table_id: str):
    """
    Deletes the specified table in the given project:dataset

    Args:
        client: BQ API client
        dataset_id: destination dataset
        table_id: table to be deleted

    Returns:

    Examples:
        delete_table(client, 'my_dataset', 'my_table')
    """
    dataset_ref = client.dataset(dataset_id=dataset_id)
    tables_list = [t.table_id for t in list(client.list_tables(dataset_ref))]

    if table_id not in tables_list:
        print("THIS TABLE DOES NOT EXIST IN {}:{}".format(client.project, dataset_id))
    else:
        table_ref = dataset_ref.table(table_id)
        client.delete_table(table_ref)
Exemplo n.º 11
0
def delete_table_if_needed(client: bq.Client, dataset_id: str, table_id: str):
    table_full_id = get_full_table_name(client, dataset_id, table_id)
    client.delete_table(table_full_id, not_found_ok=True)
Exemplo n.º 12
0
class BigQuery(BaseDb):
    """
        A Google BigQuery database client

        Kwargs:
            name : str - The canonical name to use for this instance
            creds_file : str - The filepath of the desired GOOGLE_APPLICATION_CREDENTIALS file
            conn_kwargs : Use in place of a query string to set individual
                          attributes of the connection defaults (project, etc)
    """

    def __init__(self, name=None, creds_file=None, **conn_kwargs):
        if creds_file is None:
            creds_file = os.getenv('BIGQUERY_CREDS_FILE', None)
        self._bq_creds_file = creds_file

        self._conn_kwargs = dict(**BIGQUERY_DEFAULT_CONN_KWARGS)

        self._name = name
        for k, v in six.iteritems(conn_kwargs):
            if k in self._conn_kwargs:
                self._conn_kwargs[k] = v

    def __repr__(self):
        return '<{db.__class__.__name__}({project})>'.format(db=self, project=self._conn_kwargs['project'])

    @property
    def name(self):
        return self._name

    @property
    def project(self):
        return self._conn_kwargs['project']

    @project.setter
    def project(self, value):
        self._conn_kwargs['project'] = value

    def _connect(self):
        if self._bq_creds_file is not None:
            if Path(self._bq_creds_file).exists():
                os.environ.setdefault('GOOGLE_APPLICATION_CREDENTIALS', self._bq_creds_file)
            else:
                _log.warning('Path set by creds file does not exist: %s', self._bq_creds_file)
        self._conn = Client(**self._conn_kwargs)

    def _close(self):
        """
            This is a no-op because the bigquery Client doesn't have a close method.
            The BaseDb close method will handle setting self._conn to None and self._connected to False.
        """
        return

    def _query(self, query_string):
        self.connect()
        query_job = self._conn.query(query_string)
        return query_job.result()

    def query(self, query_string):
        from .result import QueryResult
        result = self._query(query_string)
        return QueryResult(result)

    def execute(self, query_string):
        self._query(query_string)

    def list_tables(self, dataset_id):
        """
            List all tables in the provided dataset

            Args:
                dataset_id : str - The dataset to query

            Returns:
                list of table names
        """
        self.connect()
        dataset_ref = self._conn.dataset(dataset_id)
        return [t.table_id for t in self._conn.list_tables(dataset_ref)]

    def delete_table(self, dataset_id, table_id):
        """
            Delete the given table in the given dataset

            Args:
                dataset_id : str - The dataset containing the table to delete
                table_id : str - The table to delete

            Returns:
                None
        """
        self.connect()
        table_ref = self._conn.dataset(dataset_id).table(table_id)
        self._conn.delete_table(table_ref)
Exemplo n.º 13
0
def table_id(bigquery_client: bigquery.Client, project_id: str, dataset_id: str):
    table_id = f"{resource_prefix()}_update_with_dml"
    yield table_id
    full_table_id = f"{project_id}.{dataset_id}.{table_id}"
    bigquery_client.delete_table(full_table_id, not_found_ok=True)
Exemplo n.º 14
0
        def enrich_task():
            client = Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)
            table = Table(temp_table_ref)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            if time_partitioning_field is not None:
                table.time_partitioning = TimePartitioning(
                    field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table.schema = schema

            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref
            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql = read_file(sql_path, environment)
            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            # Copy temporary table to destination
            copy_job_config = CopyJobConfig()
            copy_job_config.write_disposition = 'WRITE_TRUNCATE'

            dest_table_name = '{task}'.format(task=task)
            dest_table_ref = client.dataset(
                dataset_name,
                project=destination_dataset_project_id).table(dest_table_name)
            copy_job = client.copy_table(temp_table_ref,
                                         dest_table_ref,
                                         location='US',
                                         job_config=copy_job_config)
            submit_bigquery_job(copy_job, copy_job_config)
            assert copy_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)
def clean_up_bq_table(client: cloud_bigquery.Client, table_name: str) -> None:
  try:
    client.get_table(table_name)
    client.delete_table(table_name)
  except NotFound:
    pass
Exemplo n.º 16
0
def bigquery_table(bigquery_client: bigquery.Client) -> bigquery.Table:
    """ Create a temporary table for the test, remove on cleanup """
    table = bigquery_client.create_table(TEST_TABLE, exists_ok=True)
    yield table
    bigquery_client.delete_table(table)
Exemplo n.º 17
0
        def enrich_task(ds, **kwargs):
            template_context = kwargs.copy()
            template_context['ds'] = ds
            template_context['params'] = environment

            client = Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)
            table = Table(temp_table_ref)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            table.time_partitioning = TimePartitioning(
                field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table.schema = schema

            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref

            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql_template = read_file(sql_path)
            sql = kwargs['task'].render_template('', sql_template,
                                                 template_context)
            print('Enrichment sql:')
            print(sql)

            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            if load_all_partitions:
                # Copy temporary table to destination
                copy_job_config = CopyJobConfig()
                copy_job_config.write_disposition = 'WRITE_TRUNCATE'

                dest_table_name = '{task}'.format(task=task)
                dest_table_ref = client.dataset(
                    dataset_name,
                    project=destination_dataset_project_id).table(
                        dest_table_name)
                copy_job = client.copy_table(temp_table_ref,
                                             dest_table_ref,
                                             location='US',
                                             job_config=copy_job_config)
                submit_bigquery_job(copy_job, copy_job_config)
                assert copy_job.state == 'DONE'
            else:
                # Merge
                # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement
                merge_job_config = QueryJobConfig()
                # Finishes faster, query limit for concurrent interactive queries is 50
                merge_job_config.priority = QueryPriority.INTERACTIVE

                merge_sql_path = os.path.join(
                    dags_folder,
                    'resources/stages/enrich/sqls/merge_{task}.sql'.format(
                        task=task))
                merge_sql_template = read_file(merge_sql_path)
                template_context['params']['source_table'] = temp_table_name
                merge_sql = kwargs['task'].render_template(
                    '', merge_sql_template, template_context)
                print('Merge sql:')
                print(merge_sql)
                merge_job = client.query(merge_sql,
                                         location='US',
                                         job_config=merge_job_config)
                submit_bigquery_job(merge_job, merge_job_config)
                assert merge_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)