def bigquery_dataset(bigquery_client: bigquery.Client,
                     bigquery_schema: List[bigquery.SchemaField]):
    project_id = bigquery_client.project
    dataset_id = "test_pybigquery"
    dataset = bigquery.Dataset(f"{project_id}.{dataset_id}")
    dataset = bigquery_client.create_dataset(dataset, exists_ok=True)
    sample_table_id = f"{project_id}.{dataset_id}.sample"
    try:
        # Since the data changes rarely and the tests are mostly read-only,
        # only create the tables if they don't already exist.
        # TODO: Create shared sample data tables in bigquery-public-data that
        #       include test values for all data types.
        bigquery_client.get_table(sample_table_id)
    except google.api_core.exceptions.NotFound:
        job1 = load_sample_data(sample_table_id, bigquery_client,
                                bigquery_schema)
        job1.result()
    one_row_table_id = f"{project_id}.{dataset_id}.sample_one_row"
    try:
        bigquery_client.get_table(one_row_table_id)
    except google.api_core.exceptions.NotFound:
        job2 = load_sample_data(
            one_row_table_id,
            bigquery_client,
            bigquery_schema,
            filename="sample_one_row.json",
        )
        job2.result()
    view = bigquery.Table(f"{project_id}.{dataset_id}.sample_view", )
    view.view_query = f"SELECT string FROM `{dataset_id}.sample`"
    bigquery_client.create_table(view, exists_ok=True)
    return dataset_id
示例#2
0
def _recreate_table(client: bigquery.Client, table_id: str, schema: list) -> None:
    logger.info(f'Recreating {table_id} table...')
    dataset_ref = client.dataset(DATASET_ID)
    table_ref = dataset_ref.table(table_id)
    table = bigquery.Table(table_ref, schema=schema)
    client.delete_table(table_ref, not_found_ok=True)
    client.create_table(table)
示例#3
0
def load_views(bq: bigquery.Client, dataset: bigquery.Dataset,
               views: Dict[str, str]):
    """Load views for a test."""
    for table, view_query in views.items():
        view = bigquery.Table(dataset.table(table))
        view.view_query = view_query.format(project=dataset.project,
                                            dataset=dataset.dataset_id)
        bq.create_table(view)
示例#4
0
def table_id_us_east1(bigquery_client: bigquery.Client, project_id: str,
                      dataset_id_us_east1: str):
    table_id = prefixer.create_prefix()
    full_table_id = f"{project_id}.{dataset_id_us_east1}.{table_id}"
    table = bigquery.Table(
        full_table_id, schema=[bigquery.SchemaField("string_col", "STRING")])
    bigquery_client.create_table(table)
    yield full_table_id
    bigquery_client.delete_table(table, not_found_ok=True)
示例#5
0
def create_table(
    client: bq.Client,
    dataset_id: str,
    table_id: str,
    schema: List[bq.SchemaField],
):
    table_full_id = get_full_table_name(client, dataset_id, table_id)
    table = bq.Table(table_full_id, schema=schema)
    client.create_table(table)
示例#6
0
def writable_table(bigquery_client: bigquery.Client, project_id: str,
                   random_dataset: bigquery.Dataset):
    full_table_id = f"{project_id}.{random_dataset.dataset_id}.writable_table_{random.randrange(1_000_000_000)}"
    table = bigquery.Table(full_table_id)
    table.schema = [
        bigquery.SchemaField("field1", "STRING"),
        bigquery.SchemaField("field2", "INTEGER"),
    ]
    bigquery_client.create_table(table)
    yield full_table_id
    bigquery_client.delete_table(full_table_id)
def bigquery_empty_table(
    bigquery_dataset: str,
    bigquery_client: bigquery.Client,
    bigquery_schema: List[bigquery.SchemaField],
):
    project_id = bigquery_client.project
    # Create new table in its own dataset.
    dataset_id = bigquery_dataset
    table_id = f"{project_id}.{dataset_id}.sample_dml_empty"
    empty_table = bigquery.Table(table_id, schema=bigquery_schema)
    bigquery_client.create_table(empty_table)
    return table_id
def load_bigquery_table_via_bq_apis(bq_client: bigquery.Client, dataset_id,
                                    table_name, imported_data_info, src_uris):
    """
    Load tables using BigQuery Load jobs, using the same configuration as BQ DTS ImportedDataInfo
    :return:
    """
    # https://googlecloudplatform.github.io/google-cloud-python/latest/_modules/google/cloud/bigquery/client.html#Client.load_table_from_uri
    # Step 1 - Translate required fields for BigQuery Python SDK
    tgt_tabledef = imported_data_info['table_defs'][0]

    # Step 2 - Create target table if it doesn't exist
    dataset_ref = bq_client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_name)
    try:
        bq_client.get_table(table_ref)
    except exceptions.NotFound:
        # Step 2a - Attach schema
        tgt_schema = RPCRecordSchema_to_GCloudSchema(tgt_tabledef['schema'])
        tgt_table = bigquery.Table(table_ref, schema=tgt_schema)

        # Step 2b - Attach description
        tgt_table.description = imported_data_info[
            'destination_table_description']

        # Step 2c - Conditionally set partitioning type
        if '$' in table_name:
            tgt_table.partitioning_type = 'DAY'
            tgt_table._properties['tableReference'][
                'tableId'], _, _ = table_name.partition('$')

        # Step 2d - Create BigQuery table
        bq_client.create_table(tgt_table)

    # Step 3a - Create BigQuery Load Job ID
    current_datetime = datetime.datetime.utcnow().isoformat()
    raw_job_id = f'{table_name}_{current_datetime}'
    clean_job_id = BQ_JOB_ID_MATCHER.sub('___', raw_job_id)

    # Step 3b - Create BigQuery Job Config
    job_config = DTSTableDefinition_to_BQLoadJobConfig(tgt_tabledef)

    # Step 4 - Execute BigQuery Load Job using Python SDK
    load_job = bq_client.load_table_from_uri(source_uris=src_uris,
                                             destination=table_ref,
                                             job_id=clean_job_id,
                                             job_config=job_config)

    return load_job
    def create_external_table_hive_partitioning(
            bq_client: bigquery.Client, dataset: bigquery.Dataset,
            table_id: str, gcs_directory_path: str) -> bigquery.Table:
        """
            Creates an external table with AUTO hive partitioning in GCS
        :param bq_client: Client object to bigquery
        :param dataset: dataset object. Check 'get_or_create_dataset' method
        :param table_id: Table to be created
        :param gcs_directory_path: Directory of GCS with the data. For example:
            If you have a structure like this:
            "gs://bucket/images_metadata/source_id=abc/date=2018-02-20"
            You should pass:
            "******"
        :return:
        """
        table = bigquery.Table(dataset.table(table_id))

        external_config = bigquery.ExternalConfig(
            bigquery.SourceFormat.PARQUET)
        external_config.source_uris = [f"{gcs_directory_path}/*"]
        hive_part_opt = HivePartitioningOptions()
        hive_part_opt.mode = "AUTO"
        hive_part_opt.source_uri_prefix = gcs_directory_path

        external_config.hive_partitioning = hive_part_opt
        table.external_data_configuration = external_config
        table = bq_client.create_table(table, exists_ok=True)

        return table
示例#10
0
def dest_partitioned_table(request, bq: bigquery.Client, mock_env,
                           dest_dataset) -> bigquery.Table:
    public_table: bigquery.Table = bq.get_table(
        bigquery.TableReference.from_string(
            "bigquery-public-data.new_york_311.311_service_requests"))
    schema = public_table.schema

    table: bigquery.Table = bigquery.Table(
        f"{os.environ.get('GCP_PROJECT')}"
        f".{dest_dataset.dataset_id}.cf_test_nyc_311_"
        f"{str(uuid.uuid4()).replace('-','_')}",
        schema=schema,
    )

    table.time_partitioning = bigquery.TimePartitioning()
    table.time_partitioning.type_ = bigquery.TimePartitioningType.HOUR
    table.time_partitioning.field = "created_date"

    table = bq.create_table(table)

    def teardown():
        bq.delete_table(table, not_found_ok=True)

    request.addfinalizer(teardown)
    return table
def get_or_create_table(client: bigquery.Client, dataset_id: str,
                        table_id: str) -> bigquery.Table:
    """
    BigQueryのデータセットとテーブルを作成する。既に存在する場合は取得する。
    """
    logging.info(f'Creating dataset {dataset_id} if not exists...')
    dataset = client.create_dataset(dataset_id,
                                    exists_ok=True)  # データセットを作成または取得する。

    logging.info(f'Creating table {dataset_id}.{table_id} if not exists...')
    table_ref = dataset.table(table_id)
    return client.create_table(  # テーブルを作成または取得する。
        bigquery.Table(table_ref,
                       schema=[
                           bigquery.SchemaField('id',
                                                'string',
                                                description='ツイートのID'),
                           bigquery.SchemaField('lang',
                                                'string',
                                                description='ツイートの言語'),
                           bigquery.SchemaField('screen_name',
                                                'string',
                                                description='ユーザー名'),
                           bigquery.SchemaField('text',
                                                'string',
                                                description='ツイートの本文'),
                           bigquery.SchemaField('created_at',
                                                'timestamp',
                                                description='ツイートの日時'),
                       ]),
        exists_ok=True)
示例#12
0
def dest_partitioned_table_allow_jagged(bq: bigquery.Client, dest_dataset,
                                        monkeypatch) -> bigquery.Table:
    public_table: bigquery.Table = bq.get_table(
        bigquery.TableReference.from_string(
            "bigquery-public-data.new_york_311.311_service_requests"))
    schema = public_table.schema

    if os.getenv('GCP_PROJECT') is None:
        monkeypatch.setenv("GCP_PROJECT", bq.project)

    extra_field_for_jagged_row_test = bigquery.schema.SchemaField(
        "extra_jagged_row_test_column", "STRING")
    schema.append(extra_field_for_jagged_row_test)
    table: bigquery.Table = bigquery.Table(
        f"{os.getenv('GCP_PROJECT')}"
        f".{dest_dataset.dataset_id}.cf_test_nyc_311_"
        f"{str(uuid.uuid4()).replace('-', '_')}",
        schema=schema,
    )

    table.time_partitioning = bigquery.TimePartitioning()
    table.time_partitioning.type_ = bigquery.TimePartitioningType.HOUR
    table.time_partitioning.field = "created_date"

    table = bq.create_table(table)
    return table
示例#13
0
        def create_view_task():

            client = Client()

            dest_table_name = '{task}'.format(task=task)
            dest_table_ref = client.dataset(
                dataset_name,
                project=destination_dataset_project_id).table(dest_table_name)
            table = Table(dest_table_ref)

            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql = read_file(sql_path, environment)
            table.view_query = sql

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            logging.info('Creating view: ' + json.dumps(table.to_api_repr()))

            try:
                table = client.create_table(table)
            except Conflict:
                # https://cloud.google.com/bigquery/docs/managing-views
                table = client.update_table(table, ['view_query'])
            assert table.table_id == dest_table_name
示例#14
0
def create_tables(client: bigquery.Client, tableSchemas: dict) -> dict:
    """Create empty BigQuery tables for the given partial Table schemas.

    Returns a dict of `{ftId : bqId}` to the caller
    """
    ds = create_dataset(client, f'FusionTable_Autoimport_{datetime.now()}')

    def _create_field_schema(col_schema: dict) -> bigquery.SchemaField:
        """Create a SchemaField from the dict"""
        name = to_safe_name(col_schema['name'])
        return bigquery.SchemaField(name, col_schema.get('type'),
                                    col_schema.get('mode', 'NULLABLE'),
                                    col_schema.get('description', ''))

    def _table_from_ft(ft_schema: dict) -> bigquery.Table:
        """Create a local representation of a BigQuery table"""
        # A "TableSchema" is just a sequence of SchemaFields https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.table.Table.html
        schema = list(map(_create_field_schema, ft_schema['columns']))
        table = bigquery.Table(
            bigquery.TableReference(ds, to_safe_name(ft_schema['name'])),
            schema)
        table.description = ft_schema.get('description', '')
        return table

    return {
        ftId: client.create_table(_table_from_ft(ftSchema))
        for (ftId, ftSchema) in tableSchemas.items()
    }
示例#15
0
        def create_view_task(ds, **kwargs):

            template_context = kwargs.copy()
            template_context['ds'] = ds
            template_context['params'] = environment

            client = Client()

            dest_table_name = '{task}'.format(task=task)
            dest_table_ref = client.dataset(
                dataset_name,
                project=destination_dataset_project_id).table(dest_table_name)
            table = Table(dest_table_ref)

            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql_template = read_file(sql_path)
            sql = kwargs['task'].render_template('', sql_template,
                                                 template_context)
            table.view_query = sql

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            logging.info('Creating view: ' + json.dumps(table.to_api_repr()))

            try:
                table = client.create_table(table)
            except Conflict:
                # https://cloud.google.com/bigquery/docs/managing-views
                table = client.update_table(table, ['view_query'])
            assert table.table_id == dest_table_name
def bigquery_empty_table(
    bigquery_dataset: str,
    bigquery_dml_dataset: str,
    bigquery_client: bigquery.Client,
    bigquery_schema: List[bigquery.SchemaField],
):
    project_id = bigquery_client.project
    # Cleanup the sample_dml table, if it exists.
    old_table_id = f"{project_id}.{bigquery_dataset}.sample_dml"
    bigquery_client.delete_table(old_table_id, not_found_ok=True)
    # Create new table in its own dataset.
    dataset_id = bigquery_dml_dataset
    table_id = f"{project_id}.{dataset_id}.sample_dml_{temp_suffix()}"
    empty_table = bigquery.Table(table_id, schema=bigquery_schema)
    bigquery_client.create_table(empty_table)
    yield table_id
    bigquery_client.delete_table(empty_table)
示例#17
0
def test_list_rows_empty_table(bigquery_client: bigquery.Client,
                               table_id: str):
    from google.cloud.bigquery.table import RowIterator

    table = bigquery_client.create_table(table_id)

    # It's a bit silly to list rows for an empty table, but this does
    # happen as the result of a DDL query from an IPython magic command.
    rows = bigquery_client.list_rows(table)
    assert isinstance(rows, RowIterator)
    assert tuple(rows) == ()
示例#18
0
def bigquery_dataset(bigquery_client: bigquery.Client,
                     bigquery_schema: List[bigquery.SchemaField]):
    project_id = bigquery_client.project
    dataset_id = prefixer.create_prefix()
    dataset = bigquery.Dataset(f"{project_id}.{dataset_id}")
    dataset = bigquery_client.create_dataset(dataset)
    sample_table_id = f"{project_id}.{dataset_id}.sample"
    job1 = load_sample_data(sample_table_id, bigquery_client, bigquery_schema)
    job1.result()
    one_row_table_id = f"{project_id}.{dataset_id}.sample_one_row"
    job2 = load_sample_data(
        one_row_table_id,
        bigquery_client,
        bigquery_schema,
        filename="sample_one_row.json",
    )
    job2.result()
    view = bigquery.Table(f"{project_id}.{dataset_id}.sample_view", )
    view.view_query = f"SELECT string FROM `{dataset_id}.sample`"
    bigquery_client.create_table(view)
    yield dataset_id
    bigquery_client.delete_dataset(dataset_id, delete_contents=True)
示例#19
0
def create_table(client: bigquery.Client, dataset_id: str, table_id: str, schema: list):
    """
    Creates a table according to the given schema in the specified project:dataset

    Args:
        client: BQ API client
        dataset_id: destination dataset
        table_id: table to be created
        schema: schema of the table to be created

    Returns:

    Examples:
        create_table(client, 'my_dataset', 'my_table', my_schema)
    """
    dataset_ref = client.dataset(dataset_id=dataset_id)
    tables_list = [t.table_id for t in list(client.list_tables(dataset_ref))]

    if table_id in tables_list:
        print("THIS TABLE ALREADY EXISTS IN {}:{}".format(client.project, dataset_id))
    else:
        table_ref = dataset_ref.table(table_id)
        client.create_table(bigquery.Table(table_ref, schema))
示例#20
0
def create_table(client: bigquery.Client, dataset_id: str, table_id: str,
                 fields: List[Dict]):
    dataset = bigquery.Dataset("{}.{}".format(client.project, dataset_id))
    table_ref = dataset.table(table_id)
    table = bigquery.Table(table_ref, schema=fields)
    table.time_partitioning = bigquery.table.TimePartitioning(
    )  # partition by day

    try:
        table = client.create_table(table)
        print("Created table {}.{}.{}".format(table.project, table.dataset_id,
                                              table.table_id))
    except gexceptions.GoogleAPIError as e:
        print("Table {} could not be created: {}. Skipping...".format(
            table_id, e))
示例#21
0
def get_or_create_table(client: bigquery.Client) -> bigquery.Table:
    try:
        dataset = client.get_dataset("sensors")
    except NotFound as _:
        dataset = client.create_dataset("sensors")

    # The default project ID is not set and hence a fully-qualified ID is required.
    table_ref = bigquery.TableReference(dataset, table_id="particulate_matter")
    try:
        return client.get_table(table_ref)
    except NotFound as _:
        return client.create_table(
            bigquery.Table(
                table_ref,
                schema=[
                    bigquery.SchemaField(
                        "humidity",
                        "NUMERIC",
                        description="Sensor DHT22humidity in %"),
                    bigquery.SchemaField("max_micro",
                                         "NUMERIC",
                                         description=""),
                    bigquery.SchemaField("min_micro",
                                         "NUMERIC",
                                         description=""),
                    bigquery.SchemaField("samples", "NUMERIC", description=""),
                    bigquery.SchemaField(
                        "sds_p1",
                        "NUMERIC",
                        description="Sensor SDS011 PM10 in µg/m³"),
                    bigquery.SchemaField(
                        "sds_p2",
                        "NUMERIC",
                        description="Sensor SDS011 PM2.5 in µg/m³"),
                    bigquery.SchemaField(
                        "signal",
                        "NUMERIC",
                        description="WiFi signal strength in dBm"),
                    bigquery.SchemaField(
                        "temperature",
                        "NUMERIC",
                        description="Sensor DHT22 temperature in °C"),
                    bigquery.SchemaField("datetime",
                                         "DATETIME",
                                         description="Datetime of measurement",
                                         mode="REQUIRED"),
                ],
            ))
示例#22
0
def create_bq_table(
    client: bigquery.Client,
    dataset: bigquery.Dataset,
    table_id: str,
    table_schema: List[bigquery.SchemaField],
    table_description: str = None,
) -> bigquery.Table:
    """
    Create empty table.
    """
    # TODO: validate 'table_id'.

    # note: it is not intuitive the dual instantiation of a 'Table' object.
    table = bigquery.Table(dataset.table(table_id),
                           schema=table_schema)  # type: bigquery.Table
    table.description = table_description

    # API request
    return client.create_table(table)  # type: bigquery.Table
示例#23
0
def dest_table(request, bq: bigquery.Client, dest_dataset) -> bigquery.Table:
    public_table: bigquery.Table = bq.get_table(
        bigquery.TableReference.from_string(
            "bigquery-public-data.new_york_311.311_service_requests"))
    schema = public_table.schema

    table: bigquery.Table = bigquery.Table(
        f"{os.environ.get('TF_VAR_project_id', 'bqutil')}"
        f".{dest_dataset.dataset_id}.cf_e2e_test_nyc_311_"
        f"{os.getenv('SHORT_SHA', 'manual')}",
        schema=schema,
    )

    table = bq.create_table(table)

    def teardown():
        bq.delete_table(table, not_found_ok=True)

    request.addfinalizer(teardown)
    return table
def load_folder(dst_dataset: str, bq_client: BQClient, bucket_name: str,
                prefix: str, gcs_client: GCSClient,
                hpo_id: str) -> List[LoadJob]:
    """
    Stage files from a bucket to a dataset

    :param dst_dataset: Identifies the destination dataset
    :param bq_client: a BigQuery client object
    :param bucket_name: the bucket in GCS containing the archive files
    :param prefix: prefix of the filepath URI
    :param gcs_client: a Cloud Storage client object
    :param hpo_id: Identifies the HPO site
    :return: list of completed load jobs
    """
    blobs = list(gcs_client.list_blobs(bucket_name, prefix=prefix))

    load_jobs = []
    for blob in blobs:
        table_name = _filename_to_table_name(blob.name)
        if table_name not in AOU_REQUIRED:
            LOGGER.debug(f'Skipping file for {table_name}')
            continue
        schema = get_table_schema(table_name)
        hpo_table_name = f'{hpo_id}_{table_name}'
        fq_hpo_table = f'{bq_client.project}.{dst_dataset}.{hpo_table_name}'
        destination = Table(fq_hpo_table, schema=schema)
        destination = bq_client.create_table(destination)
        job_config = LoadJobConfig()
        job_config.schema = schema
        job_config.skip_leading_rows = 1
        job_config.source_format = 'CSV'
        source_uri = f'gs://{bucket_name}/{blob.name}'
        load_job = bq_client.load_table_from_uri(
            source_uri,
            destination,
            job_config=job_config,
            job_id_prefix=f"{__file__.split('/')[-1].split('.')[0]}_")
        LOGGER.info(f'table:{destination} job_id:{load_job.job_id}')
        load_jobs.append(load_job)
        load_job.result()
    return load_jobs
示例#25
0
文件: bq.py 项目: sheilsarda/banditml
    def create(
        cls,
        client: bigquery.Client,
        project_id: str,
        dataset_id: str,
        schema: List[bigquery.SchemaField] = None,
        partition: bool = False,
    ):
        if cls.name is None:
            raise NotImplementedError(
                "class attribute `name` must be set in order to create table")
        bq_table = bigquery.Table(
            _fulltable(project_id, dataset_id, cls.name),
            schema=schema,
        )
        bq_table.time_partitioning = bigquery.TimePartitioning(
            type_=bigquery.TimePartitioningType.DAY,
            field=cls.partition_field,
        )
        bq_table = client.create_table(bq_table)

        return cls(client, bq_table=bq_table)
示例#26
0
def apply_schema_differences(
    schema_diffs: _SchemaDiffs,
    bigquery_client: BigQueryClient,
) -> None:
    print("Applying changes...")
    for table_identifier, difference in schema_diffs.items():
        if isinstance(difference, MissingTable):
            print("Creating table...")
            table = Table(
                table_identifier,
                schema=difference.local_table.get_schema_fields(),
            )
            if difference.local_table.time_partitioning:
                table.time_partitioning = difference.local_table.time_partitioning
            remote_table = bigquery_client.create_table(table)
            print(remote_table)
        elif isinstance(difference, ExistingTable):
            difference.remote_table.schema = difference.local_table.get_schema_fields(
            )
            print(
                bigquery_client.update_table(difference.remote_table,
                                             ["schema"]))
示例#27
0
def create_bq_table(table_name='CRY', dataset_name='price_data'):
    '''Create table if not exists'''
    client = Client()
    tables = [
        i.table_id
        for i in client.list_tables(client.project + "." + dataset_name)
    ]
    if table_name not in tables:
        if table_name == 'CRY':
            schema = [
                SchemaField("open", "FLOAT64", mode="NULLABLE"),
                SchemaField("high", "FLOAT64", mode="NULLABLE"),
                SchemaField("low", "FLOAT64", mode="NULLABLE"),
                SchemaField("close", "FLOAT64", mode="NULLABLE"),
                SchemaField("volume", "FLOAT64", mode="NULLABLE"),
                SchemaField("market_cap", "FLOAT64", mode="NULLABLE"),
                SchemaField("symbol", "STRING", mode="NULLABLE"),
                SchemaField("date", "TIMESTAMP", mode="NULLABLE"),
            ]
        else:
            schema = [
                SchemaField("open", "FLOAT64", mode="NULLABLE"),
                SchemaField("high", "FLOAT64", mode="NULLABLE"),
                SchemaField("low", "FLOAT64", mode="NULLABLE"),
                SchemaField("close", "FLOAT64", mode="NULLABLE"),
                SchemaField("adjusted_close", "FLOAT64", mode="NULLABLE"),
                SchemaField("volume", "FLOAT64", mode="NULLABLE"),
                SchemaField("dividend_amount", "FLOAT64", mode="NULLABLE"),
                SchemaField("split_coefficient", "FLOAT64", mode="NULLABLE"),
                SchemaField("symbol", "STRING", mode="NULLABLE"),
                SchemaField("date", "TIMESTAMP", mode="NULLABLE"),
            ]
        table = Table(client.project + "." + dataset_name + "." + table_name,
                      schema=schema)
        table = client.create_table(table)
    else:
        print("Table already exists")
示例#28
0
        def enrich_task():
            client = Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)
            table = Table(temp_table_ref)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            if time_partitioning_field is not None:
                table.time_partitioning = TimePartitioning(
                    field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table.schema = schema

            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref
            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql = read_file(sql_path, environment)
            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            # Copy temporary table to destination
            copy_job_config = CopyJobConfig()
            copy_job_config.write_disposition = 'WRITE_TRUNCATE'

            dest_table_name = '{task}'.format(task=task)
            dest_table_ref = client.dataset(
                dataset_name,
                project=destination_dataset_project_id).table(dest_table_name)
            copy_job = client.copy_table(temp_table_ref,
                                         dest_table_ref,
                                         location='US',
                                         job_config=copy_job_config)
            submit_bigquery_job(copy_job, copy_job_config)
            assert copy_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)
示例#29
0
def bigquery_table(bigquery_client: bigquery.Client) -> bigquery.Table:
    """ Create a temporary table for the test, remove on cleanup """
    table = bigquery_client.create_table(TEST_TABLE, exists_ok=True)
    yield table
    bigquery_client.delete_table(table)
示例#30
0
        def enrich_task(ds, **kwargs):
            template_context = kwargs.copy()
            template_context['ds'] = ds
            template_context['params'] = environment

            client = Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)
            table = Table(temp_table_ref)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            table.time_partitioning = TimePartitioning(
                field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table.schema = schema

            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref

            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql_template = read_file(sql_path)
            sql = kwargs['task'].render_template('', sql_template,
                                                 template_context)
            print('Enrichment sql:')
            print(sql)

            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            if load_all_partitions:
                # Copy temporary table to destination
                copy_job_config = CopyJobConfig()
                copy_job_config.write_disposition = 'WRITE_TRUNCATE'

                dest_table_name = '{task}'.format(task=task)
                dest_table_ref = client.dataset(
                    dataset_name,
                    project=destination_dataset_project_id).table(
                        dest_table_name)
                copy_job = client.copy_table(temp_table_ref,
                                             dest_table_ref,
                                             location='US',
                                             job_config=copy_job_config)
                submit_bigquery_job(copy_job, copy_job_config)
                assert copy_job.state == 'DONE'
            else:
                # Merge
                # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement
                merge_job_config = QueryJobConfig()
                # Finishes faster, query limit for concurrent interactive queries is 50
                merge_job_config.priority = QueryPriority.INTERACTIVE

                merge_sql_path = os.path.join(
                    dags_folder,
                    'resources/stages/enrich/sqls/merge_{task}.sql'.format(
                        task=task))
                merge_sql_template = read_file(merge_sql_path)
                template_context['params']['source_table'] = temp_table_name
                merge_sql = kwargs['task'].render_template(
                    '', merge_sql_template, template_context)
                print('Merge sql:')
                print(merge_sql)
                merge_job = client.query(merge_sql,
                                         location='US',
                                         job_config=merge_job_config)
                submit_bigquery_job(merge_job, merge_job_config)
                assert merge_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)