def create_table(client, dataset_ref, table_name, is_partitioned=False):
    table_ref = dataset_ref.table(table_name)
    table_obj = Table(table_ref, schema=TABLE_SCHEMA)

    if is_partitioned:
        time_partitioning = TimePartitioning()
        time_partitioning.field = 'partition_value'
        table_obj.time_partitioning = time_partitioning

    return client.create_table(table_obj)
 def __create_test_table(self, table_name, dataset_id):
     table_schema = [
         SchemaField("int_data", "INT64"),
         SchemaField("str_data", "STRING")
     ]
     table_reference = TableReference(dataset_id, table_name)
     test_table = Table(table_reference, table_schema)
     test_table.time_partitioning = TimePartitioning("DAY")
     self.__delete_if_exists(test_table)
     self.GCP_BIGQUERY_CLIENT.create_table(test_table)
     return test_table
Пример #3
0
def store_digital_health_status_data(project_id,
                                     json_data,
                                     destination_table,
                                     schema=None):
    """
    Stores the fetched digital_health_sharing_status data in a BigQuery dataset.

    If the table doesn't exist, it will create that table. If the table does exist,
    it will create a partition in the designated table or append to the same partition.
    This is necessary for storing data has "RECORD" type fields which do not conform to a dataframe.
    The data is stored using a JSON file object since it is one of the ways BigQuery expects it.
    :param project_id: identifies the project
    :param json_data: list of json objects retrieved from process_digital_health_data_to_json
    :param destination_table: fully qualified destination table name as 'project.dataset.table'
    :param schema: a list of SchemaField objects corresponding to the destination table

    :return: returns the bq job_id for the loading of digital health data
    """

    # Parameter check
    if not isinstance(project_id, str):
        raise RuntimeError(
            f'Please specify the project in which to create the table')

    client = get_client(project_id)
    if not schema:
        schema = get_table_schema(DIGITAL_HEALTH_SHARING_STATUS)

    try:
        table = client.get_table(destination_table)
    except NotFound:
        table = Table(destination_table, schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY)
        table = client.create_table(table)

    file_obj = StringIO()
    for json_obj in json_data:
        json.dump(json_obj, file_obj)
        file_obj.write('\n')
    job_config = LoadJobConfig(
        source_format=SourceFormat.NEWLINE_DELIMITED_JSON, schema=schema)
    job = client.load_table_from_file(file_obj,
                                      table,
                                      rewind=True,
                                      job_config=job_config,
                                      job_id_prefix='ps_digital_health_load_')
    job.result()

    return job.job_id
Пример #4
0
    def setUp(self):
        self.project_id = os.environ.get(PROJECT_ID)
        self.dataset_id = os.environ.get('COMBINED_DATASET_ID')
        self.dataset_ref = DatasetReference(self.project_id, self.dataset_id)
        self.client = bq.get_client(self.project_id)

        self.schema = [
            SchemaField("person_id", "INT64"),
            SchemaField("first_name", "STRING"),
            SchemaField("last_name", "STRING"),
            SchemaField("algorithm", "STRING")
        ]

        self.ps_api_fields = [
            dict(name='person_id', type='integer', mode='nullable'),
            dict(name='first_name', type='string', mode='nullable'),
            dict(name='last_name', type='string', mode='nullable')
        ]

        self.id_match_fields = [
            dict(name='person_id', type='integer', mode='nullable'),
            dict(name='first_name', type='string', mode='nullable'),
            dict(name='last_name', type='string', mode='nullable'),
            dict(name='algorithm', type='string', mode='nullable')
        ]

        self.hpo_id = 'fake_site'
        self.id_match_table_id = f'{IDENTITY_MATCH_TABLE}_{self.hpo_id}'
        self.ps_values_table_id = f'ps_api_values_{self.hpo_id}'

        # Create and populate the ps_values site table

        schema = bq.get_table_schema(PS_API_VALUES)
        tablename = self.ps_values_table_id

        table = Table(f'{self.project_id}.{self.dataset_id}.{tablename}',
                      schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table)

        populate_query = POPULATE_PS_VALUES.render(
            project_id=self.project_id,
            drc_dataset_id=self.dataset_id,
            ps_values_table_id=self.ps_values_table_id)
        job = self.client.query(populate_query)
        job.result()
Пример #5
0
    def test_should_create_table_from_table_object(self):
        # given
        table_id = f'{self.dataset_manager.project_id}.{self.dataset_manager.dataset_name}.example_test_table'
        table = Table(table_id,
                      schema=[
                          {
                              "mode": "NULLABLE",
                              "name": "example_field",
                              "type": "STRING"
                          },
                      ])
        table.time_partitioning = TimePartitioning()

        # when
        self.dataset_manager.create_table_from_schema('example_test_table',
                                                      schema=None,
                                                      table=table)

        # then
        self.table_should_exists()
Пример #6
0
    def create_table_from_schema(self,
                                 table_id: str,
                                 schema: typing.Union[typing.List[dict], Path,
                                                      None] = None,
                                 table=None):
        from google.cloud.bigquery import Table, TimePartitioning

        if schema and table:
            raise ValueError(
                "You can't provide both schema and table, because the table you provide"
                "should already contain the schema.")
        if not schema and not table:
            raise ValueError("You must provide either schema or table.")

        if isinstance(schema, Path):
            schema = json.loads(schema.read_text())

        if table is None:
            table = Table(table_id, schema=schema)
            table.time_partitioning = TimePartitioning()

        self.logger.info(f'CREATING TABLE FROM SCHEMA: {table.schema}')

        self.bigquery_client.create_table(table)
Пример #7
0
        def enrich_task():
            client = Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)
            table = Table(temp_table_ref)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            if time_partitioning_field is not None:
                table.time_partitioning = TimePartitioning(
                    field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table.schema = schema

            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref
            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql = read_file(sql_path, environment)
            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            # Copy temporary table to destination
            copy_job_config = CopyJobConfig()
            copy_job_config.write_disposition = 'WRITE_TRUNCATE'

            dest_table_name = '{task}'.format(task=task)
            dest_table_ref = client.dataset(
                dataset_name,
                project=destination_dataset_project_id).table(dest_table_name)
            copy_job = client.copy_table(temp_table_ref,
                                         dest_table_ref,
                                         location='US',
                                         job_config=copy_job_config)
            submit_bigquery_job(copy_job, copy_job_config)
            assert copy_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)
Пример #8
0
        def enrich_task(ds, **kwargs):
            template_context = kwargs.copy()
            template_context['ds'] = ds
            template_context['params'] = environment

            client = Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)
            table = Table(temp_table_ref)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            table.time_partitioning = TimePartitioning(
                field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table.schema = schema

            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref

            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql_template = read_file(sql_path)
            sql = kwargs['task'].render_template('', sql_template,
                                                 template_context)
            print('Enrichment sql:')
            print(sql)

            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            if load_all_partitions:
                # Copy temporary table to destination
                copy_job_config = CopyJobConfig()
                copy_job_config.write_disposition = 'WRITE_TRUNCATE'

                dest_table_name = '{task}'.format(task=task)
                dest_table_ref = client.dataset(
                    dataset_name,
                    project=destination_dataset_project_id).table(
                        dest_table_name)
                copy_job = client.copy_table(temp_table_ref,
                                             dest_table_ref,
                                             location='US',
                                             job_config=copy_job_config)
                submit_bigquery_job(copy_job, copy_job_config)
                assert copy_job.state == 'DONE'
            else:
                # Merge
                # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement
                merge_job_config = QueryJobConfig()
                # Finishes faster, query limit for concurrent interactive queries is 50
                merge_job_config.priority = QueryPriority.INTERACTIVE

                merge_sql_path = os.path.join(
                    dags_folder,
                    'resources/stages/enrich/sqls/merge_{task}.sql'.format(
                        task=task))
                merge_sql_template = read_file(merge_sql_path)
                template_context['params']['source_table'] = temp_table_name
                merge_sql = kwargs['task'].render_template(
                    '', merge_sql_template, template_context)
                print('Merge sql:')
                print(merge_sql)
                merge_job = client.query(merge_sql,
                                         location='US',
                                         job_config=merge_job_config)
                submit_bigquery_job(merge_job, merge_job_config)
                assert merge_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)
Пример #9
0
    def setUp(self):
        self.maxDiff = None
        self.project_id = os.environ.get(PROJECT_ID)
        self.dataset_id = os.environ.get('COMBINED_DATASET_ID')
        self.dataset_ref = DatasetReference(self.project_id, self.dataset_id)
        self.client = bq.get_client(self.project_id)

        self.hpo_id = 'fake_site'
        self.id_match_table_id = f'{IDENTITY_MATCH_TABLE}_{self.hpo_id}'
        self.ps_values_table_id = f'{PS_API_VALUES}_{self.hpo_id}'
        self.pii_address_table_id = f'{self.hpo_id}_pii_address'
        self.pii_email_table_id = f'{self.hpo_id}_pii_email'
        self.pii_phone_number_table_id = f'{self.hpo_id}_pii_phone_number'
        self.pii_name_table_id = f'{self.hpo_id}_pii_name'
        self.person_table_id = f'{self.hpo_id}_person'
        self.location_table_id = f'{self.hpo_id}_location'
        self.fq_concept_table = f'{self.project_id}.{self.dataset_id}.concept'

        # Create and populate the ps_values site table

        schema = resources.fields_for(f'{PS_API_VALUES}')
        table = Table(
            f'{self.project_id}.{self.dataset_id}.{self.ps_values_table_id}',
            schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table, exists_ok=True)

        populate_query = POPULATE_PS_VALUES.render(
            project_id=self.project_id,
            drc_dataset_id=self.dataset_id,
            ps_values_table_id=self.ps_values_table_id)
        job = self.client.query(populate_query)
        job.result()

        # Create and populate the drc_id_match_table

        schema = resources.fields_for(f'{IDENTITY_MATCH_TABLE}')
        table = Table(
            f'{self.project_id}.{self.dataset_id}.{self.id_match_table_id}',
            schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table, exists_ok=True)

        populate_query = POPULATE_ID_MATCH.render(
            project_id=self.project_id,
            drc_dataset_id=self.dataset_id,
            id_match_table_id=self.id_match_table_id)
        job = self.client.query(populate_query)
        job.result()

        # Create and populate pii_name, pii_email, pii_phone_number, and pii_address table

        schema = resources.fields_for(f'{PII_NAME}')
        table = Table(
            f'{self.project_id}.{self.dataset_id}.{self.pii_name_table_id}',
            schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table, exists_ok=True)

        schema = resources.fields_for(f'{PII_EMAIL}')
        table = Table(
            f'{self.project_id}.{self.dataset_id}.{self.pii_email_table_id}',
            schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table, exists_ok=True)

        schema = resources.fields_for(f'{PII_PHONE_NUMBER}')
        table = Table(
            f'{self.project_id}.{self.dataset_id}.{self.pii_phone_number_table_id}',
            schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table, exists_ok=True)

        schema = resources.fields_for(f'{PII_ADDRESS}')
        table = Table(
            f'{self.project_id}.{self.dataset_id}.{self.pii_address_table_id}',
            schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table, exists_ok=True)

        person_table = Table(
            f'{self.project_id}.{self.dataset_id}.{self.person_table_id}',
            schema=person_schema)
        person_table = self.client.create_table(person_table, exists_ok=True)

        location_table = Table(
            f'{self.project_id}.{self.dataset_id}.{self.location_table_id}',
            schema=location_schema)
        location_table = self.client.create_table(location_table,
                                                  exists_ok=True)

        concept_table = Table(f'{self.project_id}.{self.dataset_id}.concept',
                              schema=concept_schema)
        concept_table = self.client.create_table(concept_table, exists_ok=True)
Пример #10
0
    def commit(self):
        # The commit is where the upload is actually done for BigQuery (special case).
        # The _create_or_update_table method can be called multiple times;
        # each time, data is appended to the .avro file. When "committing",
        # this .avro file is uploaded and, depending on the load strategy, used.
        if not hasattr(self, "avro_file_name"):
            # There was no data ever uploaded
            # Do nothing
            self.log.info("Nothing to upload!")
            return

        # Clean up after yourself first
        self.avro_writer.close()

        # Fetch the relevant configuration
        project_id = self.table_creation_config.get("database_name",
                                                    self.database_name)
        assert project_id, "Missing Project ID!"
        load_strategy = self.table_creation_config["load_strategy"]
        primary_key = self.table_creation_config["primary_key"]
        schema_name = self.table_creation_config["schema_name"]
        schema_suffix = self.table_creation_config["schema_suffix"]
        table_name_final = self.table_creation_config["table_name"]
        table_suffix = "__ewah_tmp"

        columns_definition = self.table_creation_config["columns_definition"]
        new_schema_name = schema_name + schema_suffix

        is_full_refresh = (load_strategy == EC.LS_INSERT_REPLACE
                           or not self.test_if_table_exists(
                               table_name=table_name_final,
                               schema_name=new_schema_name,
                               project_id=project_id,
                           ))

        conn = self.dwh_hook.dbconn
        ds_new = conn.get_dataset(new_schema_name)

        # Create temp table with .avro file
        if is_full_refresh:
            # temp table is also the final table for full refresh!
            table_name = table_name_final
        else:
            table_name = table_name_final + table_suffix

        # Drop temp table if it already exists
        if self.test_if_table_exists(
                table_name=table_name,
                schema_name=new_schema_name,
                project_id=project_id,
        ):
            # Drop table before re-creating it
            conn.delete_table(
                conn.get_table(
                    TableReference(dataset_ref=ds_new, table_id=table_name)))
        # Create temp table with .avro file
        table_obj = Table(".".join([project_id, new_schema_name, table_name]))
        if is_full_refresh and self.partition_field:
            table_obj.time_partitioning = bigquery.TimePartitioning(
                type_=self.partition_type,
                field=self.partition_field,
            )
            if self.require_partition_filter:
                table_obj.require_partition_filter = True
        self.log.info("Uploading data into table now...")
        with open(self.avro_file_name, "rb") as source_file:
            job = conn.load_table_from_file(
                file_obj=source_file,
                destination=table_obj,
                job_id_prefix="ewah_",
                rewind=True,
                job_config=LoadJobConfig(
                    autodetect=False,
                    source_format="AVRO",
                    schema=[
                        SchemaField(name=name, field_type=field["data_type"])
                        for name, field in columns_definition.items()
                    ],
                ),
            )
            try:
                job.result()
            except:
                self.log.info("Errors occured - job errors: {0}".format(
                    job.errors))
                raise
            assert job.state == "DONE", "Invalid job state: {0}".format(
                job.state)

        if not is_full_refresh:
            # Need to merge new rows into the existing table

            fields_pk = set(primary_key or [])
            fields_all = set(columns_definition.keys() or [])
            fields_non_pk = fields_all - fields_pk

            if load_strategy == EC.LS_UPSERT:
                assert fields_pk
            elif load_strategy == EC.LS_INSERT_ADD:
                fields_pk = []  # Ignore if set
            else:
                raise Exception("Not implemented!")

            merge_statement = """
                MERGE INTO `{target}` AS TARGET
                USING `{source}` AS SOURCE
                ON {condition}

                WHEN MATCHED THEN
                    UPDATE SET {update_fields}

                WHEN NOT MATCHED THEN
                    INSERT ({insert_fields})
                    VALUES ({insert_fields})
            """.format(
                target=".".join(
                    [project_id, new_schema_name, table_name_final]),
                source=".".join([project_id, new_schema_name, table_name]),
                condition=" AND ".join([
                    "TARGET.`{0}` = SOURCE.`{0}`".format(field)
                    for field in fields_pk
                ]) or "FALSE",
                insert_fields="`{0}`".format("`, `".join(fields_all)),
                update_fields=", ".join([
                    "`{0}` = SOURCE.`{0}`".format(field)
                    for field in fields_non_pk
                ]),
            )

            self.log.info(
                "Executing query:\n\n{0}\n\n".format(merge_statement))
            job = conn.query(
                query=merge_statement,
                job_id_prefix="ewah_",
            )
            try:
                job.result()
            except:
                self.log.info("Errors occured - job errors: {0}".format(
                    job.errors))
                raise
            assert job.state == "DONE", "Invalid job state: {0}".format(
                job.state)

            # Remove old temp table from dataset
            conn.delete_table(
                conn.get_table(
                    TableReference(dataset_ref=ds_new, table_id=table_name)))

        self.log.info("Done!")