Пример #1
0
    def test_create_dataset(self):
        dataset = bq.create_dataset(self.project_id, self.dataset_id,
                                    self.description, self.label_or_tag)
        self.assertEqual(dataset.dataset_id, self.dataset_id)

        # Try to create same dataset, which now already exists
        self.assertRaises(RuntimeError, bq.create_dataset, self.project_id,
                          self.dataset_id, self.description, self.label_or_tag)

        dataset = bq.create_dataset(self.project_id,
                                    self.dataset_id,
                                    self.description,
                                    self.label_or_tag,
                                    overwrite_existing=True)
        self.assertEqual(dataset.dataset_id, self.dataset_id)
Пример #2
0
def main(project_id: str, bucket_name: str, dst_dataset_id: str):
    """
    Load and transform vocabulary files in GCS to a BigQuery dataset

    :param project_id:
    :param bucket_name: refers to the bucket containing vocabulary files
    :param dst_dataset_id: final destination to load the vocabulary in BigQuery
    """
    bq_client = bq.get_client(project_id)
    gcs_client = storage.Client(project_id)
    sandbox_dataset_id = get_sandbox_dataset_id(dst_dataset_id)
    sandbox_dataset = bq.create_dataset(
        project_id,
        sandbox_dataset_id,
        f'Vocabulary loaded from gs://{bucket_name}',
        label_or_tag={'type': 'vocabulary'},
        overwrite_existing=True)
    stage_jobs = load_stage(sandbox_dataset, bq_client, bucket_name, gcs_client)
    wait_jobs(stage_jobs)
    load_jobs = load(project_id,
                     bq_client,
                     sandbox_dataset_id,
                     dst_dataset_id,
                     overwrite_ok=True)
    wait_jobs(load_jobs)
Пример #3
0
def create_sandbox_dataset(project_id, dataset_id):
    """
    A helper function create a sandbox dataset if the sandbox dataset doesn't exist
    :param project_id: project_id
    :param dataset_id: any dataset_id
    :return: the sandbox dataset_id
    """
    sandbox_dataset_id = get_sandbox_dataset_id(dataset_id)
    friendly_name = f'Sandbox for {dataset_id}'
    description = f'Sandbox created for storing records affected by the cleaning rules applied to {dataset_id}'
    label_or_tag = {'label': '', 'tag': ''}
    create_dataset(project_id=project_id,
                   dataset_id=sandbox_dataset_id,
                   friendly_name=friendly_name,
                   description=description,
                   label_or_tag=label_or_tag,
                   overwrite_existing=False)

    return sandbox_dataset_id
Пример #4
0
    def test_query_sheet_linked_bq_table(self):
        dataset = bq.create_dataset(self.project_id, self.dataset_id,
                                    self.description, self.label_or_tag)
        # add Google Drive scope
        external_data_scopes = [
            "https://www.googleapis.com/auth/drive",
            "https://www.googleapis.com/auth/cloud-platform"
        ]
        client = bq.get_client(self.project_id, external_data_scopes)

        # Configure the external data source and query job.
        external_config = bigquery.ExternalConfig("GOOGLE_SHEETS")

        # Grant viewing access to the test sheet to BigQuery test accounts
        sheet_url = (
            "https://docs.google.com/spreadsheets"
            "/d/1JI-KyigmwZU9I2J6TZqVTPNoEAWVqiFeF8Y549-dvzM/edit#gid=0")
        schema = [
            bigquery.SchemaField("site_name", "STRING"),
            bigquery.SchemaField("hpo_id", "STRING"),
            bigquery.SchemaField("site_point_of_contact", "STRING"),
        ]
        external_config.source_uris = [sheet_url]
        external_config.schema = schema
        external_config.options.range = (
            "contacts!A1:C5"  # limit scope so that other items can be added to sheet
        )
        external_config.options.skip_leading_rows = 1  # Optionally skip header row.

        table_id = consts.HPO_ID_CONTACT_LIST_TABLE_ID
        table = bigquery.Table(dataset.table(table_id), schema=schema)
        table.external_data_configuration = external_config

        table = client.create_table(table)
        table_content_query = f'SELECT * FROM `{dataset.dataset_id}.{table.table_id}`'
        actual_df = bq.query_sheet_linked_bq_table(self.project_id,
                                                   table_content_query,
                                                   external_data_scopes)
        expected_dict = [{
            'site_name':
                'Fake Site Name 1',
            'hpo_id':
                'fake_1',
            'site_point_of_contact':
                'fake.email.1@site_1.fakedomain; fake.email.2@site_1.fakedomain'
        }, {
            'site_name': 'Fake Site Name 2',
            'hpo_id': 'fake_2',
            'site_point_of_contact': 'no data steward'
        }, {
            'site_name':
                'Fake Site Name 3',
            'hpo_id':
                'fake_3',
            'site_point_of_contact':
                'Fake.Email.1@site_3.fake_domain; Fake.Email.2@site_3.fake_domain'
        }, {
            'site_name':
                'Fake Site Name 4',
            'hpo_id':
                'fake_4',
            'site_point_of_contact':
                '[email protected]; [email protected]'
        }]
        expected_df = pd.DataFrame(
            expected_dict,
            columns=["site_name", "hpo_id", "site_point_of_contact"])
        pd.testing.assert_frame_equal(actual_df, expected_df)