예제 #1
0
    def test_update_labels_and_tags(self):
        # Tests if dataset_id param is provided
        self.assertRaises(RuntimeError, update_labels_and_tags, None,
                          self.existing_labels_or_tags,
                          self.new_labels_or_tags)

        # Tests if new_labels_or_tags param is provided
        self.assertRaises(RuntimeError, update_labels_and_tags,
                          self.dataset_id, self.existing_labels_or_tags, None)

        # Pre-conditions
        results = update_labels_and_tags(self.dataset_id,
                                         self.existing_labels_or_tags,
                                         self.new_labels_or_tags, True)

        # Post conditions
        self.assertEqual(results, self.updated)
        with self.assertRaises(RuntimeError):
            update_labels_and_tags(self.dataset_id,
                                   existing_labels_or_tags={'label': 'apples'},
                                   new_labels_or_tags={'label': 'oranges'},
                                   overwrite_ok=False)
예제 #2
0
def create_datasets(client, name, input_dataset, tier, release_tag):
    """
    Creates backup, staging, sandbox, and final datasets with the proper descriptions
    and tag/labels applied

    :param client: an instantiated bigquery client object
    :param name: the base name of the datasets to be created
    :param input_dataset: name of the input dataset
    :param tier: tier parameter passed through from either a list or command line argument
    :param release_tag: release tag parameter passed through either the command line arguments
    :return: tuple of created dataset names
    """

    if not client:
        raise RuntimeError("Please specify BigQuery client object")
    if not name:
        raise RuntimeError(
            "Please specify the base name of the datasets to be created")
    if not input_dataset:
        raise RuntimeError("Please specify the name of the input dataset")
    if not tier:
        raise RuntimeError(
            "Please specify the tier intended for the output datasets")
    if not release_tag:
        raise RuntimeError(
            "Please specify the release tag for the dataset in the format of YYYY#q#r"
        )

    # Construct names of datasets need as part of the deid process
    final_dataset_id = name
    backup_dataset_id = f'{name}_{consts.BACKUP}'
    staging_dataset_id = f'{name}_{consts.STAGING}'
    sandbox_dataset_id = f'{name}_{consts.SANDBOX}'

    datasets = {
        consts.CLEAN: final_dataset_id,
        consts.BACKUP: backup_dataset_id,
        consts.STAGING: staging_dataset_id,
        consts.SANDBOX: sandbox_dataset_id
    }

    deid_datasets = [final_dataset_id, staging_dataset_id]

    # base labels and tags for the datasets
    base_labels_and_tags = {'release_tag': release_tag, 'data_tier': tier}

    description = f'dataset created from {input_dataset} for {tier}{release_tag} CDR run'

    # Creation of dataset objects and dataset label and description updates
    for phase, dataset_id in datasets.items():
        dataset_object = bq.define_dataset(client.project, dataset_id,
                                           description, base_labels_and_tags)
        client.create_dataset(dataset_object, exists_ok=True)
        dataset = bq.get_dataset(client.project, dataset_id)
        if dataset_id in deid_datasets:
            new_labels = bq.update_labels_and_tags(dataset_id,
                                                   base_labels_and_tags, {
                                                       'phase': phase,
                                                       'de-identified': 'true'
                                                   })
            dataset.labels = new_labels
            dataset.description = f'{phase} {description}'
            client.update_dataset(dataset, ["labels", "description"])
        else:
            new_labels = bq.update_labels_and_tags(dataset_id,
                                                   base_labels_and_tags, {
                                                       'phase': phase,
                                                       'de-identified': 'false'
                                                   })
            dataset.labels = new_labels
            dataset.description = f'{phase} {description}'
            client.update_dataset(dataset, ["labels", "description"])

    # Copy input dataset tables to backup and staging datasets
    tables = client.list_tables(input_dataset)
    for table in tables:
        backup_table = f'{backup_dataset_id}.{table.table_id}'
        staging_table = f'{staging_dataset_id}.{table.table_id}'
        client.copy_table(table, backup_table)
        client.copy_table(table, staging_table)

    return datasets