Exemplo n.º 1
0
    def test_create_sandbox_dataset(self):
        # Create sandbox dataset
        sandbox_dataset = sandbox.create_sandbox_dataset(
            self.project_id, self.dataset_id)
        all_datasets_obj = list_datasets(self.project_id)
        all_datasets = [d.dataset_id for d in all_datasets_obj]

        self.assertTrue(sandbox_dataset in all_datasets)

        # Try to create same sandbox, which now already exists
        self.assertRaises(RuntimeError, sandbox.create_sandbox_dataset,
                          self.project_id, self.dataset_id)

        # Remove fake dataset created in project
        delete_dataset(self.project_id, sandbox_dataset)
Exemplo n.º 2
0
def check_and_create_sandbox_dataset(project_id, dataset_id):
    """
    A helper function to check if sandbox dataset exisits. If it does not, it will create.

    :param project_id: the project_id that the dataset is in
    :param dataset_id: the dataset_id to verify
    :return: the sandbox dataset_name that either exists or was created
    """
    sandbox_dataset = get_sandbox_dataset_id(dataset_id)
    dataset_objs = list_datasets(project_id)
    datasets = [d.dataset_id for d in dataset_objs]

    if sandbox_dataset not in datasets:
        create_sandbox_dataset(project_id, dataset_id)
    return sandbox_dataset
Exemplo n.º 3
0
def get_datasets_list(project_id, dataset_ids_str):
    """
    Returns list of dataset_ids on which to perform retraction

    Returns list of rdr, ehr, unioned, combined and deid dataset_ids and excludes sandbox and staging datasets
    :param project_id: identifies the project containing datasets to retract from
    :param dataset_ids_str: string of datasets to retract from separated by a space. If set to 'all_datasets',
        retracts from all datasets. If set to 'none', skips retraction from BigQuery datasets
    :return: List of dataset_ids
    :raises: AttributeError if dataset_ids_str does not allow .split()
    """
    all_dataset_ids = [
        dataset.dataset_id for dataset in bq.list_datasets(project_id)
    ]

    if not dataset_ids_str or dataset_ids_str == consts.NONE:
        dataset_ids = []
        LOGGER.info(
            "No datasets specified. Defaulting to empty list. Expect bucked only retraction."
        )
    elif dataset_ids_str == consts.ALL_DATASETS:
        dataset_ids = all_dataset_ids
        LOGGER.info(
            f"All datasets are specified. Setting dataset_ids to all datasets in project: {project_id}"
        )
    else:
        dataset_ids = dataset_ids_str.split()
        # only consider datasets that exist in the project
        dataset_ids = [
            dataset_id for dataset_id in dataset_ids
            if dataset_id in all_dataset_ids
        ]
        LOGGER.info(
            f"Datasets specified and existing in project {project_id}: {dataset_ids}"
        )

    # consider datasets containing PPI/EHR data, excluding sandbox/staging datasets
    dataset_ids = [
        dataset_id for dataset_id in dataset_ids
        if get_dataset_type(dataset_id) != common.OTHER
        and not is_sandbox_dataset(dataset_id)
        and not is_staging_dataset(dataset_id)
    ]

    LOGGER.info(f"Found datasets to retract from: {', '.join(dataset_ids)}")
    return dataset_ids
Exemplo n.º 4
0
def get_dataset_ids_to_target(project_id, dataset_ids=None):
    """
    Return dataset_ids that are found in the project based on BQ metadata

    :param project_id: Identifies the project to target
    :param dataset_ids: list identifying datasets or None for all datasets
    :return: List of dataset_ids in the project to target
    """
    all_datasets = bq.list_datasets(project_id)
    all_dataset_ids = [dataset.dataset_id for dataset in all_datasets]
    result_dataset_ids = []
    if dataset_ids is None:
        result_dataset_ids = all_dataset_ids
    else:
        for dataset_id in dataset_ids:
            if dataset_id not in all_dataset_ids:
                logging.info(
                    f"Dataset {dataset_id} not found in project {project_id}, skipping"
                )
            else:
                result_dataset_ids.append(dataset_id)
    return result_dataset_ids