def run_bq_retraction(project_id, sandbox_dataset_id, pid_project_id, pid_table_id, hpo_id, dataset_ids_list, retraction_type): """ Main function to perform retraction pid table must follow schema described above in PID_TABLE_FIELDS and must reside in sandbox_dataset_id This function removes rows from all tables containing person_ids if they exist in pid_table_id :param project_id: project to retract from :param sandbox_dataset_id: identifies the dataset containing the pid table :param pid_project_id: identifies the dataset containing the sandbox dataset :param pid_table_id: table containing the person_ids and research_ids :param hpo_id: hpo_id of the site to retract from :param dataset_ids_list: list of datasets to retract from separated by a space. If containing only 'all_datasets', retracts from all datasets. If containing only 'none', skips retraction from BigQuery datasets :param retraction_type: string indicating whether all data needs to be removed, including RDR, or if RDR data needs to be kept intact. Can take the values 'rdr_and_ehr' or 'only_ehr' :return: """ client = bq.get_client(project_id) dataset_ids = ru.get_datasets_list(project_id, dataset_ids_list) queries = [] for dataset in dataset_ids: if ru.is_deid_dataset(dataset): LOGGER.info(f"Retracting from DEID dataset {dataset}") research_id_query = JINJA_ENV.from_string(PERSON_ID_QUERY).render( person_research_id=RESEARCH_ID, pid_project=pid_project_id, sandbox_dataset_id=sandbox_dataset_id, pid_table_id=pid_table_id) queries = queries_to_retract_from_dataset(client, project_id, dataset, research_id_query, retraction_type) else: person_id_query = JINJA_ENV.from_string(PERSON_ID_QUERY).render( person_research_id=PERSON_ID, pid_project=pid_project_id, sandbox_dataset_id=sandbox_dataset_id, pid_table_id=pid_table_id) if ru.is_combined_dataset(dataset): LOGGER.info(f"Retracting from Combined dataset {dataset}") queries = queries_to_retract_from_dataset( client, project_id, dataset, person_id_query) elif ru.is_unioned_dataset(dataset): LOGGER.info(f"Retracting from Unioned dataset {dataset}") queries = queries_to_retract_from_dataset( client, project_id, dataset, person_id_query) elif ru.is_ehr_dataset(dataset): if hpo_id == NONE_STR: LOGGER.info( f'"RETRACTION_HPO_ID" set to "{NONE_STR}", skipping retraction from {dataset}' ) else: LOGGER.info(f"Retracting from EHR dataset {dataset}") queries = queries_to_retract_from_ehr_dataset( client, project_id, dataset, hpo_id, person_id_query) retraction_query_runner(client, queries) LOGGER.info('Retraction complete') return
def main(args=None): pipeline_logging.configure(logging.DEBUG, add_console_handler=True) parser = get_parser() args = parser.parse_args(args) client = bq.get_client(args.project_id) dataset_ids = ru.get_datasets_list(args.project_id, args.dataset_ids) LOGGER.info( f"Datasets to retract deactivated participants from: {dataset_ids}") run_deactivation(client, args.project_id, dataset_ids, args.fq_deact_table, args.fq_pid_rid_table) LOGGER.info( f"Retraction of deactivated participants from {dataset_ids} complete")
def test_get_datasets_list(self, mock_get_client): #pre-conditions removed_datasets = [ data_ref('foo', 'vocabulary20201010'), data_ref('foo', 'R2019q4r1_deid_sandbox') ] expected_datasets = [ data_ref('foo', '2021q1r1_rdr'), data_ref('foo', 'C2020q1r1_deid'), data_ref('foo', 'R2019q4r1_deid'), data_ref('foo', '2018q4r1_rdr') ] expected_list = [dataset.dataset_id for dataset in expected_datasets] mock_client = mock.MagicMock() mock_get_client.return_value = mock_client mock_client.list_datasets.return_value = removed_datasets + expected_datasets # test all_datasets flag ds_list = ru.get_datasets_list('foo', ['all_datasets']) # post conditions self.assertCountEqual(expected_list, ds_list) # test specific dataset ds_list = ru.get_datasets_list('foo', ['C2020q1r1_deid']) # post conditions self.assertEqual(['C2020q1r1_deid'], ds_list) # test None dataset ds_list = ru.get_datasets_list('foo', None) # post conditions self.assertEqual([], ds_list) # test empty list dataset ds_list = ru.get_datasets_list('foo', []) # post conditions self.assertEqual([], ds_list)
def run_bq_retraction(project_id, sandbox_dataset_id, pid_project_id, pid_table_id, hpo_id, dataset_ids_str, retraction_type): """ Main function to perform retraction pid table must follow schema described above in PID_TABLE_FIELDS and must reside in sandbox_dataset_id This function removes rows from all tables containing person_ids if they exist in pid_table_id :param project_id: project to retract from :param sandbox_dataset_id: identifies the dataset containing the pid table :param pid_project_id: identifies the dataset containing the sandbox dataset :param pid_table_id: table containing the person_ids and research_ids :param hpo_id: hpo_id of the site to retract from :param dataset_ids_str: string of datasets to retract from separated by a space. If set to 'all_datasets', retracts from all datasets. If set to 'none', skips retraction from BigQuery datasets :param retraction_type: string indicating whether all data needs to be removed, including RDR, or if RDR data needs to be kept intact. Can take the values 'rdr_and_ehr' or 'only_ehr' :return: """ dataset_ids = ru.get_datasets_list(project_id, dataset_ids_str) deid_datasets = [] combined_datasets = [] unioned_datasets = [] ehr_datasets = [] for dataset in dataset_ids: if ru.is_deid_dataset(dataset): deid_datasets.append(dataset) elif ru.is_combined_dataset(dataset): combined_datasets.append(dataset) elif ru.is_unioned_dataset(dataset): unioned_datasets.append(dataset) elif ru.is_ehr_dataset(dataset): ehr_datasets.append(dataset) # skip ehr datasets if hpo_id is indicated as none if hpo_id == 'none': LOGGER.info( '"RETRACTION_HPO_ID" set to "none", skipping retraction from EHR datasets' ) ehr_datasets = [] LOGGER.info(f"Retracting from EHR datasets: {', '.join(ehr_datasets)}") for dataset in ehr_datasets: ehr_mapping_queries, ehr_queries = queries_to_retract_from_ehr_dataset( project_id, dataset, pid_project_id, sandbox_dataset_id, hpo_id, pid_table_id) retraction_query_runner(ehr_mapping_queries) retraction_query_runner(ehr_queries) LOGGER.info('Finished retracting from EHR datasets') LOGGER.info( f"Retracting from UNIONED datasets: {', '.join(unioned_datasets)}") for dataset in unioned_datasets: unioned_mapping_queries, unioned_queries = queries_to_retract_from_unioned_dataset( project_id, dataset, pid_project_id, sandbox_dataset_id, pid_table_id) retraction_query_runner(unioned_mapping_queries) retraction_query_runner(unioned_queries) LOGGER.info('Finished retracting from UNIONED datasets') LOGGER.info( f"Retracting from COMBINED datasets: {', '.join(combined_datasets)}") for dataset in combined_datasets: combined_mapping_queries, combined_queries = queries_to_retract_from_combined_or_deid_dataset( project_id, dataset, pid_project_id, sandbox_dataset_id, pid_table_id, retraction_type, deid_flag=False) retraction_query_runner(combined_mapping_queries) retraction_query_runner(combined_queries) LOGGER.info('Finished retracting from COMBINED datasets') # TODO ensure the correct research_ids for persons_ids are used for each deid retraction LOGGER.info(f"Retracting from DEID datasets: {', '.join(deid_datasets)}") for dataset in deid_datasets: deid_mapping_queries, deid_queries = queries_to_retract_from_combined_or_deid_dataset( project_id, dataset, pid_project_id, sandbox_dataset_id, pid_table_id, retraction_type, deid_flag=True) retraction_query_runner(deid_mapping_queries) retraction_query_runner(deid_queries) LOGGER.info('Finished retracting from DEID datasets')
required=True) parser.add_argument('-b', '--sandbox_dataset_id', action='store', dest='sandbox_dataset_id', help='Identifies sandbox dataset to store records', required=True) args = parser.parse_args() pipeline_logging.configure(level=logging.DEBUG, add_console_handler=args.console_log) client = bq.get_client(args.project_id) # keep only datasets existing in project dataset_ids = ru.get_datasets_list(args.project_id, [args.dataset_id]) # dataset_ids should contain only one dataset (unioned_ehr) if len(dataset_ids) == 1: dataset_id = dataset_ids[0] else: raise RuntimeError(f'More than one dataset specified: {dataset_ids}') LOGGER.info( f"Dataset to retract deactivated participants from: {dataset_id}. " f"Using sandbox dataset: {args.sandbox_dataset_id}") deactivation_queries = remove_ehr_data_queries(client, args.api_project_id, args.project_id, dataset_id, args.sandbox_dataset_id)