def setUp(self): self.project_id = bq_utils.app_identity.get_application_id() self.dataset_id = bq_utils.get_combined_dataset_id() self.sandbox_dataset_id = bq_utils.get_unioned_dataset_id() if not self.project_id or not self.dataset_id: # TODO: Fix handling of globals, push these assertions down if they are required. raise ValueError( f"missing configuration for project ('{self.project_id}') " + f"and/or dataset ('{self.dataset_id}')") # TODO: Reconcile this with a consistent integration testing model. Ideally each test should # clean up after itself so that we don't need this defensive check. test_util.delete_all_tables(self.dataset_id) # drop concept table drop_concept_table(self.dataset_id) create_tables = ['person', 'observation'] table_fields = { 'person': 'post_deid_person', 'observation': 'observation', 'concept': 'concept' } for tbl in ['concept']: if not bq_utils.table_exists(tbl, dataset_id=self.dataset_id): create_tables.append(tbl) for tbl in create_tables: bq_utils.create_standard_table(table_fields[tbl], tbl, dataset_id=self.dataset_id, force_all_nullable=True)
def remove_ehr_data_queries(project_id, ticket_number, pids_project_id, pids_dataset_id, tablename): """ Creates sandboxes and drops all EHR data found for deactivated participants after their deactivation date :param project_id: BQ name of the project :param ticket_number: Jira ticket number to identify and title sandbox tables :param pids_project_id: deactivated participants PIDs table in BQ's project_id :param pids_dataset_id: deactivated participants PIDs table in BQ's dataset_id :param tablename: The name of the table to house the deactivated participant data """ ehr_union_dataset = bq_utils.get_unioned_dataset_id() # gets the deactivated participant dataset to ensure it's up-to-date df = psr.get_deactivated_participants(pids_project_id, pids_dataset_id, tablename, DEACTIVATED_PARTICIPANTS_COLUMNS) # To store dataframe in a BQ dataset table destination_table = pids_dataset_id + '.' + tablename psr.store_participant_data(df, project_id, destination_table) # creates sandbox and truncate queries to run for deactivated participant data drops queries = rdp.create_queries( project_id, ticket_number=ticket_number, # the deactivated participants table is stored in the same project # as the data being retracted pids_project_id=project_id, pids_dataset_id=pids_dataset_id, pids_table=tablename, datasets=[ehr_union_dataset]) return queries
def setUp(self): self.testbed = testbed.Testbed() self.testbed.activate() self.testbed.init_app_identity_stub() self.testbed.init_memcache_stub() self.testbed.init_urlfetch_stub() self.testbed.init_blobstore_stub() self.testbed.init_datastore_v3_stub() self.project_id = bq_utils.app_identity.get_application_id() self.hpo_ids = [NYC_HPO_ID, PITT_HPO_ID] self.input_dataset_id = bq_utils.get_dataset_id() self.output_dataset_id = bq_utils.get_unioned_dataset_id() self._empty_hpo_buckets() test_util.delete_all_tables(self.input_dataset_id) test_util.delete_all_tables(self.output_dataset_id) # TODO Generalize to work for all foreign key references # Collect all primary key fields in CDM tables mapped_fields = [] for table in cdm.tables_to_map(): field = table + '_id' mapped_fields.append(field) self.mapped_fields = mapped_fields self.implemented_foreign_keys = [ eu_constants.VISIT_OCCURRENCE_ID, eu_constants.CARE_SITE_ID, eu_constants.LOCATION_ID ]
def clean_unioned_ehr_dataset(project_id=None, dataset_id=None): """ Run all clean rules defined for the unioned ehr dataset. :param project_id: Name of the BigQuery project. :param dataset_id: Name of the dataset to clean """ if project_id is None: project_id = app_identity.get_application_id() LOGGER.info('Project is unspecified. Using default value of:\t%s', project_id) if dataset_id is None: dataset_id = bq_utils.get_unioned_dataset_id() LOGGER.info('Dataset is unspecified. Using default value of:\t%s', dataset_id) sandbox_dataset_id = sandbox.create_sandbox_dataset(project_id=project_id, dataset_id=dataset_id) query_list = _gather_unioned_ehr_queries(project_id, dataset_id, sandbox_dataset_id) LOGGER.info("Cleaning unioned_dataset") clean_engine.clean_dataset(project_id, query_list, stage.UNIONED)
def delete_records_for_non_matching_participants(project_id, dataset_id, sandbox_dataset_id=None, ehr_dataset_id=None, validation_dataset_id=None): """ This function generates the queries that delete participants and their corresponding data points, for which the participant_match data is missing and DRC matching algorithm flags it as a no match :param project_id: :param dataset_id: :param ehr_dataset_id: :param sandbox_dataset_id: Identifies the sandbox dataset to store rows #TODO use sandbox_dataset_id for CR :param validation_dataset_id: :return: """ if ehr_dataset_id is None: ehr_dataset_id = bq_utils.get_unioned_dataset_id() if validation_dataset_id is None: validation_dataset_id = bq.get_latest_validation_dataset_id(project_id) non_matching_person_ids = [] # Retrieving all hpo_ids for hpo_id in readers.get_hpo_site_names(): if not exist_participant_match(ehr_dataset_id, hpo_id): LOGGER.info( 'The hpo site {hpo_id} is missing the participant_match data'. format(hpo_id=hpo_id)) non_matching_person_ids.extend( get_list_non_match_participants(project_id, validation_dataset_id, hpo_id)) else: LOGGER.info( 'The hpo site {hpo_id} submitted the participant_match data'. format(hpo_id=hpo_id)) queries = [] if non_matching_person_ids: LOGGER.info( 'Participants: {person_ids} and their data will be dropped from {combined_dataset_id}' .format(person_ids=non_matching_person_ids, combined_dataset_id=dataset_id)) queries.append( remove_pids.get_sandbox_queries(project_id, dataset_id, non_matching_person_ids, TICKET_NUMBER)) queries.extend( remove_pids.get_remove_pids_queries(project_id, dataset_id, non_matching_person_ids)) return queries
def setUp(self): self.hpo_id = 'fake' self.project_id = 'fake-project-id' self.test_project_id = app_identity.get_application_id() self.pid_table_id = 'pid_table' self.bq_dataset_id = bq_utils.get_unioned_dataset_id() self.dataset_ids = 'all_datasets' self.person_research_ids = [(1, 6890173), (2, 858761), (1234567, 4589763)]
def clean_unioned_ehr_dataset(project=None, dataset=None): if dataset is None or dataset == '' or dataset.isspace(): dataset = bq_utils.get_unioned_dataset_id() LOGGER.info('Dataset is unspecified. Using default value of:\t%s', dataset) query_list = _gather_unioned_ehr_queries(project, dataset) LOGGER.info("Cleaning unioned_dataset") clean_engine.clean_dataset(project, dataset, query_list)
def setUp(self): self.hpo_id = 'fake' self.project_id = 'fake-project-id' self.test_project_id = app_identity.get_application_id() self.ehr_dataset_id = 'ehr20190801_fake' self.unioned_dataset_id = 'unioned_ehr20190801' self.combined_dataset_id = 'combined20190801' self.bq_dataset_id = bq_utils.get_unioned_dataset_id() self.person_ids = [1, 2, 1234567] self.tables_to_retract_unioned = retract_data_bq.TABLES_FOR_RETRACTION | {common.FACT_RELATIONSHIP, common.PERSON} self.tables_to_retract_combined = retract_data_bq.TABLES_FOR_RETRACTION | {common.FACT_RELATIONSHIP} self.all_tables = resources.CDM_TABLES
def union_ehr(): hpo_id = 'unioned_ehr' app_id = bq_utils.app_identity.get_application_id() input_dataset_id = bq_utils.get_dataset_id() output_dataset_id = bq_utils.get_unioned_dataset_id() ehr_union.main(input_dataset_id, output_dataset_id, app_id) run_achilles(hpo_id) now_date_string = datetime.datetime.now().strftime('%Y_%m_%d') folder_prefix = 'unioned_ehr_' + now_date_string + '/' run_export(datasource_id=hpo_id, folder_prefix=folder_prefix) logging.info(f"Uploading achilles index files") _upload_achilles_files(hpo_id, folder_prefix) return 'merge-and-achilles-done'
def setUp(self): self.hpo_id = test_util.FAKE_HPO_ID self.bucket = gcs_utils.get_hpo_bucket(self.hpo_id) self.site_bucket = 'test_bucket' self.folder_1 = '2019-01-01-v1/' self.folder_2 = '2019-02-02-v2/' self.folder_prefix_1 = self.hpo_id + '/' + self.site_bucket + '/' + self.folder_1 self.folder_prefix_2 = self.hpo_id + '/' + self.site_bucket + '/' + self.folder_2 self.pids = [17, 20] self.skip_pids = [10, 25] self.project_id = 'project_id' self.sandbox_dataset_id = bq_utils.get_unioned_dataset_id() self.pid_table_id = 'pid_table' self._empty_bucket()
def clean_unioned_ehr_dataset(project_id=None, dataset_id=None): """ Run all clean rules defined for the unioned ehr dataset. :param project_id: Name of the BigQuery project. :param dataset_id: Name of the dataset to clean """ if dataset_id is None or dataset_id == '' or dataset_id.isspace(): dataset_id = bq_utils.get_unioned_dataset_id() LOGGER.info('Dataset is unspecified. Using default value of:\t%s', dataset_id) query_list = _gather_unioned_ehr_queries(project_id, dataset_id) LOGGER.info("Cleaning unioned_dataset") clean_engine.clean_dataset(project_id, dataset_id, query_list)
def get_dataset_and_project_names(): """ Get project and dataset names from environment variables. :return: A dictionary of dataset names and project name """ project_and_dataset_names = dict() project_and_dataset_names[clean_cdr_consts.EHR_DATASET] = bq_utils.get_dataset_id() project_and_dataset_names[clean_cdr_consts.UNIONED_EHR_DATASET] = bq_utils.get_unioned_dataset_id() project_and_dataset_names[clean_cdr_consts.RDR_DATASET] = bq_utils.get_rdr_dataset_id() project_and_dataset_names[clean_cdr_consts.EHR_RDR_DATASET] = bq_utils.get_ehr_rdr_dataset_id() project_and_dataset_names[clean_cdr_consts.EHR_RDR_DE_IDENTIFIED] = bq_utils.get_combined_deid_dataset_id() project_and_dataset_names[clean_cdr_consts.PROJECT] = app_identity.get_application_id() return project_and_dataset_names
def setUp(self): super(EhrUnionTest, self).setUp() self.testbed = testbed.Testbed() self.testbed.activate() self.testbed.init_app_identity_stub() self.testbed.init_memcache_stub() self.testbed.init_urlfetch_stub() self.testbed.init_blobstore_stub() self.testbed.init_datastore_v3_stub() self.project_id = bq_utils.app_identity.get_application_id() self.hpo_ids = [CHS_HPO_ID, PITT_HPO_ID] self.input_dataset_id = bq_utils.get_dataset_id() self.output_dataset_id = bq_utils.get_unioned_dataset_id() self._empty_hpo_buckets() test_util.delete_all_tables(self.input_dataset_id) test_util.delete_all_tables(self.output_dataset_id)
def test_execute_queries(self): project_id = bq_utils.app_identity.get_application_id() dataset_id = bq_utils.get_combined_dataset_id() sandbox_id = bq_utils.get_unioned_dataset_id() test_util.delete_all_tables(dataset_id) create_tables = ( ['person'] + common.CLINICAL_DATA_TABLES + ['_mapping_' + t for t in common.MAPPED_CLINICAL_DATA_TABLES]) # TODO(calbach): Make the setup/teardown of these concept tables hermetic. for tbl in ['concept', 'concept_ancestor']: if not bq_utils.table_exists(tbl, dataset_id=dataset_id): create_tables.push(tbl) for tbl in create_tables: bq_utils.create_standard_table(tbl, tbl, dataset_id=dataset_id, force_all_nullable=True) for tmpl in INSERT_FAKE_PARTICIPANTS_TMPLS: resp = bq_utils.query( tmpl.render(project_id=project_id, dataset_id=dataset_id, rdr_basics_concept_id=123, rdr_consent_concept_id=345, ehr_obs_concept_id=567, rdr_basics_module_concept_id= drop_participants_without_ppi_or_ehr. BASICS_MODULE_CONCEPT_ID)) self.assertTrue(resp["jobComplete"]) clean_cdr_engine.clean_dataset( project_id, dataset_id, sandbox_id, [(drop_participants_without_ppi_or_ehr.get_queries, )]) def table_to_person_ids(t): rows = bq_utils.response2rows( bq_utils.query("SELECT person_id FROM `{}.{}.{}`".format( project_id, dataset_id, t))) return set([r["person_id"] for r in rows]) # We expect participants 1, 5 to have been removed from all tables. self.assertEqual(set([2, 3, 4, 6]), table_to_person_ids("person")) self.assertEqual(set([2, 4, 6]), table_to_person_ids("observation")) self.assertEquals(set([3, 4]), table_to_person_ids("drug_exposure")) test_util.delete_all_tables(dataset_id)
def setUp(self): self.project_id = bq_utils.app_identity.get_application_id() self.hpo_ids = [PITT_HPO_ID, NYC_HPO_ID, EXCLUDED_HPO_ID] self.input_dataset_id = bq_utils.get_dataset_id() self.output_dataset_id = bq_utils.get_unioned_dataset_id() self.storage_client = StorageClient(self.project_id) self.tearDown() # TODO Generalize to work for all foreign key references # Collect all primary key fields in CDM tables mapped_fields = [] for table in cdm.tables_to_map(): field = table + '_id' mapped_fields.append(field) self.mapped_fields = mapped_fields self.implemented_foreign_keys = [ eu_constants.VISIT_OCCURRENCE_ID, eu_constants.VISIT_DETAIL_ID, eu_constants.CARE_SITE_ID, eu_constants.LOCATION_ID ]
def setUp(self): self.project_id = bq_utils.app_identity.get_application_id() self.hpo_ids = [NYC_HPO_ID, PITT_HPO_ID] self.input_dataset_id = bq_utils.get_dataset_id() self.output_dataset_id = bq_utils.get_unioned_dataset_id() # Done in tearDown(). this is redundant. self._empty_hpo_buckets() test_util.delete_all_tables(self.input_dataset_id) test_util.delete_all_tables(self.output_dataset_id) # TODO Generalize to work for all foreign key references # Collect all primary key fields in CDM tables mapped_fields = [] for table in cdm.tables_to_map(): field = table + '_id' mapped_fields.append(field) self.mapped_fields = mapped_fields self.implemented_foreign_keys = [ eu_constants.VISIT_OCCURRENCE_ID, eu_constants.CARE_SITE_ID, eu_constants.LOCATION_ID ]
def setUp(self): self.bq_project_id = app_identity.get_application_id() self.bq_dataset_id = bq_utils.get_unioned_dataset_id()