def test_queries_to_retract_from_combined_or_deid_dataset(self, mock_list_existing_tables): existing_table_ids = [] ignored_tables = [] for cdm_table in resources.CDM_TABLES: existing_table_ids.append(cdm_table) if cdm_table not in self.tables_to_retract_combined: ignored_tables.append(cdm_table) mapped_tables = cdm.tables_to_map() for mapped_table in mapped_tables: mapping_table = ehr_union.mapping_table_for(mapped_table) existing_table_ids.append(mapping_table) if mapped_table not in self.tables_to_retract_combined: ignored_tables.append(mapping_table) mock_list_existing_tables.return_value = existing_table_ids mqs, qs = retract_data_bq.queries_to_retract_from_combined_or_deid_dataset(self.project_id, self.combined_dataset_id, self.person_ids) actual_dest_tables = set(q[retract_data_bq.DEST_TABLE] for q in qs+mqs) expected_dest_tables = set(existing_table_ids) - set(ignored_tables) self.assertSetEqual(expected_dest_tables, actual_dest_tables) # death query should use person_id as-is (no constant factor) constant_factor = common.RDR_ID_CONSTANT + common.ID_CONSTANT_FACTOR for q in qs: if q[retract_data_bq.DEST_TABLE] is common.DEATH: self.assertNotIn(str(constant_factor), q[retract_data_bq.QUERY])
def get_mapping_table_update_queries(project_id, dataset_id): """ Generates a list of query dicts for adding newly generated rows to corresponding mapping_tables :param project_id: identifies the project containing the dataset :param dataset_id: identifies the dataset containing the OMOP data :return: list of query dicts for updating mapping_tables """ queries = [] for domain_table in DOMAIN_TABLE_NAMES: mapping_table = mapping_table_for(domain_table) query = dict() query[cdr_consts.QUERY] = parse_mapping_table_update_query(project_id, dataset_id, domain_table, mapping_table) query[cdr_consts.DESTINATION_TABLE] = mapping_table query[cdr_consts.DISPOSITION] = bq_consts.WRITE_TRUNCATE query[cdr_consts.DESTINATION_DATASET] = dataset_id queries.append(query) return queries
def test_queries_to_retract_from_ehr_dataset(self, mock_list_existing_tables): hpo_person = bq_utils.get_table_id(self.hpo_id, common.PERSON) hpo_death = bq_utils.get_table_id(self.hpo_id, common.DEATH) # hpo tables existing_table_ids = [hpo_person, hpo_death] for table in self.tables_to_retract_unioned: table_id = bq_utils.get_table_id(self.hpo_id, table) existing_table_ids.append(table_id) # unioned tables ignored_tables = [] for cdm_table in resources.CDM_TABLES: unioned_table_id = retract_data_bq.UNIONED_EHR + cdm_table existing_table_ids.append(unioned_table_id) if cdm_table not in self.tables_to_retract_unioned: ignored_tables.append(unioned_table_id) mapped_tables = cdm.tables_to_map() # fact_relationship does not have pid, is handled separate from other mapped tables for mapped_table in mapped_tables: mapping_table = ehr_union.mapping_table_for(mapped_table) existing_table_ids.append(mapping_table) legacy_mapping_table = retract_data_bq.UNIONED_EHR + mapping_table existing_table_ids.append(legacy_mapping_table) if mapped_table not in self.tables_to_retract_unioned: ignored_tables.append(mapping_table) ignored_tables.append(legacy_mapping_table) mock_list_existing_tables.return_value = existing_table_ids mqs, qs = retract_data_bq.queries_to_retract_from_ehr_dataset( self.project_id, self.ehr_dataset_id, self.project_id, self.sandbox_dataset_id, self.hpo_id, self.pid_table_id) actual_dest_tables = set(q[retract_data_bq.DEST_TABLE] for q in qs + mqs) expected_dest_tables = set(existing_table_ids) - set(hpo_person) - set( ignored_tables) self.assertSetEqual(expected_dest_tables, actual_dest_tables)
def get_mapping_table_update_queries(self): """ Generates a list of query dicts for adding newly generated rows to corresponding mapping_tables :return: list of query dicts for updating mapping_tables """ queries = [] for domain_table in self.affected_tables: mapping_table = mapping_table_for(domain_table) queries.append({ cdr_consts.QUERY: self.parse_mapping_table_update_query(domain_table, mapping_table), cdr_consts.DESTINATION_TABLE: mapping_table, cdr_consts.DISPOSITION: bq_consts.WRITE_TRUNCATE, cdr_consts.DESTINATION_DATASET: self.dataset_id }) return queries
def test_queries_to_retract_from_unioned_dataset(self, mock_list_existing_tables): existing_table_ids = [] ignored_tables = [] for cdm_table in resources.CDM_TABLES: existing_table_ids.append(cdm_table) if cdm_table not in self.tables_to_retract_unioned: ignored_tables.append(cdm_table) mapped_tables = cdm.tables_to_map() for mapped_table in mapped_tables: mapping_table = ehr_union.mapping_table_for(mapped_table) existing_table_ids.append(mapping_table) if mapped_table not in self.tables_to_retract_unioned: ignored_tables.append(mapping_table) mock_list_existing_tables.return_value = existing_table_ids mqs, qs = retract_data_bq.queries_to_retract_from_unioned_dataset(self.project_id, self.unioned_dataset_id, self.person_ids) actual_dest_tables = set(q[retract_data_bq.DEST_TABLE] for q in qs+mqs) expected_dest_tables = set(existing_table_ids) - set(ignored_tables) self.assertSetEqual(expected_dest_tables, actual_dest_tables)
def test_union_ehr(self): self._load_datasets() input_tables_before = set(self._dataset_tables(self.input_dataset_id)) # output should be mapping tables and cdm tables output_tables_before = self._dataset_tables(self.output_dataset_id) mapping_tables = [ ehr_union.mapping_table_for(table) for table in ehr_union.tables_to_map() ] output_cdm_tables = [ ehr_union.output_table_for(table) for table in common.CDM_TABLES ] expected_output = set(output_tables_before + mapping_tables + output_cdm_tables) # perform ehr union ehr_union.main(self.input_dataset_id, self.output_dataset_id, self.project_id, self.hpo_ids) # input dataset should be unchanged input_tables_after = set(self._dataset_tables(self.input_dataset_id)) self.assertSetEqual(input_tables_before, input_tables_after) # mapping tables tables_to_map = ehr_union.tables_to_map() for table_to_map in tables_to_map: mapping_table = ehr_union.mapping_table_for(table_to_map) expected_fields = { 'src_table_id', 'src_%s_id' % table_to_map, '%s_id' % table_to_map, 'src_hpo_id' } mapping_table_info = bq_utils.get_table_info( mapping_table, dataset_id=self.output_dataset_id) mapping_table_fields = mapping_table_info.get('schema', dict()).get( 'fields', []) actual_fields = set([f['name'] for f in mapping_table_fields]) message = 'Table %s has fields %s when %s expected' % ( mapping_table, actual_fields, expected_fields) self.assertSetEqual(expected_fields, actual_fields, message) result_table = ehr_union.output_table_for(table_to_map) expected_num_rows = len(self.expected_tables[result_table]) actual_num_rows = int(mapping_table_info.get('numRows', -1)) message = 'Table %s has %s rows when %s expected' % ( mapping_table, actual_num_rows, expected_num_rows) self.assertEqual(expected_num_rows, actual_num_rows, message) # check for each output table for table_name in common.CDM_TABLES: # output table exists and row count is sum of those submitted by hpos result_table = ehr_union.output_table_for(table_name) expected_rows = self.expected_tables[result_table] expected_count = len(expected_rows) table_info = bq_utils.get_table_info( result_table, dataset_id=self.output_dataset_id) actual_count = int(table_info.get('numRows')) msg = 'Unexpected row count in table {result_table} after ehr union'.format( result_table=result_table) self.assertEqual(expected_count, actual_count, msg) # TODO Compare table rows to expected accounting for the new ids and ignoring field types # q = 'SELECT * FROM {dataset}.{table}'.format(dataset=self.output_dataset_id, table=result_table) # query_response = bq_utils.query(q) # actual_rows = test_util.response2rows(query_response) # output table has clustering on person_id where applicable fields_file = os.path.join(resources.fields_path, table_name + '.json') with open(fields_file, 'r') as fp: fields = json.load(fp) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self._table_has_clustering(table_info) actual_output = set(self._dataset_tables(self.output_dataset_id)) self.assertSetEqual(expected_output, actual_output) # explicit check that output person_ids are same as input chs_person_table_id = bq_utils.get_table_id(CHS_HPO_ID, 'person') pitt_person_table_id = bq_utils.get_table_id(PITT_HPO_ID, 'person') q = '''SELECT DISTINCT person_id FROM ( SELECT person_id FROM {dataset_id}.{chs_person_table_id} UNION ALL SELECT person_id FROM {dataset_id}.{pitt_person_table_id} ) ORDER BY person_id ASC'''.format( dataset_id=self.input_dataset_id, chs_person_table_id=chs_person_table_id, pitt_person_table_id=pitt_person_table_id) response = bq_utils.query(q) expected_rows = test_util.response2rows(response) person_table_id = ehr_union.output_table_for('person') q = '''SELECT DISTINCT person_id FROM {dataset_id}.{table_id} ORDER BY person_id ASC'''.format( dataset_id=self.output_dataset_id, table_id=person_table_id) response = bq_utils.query(q) actual_rows = test_util.response2rows(response) self.assertListEqual(expected_rows, actual_rows)
def queries_to_retract_from_combined_or_deid_dataset(project_id, dataset_id, ids): """ Get list of queries to remove all records in all tables associated with supplied ids :param project_id: identifies associated project :param dataset_id: identifies associated dataset :param ids: list of ids :return: list of dict with keys query, dataset, table """ # If fewer pids, use DELETE statements instead of SELECT delete_flag = bool(len(ids) < THRESHOLD_FOR_DML) pids = int_list_to_bq(ids) logger.debug('Checking existing tables for %s.%s' % (project_id, dataset_id)) existing_tables = list_existing_tables(project_id, dataset_id) combined_mapping_queries = [] combined_queries = [] for table in TABLES_FOR_RETRACTION: if table is not common.DEATH: q_combined_mapping = dict() q_combined_mapping[DEST_DATASET] = dataset_id q_combined_mapping[DEST_TABLE] = ehr_union.mapping_table_for(table) q_combined_mapping[DELETE_FLAG] = delete_flag if q_combined_mapping[DEST_TABLE] in existing_tables: if q_combined_mapping[DELETE_FLAG]: q_combined_mapping[ QUERY] = DELETE_RETRACT_MAPPING_DATA_COMBINED_QUERY.format( project=project_id, dataset=q_combined_mapping[DEST_DATASET], mapping_table=q_combined_mapping[DEST_TABLE], table_id=get_table_id(table), table=table, pids=pids, CONSTANT_FACTOR=common.RDR_ID_CONSTANT + common.ID_CONSTANT_FACTOR) else: q_combined_mapping[ QUERY] = SELECT_RETRACT_MAPPING_DATA_COMBINED_QUERY.format( project=project_id, dataset=q_combined_mapping[DEST_DATASET], mapping_table=q_combined_mapping[DEST_TABLE], table_id=get_table_id(table), table=table, pids=pids, CONSTANT_FACTOR=common.RDR_ID_CONSTANT + common.ID_CONSTANT_FACTOR) combined_mapping_queries.append(q_combined_mapping) q_combined = dict() q_combined[DEST_DATASET] = dataset_id q_combined[DEST_TABLE] = table q_combined[DELETE_FLAG] = delete_flag if q_combined[DEST_TABLE] in existing_tables: if q_combined[DELETE_FLAG]: q_combined[QUERY] = DELETE_RETRACT_DATA_COMBINED_QUERY.format( project=project_id, dataset=q_combined[DEST_DATASET], table=q_combined[DEST_TABLE], pids=pids, table_id=get_table_id(table), CONSTANT_FACTOR=common.RDR_ID_CONSTANT + common.ID_CONSTANT_FACTOR) else: q_combined[QUERY] = SELECT_RETRACT_DATA_COMBINED_QUERY.format( project=project_id, dataset=q_combined[DEST_DATASET], table=q_combined[DEST_TABLE], pids=pids, table_id=get_table_id(table), CONSTANT_FACTOR=common.RDR_ID_CONSTANT + common.ID_CONSTANT_FACTOR) combined_queries.append(q_combined) # fix death query to exclude constant for q in combined_queries: if q[DEST_TABLE] is common.DEATH: if q[DELETE_FLAG]: q[QUERY] = DELETE_RETRACT_DATA_UNIONED_QUERY.format( project=project_id, dataset=q[DEST_DATASET], table=q[DEST_TABLE], pids=pids) else: q[QUERY] = SELECT_RETRACT_DATA_UNIONED_QUERY.format( project=project_id, dataset=q[DEST_DATASET], table=q[DEST_TABLE], pids=pids) q_combined_fact_relationship = dict() q_combined_fact_relationship[DEST_DATASET] = dataset_id q_combined_fact_relationship[DEST_TABLE] = common.FACT_RELATIONSHIP q_combined_fact_relationship[DELETE_FLAG] = delete_flag if q_combined_fact_relationship[DEST_TABLE] in existing_tables: if q_combined_fact_relationship[DELETE_FLAG]: q_combined_fact_relationship[ QUERY] = DELETE_RETRACT_DATA_FACT_RELATIONSHIP.format( project=project_id, dataset=q_combined_fact_relationship[DEST_DATASET], table=q_combined_fact_relationship[DEST_TABLE], PERSON_DOMAIN=PERSON_DOMAIN, pids=pids) else: q_combined_fact_relationship[ QUERY] = SELECT_RETRACT_DATA_FACT_RELATIONSHIP.format( project=project_id, dataset=q_combined_fact_relationship[DEST_DATASET], table=q_combined_fact_relationship[DEST_TABLE], PERSON_DOMAIN=PERSON_DOMAIN, pids=pids) combined_queries.append(q_combined_fact_relationship) return combined_mapping_queries, combined_queries
def queries_to_retract_from_ehr_dataset(project_id, dataset_id, hpo_id, ids): """ Get list of queries to remove all records in all tables associated with supplied ids :param project_id: identifies associated project :param dataset_id: identifies associated dataset :param hpo_id: identifies the HPO site :param ids: list of ids :return: list of dict with keys query, dataset, table, delete_flag """ # If fewer pids, use DELETE statements instead of SELECT delete_flag = bool(len(ids) < THRESHOLD_FOR_DML) logger.debug('Checking existing tables for %s.%s' % (project_id, dataset_id)) pids = int_list_to_bq(ids) existing_tables = list_existing_tables(project_id, dataset_id) site_queries = [] unioned_mapping_queries = [] unioned_mapping_legacy_queries = [] unioned_queries = [] for table in TABLES_FOR_RETRACTION: q_site = dict() q_site[DEST_DATASET] = dataset_id q_site[DEST_TABLE] = get_site_table(hpo_id, table) q_site[DELETE_FLAG] = delete_flag if q_site[DEST_TABLE] in existing_tables: if q_site[DELETE_FLAG]: q_site[QUERY] = DELETE_RETRACT_DATA_SITE_QUERY.format( project=project_id, dataset=q_site[DEST_DATASET], table=q_site[DEST_TABLE], pids=pids) else: q_site[QUERY] = SELECT_RETRACT_DATA_SITE_QUERY.format( project=project_id, dataset=q_site[DEST_DATASET], table=q_site[DEST_TABLE], pids=pids) site_queries.append(q_site) # death does not have mapping table if table is not common.DEATH: q_unioned_mapping = dict() q_unioned_mapping[DEST_DATASET] = dataset_id q_unioned_mapping[DEST_TABLE] = ehr_union.mapping_table_for(table) q_unioned_mapping[DELETE_FLAG] = delete_flag if q_unioned_mapping[DEST_TABLE] in existing_tables: if q_unioned_mapping[DELETE_FLAG]: q_unioned_mapping[ QUERY] = DELETE_RETRACT_MAPPING_DATA_UNIONED_QUERY.format( project=project_id, dataset=q_unioned_mapping[DEST_DATASET], mapping_table=q_unioned_mapping[DEST_TABLE], table_id=get_table_id(table), table=UNIONED_EHR + table, pids=pids) else: q_unioned_mapping[ QUERY] = SELECT_RETRACT_MAPPING_DATA_UNIONED_QUERY.format( project=project_id, dataset=q_unioned_mapping[DEST_DATASET], mapping_table=q_unioned_mapping[DEST_TABLE], table_id=get_table_id(table), table=UNIONED_EHR + table, pids=pids) unioned_mapping_queries.append(q_unioned_mapping) q_unioned_mapping_legacy = dict() q_unioned_mapping_legacy[DEST_DATASET] = dataset_id q_unioned_mapping_legacy[ DEST_TABLE] = UNIONED_EHR + ehr_union.mapping_table_for(table) q_unioned_mapping_legacy[DELETE_FLAG] = delete_flag if q_unioned_mapping_legacy[DEST_TABLE] in existing_tables: if q_unioned_mapping_legacy[DELETE_FLAG]: q_unioned_mapping_legacy[ QUERY] = DELETE_RETRACT_MAPPING_DATA_UNIONED_QUERY.format( project=project_id, dataset=q_unioned_mapping_legacy[DEST_DATASET], mapping_table=q_unioned_mapping_legacy[DEST_TABLE], table_id=get_table_id(table), table=UNIONED_EHR + table, pids=pids) else: q_unioned_mapping_legacy[ QUERY] = SELECT_RETRACT_MAPPING_DATA_UNIONED_QUERY.format( project=project_id, dataset=q_unioned_mapping_legacy[DEST_DATASET], mapping_table=q_unioned_mapping_legacy[DEST_TABLE], table_id=get_table_id(table), table=UNIONED_EHR + table, pids=pids) unioned_mapping_legacy_queries.append(q_unioned_mapping_legacy) q_unioned = dict() q_unioned[DEST_DATASET] = dataset_id q_unioned[DEST_TABLE] = UNIONED_EHR + table q_unioned[DELETE_FLAG] = delete_flag if q_unioned[DEST_TABLE] in existing_tables: if q_unioned[DELETE_FLAG]: q_unioned[QUERY] = DELETE_RETRACT_DATA_UNIONED_QUERY.format( project=project_id, dataset=q_unioned[DEST_DATASET], table=q_unioned[DEST_TABLE], pids=pids) else: q_unioned[QUERY] = SELECT_RETRACT_DATA_UNIONED_QUERY.format( project=project_id, dataset=q_unioned[DEST_DATASET], table=q_unioned[DEST_TABLE], pids=pids) unioned_queries.append(q_unioned) # Remove from person table q_site_person = dict() q_site_person[DEST_DATASET] = dataset_id q_site_person[DEST_TABLE] = get_site_table(hpo_id, common.PERSON) q_site_person[DELETE_FLAG] = delete_flag if q_site_person[DEST_TABLE] in existing_tables: if q_site_person[DELETE_FLAG]: q_site_person[QUERY] = DELETE_RETRACT_DATA_SITE_QUERY.format( project=project_id, dataset=q_site_person[DEST_DATASET], table=q_site_person[DEST_TABLE], pids=pids) else: q_site_person[QUERY] = SELECT_RETRACT_DATA_SITE_QUERY.format( project=project_id, dataset=q_site_person[DEST_DATASET], table=q_site_person[DEST_TABLE], pids=pids) site_queries.append(q_site_person) q_unioned_person = dict() q_unioned_person[DEST_DATASET] = dataset_id q_unioned_person[DEST_TABLE] = UNIONED_EHR + common.PERSON q_unioned_person[DELETE_FLAG] = delete_flag if q_unioned_person[DEST_TABLE] in existing_tables: if q_unioned_person[DELETE_FLAG]: q_unioned_person[QUERY] = DELETE_RETRACT_DATA_UNIONED_QUERY.format( project=project_id, dataset=q_unioned_person[DEST_DATASET], table=q_unioned_person[DEST_TABLE], pids=pids) else: q_unioned_person[QUERY] = SELECT_RETRACT_DATA_UNIONED_QUERY.format( project=project_id, dataset=q_unioned_person[DEST_DATASET], table=q_unioned_person[DEST_TABLE], pids=pids) unioned_queries.append(q_unioned_person) # Remove fact_relationship records referencing retracted person_ids q_site_fact_relationship = dict() q_site_fact_relationship[DEST_DATASET] = dataset_id q_site_fact_relationship[DEST_TABLE] = get_site_table( hpo_id, common.FACT_RELATIONSHIP) q_site_fact_relationship[DELETE_FLAG] = delete_flag if q_site_fact_relationship[DEST_TABLE] in existing_tables: if q_site_fact_relationship[DELETE_FLAG]: q_site_fact_relationship[ QUERY] = DELETE_RETRACT_DATA_FACT_RELATIONSHIP.format( project=project_id, dataset=q_site_fact_relationship[DEST_DATASET], table=q_site_fact_relationship[DEST_TABLE], PERSON_DOMAIN=PERSON_DOMAIN, pids=pids) else: q_site_fact_relationship[ QUERY] = SELECT_RETRACT_DATA_FACT_RELATIONSHIP.format( project=project_id, dataset=q_site_fact_relationship[DEST_DATASET], table=q_site_fact_relationship[DEST_TABLE], PERSON_DOMAIN=PERSON_DOMAIN, pids=pids) site_queries.append(q_site_fact_relationship) q_unioned_fact_relationship = dict() q_unioned_fact_relationship[DEST_DATASET] = dataset_id q_unioned_fact_relationship[ DEST_TABLE] = UNIONED_EHR + common.FACT_RELATIONSHIP q_unioned_fact_relationship[DELETE_FLAG] = delete_flag if q_unioned_fact_relationship[DEST_TABLE] in existing_tables: if q_unioned_fact_relationship[DELETE_FLAG]: q_unioned_fact_relationship[ QUERY] = DELETE_RETRACT_DATA_FACT_RELATIONSHIP.format( project=project_id, dataset=q_unioned_fact_relationship[DEST_DATASET], table=q_unioned_fact_relationship[DEST_TABLE], PERSON_DOMAIN=PERSON_DOMAIN, pids=pids) else: q_unioned_fact_relationship[ QUERY] = SELECT_RETRACT_DATA_FACT_RELATIONSHIP.format( project=project_id, dataset=q_unioned_fact_relationship[DEST_DATASET], table=q_unioned_fact_relationship[DEST_TABLE], PERSON_DOMAIN=PERSON_DOMAIN, pids=pids) unioned_queries.append(q_unioned_fact_relationship) return unioned_mapping_legacy_queries + unioned_mapping_queries, unioned_queries + site_queries
def get_table_hpo_subquery_error(self, table, dataset_in, dataset_out): subquery = ehr_union.table_hpo_subquery(table, NYC_HPO_ID, dataset_in, dataset_out) # moz-sql-parser doesn't support the ROW_NUMBER() OVER() a analytical function of sql we are removing # that statement from the returned query for the parser be able to parse out the query without erroring out. subquery = re.sub( r",\s+ROW_NUMBER\(\) OVER \(PARTITION BY nm\..+?_id\) AS row_num", " ", subquery) stmt = moz_sql_parser.parse(subquery) # Sanity check it is a select statement if 'select' not in stmt: return SUBQUERY_FAIL_MSG.format(expr='query type', table=table, expected='select', actual=str(stmt), subquery=subquery) # Input table should be first in FROM expression actual_from = first_or_none( dpath.util.values(stmt, 'from/0/value/from/value') or dpath.util.values(stmt, 'from')) expected_from = dataset_in + '.' + bq_utils.get_table_id( NYC_HPO_ID, table) if expected_from != actual_from: return SUBQUERY_FAIL_MSG.format(expr='first object in FROM', table=table, expected=expected_from, actual=actual_from, subquery=subquery) # Ensure all key fields (primary or foreign) yield joins with their associated mapping tables # Note: ordering of joins in the subquery is assumed to be consistent with field order in the json file fields = resources.fields_for(table) id_field = table + '_id' key_ind = 0 expected_join = None actual_join = None for field in fields: if field['name'] in self.mapped_fields: # key_ind += 1 # TODO use this increment when we generalize solution for all foreign keys if field['name'] == id_field: # Primary key, mapping table associated with this one should be INNER joined key_ind += 1 expr = 'inner join on primary key' actual_join = first_or_none( dpath.util.values(stmt, 'from/%s/join/value' % key_ind)) expected_join = dataset_out + '.' + ehr_union.mapping_table_for( table) elif field['name'] in self.implemented_foreign_keys: # Foreign key, mapping table associated with the referenced table should be LEFT joined key_ind += 1 expr = 'left join on foreign key' actual_join = first_or_none( dpath.util.values(stmt, 'from/%s/left join/value' % key_ind)) joined_table = field['name'].replace('_id', '') expected_join = dataset_out + '.' + ehr_union.mapping_table_for( joined_table) if expected_join != actual_join: return SUBQUERY_FAIL_MSG.format(expr=expr, table=table, expected=expected_join, actual=actual_join, subquery=subquery)
def test_union_ehr(self): self._load_datasets() input_tables_before = set(self._dataset_tables(self.input_dataset_id)) # output should be mapping tables and cdm tables output_tables_before = self._dataset_tables(self.output_dataset_id) mapping_tables = [ ehr_union.mapping_table_for(table) for table in cdm.tables_to_map() + [combine_ehr_rdr.PERSON_TABLE] ] output_cdm_tables = [ ehr_union.output_table_for(table) for table in resources.CDM_TABLES ] expected_output = set(output_tables_before + mapping_tables + output_cdm_tables) # perform ehr union ehr_union.main(self.input_dataset_id, self.output_dataset_id, self.project_id, self.hpo_ids) # input dataset should be unchanged input_tables_after = set(self._dataset_tables(self.input_dataset_id)) self.assertSetEqual(input_tables_before, input_tables_after) # fact_relationship from pitt hpo_unique_identifiers = ehr_union.get_hpo_offsets(self.hpo_ids) pitt_offset = hpo_unique_identifiers[PITT_HPO_ID] q = '''SELECT fact_id_1, fact_id_2 FROM `{input_dataset}.{hpo_id}_fact_relationship` where domain_concept_id_1 = 21 and domain_concept_id_2 = 21'''.format( input_dataset=self.input_dataset_id, hpo_id=PITT_HPO_ID) response = bq_utils.query(q) result = bq_utils.response2rows(response) expected_fact_id_1 = result[0]["fact_id_1"] + pitt_offset expected_fact_id_2 = result[0]["fact_id_2"] + pitt_offset q = '''SELECT fr.fact_id_1, fr.fact_id_2 FROM `{dataset_id}.unioned_ehr_fact_relationship` fr join `{dataset_id}._mapping_measurement` mm on fr.fact_id_1 = mm.measurement_id and mm.src_hpo_id = "{hpo_id}"'''.format( dataset_id=self.output_dataset_id, hpo_id=PITT_HPO_ID) response = bq_utils.query(q) result = bq_utils.response2rows(response) actual_fact_id_1, actual_fact_id_2 = result[0]["fact_id_1"], result[0][ "fact_id_2"] self.assertEqual(expected_fact_id_1, actual_fact_id_1) self.assertEqual(expected_fact_id_2, actual_fact_id_2) # mapping tables tables_to_map = cdm.tables_to_map() for table_to_map in tables_to_map: mapping_table = ehr_union.mapping_table_for(table_to_map) expected_fields = { 'src_table_id', 'src_%s_id' % table_to_map, '%s_id' % table_to_map, 'src_hpo_id' } mapping_table_info = bq_utils.get_table_info( mapping_table, dataset_id=self.output_dataset_id) mapping_table_fields = mapping_table_info.get('schema', dict()).get( 'fields', []) actual_fields = set([f['name'] for f in mapping_table_fields]) message = 'Table %s has fields %s when %s expected' % ( mapping_table, actual_fields, expected_fields) self.assertSetEqual(expected_fields, actual_fields, message) result_table = ehr_union.output_table_for(table_to_map) expected_num_rows = len(self.expected_tables[result_table]) actual_num_rows = int(mapping_table_info.get('numRows', -1)) message = 'Table %s has %s rows when %s expected' % ( mapping_table, actual_num_rows, expected_num_rows) self.assertEqual(expected_num_rows, actual_num_rows, message) # check for each output table for table_name in resources.CDM_TABLES: # output table exists and row count is sum of those submitted by hpos result_table = ehr_union.output_table_for(table_name) expected_rows = self.expected_tables[result_table] expected_count = len(expected_rows) table_info = bq_utils.get_table_info( result_table, dataset_id=self.output_dataset_id) actual_count = int(table_info.get('numRows')) msg = 'Unexpected row count in table {result_table} after ehr union'.format( result_table=result_table) self.assertEqual(expected_count, actual_count, msg) # TODO Compare table rows to expected accounting for the new ids and ignoring field types # q = 'SELECT * FROM {dataset}.{table}'.format(dataset=self.output_dataset_id, table=result_table) # query_response = bq_utils.query(q) # actual_rows = bq_utils.response2rows(query_response) # output table has clustering on person_id where applicable fields_file = os.path.join(resources.fields_path, table_name + '.json') with open(fields_file, 'r') as fp: fields = json.load(fp) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self._table_has_clustering(table_info) actual_output = set(self._dataset_tables(self.output_dataset_id)) self.assertSetEqual(expected_output, actual_output) # explicit check that output person_ids are same as input nyc_person_table_id = bq_utils.get_table_id(NYC_HPO_ID, 'person') pitt_person_table_id = bq_utils.get_table_id(PITT_HPO_ID, 'person') q = '''SELECT DISTINCT person_id FROM ( SELECT person_id FROM {dataset_id}.{nyc_person_table_id} UNION ALL SELECT person_id FROM {dataset_id}.{pitt_person_table_id} ) ORDER BY person_id ASC'''.format( dataset_id=self.input_dataset_id, nyc_person_table_id=nyc_person_table_id, pitt_person_table_id=pitt_person_table_id) response = bq_utils.query(q) expected_rows = bq_utils.response2rows(response) person_table_id = ehr_union.output_table_for('person') q = '''SELECT DISTINCT person_id FROM {dataset_id}.{table_id} ORDER BY person_id ASC'''.format( dataset_id=self.output_dataset_id, table_id=person_table_id) response = bq_utils.query(q) actual_rows = bq_utils.response2rows(response) self.assertCountEqual(expected_rows, actual_rows)
def queries_to_retract_from_combined_or_deid_dataset( project_id, dataset_id, pid_project_id, sandbox_dataset_id, pid_table_id, retraction_type, deid_flag): """ Get list of queries to remove all records in all tables associated with supplied ids :param project_id: identifies associated project :param dataset_id: identifies associated dataset :param pid_project_id: identifies the project containing the sandbox dataset :param sandbox_dataset_id: identifies the dataset containing the pid table :param pid_table_id: table containing the person_ids and research_ids :param retraction_type: string indicating whether all data needs to be removed, including RDR, or if RDR data needs to be kept intact. Can take the values 'rdr_and_ehr' or 'only_ehr' :param deid_flag: flag indicating if running on a deid dataset :return: list of dict with keys query, dataset, table """ logging.info('Checking existing tables for %s.%s' % (project_id, dataset_id)) existing_tables = list_existing_tables(project_id, dataset_id) # retract from ehr and rdr or only ehr if retraction_type == 'rdr_and_ehr': logging.info('Retracting from RDR and EHR data for %s' % dataset_id) constant_factor_rdr = 0 elif retraction_type == 'only_ehr': logging.info('Retracting from EHR data while retaining RDR for %s' % dataset_id) constant_factor_rdr = common.RDR_ID_CONSTANT + common.ID_CONSTANT_FACTOR else: raise ValueError('%s is not a valid retraction type' % retraction_type) combined_mapping_queries = [] combined_queries = [] for table in TABLES_FOR_RETRACTION: if table is not common.DEATH: q_combined_mapping = dict() q_combined_mapping[DEST_DATASET] = dataset_id q_combined_mapping[DEST_TABLE] = ehr_union.mapping_table_for(table) if q_combined_mapping[DEST_TABLE] in existing_tables: q_combined_mapping[ QUERY] = RETRACT_MAPPING_DATA_COMBINED_QUERY.format( project=project_id, pid_project=pid_project_id, dataset=q_combined_mapping[DEST_DATASET], mapping_table=q_combined_mapping[DEST_TABLE], table_id=get_table_id(table), table=table, pid_table_id=pid_table_id, sandbox_dataset_id=sandbox_dataset_id, CONSTANT_FACTOR=constant_factor_rdr, person_research_id=RESEARCH_ID if deid_flag else PERSON_ID) combined_mapping_queries.append(q_combined_mapping) q_combined = dict() q_combined[DEST_DATASET] = dataset_id q_combined[DEST_TABLE] = table if q_combined[DEST_TABLE] in existing_tables: q_combined[QUERY] = RETRACT_DATA_COMBINED_QUERY.format( project=project_id, pid_project=pid_project_id, dataset=q_combined[DEST_DATASET], table=q_combined[DEST_TABLE], pid_table_id=pid_table_id, table_id=get_table_id(table), sandbox_dataset_id=sandbox_dataset_id, CONSTANT_FACTOR=constant_factor_rdr, person_research_id=RESEARCH_ID if deid_flag else PERSON_ID) combined_queries.append(q_combined) if retraction_type == 'rdr_and_ehr': # retract from person q_combined_person = dict() q_combined_person[DEST_DATASET] = dataset_id q_combined_person[DEST_TABLE] = common.PERSON if q_combined_person[DEST_TABLE] in existing_tables: q_combined_person[QUERY] = RETRACT_DATA_UNIONED_QUERY.format( project=project_id, pid_project=pid_project_id, dataset=q_combined_person[DEST_DATASET], table=q_combined_person[DEST_TABLE], pid_table_id=pid_table_id, sandbox_dataset_id=sandbox_dataset_id, person_research_id=RESEARCH_ID if deid_flag else PERSON_ID) combined_queries.append(q_combined_person) # fix death query to exclude constant for q in combined_queries: if q[DEST_TABLE] is common.DEATH: q[QUERY] = RETRACT_DATA_UNIONED_QUERY.format( project=project_id, pid_project=pid_project_id, dataset=q[DEST_DATASET], table=q[DEST_TABLE], pid_table_id=pid_table_id, sandbox_dataset_id=sandbox_dataset_id, person_research_id=RESEARCH_ID if deid_flag else PERSON_ID) q_combined_fact_relationship = dict() q_combined_fact_relationship[DEST_DATASET] = dataset_id q_combined_fact_relationship[DEST_TABLE] = common.FACT_RELATIONSHIP if q_combined_fact_relationship[DEST_TABLE] in existing_tables: q_combined_fact_relationship[ QUERY] = RETRACT_DATA_FACT_RELATIONSHIP.format( project=project_id, pid_project=pid_project_id, dataset=q_combined_fact_relationship[DEST_DATASET], table=q_combined_fact_relationship[DEST_TABLE], PERSON_DOMAIN=PERSON_DOMAIN, pid_table_id=pid_table_id, sandbox_dataset_id=sandbox_dataset_id, person_research_id=RESEARCH_ID if deid_flag else PERSON_ID) combined_queries.append(q_combined_fact_relationship) return combined_mapping_queries, combined_queries
def queries_to_retract_from_unioned_dataset(project_id, dataset_id, pid_project_id, sandbox_dataset_id, pid_table_id): """ Get list of queries to remove all records in all tables associated with supplied ids :param project_id: identifies associated project :param dataset_id: identifies associated dataset :param pid_project_id: identifies the project containing the sandbox dataset :param sandbox_dataset_id: identifies the dataset containing the pid table :param pid_table_id: table containing the person_ids and research_ids :return: list of dict with keys query, dataset, table """ logging.info('Checking existing tables for %s.%s' % (project_id, dataset_id)) existing_tables = list_existing_tables(project_id, dataset_id) unioned_mapping_queries = [] unioned_queries = [] for table in TABLES_FOR_RETRACTION: if table is not common.DEATH: q_unioned_mapping = dict() q_unioned_mapping[DEST_DATASET] = dataset_id q_unioned_mapping[DEST_TABLE] = ehr_union.mapping_table_for(table) if q_unioned_mapping[DEST_TABLE] in existing_tables: q_unioned_mapping[ QUERY] = RETRACT_MAPPING_DATA_UNIONED_QUERY.format( project=project_id, pid_project=pid_project_id, dataset=q_unioned_mapping[DEST_DATASET], mapping_table=q_unioned_mapping[DEST_TABLE], table_id=get_table_id(table), table=table, pid_table_id=pid_table_id, sandbox_dataset_id=sandbox_dataset_id, person_research_id=PERSON_ID) unioned_mapping_queries.append(q_unioned_mapping) q_unioned = dict() q_unioned[DEST_DATASET] = dataset_id q_unioned[DEST_TABLE] = table if q_unioned[DEST_TABLE] in existing_tables: q_unioned[QUERY] = RETRACT_DATA_UNIONED_QUERY.format( project=project_id, pid_project=pid_project_id, dataset=q_unioned[DEST_DATASET], table=q_unioned[DEST_TABLE], pid_table_id=pid_table_id, sandbox_dataset_id=sandbox_dataset_id, person_research_id=PERSON_ID) unioned_queries.append(q_unioned) # retract from person q_unioned_person = dict() q_unioned_person[DEST_DATASET] = dataset_id q_unioned_person[DEST_TABLE] = common.PERSON if q_unioned_person[DEST_TABLE] in existing_tables: q_unioned_person[QUERY] = RETRACT_DATA_UNIONED_QUERY.format( project=project_id, pid_project=pid_project_id, dataset=q_unioned_person[DEST_DATASET], table=q_unioned_person[DEST_TABLE], pid_table_id=pid_table_id, sandbox_dataset_id=sandbox_dataset_id, person_research_id=PERSON_ID) unioned_queries.append(q_unioned_person) q_unioned_fact_relationship = dict() q_unioned_fact_relationship[DEST_DATASET] = dataset_id q_unioned_fact_relationship[DEST_TABLE] = common.FACT_RELATIONSHIP if q_unioned_fact_relationship[DEST_TABLE] in existing_tables: q_unioned_fact_relationship[ QUERY] = RETRACT_DATA_FACT_RELATIONSHIP.format( project=project_id, pid_project=pid_project_id, dataset=q_unioned_fact_relationship[DEST_DATASET], table=q_unioned_fact_relationship[DEST_TABLE], PERSON_DOMAIN=PERSON_DOMAIN, pid_table_id=pid_table_id, sandbox_dataset_id=sandbox_dataset_id, person_research_id=PERSON_ID) unioned_queries.append(q_unioned_fact_relationship) return unioned_mapping_queries, unioned_queries
def queries_to_retract_from_ehr_dataset(project_id, dataset_id, pid_project_id, sandbox_dataset_id, hpo_id, pid_table_id): """ Get list of queries to remove all records in all tables associated with supplied ids :param project_id: identifies associated project :param dataset_id: identifies associated dataset :param pid_project_id: identifies the project containing the sandbox dataset :param sandbox_dataset_id: identifies the dataset containing the pid table :param hpo_id: identifies the HPO site :param pid_table_id: table containing the person_ids and research_ids :return: list of dict with keys query, dataset, table, delete_flag """ logging.info('Checking existing tables for %s.%s' % (project_id, dataset_id)) existing_tables = list_existing_tables(project_id, dataset_id) site_queries = [] unioned_mapping_queries = [] unioned_mapping_legacy_queries = [] unioned_queries = [] for table in TABLES_FOR_RETRACTION: q_site = dict() q_site[DEST_DATASET] = dataset_id q_site[DEST_TABLE] = get_site_table(hpo_id, table) if q_site[DEST_TABLE] in existing_tables: q_site[QUERY] = RETRACT_DATA_SITE_QUERY.format( project=project_id, pid_project=pid_project_id, dataset=q_site[DEST_DATASET], table=q_site[DEST_TABLE], pid_table_id=pid_table_id, sandbox_dataset_id=sandbox_dataset_id, person_research_id=PERSON_ID) site_queries.append(q_site) # death does not have mapping table if table is not common.DEATH: q_unioned_mapping = dict() q_unioned_mapping[DEST_DATASET] = dataset_id q_unioned_mapping[DEST_TABLE] = ehr_union.mapping_table_for(table) if q_unioned_mapping[DEST_TABLE] in existing_tables: q_unioned_mapping[ QUERY] = RETRACT_MAPPING_DATA_UNIONED_QUERY.format( project=project_id, pid_project=pid_project_id, dataset=q_unioned_mapping[DEST_DATASET], mapping_table=q_unioned_mapping[DEST_TABLE], table_id=get_table_id(table), table=UNIONED_EHR + table, pid_table_id=pid_table_id, sandbox_dataset_id=sandbox_dataset_id, person_research_id=PERSON_ID) unioned_mapping_queries.append(q_unioned_mapping) q_unioned_mapping_legacy = dict() q_unioned_mapping_legacy[DEST_DATASET] = dataset_id q_unioned_mapping_legacy[ DEST_TABLE] = UNIONED_EHR + ehr_union.mapping_table_for(table) if q_unioned_mapping_legacy[DEST_TABLE] in existing_tables: q_unioned_mapping_legacy[ QUERY] = RETRACT_MAPPING_DATA_UNIONED_QUERY.format( project=project_id, pid_project=pid_project_id, dataset=q_unioned_mapping_legacy[DEST_DATASET], mapping_table=q_unioned_mapping_legacy[DEST_TABLE], table_id=get_table_id(table), table=UNIONED_EHR + table, pid_table_id=pid_table_id, sandbox_dataset_id=sandbox_dataset_id, person_research_id=PERSON_ID) unioned_mapping_legacy_queries.append(q_unioned_mapping_legacy) q_unioned = dict() q_unioned[DEST_DATASET] = dataset_id q_unioned[DEST_TABLE] = UNIONED_EHR + table if q_unioned[DEST_TABLE] in existing_tables: q_unioned[QUERY] = RETRACT_DATA_UNIONED_QUERY.format( project=project_id, pid_project=pid_project_id, dataset=q_unioned[DEST_DATASET], table=q_unioned[DEST_TABLE], pid_table_id=pid_table_id, sandbox_dataset_id=sandbox_dataset_id, person_research_id=PERSON_ID) unioned_queries.append(q_unioned) # Remove from person table q_site_person = dict() q_site_person[DEST_DATASET] = dataset_id q_site_person[DEST_TABLE] = get_site_table(hpo_id, common.PERSON) if q_site_person[DEST_TABLE] in existing_tables: q_site_person[QUERY] = RETRACT_DATA_SITE_QUERY.format( project=project_id, pid_project=pid_project_id, dataset=q_site_person[DEST_DATASET], table=q_site_person[DEST_TABLE], pid_table_id=pid_table_id, sandbox_dataset_id=sandbox_dataset_id, person_research_id=PERSON_ID) site_queries.append(q_site_person) q_unioned_person = dict() q_unioned_person[DEST_DATASET] = dataset_id q_unioned_person[DEST_TABLE] = UNIONED_EHR + common.PERSON if q_unioned_person[DEST_TABLE] in existing_tables: q_unioned_person[QUERY] = RETRACT_DATA_UNIONED_QUERY.format( project=project_id, pid_project=pid_project_id, dataset=q_unioned_person[DEST_DATASET], table=q_unioned_person[DEST_TABLE], pid_table_id=pid_table_id, sandbox_dataset_id=sandbox_dataset_id, person_research_id=PERSON_ID) unioned_queries.append(q_unioned_person) # Remove fact_relationship records referencing retracted person_ids q_site_fact_relationship = dict() q_site_fact_relationship[DEST_DATASET] = dataset_id q_site_fact_relationship[DEST_TABLE] = get_site_table( hpo_id, common.FACT_RELATIONSHIP) if q_site_fact_relationship[DEST_TABLE] in existing_tables: q_site_fact_relationship[ QUERY] = RETRACT_DATA_FACT_RELATIONSHIP.format( project=project_id, pid_project=pid_project_id, dataset=q_site_fact_relationship[DEST_DATASET], table=q_site_fact_relationship[DEST_TABLE], PERSON_DOMAIN=PERSON_DOMAIN, pid_table_id=pid_table_id, sandbox_dataset_id=sandbox_dataset_id, person_research_id=PERSON_ID) site_queries.append(q_site_fact_relationship) q_unioned_fact_relationship = dict() q_unioned_fact_relationship[DEST_DATASET] = dataset_id q_unioned_fact_relationship[ DEST_TABLE] = UNIONED_EHR + common.FACT_RELATIONSHIP if q_unioned_fact_relationship[DEST_TABLE] in existing_tables: q_unioned_fact_relationship[ QUERY] = RETRACT_DATA_FACT_RELATIONSHIP.format( project=project_id, pid_project=pid_project_id, dataset=q_unioned_fact_relationship[DEST_DATASET], table=q_unioned_fact_relationship[DEST_TABLE], PERSON_DOMAIN=PERSON_DOMAIN, pid_table_id=pid_table_id, sandbox_dataset_id=sandbox_dataset_id, person_research_id=PERSON_ID) unioned_queries.append(q_unioned_fact_relationship) return unioned_mapping_legacy_queries + unioned_mapping_queries, unioned_queries + site_queries
def get_table_hpo_subquery_error(self, table, dataset_in, dataset_out): subquery = ehr_union.table_hpo_subquery(table, NYC_HPO_ID, dataset_in, dataset_out) # moz-sql-parser doesn't support the ROW_NUMBER() OVER() a analytical function of sql we are removing # that statement from the returned query for the parser be able to parse out the query without erroring out. subquery = re.sub( r",\s+ROW_NUMBER\(\) OVER \(PARTITION BY nm\..+?_id\) AS row_num", " ", subquery) # offset is being used as a column-name in note_nlp table. # Although, BigQuery does not throw any errors for this, moz_sql_parser indentifies as a SQL Keyword. # So, change required only in Test Script as a workaround. if 'offset,' in subquery: subquery = subquery.replace('offset,', '"offset",') stmt = moz_sql_parser.parse(subquery) # Sanity check it is a select statement if 'select' not in stmt: return SUBQUERY_FAIL_MSG.format(expr='query type', table=table, expected='select', actual=str(stmt), subquery=subquery) # Input table should be first in FROM expression actual_from = first_or_none( dpath.util.values(stmt, 'from/0/value/from/value') or dpath.util.values(stmt, 'from')) expected_from = dataset_in + '.' + bq_utils.get_table_id( NYC_HPO_ID, table) if expected_from != actual_from: return SUBQUERY_FAIL_MSG.format(expr='first object in FROM', table=table, expected=expected_from, actual=actual_from, subquery=subquery) # Ensure all key fields (primary or foreign) yield joins with their associated mapping tables # Note: ordering of joins in the subquery is assumed to be consistent with field order in the json file fields = resources.fields_for(table) id_field = table + '_id' key_ind = 0 expected_join = None actual_join = None for field in fields: if field['name'] in self.mapped_fields: # key_ind += 1 # TODO use this increment when we generalize solution for all foreign keys if field['name'] == id_field: # Primary key, mapping table associated with this one should be INNER joined key_ind += 1 expr = 'inner join on primary key' actual_join = first_or_none( dpath.util.values(stmt, 'from/%s/join/value' % key_ind)) expected_join = dataset_out + '.' + ehr_union.mapping_table_for( table) elif field['name'] in self.implemented_foreign_keys: # Foreign key, mapping table associated with the referenced table should be LEFT joined key_ind += 1 expr = 'left join on foreign key' # Visit_detail table has 'visit_occurrence' column after 'care_site', which is different from # other cdm tables, where 'visit_occurrence' comes before other foreign_keys. # The test expects the same order as other cmd tables, so the expected-query has # 'visit_occurrence' before 'care_site'. The following reorder is required to match the sequence # to the actual-query. if table == 'visit_detail' and key_ind == 2: stmt['from'][2], stmt['from'][3] = stmt['from'][ 3], stmt['from'][2] actual_join = first_or_none( dpath.util.values(stmt, 'from/%s/left join/value' % key_ind)) joined_table = field['name'].replace('_id', '') expected_join = dataset_out + '.' + ehr_union.mapping_table_for( joined_table) if expected_join != actual_join: return SUBQUERY_FAIL_MSG.format(expr=expr, table=table, expected=expected_join, actual=actual_join, subquery=subquery)