def test_ehr_person_to_observation_counts(self, mock_tables_map): self._load_datasets() mock_tables_map.return_value = [ common.OBSERVATION, common.LOCATION, common.CARE_SITE, common.VISIT_OCCURRENCE ] # perform ehr union ehr_union.main(self.input_dataset_id, self.output_dataset_id, self.project_id, self.hpo_ids) q_person = ''' SELECT * FROM {output_dataset_id}.unioned_ehr_person AS p '''.format(output_dataset_id=self.output_dataset_id) person_response = bq_utils.query(q_person) person_rows = bq_utils.response2rows(person_response) q_observation = ''' SELECT * FROM {output_dataset_id}.unioned_ehr_observation WHERE observation_type_concept_id = 38000280 '''.format(output_dataset_id=self.output_dataset_id) # observation should contain 4 records per person of type EHR expected = len(person_rows) * 4 observation_response = bq_utils.query(q_observation) observation_rows = bq_utils.response2rows(observation_response) actual = len(observation_rows) self.assertEqual( actual, expected, 'Expected %s EHR person records in observation but found %s' % (expected, actual))
def test_ehr_person_to_observation_counts(self): self._load_datasets() # perform ehr union ehr_union.main(self.input_dataset_id, self.output_dataset_id, self.project_id, self.hpo_ids) q_person = ''' SELECT * FROM {output_dataset_id}.unioned_ehr_person AS p '''.format(output_dataset_id=self.output_dataset_id) person_response = bq_utils.query(q_person) person_rows = bq_utils.response2rows(person_response) q_observation = ''' SELECT * FROM {output_dataset_id}.unioned_ehr_observation WHERE observation_type_concept_id = 38000280 '''.format(output_dataset_id=self.output_dataset_id) # observation should contain 4 records per person of type EHR expected = len(person_rows) * 4 observation_response = bq_utils.query(q_observation) observation_rows = bq_utils.response2rows(observation_response) actual = len(observation_rows) self.assertEqual( actual, expected, 'Expected %s EHR person records in observation but found %s' % (expected, actual))
def test_ehr_person_to_observation(self, mock_tables_map): # ehr person table converts to observation records self._load_datasets() mock_tables_map.return_value = [ common.OBSERVATION, common.LOCATION, common.CARE_SITE, common.VISIT_OCCURRENCE ] # perform ehr union ehr_union.main(self.input_dataset_id, self.output_dataset_id, self.project_id, self.hpo_ids) person_query = ''' SELECT person_id, gender_concept_id, gender_source_value, race_concept_id, race_source_value, CAST(birth_datetime AS STRING) AS birth_datetime, ethnicity_concept_id, ethnicity_source_value, EXTRACT(DATE FROM birth_datetime) AS birth_date FROM {output_dataset_id}.unioned_ehr_person '''.format(output_dataset_id=self.output_dataset_id) person_response = bq_utils.query(person_query) person_rows = bq_utils.response2rows(person_response) # construct dicts of expected values expected = [] for person_row in person_rows: expected.extend(self.convert_ehr_person_to_observation(person_row)) # query for observation table records query = ''' SELECT person_id, observation_concept_id, value_as_concept_id, value_as_string, observation_source_value, observation_date FROM {output_dataset_id}.unioned_ehr_observation AS obs WHERE obs.observation_concept_id IN ({gender_concept_id},{race_concept_id},{dob_concept_id}, {ethnicity_concept_id}) ''' obs_query = query.format( output_dataset_id=self.output_dataset_id, gender_concept_id=eu_constants.GENDER_CONCEPT_ID, race_concept_id=eu_constants.RACE_CONCEPT_ID, dob_concept_id=eu_constants.DOB_CONCEPT_ID, ethnicity_concept_id=eu_constants.ETHNICITY_CONCEPT_ID) obs_response = bq_utils.query(obs_query) obs_rows = bq_utils.response2rows(obs_response) actual = obs_rows self.assertCountEqual(expected, actual)
def union_ehr(): hpo_id = 'unioned_ehr' app_id = bq_utils.app_identity.get_application_id() input_dataset_id = bq_utils.get_dataset_id() output_dataset_id = bq_utils.get_unioned_dataset_id() ehr_union.main(input_dataset_id, output_dataset_id, app_id) run_achilles(hpo_id) now_date_string = datetime.datetime.now().strftime('%Y_%m_%d') folder_prefix = 'unioned_ehr_' + now_date_string + '/' run_export(datasource_id=hpo_id, folder_prefix=folder_prefix) logging.info(f"Uploading achilles index files") _upload_achilles_files(hpo_id, folder_prefix) return 'merge-and-achilles-done'
def test_excluded_hpo_ids(self, mock_hpo_info, mock_create_std_tbl, mock_mapping, mock_load, mock_client, mock_map_person, mock_move_person): mock_hpo_info.return_value = [{ 'hpo_id': hpo_id } for hpo_id in self.hpo_ids] mock_client.return_value = 'client' eu.main("input_dataset_id", "output_dataset_id", "project_id", hpo_ids_ex=[self.FAKE_SITE_2]) mock_mapping.assert_called_with(ANY, [self.FAKE_SITE_1], "input_dataset_id", "output_dataset_id", "project_id", 'client')
def test_union_ehr(self): self._load_datasets() input_tables_before = set(self._dataset_tables(self.input_dataset_id)) # output should be mapping tables and cdm tables output_tables_before = self._dataset_tables(self.output_dataset_id) mapping_tables = [ ehr_union.mapping_table_for(table) for table in ehr_union.tables_to_map() ] output_cdm_tables = [ ehr_union.output_table_for(table) for table in common.CDM_TABLES ] expected_output = set(output_tables_before + mapping_tables + output_cdm_tables) # perform ehr union ehr_union.main(self.input_dataset_id, self.output_dataset_id, self.project_id, self.hpo_ids) # input dataset should be unchanged input_tables_after = set(self._dataset_tables(self.input_dataset_id)) self.assertSetEqual(input_tables_before, input_tables_after) # mapping tables tables_to_map = ehr_union.tables_to_map() for table_to_map in tables_to_map: mapping_table = ehr_union.mapping_table_for(table_to_map) expected_fields = { 'src_table_id', 'src_%s_id' % table_to_map, '%s_id' % table_to_map, 'src_hpo_id' } mapping_table_info = bq_utils.get_table_info( mapping_table, dataset_id=self.output_dataset_id) mapping_table_fields = mapping_table_info.get('schema', dict()).get( 'fields', []) actual_fields = set([f['name'] for f in mapping_table_fields]) message = 'Table %s has fields %s when %s expected' % ( mapping_table, actual_fields, expected_fields) self.assertSetEqual(expected_fields, actual_fields, message) result_table = ehr_union.output_table_for(table_to_map) expected_num_rows = len(self.expected_tables[result_table]) actual_num_rows = int(mapping_table_info.get('numRows', -1)) message = 'Table %s has %s rows when %s expected' % ( mapping_table, actual_num_rows, expected_num_rows) self.assertEqual(expected_num_rows, actual_num_rows, message) # check for each output table for table_name in common.CDM_TABLES: # output table exists and row count is sum of those submitted by hpos result_table = ehr_union.output_table_for(table_name) expected_rows = self.expected_tables[result_table] expected_count = len(expected_rows) table_info = bq_utils.get_table_info( result_table, dataset_id=self.output_dataset_id) actual_count = int(table_info.get('numRows')) msg = 'Unexpected row count in table {result_table} after ehr union'.format( result_table=result_table) self.assertEqual(expected_count, actual_count, msg) # TODO Compare table rows to expected accounting for the new ids and ignoring field types # q = 'SELECT * FROM {dataset}.{table}'.format(dataset=self.output_dataset_id, table=result_table) # query_response = bq_utils.query(q) # actual_rows = test_util.response2rows(query_response) # output table has clustering on person_id where applicable fields_file = os.path.join(resources.fields_path, table_name + '.json') with open(fields_file, 'r') as fp: fields = json.load(fp) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self._table_has_clustering(table_info) actual_output = set(self._dataset_tables(self.output_dataset_id)) self.assertSetEqual(expected_output, actual_output) # explicit check that output person_ids are same as input chs_person_table_id = bq_utils.get_table_id(CHS_HPO_ID, 'person') pitt_person_table_id = bq_utils.get_table_id(PITT_HPO_ID, 'person') q = '''SELECT DISTINCT person_id FROM ( SELECT person_id FROM {dataset_id}.{chs_person_table_id} UNION ALL SELECT person_id FROM {dataset_id}.{pitt_person_table_id} ) ORDER BY person_id ASC'''.format( dataset_id=self.input_dataset_id, chs_person_table_id=chs_person_table_id, pitt_person_table_id=pitt_person_table_id) response = bq_utils.query(q) expected_rows = test_util.response2rows(response) person_table_id = ehr_union.output_table_for('person') q = '''SELECT DISTINCT person_id FROM {dataset_id}.{table_id} ORDER BY person_id ASC'''.format( dataset_id=self.output_dataset_id, table_id=person_table_id) response = bq_utils.query(q) actual_rows = test_util.response2rows(response) self.assertListEqual(expected_rows, actual_rows)
def test_union_ehr(self): self._load_datasets() input_tables_before = set(self._dataset_tables(self.input_dataset_id)) # output should be mapping tables and cdm tables output_tables_before = self._dataset_tables(self.output_dataset_id) mapping_tables = [ ehr_union.mapping_table_for(table) for table in cdm.tables_to_map() + [combine_ehr_rdr.PERSON_TABLE] ] output_cdm_tables = [ ehr_union.output_table_for(table) for table in resources.CDM_TABLES ] expected_output = set(output_tables_before + mapping_tables + output_cdm_tables) # perform ehr union ehr_union.main(self.input_dataset_id, self.output_dataset_id, self.project_id, self.hpo_ids) # input dataset should be unchanged input_tables_after = set(self._dataset_tables(self.input_dataset_id)) self.assertSetEqual(input_tables_before, input_tables_after) # fact_relationship from pitt hpo_unique_identifiers = ehr_union.get_hpo_offsets(self.hpo_ids) pitt_offset = hpo_unique_identifiers[PITT_HPO_ID] q = '''SELECT fact_id_1, fact_id_2 FROM `{input_dataset}.{hpo_id}_fact_relationship` where domain_concept_id_1 = 21 and domain_concept_id_2 = 21'''.format( input_dataset=self.input_dataset_id, hpo_id=PITT_HPO_ID) response = bq_utils.query(q) result = bq_utils.response2rows(response) expected_fact_id_1 = result[0]["fact_id_1"] + pitt_offset expected_fact_id_2 = result[0]["fact_id_2"] + pitt_offset q = '''SELECT fr.fact_id_1, fr.fact_id_2 FROM `{dataset_id}.unioned_ehr_fact_relationship` fr join `{dataset_id}._mapping_measurement` mm on fr.fact_id_1 = mm.measurement_id and mm.src_hpo_id = "{hpo_id}"'''.format( dataset_id=self.output_dataset_id, hpo_id=PITT_HPO_ID) response = bq_utils.query(q) result = bq_utils.response2rows(response) actual_fact_id_1, actual_fact_id_2 = result[0]["fact_id_1"], result[0][ "fact_id_2"] self.assertEqual(expected_fact_id_1, actual_fact_id_1) self.assertEqual(expected_fact_id_2, actual_fact_id_2) # mapping tables tables_to_map = cdm.tables_to_map() for table_to_map in tables_to_map: mapping_table = ehr_union.mapping_table_for(table_to_map) expected_fields = { 'src_table_id', 'src_%s_id' % table_to_map, '%s_id' % table_to_map, 'src_hpo_id' } mapping_table_info = bq_utils.get_table_info( mapping_table, dataset_id=self.output_dataset_id) mapping_table_fields = mapping_table_info.get('schema', dict()).get( 'fields', []) actual_fields = set([f['name'] for f in mapping_table_fields]) message = 'Table %s has fields %s when %s expected' % ( mapping_table, actual_fields, expected_fields) self.assertSetEqual(expected_fields, actual_fields, message) result_table = ehr_union.output_table_for(table_to_map) expected_num_rows = len(self.expected_tables[result_table]) actual_num_rows = int(mapping_table_info.get('numRows', -1)) message = 'Table %s has %s rows when %s expected' % ( mapping_table, actual_num_rows, expected_num_rows) self.assertEqual(expected_num_rows, actual_num_rows, message) # check for each output table for table_name in resources.CDM_TABLES: # output table exists and row count is sum of those submitted by hpos result_table = ehr_union.output_table_for(table_name) expected_rows = self.expected_tables[result_table] expected_count = len(expected_rows) table_info = bq_utils.get_table_info( result_table, dataset_id=self.output_dataset_id) actual_count = int(table_info.get('numRows')) msg = 'Unexpected row count in table {result_table} after ehr union'.format( result_table=result_table) self.assertEqual(expected_count, actual_count, msg) # TODO Compare table rows to expected accounting for the new ids and ignoring field types # q = 'SELECT * FROM {dataset}.{table}'.format(dataset=self.output_dataset_id, table=result_table) # query_response = bq_utils.query(q) # actual_rows = bq_utils.response2rows(query_response) # output table has clustering on person_id where applicable fields_file = os.path.join(resources.fields_path, table_name + '.json') with open(fields_file, 'r') as fp: fields = json.load(fp) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self._table_has_clustering(table_info) actual_output = set(self._dataset_tables(self.output_dataset_id)) self.assertSetEqual(expected_output, actual_output) # explicit check that output person_ids are same as input nyc_person_table_id = bq_utils.get_table_id(NYC_HPO_ID, 'person') pitt_person_table_id = bq_utils.get_table_id(PITT_HPO_ID, 'person') q = '''SELECT DISTINCT person_id FROM ( SELECT person_id FROM {dataset_id}.{nyc_person_table_id} UNION ALL SELECT person_id FROM {dataset_id}.{pitt_person_table_id} ) ORDER BY person_id ASC'''.format( dataset_id=self.input_dataset_id, nyc_person_table_id=nyc_person_table_id, pitt_person_table_id=pitt_person_table_id) response = bq_utils.query(q) expected_rows = bq_utils.response2rows(response) person_table_id = ehr_union.output_table_for('person') q = '''SELECT DISTINCT person_id FROM {dataset_id}.{table_id} ORDER BY person_id ASC'''.format( dataset_id=self.output_dataset_id, table_id=person_table_id) response = bq_utils.query(q) actual_rows = bq_utils.response2rows(response) self.assertCountEqual(expected_rows, actual_rows)