def test_consented_person_id(self): """ Test observation data has seven (7) persons with consent records as described below 1: No 2: Yes 3: NULL 4: No followed by Yes 5: Yes followed by No 6: Yes followed by NULL 7: NULL and Yes with same date/time """ # sanity check self.assertFalse( bq_utils.table_exists(EHR_CONSENT_TABLE_ID, self.combined_dataset_id)) ehr_consent() self.assertTrue( bq_utils.table_exists(EHR_CONSENT_TABLE_ID, self.combined_dataset_id), 'Table {dataset}.{table} created by consented_person'.format( dataset=self.combined_dataset_id, table=EHR_CONSENT_TABLE_ID)) response = bq_utils.query('SELECT * FROM {dataset}.{table}'.format( dataset=self.combined_dataset_id, table=EHR_CONSENT_TABLE_ID)) rows = test_util.response2rows(response) expected = {2, 4} actual = set(row['person_id'] for row in rows) self.assertSetEqual( expected, actual, 'Records in {dataset}.{table}'.format( dataset=self.combined_dataset_id, table=EHR_CONSENT_TABLE_ID))
def _all_rdr_records_included(self): """ All rdr records are included whether or not there is corresponding ehr record """ for domain_table in DOMAIN_TABLES: mapping_table = mapping_table_for(domain_table) q = '''SELECT rt.{domain_table}_id as id FROM {rdr_dataset_id}.{domain_table} rt LEFT JOIN {ehr_rdr_dataset_id}.{mapping_table} m ON rt.{domain_table}_id = m.src_{domain_table}_id WHERE m.{domain_table}_id IS NULL OR NOT EXISTS (SELECT 1 FROM {ehr_rdr_dataset_id}.{domain_table} t WHERE t.{domain_table}_id = m.{domain_table}_id)'''.format( domain_table=domain_table, rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), mapping_table=mapping_table) response = bq_utils.query(q) rows = test_util.response2rows(response) self.assertEqual( 0, len(rows), "RDR records should map to records in mapping and combined tables" )
def _ehr_only_records_excluded(self): """ EHR person records which are missing from RDR are excluded from combined """ q = ''' WITH ehr_only AS (SELECT person_id FROM {ehr_dataset_id}.person ep WHERE NOT EXISTS (SELECT 1 FROM {rdr_dataset_id}.person rp WHERE rp.person_id = ep.person_id) ) SELECT ehr_only.person_id AS ehr_person_id, p.person_id AS combined_person_id FROM ehr_only LEFT JOIN {ehr_rdr_dataset_id}.person p ON ehr_only.person_id = p.person_id '''.format(ehr_dataset_id=self.ehr_dataset_id, rdr_dataset_id=self.rdr_dataset_id, ehr_rdr_dataset_id=self.combined_dataset_id) response = bq_utils.query(q) rows = test_util.response2rows(response) self.assertGreater(len(rows), 0, 'Test data is missing EHR-only records') for row in rows: combined_person_id = row['combined_person_id'] self.assertIsNone( combined_person_id, 'EHR-only person_id `{ehr_person_id}` found in combined when it should be excluded' )
def test_copy_rdr_tables(self): for table in RDR_TABLES_TO_COPY: self.assertFalse( bq_utils.table_exists( table, self.combined_dataset_id)) # sanity check copy_rdr_table(table) actual = bq_utils.table_exists(table, self.combined_dataset_id) self.assertTrue( actual, msg='RDR table {table} should be copied'.format(table=table)) # Check that row count in combined is same as rdr q = ''' WITH rdr AS (SELECT COUNT(1) n FROM {rdr_dataset_id}.{table}), combined AS (SELECT COUNT(1) n FROM {combined_dataset_id}.{table}) SELECT rdr.n AS rdr_count, combined.n AS combined_count FROM rdr, combined '''.format(rdr_dataset_id=self.rdr_dataset_id, combined_dataset_id=self.combined_dataset_id, table=table) response = bq_utils.query(q) rows = test_util.response2rows(response) self.assertTrue(len(rows) == 1) # sanity check row = rows[0] rdr_count, combined_count = row['rdr_count'], row['combined_count'] msg_fmt = 'Table {table} has {rdr_count} in rdr and {combined_count} in combined (expected to be equal)' self.assertEqual( rdr_count, combined_count, msg_fmt.format(table=table, rdr_count=rdr_count, combined_count=combined_count))
def _check_ehr_person_observation(self): q = '''SELECT * FROM {dataset_id}.person'''.format( dataset_id=self.ehr_dataset_id) person_response = bq_utils.query(q) person_rows = test_util.response2rows(person_response) q = '''SELECT * FROM {ehr_rdr_dataset_id}.observation WHERE observation_type_concept_id = 38000280'''.format( ehr_rdr_dataset_id=self.combined_dataset_id) # observation should contain 4 records per person of type EHR expected = len(person_rows) * 4 observation_response = bq_utils.query(q) observation_rows = test_util.response2rows(observation_response) # TODO check row content is as expected actual = len(observation_rows) self.assertEqual( actual, expected, 'Expected %s EHR person records in observation but found %s' % (expected, actual))
def test_union_ehr(self): self._load_datasets() input_tables_before = set(self._dataset_tables(self.input_dataset_id)) # output should be mapping tables and cdm tables output_tables_before = self._dataset_tables(self.output_dataset_id) mapping_tables = [ ehr_union.mapping_table_for(table) for table in ehr_union.tables_to_map() ] output_cdm_tables = [ ehr_union.output_table_for(table) for table in common.CDM_TABLES ] expected_output = set(output_tables_before + mapping_tables + output_cdm_tables) # perform ehr union ehr_union.main(self.input_dataset_id, self.output_dataset_id, self.project_id, self.hpo_ids) # input dataset should be unchanged input_tables_after = set(self._dataset_tables(self.input_dataset_id)) self.assertSetEqual(input_tables_before, input_tables_after) # mapping tables tables_to_map = ehr_union.tables_to_map() for table_to_map in tables_to_map: mapping_table = ehr_union.mapping_table_for(table_to_map) expected_fields = { 'src_table_id', 'src_%s_id' % table_to_map, '%s_id' % table_to_map, 'src_hpo_id' } mapping_table_info = bq_utils.get_table_info( mapping_table, dataset_id=self.output_dataset_id) mapping_table_fields = mapping_table_info.get('schema', dict()).get( 'fields', []) actual_fields = set([f['name'] for f in mapping_table_fields]) message = 'Table %s has fields %s when %s expected' % ( mapping_table, actual_fields, expected_fields) self.assertSetEqual(expected_fields, actual_fields, message) result_table = ehr_union.output_table_for(table_to_map) expected_num_rows = len(self.expected_tables[result_table]) actual_num_rows = int(mapping_table_info.get('numRows', -1)) message = 'Table %s has %s rows when %s expected' % ( mapping_table, actual_num_rows, expected_num_rows) self.assertEqual(expected_num_rows, actual_num_rows, message) # check for each output table for table_name in common.CDM_TABLES: # output table exists and row count is sum of those submitted by hpos result_table = ehr_union.output_table_for(table_name) expected_rows = self.expected_tables[result_table] expected_count = len(expected_rows) table_info = bq_utils.get_table_info( result_table, dataset_id=self.output_dataset_id) actual_count = int(table_info.get('numRows')) msg = 'Unexpected row count in table {result_table} after ehr union'.format( result_table=result_table) self.assertEqual(expected_count, actual_count, msg) # TODO Compare table rows to expected accounting for the new ids and ignoring field types # q = 'SELECT * FROM {dataset}.{table}'.format(dataset=self.output_dataset_id, table=result_table) # query_response = bq_utils.query(q) # actual_rows = test_util.response2rows(query_response) # output table has clustering on person_id where applicable fields_file = os.path.join(resources.fields_path, table_name + '.json') with open(fields_file, 'r') as fp: fields = json.load(fp) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self._table_has_clustering(table_info) actual_output = set(self._dataset_tables(self.output_dataset_id)) self.assertSetEqual(expected_output, actual_output) # explicit check that output person_ids are same as input chs_person_table_id = bq_utils.get_table_id(CHS_HPO_ID, 'person') pitt_person_table_id = bq_utils.get_table_id(PITT_HPO_ID, 'person') q = '''SELECT DISTINCT person_id FROM ( SELECT person_id FROM {dataset_id}.{chs_person_table_id} UNION ALL SELECT person_id FROM {dataset_id}.{pitt_person_table_id} ) ORDER BY person_id ASC'''.format( dataset_id=self.input_dataset_id, chs_person_table_id=chs_person_table_id, pitt_person_table_id=pitt_person_table_id) response = bq_utils.query(q) expected_rows = test_util.response2rows(response) person_table_id = ehr_union.output_table_for('person') q = '''SELECT DISTINCT person_id FROM {dataset_id}.{table_id} ORDER BY person_id ASC'''.format( dataset_id=self.output_dataset_id, table_id=person_table_id) response = bq_utils.query(q) actual_rows = test_util.response2rows(response) self.assertListEqual(expected_rows, actual_rows)