def test_merge_with_good_data(self): running_jobs = [] with open(NYC_FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'), 'person.csv', fp) result = bq_utils.load_cdm_csv('nyc', 'person') running_jobs.append(result['jobReference']['jobId']) with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'), 'person.csv', fp) result = bq_utils.load_cdm_csv('pitt', 'person') running_jobs.append(result['jobReference']['jobId']) nyc_person_ids = [ int(row['person_id']) for row in resources._csv_to_list(NYC_FIVE_PERSONS_PERSON_CSV) ] pitt_person_ids = [ int(row['person_id']) for row in resources._csv_to_list(PITT_FIVE_PERSONS_PERSON_CSV) ] expected_result = nyc_person_ids + pitt_person_ids expected_result.sort() incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) self.assertEqual( len(incomplete_jobs), 0, 'loading tables {},{} timed out'.format('nyc_person', 'pitt_person')) dataset_id = bq_utils.get_dataset_id() table_ids = ['nyc_person', 'pitt_person'] merged_table_id = 'merged_nyc_pitt' success_flag, error = bq_utils.merge_tables(dataset_id, table_ids, dataset_id, merged_table_id) self.assertTrue(success_flag) self.assertEqual(error, "") query_string = 'SELECT person_id FROM {dataset_id}.{table_id}'.format( dataset_id=dataset_id, table_id=merged_table_id) merged_query_job_result = bq_utils.query(query_string) self.assertIsNone(merged_query_job_result.get('errors', None)) actual_result = [ int(row['f'][0]['v']) for row in merged_query_job_result['rows'] ] actual_result.sort() self.assertListEqual(expected_result, actual_result)
def _load_datasets(self): """ Load five persons data for each test hpo """ # expected_tables is for testing output # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...] expected_tables = dict() running_jobs = [] for cdm_table in common.CDM_TABLES: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') output_table = ehr_union.output_table_for(cdm_table) expected_tables[output_table] = [] for hpo_id in self.hpo_ids: # upload csv into hpo bucket bucket = gcs_utils.get_hpo_bucket(hpo_id) if os.path.exists(cdm_file_name): test_util.write_cloud_file(bucket, cdm_file_name) csv_rows = resources._csv_to_list(cdm_file_name) else: # results in empty table test_util.write_cloud_str(bucket, cdm_table + '.csv', 'dummy\n') csv_rows = [] # load table from csv result = bq_utils.load_cdm_csv(hpo_id, cdm_table) running_jobs.append(result['jobReference']['jobId']) expected_tables[output_table] += list(csv_rows) incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) if len(incomplete_jobs) > 0: message = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) self.expected_tables = expected_tables
def test_load_ehr_observation(self): hpo_id = 'pitt' dataset_id = bq_utils.get_dataset_id() table_id = bq_utils.get_table_id(hpo_id, table_name='observation') q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format( dataset_id=dataset_id, table_id=table_id) expected_observation_ids = [ int(row['observation_id']) for row in resources._csv_to_list( PITT_FIVE_PERSONS_OBSERVATION_CSV) ] with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket(hpo_id), 'observation.csv', fp) result = bq_utils.load_cdm_csv(hpo_id, 'observation') job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([job_id]) self.assertEqual(len(incomplete_jobs), 0, 'pitt_observation load job did not complete') load_job_result = bq_utils.get_job_details(job_id) load_job_result_status = load_job_result['status'] load_job_errors = load_job_result_status.get('errors') self.assertIsNone(load_job_errors, msg='pitt_observation load job failed: ' + str(load_job_errors)) query_results_response = bq_utils.query(q) query_job_errors = query_results_response.get('errors') self.assertIsNone(query_job_errors) actual_result = [ int(row['f'][0]['v']) for row in query_results_response['rows'] ] self.assertListEqual(actual_result, expected_observation_ids)
def test_pii_files_loaded(self, mock_check_cron): # tests if pii files are loaded test_file_paths = [ test_util.PII_NAME_FILE, test_util.PII_MRN_BAD_PERSON_ID_FILE ] test_file_names = [os.path.basename(f) for f in test_file_paths] test_util.write_cloud_file(self.hpo_bucket, test_util.PII_NAME_FILE, prefix=self.folder_prefix) test_util.write_cloud_file(self.hpo_bucket, test_util.PII_MRN_BAD_PERSON_ID_FILE, prefix=self.folder_prefix) rs = resources._csv_to_list(test_util.PII_FILE_LOAD_RESULT_CSV) expected_results = [(r['file_name'], int(r['found']), int(r['parsed']), int(r['loaded'])) for r in rs] for f in common.SUBMISSION_FILES: if f not in test_file_names: expected_result = (f, 0, 0, 0) expected_results.append(expected_result) bucket_items = gcs_utils.list_bucket(self.hpo_bucket) r = main.validate_submission(self.hpo_id, self.hpo_bucket, bucket_items, self.folder_prefix) self.assertSetEqual(set(expected_results), set(r['results']))
def _load_datasets(self): load_jobs = [] self.expected_tables = dict() for cdm_table in common.CDM_TABLES: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') result_table = ehr_merge.result_table_for(cdm_table) if os.path.exists(cdm_file_name): # one copy for chs, the other for pitt csv_rows = resources._csv_to_list(cdm_file_name) self.expected_tables[result_table] = csv_rows + list(csv_rows) test_util.write_cloud_file(self.chs_bucket, cdm_file_name) test_util.write_cloud_file(self.pitt_bucket, cdm_file_name) else: self.expected_tables[result_table] = [] test_util.write_cloud_str(self.chs_bucket, cdm_table + '.csv', 'dummy\n') test_util.write_cloud_str(self.pitt_bucket, cdm_table + '.csv', 'dummy\n') chs_load_results = bq_utils.load_cdm_csv(CHS_HPO_ID, cdm_table) pitt_load_results = bq_utils.load_cdm_csv(PITT_HPO_ID, cdm_table) chs_load_job_id = chs_load_results['jobReference']['jobId'] pitt_load_job_id = pitt_load_results['jobReference']['jobId'] load_jobs.append(chs_load_job_id) load_jobs.append(pitt_load_job_id) incomplete_jobs = bq_utils.wait_on_jobs(load_jobs) if len(incomplete_jobs) > 0: raise RuntimeError('BigQuery jobs %s failed to complete' % incomplete_jobs)
def test_validate_five_persons_success(self, mock_check_cron): prefix = 'dummy-prefix-2018-03-22/' expected_result_items = resources._csv_to_list( test_util.FIVE_PERSONS_SUCCESS_RESULT_CSV) json_export_files = self.get_json_export_files(test_util.FAKE_HPO_ID) # upload all five_persons files for cdm_file in test_util.FIVE_PERSONS_FILES: test_util.write_cloud_file(self.hpo_bucket, cdm_file, prefix=prefix) expected_tables = [ 'person', 'visit_occurrence', 'condition_occurrence', 'procedure_occurrence', 'drug_exposure', 'measurement' ] cdm_files = [table + '.csv' for table in expected_tables] main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) # check the result file was put in bucket expected_object_names = cdm_files + common.IGNORE_LIST + json_export_files expected_objects = [ prefix + item for item in expected_object_names ] list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket) actual_objects = [item['name'] for item in list_bucket_result] self.assertSetEqual(set(expected_objects), set(actual_objects)) # result says file found, parsed, loaded actual_result = test_util.read_cloud_file( self.hpo_bucket, prefix + common.RESULT_CSV) actual_result_file = StringIO.StringIO(actual_result) actual_result_items = resources._csv_file_to_list( actual_result_file) expected_result_items.sort() actual_result_items.sort() self.assertListEqual(expected_result_items, actual_result_items) self.assertTrue( main.all_required_files_loaded(test_util.FAKE_HPO_ID, folder_prefix=prefix)) # check tables exist and are clustered as expected for table in expected_tables: fields_file = os.path.join(resources.fields_path, table + '.json') table_id = bq_utils.get_table_id(test_util.FAKE_HPO_ID, table) table_info = bq_utils.get_table_info(table_id) with open(fields_file, 'r') as fp: fields = json.load(fp) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self.table_has_clustering(table_info)
def get_nyc_cu_cols(): result = [] cols = resources._csv_to_list(test_util.TEST_NYC_CU_COLS_CSV) for col in cols: omop_table_name = completeness.get_standard_table_name( col[consts.TABLE_NAME]) if omop_table_name: col[consts.OMOP_TABLE_NAME] = omop_table_name result.append(col) return result
def test_validate_five_persons_success(self, mock_check_cron): prefix = 'dummy-prefix-2018-03-22/' expected_result_items = resources._csv_to_list( test_util.FIVE_PERSONS_SUCCESS_RESULT_CSV) json_export_files = self.get_json_export_files(test_util.FAKE_HPO_ID) # upload all five_persons files for cdm_file in test_util.FIVE_PERSONS_FILES: test_util.write_cloud_file(self.hpo_bucket, cdm_file, prefix=prefix) main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) # check the result file was putin bucket expected_bucket_items = common.REQUIRED_FILES + common.IGNORE_LIST + json_export_files # want to keep this test the same. So adding all the old required files. expected_bucket_items = expected_bucket_items + [ 'measurement.csv', 'procedure_occurrence.csv', 'drug_exposure.csv', 'condition_occurrence.csv', 'visit_occurrence.csv' ] expected_bucket_items = [ prefix + item for item in expected_bucket_items ] list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket) actual_bucket_items = [item['name'] for item in list_bucket_result] self.assertSetEqual(set(expected_bucket_items), set(actual_bucket_items)) # result says file found, parsed, loaded actual_result = test_util.read_cloud_file( self.hpo_bucket, prefix + common.RESULT_CSV) actual_result_file = StringIO.StringIO(actual_result) actual_result_items = resources._csv_file_to_list( actual_result_file) expected_result_items.sort() actual_result_items.sort() self.assertListEqual(expected_result_items, actual_result_items) self.assertTrue( main.all_required_files_loaded(test_util.FAKE_HPO_ID, folder_prefix=prefix))
def load_test_data(self, hpo_id=None): """ Load to bq test achilles heel results data from csv file :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used :return: contents of the file as list of objects """ schema_path = os.path.join(resources.fields_path, common.ACHILLES_HEEL_RESULTS + '.json') table_id = common.ACHILLES_HEEL_RESULTS if hpo_id is not None: table_id = bq_utils.get_table_id(hpo_id, common.ACHILLES_HEEL_RESULTS) test_file_name = table_id + '.csv' test_file_path = os.path.join(test_util.TEST_DATA_PATH, test_file_name) test_util.write_cloud_file(self.bucket, test_file_path) gcs_path = 'gs://' + self.bucket + '/' + test_file_name load_results = bq_utils.load_csv(schema_path, gcs_path, self.app_id, self.dataset_id, table_id) job_id = load_results['jobReference']['jobId'] bq_utils.wait_on_jobs([job_id]) return resources._csv_to_list(test_file_path)
def test_pii_files_loaded(self, mock_check_cron): # tests if pii files are loaded folder_prefix = 'dummy-prefix-2018-03-22/' expected_result_items = resources._csv_to_list( test_util.PII_FILE_LOAD_RESULT_CSV) test_util.write_cloud_file(self.hpo_bucket, test_util.PII_NAME_FILE, prefix=folder_prefix) test_util.write_cloud_file(self.hpo_bucket, test_util.PII_MRN_BAD_PERSON_ID_FILE, prefix=folder_prefix) main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) actual_result = test_util.read_cloud_file( self.hpo_bucket, folder_prefix + common.RESULT_CSV) actual_result_file = StringIO.StringIO(actual_result) actual_result_items = resources._csv_file_to_list( actual_result_file) # sort in order to compare expected_result_items.sort() actual_result_items.sort() self.assertListEqual(expected_result_items, actual_result_items)