def test_copy_five_persons(self, mock_check_cron): # upload all five_persons files for cdm_file in test_util.FIVE_PERSONS_FILES: test_util.write_cloud_file(self.hpo_bucket, cdm_file, prefix=self.folder_prefix) test_util.write_cloud_file(self.hpo_bucket, cdm_file, prefix=self.folder_prefix + self.folder_prefix) main.app.testing = True with main.app.test_client() as c: c.get(test_util.COPY_HPO_FILES_URL) prefix = test_util.FAKE_HPO_ID + '/' + self.hpo_bucket + '/' + self.folder_prefix expected_bucket_items = [ prefix + item.split(os.sep)[-1] for item in test_util.FIVE_PERSONS_FILES ] expected_bucket_items.extend([ prefix + self.folder_prefix + item.split(os.sep)[-1] for item in test_util.FIVE_PERSONS_FILES ]) list_bucket_result = gcs_utils.list_bucket( gcs_utils.get_drc_bucket()) actual_bucket_items = [item['name'] for item in list_bucket_result] self.assertSetEqual(set(expected_bucket_items), set(actual_bucket_items))
def test_pii_files_loaded(self, mock_check_cron): # tests if pii files are loaded test_file_paths = [ test_util.PII_NAME_FILE, test_util.PII_MRN_BAD_PERSON_ID_FILE ] test_file_names = [os.path.basename(f) for f in test_file_paths] test_util.write_cloud_file(self.hpo_bucket, test_util.PII_NAME_FILE, prefix=self.folder_prefix) test_util.write_cloud_file(self.hpo_bucket, test_util.PII_MRN_BAD_PERSON_ID_FILE, prefix=self.folder_prefix) rs = resources.csv_to_list(test_util.PII_FILE_LOAD_RESULT_CSV) expected_results = [(r['file_name'], int(r['found']), int(r['parsed']), int(r['loaded'])) for r in rs] for f in common.SUBMISSION_FILES: if f not in test_file_names: expected_result = (f, 0, 0, 0) expected_results.append(expected_result) bucket_items = gcs_utils.list_bucket(self.hpo_bucket) folder_items = main.get_folder_items(bucket_items, self.folder_prefix) r = main.validate_submission(self.hpo_id, self.hpo_bucket, folder_items, self.folder_prefix) self.assertSetEqual(set(expected_results), set(r['results']))
def test_integration_five_person_data_retraction(self, mock_hpo_bucket, mock_bucket, mock_extract_pids): mock_hpo_bucket.return_value = self.site_bucket mock_bucket.return_value = self.bucket mock_extract_pids.return_value = self.pids lines_to_remove = {} total_lines_prior = {} for file_path in test_util.FIVE_PERSONS_FILES: # generate results files file_name = file_path.split('/')[-1] table_name = file_name.split('.')[0] lines_to_remove[file_name] = 0 total_lines_prior[file_name] = 0 with open(file_path) as f: # skip header next(f) for line in f: line = line.strip() if line != '': if (table_name in rd.PID_IN_COL1 and int(line.split(",")[0]) in self.pids) or \ (table_name in rd.PID_IN_COL2 and int(line.split(",")[1]) in self.pids): lines_to_remove[file_name] += 1 total_lines_prior[file_name] += 1 # write file to cloud for testing test_util.write_cloud_file(self.bucket, file_path, prefix=self.folder_prefix_1) test_util.write_cloud_file(self.bucket, file_path, prefix=self.folder_prefix_2) retract_result = rd.run_gcs_retraction(self.project_id, self.sandbox_dataset_id, self.pid_table_id, self.hpo_id, folder='all_folders', force_flag=True) total_lines_post = {} for file_path in test_util.FIVE_PERSONS_FILES: file_name = file_path.split('/')[-1] actual_result_contents = test_util.read_cloud_file( self.bucket, self.folder_prefix_1 + file_name) # convert to list and remove header and last list item since it is a newline total_lines_post[file_name] = len( actual_result_contents.split('\n')[1:-1]) for key in total_lines_prior.keys(): if key in lines_to_remove: self.assertEqual( lines_to_remove[key], total_lines_prior[key] - total_lines_post[key]) else: self.assertEqual(total_lines_prior[key], total_lines_post[key]) # metadata for each updated file is returned self.assertEqual(len(retract_result[self.folder_prefix_1]), len(lines_to_remove.keys()))
def test_validate_five_persons_success(self, mock_check_cron): expected_results = [] test_file_names = [ os.path.basename(f) for f in test_util.FIVE_PERSONS_FILES ] for cdm_file in common.SUBMISSION_FILES: if cdm_file in test_file_names: expected_result = (cdm_file, 1, 1, 1) test_file = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_file) test_util.write_cloud_file(self.hpo_bucket, test_file, prefix=self.folder_prefix) else: expected_result = (cdm_file, 0, 0, 0) expected_results.append(expected_result) bucket_items = gcs_utils.list_bucket(self.hpo_bucket) folder_items = main.get_folder_items(bucket_items, self.folder_prefix) r = main.validate_submission(self.hpo_id, self.hpo_bucket, folder_items, self.folder_prefix) self.assertSetEqual(set(r['results']), set(expected_results)) # check tables exist and are clustered as expected for table in resources.CDM_TABLES + common.PII_TABLES: fields_file = os.path.join(resources.fields_path, table + '.json') table_id = bq_utils.get_table_id(test_util.FAKE_HPO_ID, table) table_info = bq_utils.get_table_info(table_id) with open(fields_file, 'r') as fp: fields = json.load(fp) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self.table_has_clustering(table_info)
def _test_html_report_five_person(self, mock_check_cron): # Not sure this test is still relevant (see hpo_report module and tests) # TODO refactor or remove this test folder_prefix = '2019-01-01/' for cdm_file in test_util.FIVE_PERSONS_FILES: test_util.write_cloud_file(self.hpo_bucket, cdm_file, prefix=folder_prefix) # achilles sometimes fails due to rate limits. # using both success and failure cases allow it to fail gracefully until there is a fix for achilles with open(test_util.FIVE_PERSON_RESULTS_FILE, 'r') as f: expected_result_achilles_success = self._remove_timestamp_tags_from_results( f.read()) with open(test_util.FIVE_PERSON_RESULTS_ACHILLES_ERROR_FILE, 'r') as f: expected_result_achilles_failure = self._remove_timestamp_tags_from_results( f.read()) expected_results = [ expected_result_achilles_success, expected_result_achilles_failure ] main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) actual_result = test_util.read_cloud_file( self.hpo_bucket, folder_prefix + common.RESULTS_HTML) actual_result_file = self._remove_timestamp_tags_from_results( StringIO(actual_result).getvalue()) self.assertIn(actual_result_file, expected_results)
def _load_dataset(self): for cdm_table in resources.CDM_TABLES: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') if os.path.exists(cdm_file_name): test_util.write_cloud_file(self.hpo_bucket, cdm_file_name) else: test_util.write_cloud_str(self.hpo_bucket, cdm_table + '.csv', 'dummy\n') bq_utils.load_cdm_csv(FAKE_HPO_ID, cdm_table)
def _load_datasets(self): """ Load five persons data for each test hpo # expected_tables is for testing output # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...] """ expected_tables = dict() running_jobs = [] for cdm_table in resources.CDM_TABLES: output_table = ehr_union.output_table_for(cdm_table) expected_tables[output_table] = [] for hpo_id in self.hpo_ids: # upload csv into hpo bucket if hpo_id == NYC_HPO_ID: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') else: cdm_file_name = os.path.join( test_util.PITT_FIVE_PERSONS_PATH, cdm_table + '.csv') bucket = gcs_utils.get_hpo_bucket(hpo_id) if os.path.exists(cdm_file_name): test_util.write_cloud_file(bucket, cdm_file_name) csv_rows = resources.csv_to_list(cdm_file_name) else: # results in empty table test_util.write_cloud_str(bucket, cdm_table + '.csv', 'dummy\n') csv_rows = [] # load table from csv result = bq_utils.load_cdm_csv(hpo_id, cdm_table) running_jobs.append(result['jobReference']['jobId']) expected_tables[output_table] += list(csv_rows) # ensure person to observation output is as expected output_table_person = ehr_union.output_table_for( combine_ehr_rdr.PERSON_TABLE) output_table_observation = ehr_union.output_table_for( combine_ehr_rdr.OBSERVATION_TABLE) expected_tables[output_table_observation] += 4 * expected_tables[ output_table_person] incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) if len(incomplete_jobs) > 0: message = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) self.expected_tables = expected_tables
def _load_dataset(self, hpo_id): for cdm_table in resources.CDM_TABLES: cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_table + '.csv') if os.path.exists(cdm_file_name): test_util.write_cloud_file(self.hpo_bucket, cdm_file_name) else: test_util.write_cloud_str(self.hpo_bucket, cdm_table + '.csv', 'dummy\n') bq_utils.load_cdm_csv(hpo_id, cdm_table) # ensure concept table exists if not bq_utils.table_exists(common.CONCEPT): bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT) q = """INSERT INTO {dataset}.concept SELECT * FROM {vocab}.concept""".format( dataset=self.dataset, vocab=common.VOCABULARY_DATASET) bq_utils.query(q)
def load_test_data(self, hpo_id=None): """ Load to bq test achilles heel results data from csv file :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used :return: contents of the file as list of objects """ table_name = common.ACHILLES_HEEL_RESULTS if hpo_id is not None: table_id = bq_utils.get_table_id(hpo_id, table_name) else: table_id = table_name test_file_name = table_id + '.csv' test_file_path = os.path.join(test_util.TEST_DATA_PATH, test_file_name) test_util.write_cloud_file(self.bucket, test_file_path) gcs_path = 'gs://' + self.bucket + '/' + test_file_name load_results = bq_utils.load_csv(table_name, gcs_path, self.app_id, self.dataset_id, table_id) job_id = load_results['jobReference']['jobId'] bq_utils.wait_on_jobs([job_id]) return resources.csv_to_list(test_file_path)
def test_html_report_five_person(self, mock_check_cron, mock_first_run, mock_rdr_date, mock_required_files_loaded): mock_required_files_loaded.return_value = False mock_first_run.return_value = False rdr_date = '2020-01-01' mock_rdr_date.return_value = rdr_date for cdm_file in test_util.FIVE_PERSONS_FILES: test_util.write_cloud_file(self.hpo_bucket, cdm_file, prefix=self.folder_prefix) # load person table in RDR bq_utils.load_table_from_csv(self.project_id, self.rdr_dataset_id, common.PERSON, test_util.FIVE_PERSONS_PERSON_CSV) # Load measurement_concept_sets required_labs.load_measurement_concept_sets_table( project_id=self.project_id, dataset_id=self.bigquery_dataset_id) # Load measurement_concept_sets_descendants required_labs.load_measurement_concept_sets_descendants_table( project_id=self.project_id, dataset_id=self.bigquery_dataset_id) main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) actual_result = test_util.read_cloud_file( self.hpo_bucket, self.folder_prefix + common.RESULTS_HTML) # ensure emails are not sent bucket_items = gcs_utils.list_bucket(self.hpo_bucket) folder_items = main.get_folder_items(bucket_items, self.folder_prefix) self.assertFalse(main.is_first_validation_run(folder_items)) # parse html soup = bs(actual_result, parser="lxml", features="lxml") missing_pii_html_table = soup.find('table', id='missing_pii') table_headers = missing_pii_html_table.find_all('th') self.assertEqual('Missing Participant Record Type', table_headers[0].get_text()) self.assertEqual('Count', table_headers[1].get_text()) table_rows = missing_pii_html_table.find_next('tbody').find_all('tr') missing_record_types = [ table_row.find('td').text for table_row in table_rows ] self.assertIn(main_consts.EHR_NO_PII, missing_record_types) self.assertIn(main_consts.PII_NO_EHR, missing_record_types) self.assertIn(main_consts.EHR_NO_RDR.format(date=rdr_date), missing_record_types) self.assertIn(main_consts.EHR_NO_PARTICIPANT_MATCH, missing_record_types) required_lab_html_table = soup.find('table', id='required-lab') table_headers = required_lab_html_table.find_all('th') self.assertEqual(3, len(table_headers)) self.assertEqual('Ancestor Concept ID', table_headers[0].get_text()) self.assertEqual('Ancestor Concept Name', table_headers[1].get_text()) self.assertEqual('Found', table_headers[2].get_text()) table_rows = required_lab_html_table.find_next('tbody').find_all('tr') table_rows_last_column = [ table_row.find_all('td')[-1] for table_row in table_rows ] submitted_labs = [ row for row in table_rows_last_column if 'result-1' in row.attrs['class'] ] missing_labs = [ row for row in table_rows_last_column if 'result-0' in row.attrs['class'] ] self.assertTrue(len(table_rows) > 0) self.assertTrue(len(submitted_labs) > 0) self.assertTrue(len(missing_labs) > 0)