예제 #1
0
    def test_merge_with_good_data(self):
        running_jobs = []
        with open(NYC_FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'),
                                    'person.csv', fp)
        result = bq_utils.load_cdm_csv('nyc', 'person')
        running_jobs.append(result['jobReference']['jobId'])

        with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'),
                                    'person.csv', fp)
        result = bq_utils.load_cdm_csv('pitt', 'person')
        running_jobs.append(result['jobReference']['jobId'])

        nyc_person_ids = [
            int(row['person_id'])
            for row in resources._csv_to_list(NYC_FIVE_PERSONS_PERSON_CSV)
        ]
        pitt_person_ids = [
            int(row['person_id'])
            for row in resources._csv_to_list(PITT_FIVE_PERSONS_PERSON_CSV)
        ]
        expected_result = nyc_person_ids + pitt_person_ids
        expected_result.sort()

        incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
        self.assertEqual(
            len(incomplete_jobs), 0,
            'loading tables {},{} timed out'.format('nyc_person',
                                                    'pitt_person'))

        dataset_id = bq_utils.get_dataset_id()
        table_ids = ['nyc_person', 'pitt_person']
        merged_table_id = 'merged_nyc_pitt'
        success_flag, error = bq_utils.merge_tables(dataset_id, table_ids,
                                                    dataset_id,
                                                    merged_table_id)

        self.assertTrue(success_flag)
        self.assertEqual(error, "")

        query_string = 'SELECT person_id FROM {dataset_id}.{table_id}'.format(
            dataset_id=dataset_id, table_id=merged_table_id)
        merged_query_job_result = bq_utils.query(query_string)

        self.assertIsNone(merged_query_job_result.get('errors', None))
        actual_result = [
            int(row['f'][0]['v']) for row in merged_query_job_result['rows']
        ]
        actual_result.sort()
        self.assertListEqual(expected_result, actual_result)
예제 #2
0
 def _load_datasets(self):
     """
     Load five persons data for each test hpo
     """
     # expected_tables is for testing output
     # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...]
     expected_tables = dict()
     running_jobs = []
     for cdm_table in common.CDM_TABLES:
         cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                      cdm_table + '.csv')
         output_table = ehr_union.output_table_for(cdm_table)
         expected_tables[output_table] = []
         for hpo_id in self.hpo_ids:
             # upload csv into hpo bucket
             bucket = gcs_utils.get_hpo_bucket(hpo_id)
             if os.path.exists(cdm_file_name):
                 test_util.write_cloud_file(bucket, cdm_file_name)
                 csv_rows = resources._csv_to_list(cdm_file_name)
             else:
                 # results in empty table
                 test_util.write_cloud_str(bucket, cdm_table + '.csv',
                                           'dummy\n')
                 csv_rows = []
             # load table from csv
             result = bq_utils.load_cdm_csv(hpo_id, cdm_table)
             running_jobs.append(result['jobReference']['jobId'])
             expected_tables[output_table] += list(csv_rows)
     incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
     if len(incomplete_jobs) > 0:
         message = "Job id(s) %s failed to complete" % incomplete_jobs
         raise RuntimeError(message)
     self.expected_tables = expected_tables
예제 #3
0
 def test_load_ehr_observation(self):
     hpo_id = 'pitt'
     dataset_id = bq_utils.get_dataset_id()
     table_id = bq_utils.get_table_id(hpo_id, table_name='observation')
     q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format(
         dataset_id=dataset_id, table_id=table_id)
     expected_observation_ids = [
         int(row['observation_id']) for row in resources._csv_to_list(
             PITT_FIVE_PERSONS_OBSERVATION_CSV)
     ]
     with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp:
         gcs_utils.upload_object(gcs_utils.get_hpo_bucket(hpo_id),
                                 'observation.csv', fp)
     result = bq_utils.load_cdm_csv(hpo_id, 'observation')
     job_id = result['jobReference']['jobId']
     incomplete_jobs = bq_utils.wait_on_jobs([job_id])
     self.assertEqual(len(incomplete_jobs), 0,
                      'pitt_observation load job did not complete')
     load_job_result = bq_utils.get_job_details(job_id)
     load_job_result_status = load_job_result['status']
     load_job_errors = load_job_result_status.get('errors')
     self.assertIsNone(load_job_errors,
                       msg='pitt_observation load job failed: ' +
                       str(load_job_errors))
     query_results_response = bq_utils.query(q)
     query_job_errors = query_results_response.get('errors')
     self.assertIsNone(query_job_errors)
     actual_result = [
         int(row['f'][0]['v']) for row in query_results_response['rows']
     ]
     self.assertListEqual(actual_result, expected_observation_ids)
예제 #4
0
    def test_pii_files_loaded(self, mock_check_cron):
        # tests if pii files are loaded
        test_file_paths = [
            test_util.PII_NAME_FILE, test_util.PII_MRN_BAD_PERSON_ID_FILE
        ]
        test_file_names = [os.path.basename(f) for f in test_file_paths]
        test_util.write_cloud_file(self.hpo_bucket,
                                   test_util.PII_NAME_FILE,
                                   prefix=self.folder_prefix)
        test_util.write_cloud_file(self.hpo_bucket,
                                   test_util.PII_MRN_BAD_PERSON_ID_FILE,
                                   prefix=self.folder_prefix)

        rs = resources._csv_to_list(test_util.PII_FILE_LOAD_RESULT_CSV)
        expected_results = [(r['file_name'], int(r['found']), int(r['parsed']),
                             int(r['loaded'])) for r in rs]
        for f in common.SUBMISSION_FILES:
            if f not in test_file_names:
                expected_result = (f, 0, 0, 0)
                expected_results.append(expected_result)

        bucket_items = gcs_utils.list_bucket(self.hpo_bucket)
        r = main.validate_submission(self.hpo_id, self.hpo_bucket,
                                     bucket_items, self.folder_prefix)
        self.assertSetEqual(set(expected_results), set(r['results']))
예제 #5
0
 def _load_datasets(self):
     load_jobs = []
     self.expected_tables = dict()
     for cdm_table in common.CDM_TABLES:
         cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                      cdm_table + '.csv')
         result_table = ehr_merge.result_table_for(cdm_table)
         if os.path.exists(cdm_file_name):
             # one copy for chs, the other for pitt
             csv_rows = resources._csv_to_list(cdm_file_name)
             self.expected_tables[result_table] = csv_rows + list(csv_rows)
             test_util.write_cloud_file(self.chs_bucket, cdm_file_name)
             test_util.write_cloud_file(self.pitt_bucket, cdm_file_name)
         else:
             self.expected_tables[result_table] = []
             test_util.write_cloud_str(self.chs_bucket, cdm_table + '.csv',
                                       'dummy\n')
             test_util.write_cloud_str(self.pitt_bucket, cdm_table + '.csv',
                                       'dummy\n')
         chs_load_results = bq_utils.load_cdm_csv(CHS_HPO_ID, cdm_table)
         pitt_load_results = bq_utils.load_cdm_csv(PITT_HPO_ID, cdm_table)
         chs_load_job_id = chs_load_results['jobReference']['jobId']
         pitt_load_job_id = pitt_load_results['jobReference']['jobId']
         load_jobs.append(chs_load_job_id)
         load_jobs.append(pitt_load_job_id)
     incomplete_jobs = bq_utils.wait_on_jobs(load_jobs)
     if len(incomplete_jobs) > 0:
         raise RuntimeError('BigQuery jobs %s failed to complete' %
                            incomplete_jobs)
예제 #6
0
    def test_validate_five_persons_success(self, mock_check_cron):
        prefix = 'dummy-prefix-2018-03-22/'
        expected_result_items = resources._csv_to_list(
            test_util.FIVE_PERSONS_SUCCESS_RESULT_CSV)
        json_export_files = self.get_json_export_files(test_util.FAKE_HPO_ID)

        # upload all five_persons files
        for cdm_file in test_util.FIVE_PERSONS_FILES:
            test_util.write_cloud_file(self.hpo_bucket,
                                       cdm_file,
                                       prefix=prefix)

        expected_tables = [
            'person', 'visit_occurrence', 'condition_occurrence',
            'procedure_occurrence', 'drug_exposure', 'measurement'
        ]
        cdm_files = [table + '.csv' for table in expected_tables]

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)

            # check the result file was put in bucket
            expected_object_names = cdm_files + common.IGNORE_LIST + json_export_files
            expected_objects = [
                prefix + item for item in expected_object_names
            ]

            list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket)
            actual_objects = [item['name'] for item in list_bucket_result]
            self.assertSetEqual(set(expected_objects), set(actual_objects))

            # result says file found, parsed, loaded
            actual_result = test_util.read_cloud_file(
                self.hpo_bucket, prefix + common.RESULT_CSV)
            actual_result_file = StringIO.StringIO(actual_result)
            actual_result_items = resources._csv_file_to_list(
                actual_result_file)

            expected_result_items.sort()
            actual_result_items.sort()
            self.assertListEqual(expected_result_items, actual_result_items)
            self.assertTrue(
                main.all_required_files_loaded(test_util.FAKE_HPO_ID,
                                               folder_prefix=prefix))

        # check tables exist and are clustered as expected
        for table in expected_tables:
            fields_file = os.path.join(resources.fields_path, table + '.json')
            table_id = bq_utils.get_table_id(test_util.FAKE_HPO_ID, table)
            table_info = bq_utils.get_table_info(table_id)
            with open(fields_file, 'r') as fp:
                fields = json.load(fp)
                field_names = [field['name'] for field in fields]
                if 'person_id' in field_names:
                    self.table_has_clustering(table_info)
예제 #7
0
 def get_nyc_cu_cols():
     result = []
     cols = resources._csv_to_list(test_util.TEST_NYC_CU_COLS_CSV)
     for col in cols:
         omop_table_name = completeness.get_standard_table_name(
             col[consts.TABLE_NAME])
         if omop_table_name:
             col[consts.OMOP_TABLE_NAME] = omop_table_name
             result.append(col)
     return result
예제 #8
0
    def test_validate_five_persons_success(self, mock_check_cron):
        prefix = 'dummy-prefix-2018-03-22/'
        expected_result_items = resources._csv_to_list(
            test_util.FIVE_PERSONS_SUCCESS_RESULT_CSV)
        json_export_files = self.get_json_export_files(test_util.FAKE_HPO_ID)

        # upload all five_persons files
        for cdm_file in test_util.FIVE_PERSONS_FILES:
            test_util.write_cloud_file(self.hpo_bucket,
                                       cdm_file,
                                       prefix=prefix)

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)

            # check the result file was putin bucket
            expected_bucket_items = common.REQUIRED_FILES + common.IGNORE_LIST + json_export_files
            # want to keep this test the same. So adding all the old required files.
            expected_bucket_items = expected_bucket_items + [
                'measurement.csv', 'procedure_occurrence.csv',
                'drug_exposure.csv', 'condition_occurrence.csv',
                'visit_occurrence.csv'
            ]
            expected_bucket_items = [
                prefix + item for item in expected_bucket_items
            ]

            list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket)
            actual_bucket_items = [item['name'] for item in list_bucket_result]
            self.assertSetEqual(set(expected_bucket_items),
                                set(actual_bucket_items))

            # result says file found, parsed, loaded
            actual_result = test_util.read_cloud_file(
                self.hpo_bucket, prefix + common.RESULT_CSV)
            actual_result_file = StringIO.StringIO(actual_result)
            actual_result_items = resources._csv_file_to_list(
                actual_result_file)

            expected_result_items.sort()
            actual_result_items.sort()
            self.assertListEqual(expected_result_items, actual_result_items)
            self.assertTrue(
                main.all_required_files_loaded(test_util.FAKE_HPO_ID,
                                               folder_prefix=prefix))
    def load_test_data(self, hpo_id=None):
        """
        Load to bq test achilles heel results data from csv file

        :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used
        :return: contents of the file as list of objects
        """
        schema_path = os.path.join(resources.fields_path,
                                   common.ACHILLES_HEEL_RESULTS + '.json')
        table_id = common.ACHILLES_HEEL_RESULTS
        if hpo_id is not None:
            table_id = bq_utils.get_table_id(hpo_id,
                                             common.ACHILLES_HEEL_RESULTS)
        test_file_name = table_id + '.csv'
        test_file_path = os.path.join(test_util.TEST_DATA_PATH, test_file_name)
        test_util.write_cloud_file(self.bucket, test_file_path)
        gcs_path = 'gs://' + self.bucket + '/' + test_file_name
        load_results = bq_utils.load_csv(schema_path, gcs_path, self.app_id,
                                         self.dataset_id, table_id)
        job_id = load_results['jobReference']['jobId']
        bq_utils.wait_on_jobs([job_id])
        return resources._csv_to_list(test_file_path)
예제 #10
0
    def test_pii_files_loaded(self, mock_check_cron):
        # tests if pii files are loaded
        folder_prefix = 'dummy-prefix-2018-03-22/'
        expected_result_items = resources._csv_to_list(
            test_util.PII_FILE_LOAD_RESULT_CSV)
        test_util.write_cloud_file(self.hpo_bucket,
                                   test_util.PII_NAME_FILE,
                                   prefix=folder_prefix)
        test_util.write_cloud_file(self.hpo_bucket,
                                   test_util.PII_MRN_BAD_PERSON_ID_FILE,
                                   prefix=folder_prefix)

        main.app.testing = True
        with main.app.test_client() as c:
            c.get(test_util.VALIDATE_HPO_FILES_URL)
            actual_result = test_util.read_cloud_file(
                self.hpo_bucket, folder_prefix + common.RESULT_CSV)
            actual_result_file = StringIO.StringIO(actual_result)
            actual_result_items = resources._csv_file_to_list(
                actual_result_file)
            # sort in order to compare
            expected_result_items.sort()
            actual_result_items.sort()
            self.assertListEqual(expected_result_items, actual_result_items)