Пример #1
0
    def test_merge_with_good_data(self):
        running_jobs = []
        with open(NYC_FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'),
                                    'person.csv', fp)
        result = bq_utils.load_cdm_csv('nyc', 'person')
        running_jobs.append(result['jobReference']['jobId'])

        with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp:
            gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'),
                                    'person.csv', fp)
        result = bq_utils.load_cdm_csv('pitt', 'person')
        running_jobs.append(result['jobReference']['jobId'])

        nyc_person_ids = [
            int(row['person_id'])
            for row in resources.csv_to_list(NYC_FIVE_PERSONS_PERSON_CSV)
        ]
        pitt_person_ids = [
            int(row['person_id'])
            for row in resources.csv_to_list(PITT_FIVE_PERSONS_PERSON_CSV)
        ]
        expected_result = nyc_person_ids + pitt_person_ids
        expected_result.sort()

        incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
        self.assertEqual(
            len(incomplete_jobs), 0,
            'loading tables {},{} timed out'.format('nyc_person',
                                                    'pitt_person'))

        dataset_id = self.dataset_id
        table_ids = ['nyc_person', 'pitt_person']
        merged_table_id = 'merged_nyc_pitt'
        success_flag, error = bq_utils.merge_tables(dataset_id, table_ids,
                                                    dataset_id,
                                                    merged_table_id)

        self.assertTrue(success_flag)
        self.assertEqual(error, "")

        query_string = 'SELECT person_id FROM {dataset_id}.{table_id}'.format(
            dataset_id=dataset_id, table_id=merged_table_id)
        merged_query_job_result = bq_utils.query(query_string)

        self.assertIsNone(merged_query_job_result.get('errors', None))
        actual_result = [
            int(row['f'][0]['v']) for row in merged_query_job_result['rows']
        ]
        actual_result.sort()
        self.assertCountEqual(expected_result, actual_result)
Пример #2
0
    def test_pii_files_loaded(self, mock_check_cron):
        # tests if pii files are loaded
        test_file_paths = [
            test_util.PII_NAME_FILE, test_util.PII_MRN_BAD_PERSON_ID_FILE
        ]
        test_file_names = [os.path.basename(f) for f in test_file_paths]
        test_util.write_cloud_file(self.hpo_bucket,
                                   test_util.PII_NAME_FILE,
                                   prefix=self.folder_prefix)
        test_util.write_cloud_file(self.hpo_bucket,
                                   test_util.PII_MRN_BAD_PERSON_ID_FILE,
                                   prefix=self.folder_prefix)

        rs = resources.csv_to_list(test_util.PII_FILE_LOAD_RESULT_CSV)
        expected_results = [(r['file_name'], int(r['found']), int(r['parsed']),
                             int(r['loaded'])) for r in rs]
        for f in common.SUBMISSION_FILES:
            if f not in test_file_names:
                expected_result = (f, 0, 0, 0)
                expected_results.append(expected_result)

        bucket_items = gcs_utils.list_bucket(self.hpo_bucket)
        folder_items = main.get_folder_items(bucket_items, self.folder_prefix)
        r = main.validate_submission(self.hpo_id, self.hpo_bucket, folder_items,
                                     self.folder_prefix)
        self.assertSetEqual(set(expected_results), set(r['results']))
Пример #3
0
 def test_load_ehr_observation(self):
     hpo_id = 'pitt'
     dataset_id = self.dataset_id
     table_id = bq_utils.get_table_id(hpo_id, table_name='observation')
     q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format(
         dataset_id=dataset_id, table_id=table_id)
     expected_observation_ids = [
         int(row['observation_id'])
         for row in resources.csv_to_list(PITT_FIVE_PERSONS_OBSERVATION_CSV)
     ]
     sc_bucket = self.client.get_bucket(gcs_utils.get_hpo_bucket(hpo_id))
     bucket_blob = sc_bucket.blob('observation.csv')
     with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp:
         bucket_blob.upload_from_file(fp)
     result = bq_utils.load_cdm_csv(hpo_id, 'observation')
     job_id = result['jobReference']['jobId']
     incomplete_jobs = bq_utils.wait_on_jobs([job_id])
     self.assertEqual(len(incomplete_jobs), 0,
                      'pitt_observation load job did not complete')
     load_job_result = bq_utils.get_job_details(job_id)
     load_job_result_status = load_job_result['status']
     load_job_errors = load_job_result_status.get('errors')
     self.assertIsNone(load_job_errors,
                       msg='pitt_observation load job failed: ' +
                       str(load_job_errors))
     query_results_response = bq_utils.query(q)
     query_job_errors = query_results_response.get('errors')
     self.assertIsNone(query_job_errors)
     actual_result = [
         int(row['f'][0]['v']) for row in query_results_response['rows']
     ]
     self.assertCountEqual(actual_result, expected_observation_ids)
Пример #4
0
    def test_create_dose_form_route_mappings_table_with_dataset_id(
            self, mock_query, mock_create_table):
        # pre conditions
        route_mappings_csv = os.path.join(
            resources.resource_files_path,
            populate_route_ids.DOSE_FORM_ROUTES_FILE + ".csv")
        dose_form_route_mappings = resources.csv_to_list(route_mappings_csv)
        mapping_list = populate_route_ids.get_mapping_list(
            dose_form_route_mappings)
        query_params = dict(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            routes_table_id=populate_route_ids.DOSE_FORM_ROUTES_TABLE_ID,
            mapping_list=mapping_list)
        expected_query = populate_route_ids.INSERT_ROUTES_QUERY.format(
            **query_params)

        # test
        populate_route_ids.create_dose_form_route_mappings_table(
            self.project_id, self.dataset_id)

        # post conditions
        mock_query.assert_called_with(expected_query)
        mock_create_table.assert_called_with(
            populate_route_ids.DOSE_FORM_ROUTES_TABLE_ID,
            populate_route_ids.DOSE_FORM_ROUTE_FIELDS,
            drop_existing=True,
            dataset_id=self.dataset_id)
Пример #5
0
def create_dose_form_route_mappings_table(project_id, dataset_id=None):
    """
    Creates "_logging_dose_form_route_mappings" table with only id columns from resources/dose_form_route_mappings.csv

    :param project_id:
    :param dataset_id: BQ dataset_id
    :return: upload metadata for created table
    """
    if dataset_id is None:
        # Using table created in bq_dataset instead of re-creating in every dataset
        dataset_id = bq_utils.get_dataset_id()

    dose_form_routes_table_id = DOSE_FORM_ROUTES_TABLE_ID

    LOGGER.info("Creating %s.%s", dataset_id, DOSE_FORM_ROUTES_TABLE_ID)

    # create empty table
    bq_utils.create_table(DOSE_FORM_ROUTES_TABLE_ID,
                          DOSE_FORM_ROUTE_FIELDS,
                          drop_existing=True,
                          dataset_id=dataset_id)

    dose_form_route_mappings_csv = os.path.join(resources.resource_path,
                                                DOSE_FORM_ROUTES_FILE + ".csv")
    dose_form_route_mappings_list = resources.csv_to_list(
        dose_form_route_mappings_csv)
    dose_form_routes_populate_query = INSERT_ROUTES_QUERY.format(
        dataset_id=dataset_id,
        project_id=project_id,
        routes_table_id=DOSE_FORM_ROUTES_TABLE_ID,
        mapping_list=get_mapping_list(dose_form_route_mappings_list))
    result = bq_utils.query(dose_form_routes_populate_query)
    LOGGER.info("Created %s.%s", dataset_id, dose_form_routes_table_id)
    return result
Пример #6
0
    def test_measurement_concept_sets_table(self):

        query = sql_wrangle.qualify_tables(
            '''SELECT * FROM {dataset_id}.{table_id}'''.format(
                dataset_id=self.dataset_id,
                table_id=MEASUREMENT_CONCEPT_SETS_TABLE))
        response = bq_utils.query(query)

        actual_fields = [{
            'name': field['name'].lower(),
            'type': field['type'].lower()
        } for field in response['schema']['fields']]

        expected_fields = [{
            'name': field['name'].lower(),
            'type': field['type'].lower()
        } for field in resources.fields_for(MEASUREMENT_CONCEPT_SETS_TABLE)]

        self.assertListEqual(expected_fields, actual_fields)

        measurement_concept_sets_table_path = os.path.join(
            resources.resource_path, MEASUREMENT_CONCEPT_SETS_TABLE + '.csv')
        expected_total_rows = len(
            resources.csv_to_list(measurement_concept_sets_table_path))
        self.assertEqual(expected_total_rows, int(response['totalRows']))
Пример #7
0
    def load_test_data(self, hpo_id: str = None):
        """
        Load to bq test achilles heel results data from csv file

        :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used
        :return: contents of the file as list of objects
        """

        table_name: str = common.ACHILLES_HEEL_RESULTS
        if hpo_id is not None:
            table_id: str = bq_utils.get_table_id(hpo_id, table_name)
        else:
            table_id: str = table_name
        test_file_name: str = f'{table_id}.csv'
        test_file_path: str = os.path.join(test_util.TEST_DATA_PATH,
                                           test_file_name)

        target_bucket = self.storage_client.get_bucket(self.bucket)
        test_blob = target_bucket.blob(test_file_name)
        test_blob.upload_from_filename(test_file_path)

        gcs_path: str = f'gs://{self.bucket}/{test_file_name}'
        load_results = bq_utils.load_csv(table_name, gcs_path, self.project_id,
                                         self.dataset_id, table_id)
        job_id = load_results['jobReference']['jobId']
        bq_utils.wait_on_jobs([job_id])
        return resources.csv_to_list(test_file_path)
Пример #8
0
    def _load_datasets(self):
        """
        Load five persons data for nyc and pitt test hpo and rdr data for the excluded_hpo
        # expected_tables is for testing output
        # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...]
        """
        expected_tables: dict = {}
        running_jobs: list = []
        for cdm_table in resources.CDM_TABLES:
            output_table: str = ehr_union.output_table_for(cdm_table)
            expected_tables[output_table] = []
            for hpo_id in self.hpo_ids:
                # upload csv into hpo bucket
                cdm_filename: str = f'{cdm_table}.csv'
                if hpo_id == NYC_HPO_ID:
                    cdm_filepath: str = os.path.join(
                        test_util.FIVE_PERSONS_PATH, cdm_filename)
                elif hpo_id == PITT_HPO_ID:
                    cdm_filepath: str = os.path.join(
                        test_util.PITT_FIVE_PERSONS_PATH, cdm_filename)
                elif hpo_id == EXCLUDED_HPO_ID:
                    if cdm_table in [
                            'observation', 'person', 'visit_occurrence'
                    ]:
                        cdm_filepath: str = os.path.join(
                            test_util.RDR_PATH, cdm_filename)
                bucket: str = gcs_utils.get_hpo_bucket(hpo_id)
                gcs_bucket = self.storage_client.get_bucket(bucket)
                if os.path.exists(cdm_filepath):

                    csv_rows = resources.csv_to_list(cdm_filepath)
                    cdm_blob = gcs_bucket.blob(cdm_filename)
                    cdm_blob.upload_from_filename(cdm_filepath)

                else:
                    # results in empty table
                    cdm_blob = gcs_bucket.blob(cdm_filename)
                    cdm_blob.upload_from_string('dummy\n')
                    csv_rows: list = []
                # load table from csv
                result = bq_utils.load_cdm_csv(hpo_id, cdm_table)
                running_jobs.append(result['jobReference']['jobId'])
                if hpo_id != EXCLUDED_HPO_ID:
                    expected_tables[output_table] += list(csv_rows)
        # ensure person to observation output is as expected
        output_table_person: str = ehr_union.output_table_for(common.PERSON)
        output_table_observation: str = ehr_union.output_table_for(
            common.OBSERVATION)
        expected_tables[output_table_observation] += 4 * expected_tables[
            output_table_person]

        incomplete_jobs: list = bq_utils.wait_on_jobs(running_jobs)
        if len(incomplete_jobs) > 0:
            message: str = "Job id(s) %s failed to complete" % incomplete_jobs
            raise RuntimeError(message)
        self.expected_tables = expected_tables
Пример #9
0
 def get_nyc_cu_cols():
     result = []
     cols = resources.csv_to_list(test_util.TEST_NYC_CU_COLS_CSV)
     for col in cols:
         omop_table_name = completeness.get_standard_table_name(
             col[consts.TABLE_NAME])
         if omop_table_name:
             col[consts.OMOP_TABLE_NAME] = omop_table_name
             result.append(col)
     return result
Пример #10
0
    def _load_datasets(self):
        """
        Load five persons data for each test hpo
        # expected_tables is for testing output
        # it maps table name to list of expected records ex: "unioned_ehr_visit_occurrence" -> [{}, {}, ...]
        """
        expected_tables = dict()
        running_jobs = []
        for cdm_table in resources.CDM_TABLES:
            output_table = ehr_union.output_table_for(cdm_table)
            expected_tables[output_table] = []
            for hpo_id in self.hpo_ids:
                # upload csv into hpo bucket
                if hpo_id == NYC_HPO_ID:
                    cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                                 cdm_table + '.csv')
                else:
                    cdm_file_name = os.path.join(
                        test_util.PITT_FIVE_PERSONS_PATH, cdm_table + '.csv')
                bucket = gcs_utils.get_hpo_bucket(hpo_id)
                if os.path.exists(cdm_file_name):
                    test_util.write_cloud_file(bucket, cdm_file_name)
                    csv_rows = resources.csv_to_list(cdm_file_name)
                else:
                    # results in empty table
                    test_util.write_cloud_str(bucket, cdm_table + '.csv',
                                              'dummy\n')
                    csv_rows = []
                # load table from csv
                result = bq_utils.load_cdm_csv(hpo_id, cdm_table)
                running_jobs.append(result['jobReference']['jobId'])
                expected_tables[output_table] += list(csv_rows)
        # ensure person to observation output is as expected
        output_table_person = ehr_union.output_table_for(
            combine_ehr_rdr.PERSON_TABLE)
        output_table_observation = ehr_union.output_table_for(
            combine_ehr_rdr.OBSERVATION_TABLE)
        expected_tables[output_table_observation] += 4 * expected_tables[
            output_table_person]

        incomplete_jobs = bq_utils.wait_on_jobs(running_jobs)
        if len(incomplete_jobs) > 0:
            message = "Job id(s) %s failed to complete" % incomplete_jobs
            raise RuntimeError(message)
        self.expected_tables = expected_tables
Пример #11
0
def load_table_from_csv(project_id,
                        dataset_id,
                        table_name,
                        csv_path=None,
                        fields=None):
    """
    Loads BQ table from a csv file without making use of GCS buckets

    :param project_id: project containing the dataset
    :param dataset_id: dataset where the table needs to be created
    :param table_name: name of the table to be created
    :param csv_path: path to the csv file which needs to be loaded into BQ.
                     If None, assumes that the file exists in the resource_files folder with the name table_name.csv
    :param fields: fields in list of dicts format. If set to None, assumes that
                   the fields are stored in a json file in resource_files/fields named table_name.json
    :return: BQ response for the load query
    """
    if csv_path is None:
        csv_path = os.path.join(resources.resource_files_path,
                                table_name + ".csv")
    table_list = resources.csv_to_list(csv_path)

    if fields is None:
        fields_filename = os.path.join(resources.fields_path,
                                       table_name + '.json')
        with open(fields_filename, 'r') as f:
            fields = json.load(f)
    field_names = ', '.join([field['name'] for field in fields])
    row_exprs = [csv_line_to_sql_row_expr(t, fields) for t in table_list]
    formatted_mapping_list = ', '.join(row_exprs)

    create_table(table_id=table_name,
                 fields=fields,
                 drop_existing=True,
                 dataset_id=dataset_id)

    table_populate_query = bq_consts.INSERT_QUERY.format(
        project_id=project_id,
        dataset_id=dataset_id,
        table_id=table_name,
        columns=field_names,
        mapping_list=formatted_mapping_list)
    result = query(table_populate_query)
    return result
Пример #12
0
    def load_test_data(self, hpo_id=None):
        """
        Load to bq test achilles heel results data from csv file

        :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used
        :return: contents of the file as list of objects
        """

        table_name = common.ACHILLES_HEEL_RESULTS
        if hpo_id is not None:
            table_id = bq_utils.get_table_id(hpo_id, table_name)
        else:
            table_id = table_name
        test_file_name = table_id + '.csv'
        test_file_path = os.path.join(test_util.TEST_DATA_PATH, test_file_name)
        test_util.write_cloud_file(self.bucket, test_file_path)
        gcs_path = 'gs://' + self.bucket + '/' + test_file_name
        load_results = bq_utils.load_csv(table_name, gcs_path, self.app_id,
                                         self.dataset_id, table_id)
        job_id = load_results['jobReference']['jobId']
        bq_utils.wait_on_jobs([job_id])
        return resources.csv_to_list(test_file_path)
Пример #13
0
def create_unit_mapping_table(project_id, dataset_id):
    """
    This function creates the unit_mapping table and populate it with the values from resources/unit_mapping.csv
    :param project_id:
    :param dataset_id:
    :return:
    """
    bq_utils.create_table(table_id=UNIT_MAPPING_TABLE,
                          fields=UNIT_MAPPING_FIELDS,
                          drop_existing=True,
                          dataset_id=dataset_id)

    unit_mappings_csv = os.path.join(resources.resource_path,
                                     UNIT_MAPPING_FILE + ".csv")
    unit_mappings_list = resources.csv_to_list(unit_mappings_csv)
    unit_mappings_populate_query = INSERT_UNITS_QUERY.format(
        dataset_id=dataset_id,
        project_id=project_id,
        units_table_id=UNIT_MAPPING_TABLE,
        mapping_list=get_mapping_list(unit_mappings_list))
    result = bq_utils.query(unit_mappings_populate_query)
    LOGGER.info("Created %s.%s", dataset_id, UNIT_MAPPING_TABLE)
    return result