示例#1
0
    def test_merge_EHR(self, mock_check_cron):
        self._load_datasets()
        # enable exception propagation as described at https://goo.gl/LqDgnj
        old_dataset_items = bq_utils.list_dataset_contents(
            bq_utils.get_dataset_id())
        expected_items = ['visit_id_mapping_table']
        expected_items.extend(
            ['unioned_ehr_' + table_name for table_name in common.CDM_TABLES])

        ehr_merge.merge(bq_utils.get_dataset_id(), self.project_id)
        # check the result files were placed in bucket
        dataset_items = bq_utils.list_dataset_contents(
            bq_utils.get_dataset_id())
        for table_name in common.CDM_TABLES:
            cmd = 'SELECT COUNT(1) FROM unioned_ehr_{}'.format(table_name)
            result = bq_utils.query(cmd)
            self.assertEqual(
                int(result['rows'][0]['f'][0]['v']),
                2 * globals().get(table_name.upper() + '_COUNT', 0),
                msg='failed for table unioned_ehr_{}'.format(table_name))
        self.assertSetEqual(set(old_dataset_items + expected_items),
                            set(dataset_items))

        table_name = 'condition_occurrence'
        cmd_union = 'SELECT * FROM unioned_ehr_{}'.format(table_name)
        cmd_pitt = 'SELECT * FROM pitt_{}'.format(table_name)
        cmd_visit_mapping = "SELECT global_visit_id, mapping_visit_id FROM visit_id_mapping_table where hpo='pitt'"
        qr_union = bq_utils.query(cmd_union)
        qr_pitt = bq_utils.query(cmd_pitt)
        qr_visit_mapping = bq_utils.query(cmd_visit_mapping)

        union_result = query_result_to_payload(qr_union)
        pitt_result = query_result_to_payload(qr_pitt)
        visit_mapping_result = query_result_to_payload(qr_visit_mapping)

        def get_element_from_list_of_lists(index, list_of_lists):
            return [list_item[index] for list_item in list_of_lists]

        for ind, pitt_visit_id in enumerate(
                pitt_result['VISIT_OCCURRENCE_ID']):
            if pitt_visit_id not in visit_mapping_result['MAPPING_VISIT_ID']:
                continue
            global_visit_id_index = visit_mapping_result[
                'MAPPING_VISIT_ID'].index(pitt_visit_id)
            global_visit_id = visit_mapping_result['GLOBAL_VISIT_ID'][
                global_visit_id_index]
            union_visit_id_index = union_result['VISIT_OCCURRENCE_ID'].index(
                global_visit_id)
            pitt_cols_without_id = [
                values for key, values in pitt_result.items() if key not in
                [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID']
            ]
            union_cols_without_id = [
                values for key, values in union_result.items() if key not in
                [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID']
            ]
            self.assertListEqual(
                get_element_from_list_of_lists(ind, pitt_cols_without_id),
                get_element_from_list_of_lists(union_visit_id_index,
                                               union_cols_without_id))
示例#2
0
def merge(dataset_id, project_id):
    """merge hpo ehr data

    :dataset_id: source and target dataset
    :project_id: project in which everything happens
    :returns: list of tables generated successfully

    """
    logging.info('Starting merge')
    existing_tables = bq_utils.list_dataset_contents(dataset_id)
    hpos_to_merge = []
    hpos_with_visit = []
    for item in resources.hpo_csv():
        hpo_id = item['hpo_id']
        if hpo_id + '_person' in existing_tables:
            hpos_to_merge.append(hpo_id)
        if hpo_id + '_visit_occurrence' in existing_tables:
            hpos_with_visit.append(hpo_id)
    logging.info('HPOs to merge: %s' % hpos_to_merge)
    logging.info('HPOs with visit_occurrence: %s' % hpos_with_visit)
    create_mapping_table(hpos_with_visit, project_id, dataset_id)

    # before loading [drop and] create all tables to ensure they are set up properly
    for cdm_file_name in common.CDM_FILES:
        cdm_table_name = cdm_file_name.split('.')[0]
        result_table = result_table_for(cdm_table_name)
        bq_utils.create_standard_table(cdm_table_name,
                                       result_table,
                                       drop_existing=True)

    jobs_to_wait_on = []
    for table_name in common.CDM_TABLES:
        q = construct_query(table_name, hpos_to_merge, hpos_with_visit,
                            project_id, dataset_id)
        logging.info('Merging table: ' + table_name)
        result_table = result_table_for(table_name)
        query_result = query(q,
                             destination_table_id=result_table,
                             write_disposition='WRITE_TRUNCATE')
        query_job_id = query_result['jobReference']['jobId']
        jobs_to_wait_on.append(query_job_id)

    incomplete_jobs = bq_utils.wait_on_jobs(jobs_to_wait_on)
    if len(incomplete_jobs) == 0:
        tables_created = []
        for job_id in jobs_to_wait_on:
            job_details = bq_utils.get_job_details(job_id)
            status = job_details['status']
            table = job_details['configuration']['query']['destinationTable'][
                'tableId']
            if 'errors' in status:
                logging.error('Job ID %s errors: %s' %
                              (job_id, status['errors']))
            else:
                tables_created.append(table)
        return tables_created
    else:
        message = "Merge failed because job id(s) %s did not complete." % incomplete_jobs
        logging.error(message)
        raise RuntimeError(message)
示例#3
0
def assert_tables_in(dataset_id):
    """
    Raise assertion error if any CDM tables missing from a dataset
    :param dataset_id: dataset to check for tables in
    """
    tables = bq_utils.list_dataset_contents(dataset_id)
    logger.debug('Dataset {dataset_id} has tables: {tables}'.format(dataset_id=dataset_id, tables=tables))
    for table in TABLES_TO_PROCESS:
        if table not in tables:
            raise RuntimeError(
                'Dataset {dataset} is missing table {table}. Aborting.'.format(dataset=dataset_id, table=table))
示例#4
0
 def tearDown(self):
     delete_list = ['visit_id_mapping_table'] + [
         'unioned_ehr_' + table_name for table_name in common.CDM_TABLES
     ]
     existing_tables = bq_utils.list_dataset_contents(
         bq_utils.get_dataset_id())
     for table_id in delete_list:
         if table_id not in common.VOCABULARY_TABLES and table_id in existing_tables:
             bq_utils.delete_table(table_id)
     self._empty_bucket(self.hpo_bucket)
     self.testbed.deactivate()
示例#5
0
def get_output_tables(input_dataset, known_tables, skip_tables, only_tables):
    """
    Get list of output tables deid should produce.

    Specifically excludes table names that start with underscores, pii, or
    are explicitly suppressed.

    :param input_dataset:  dataset to read when gathering all possible table names.
    :param known_tables:  list of tables known to curation.  If a table exists in
        the input dataset but is not known to curation, it is skippped.
    :param skip_tables:  command line csv string of tables to skip for deid.
        Useful to perform deid on a subset of tables.

    :return: a list of table names to execute deid over.
    """
    tables = bq_utils.list_dataset_contents(input_dataset)
    skip_tables = [table.strip() for table in skip_tables.split(',')]
    only_tables = [table.strip() for table in only_tables.split(',')]

    allowed_tables = []
    for table in tables:
        if table.startswith('_'):
            continue
        if table.startswith('pii'):
            continue
        if table in SUPPRESSED_TABLES:
            continue
        # doing this to eliminate the 'deid_map' table and any other non-OMOP table
        if table not in known_tables:
            continue
        if table in skip_tables:
            continue

        if (only_tables == ['']
                or table in only_tables) and table in DEID_TABLES:
            allowed_tables.append(table)

    return allowed_tables
示例#6
0
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id):
    """
    Entry point for performing participant matching of PPI, EHR, and PII data.

    :param project: a string representing the project name
    :param rdr_dataset:  the dataset created from the results given to us by
        the rdr team
    :param ehr_dataset:  the dataset containing the pii information for
        comparisons
    :param dest_dataset_id:  the desired identifier for the match values
        destination dataset

    :return: results of the field comparison for each hpo
    """
    LOGGER.info(f"Calling match_participants with:\n"
                f"project:\t{project}\n"
                f"rdr_dataset:\t{rdr_dataset}\n"
                f"ehr_dataset:\t{ehr_dataset}\n"
                f"dest_dataset_id:\t{dest_dataset_id}\n")

    ehr_tables = bq_utils.list_dataset_contents(ehr_dataset)

    date_string = _get_date_string(rdr_dataset)

    if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]):
        dest_dataset_id += date_string

    # create new dataset for the intermediate tables and results
    dataset_result = bq_utils.create_dataset(
        dataset_id=dest_dataset_id,
        description=consts.DESTINATION_DATASET_DESCRIPTION.format(
            version='', rdr_dataset=rdr_dataset, ehr_dataset=ehr_dataset),
        overwrite_existing=True)

    validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {})
    validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '')
    LOGGER.info(
        f"Created new validation results dataset:\t{validation_dataset}")

    # create intermediate observation table in new dataset
    readers.create_match_values_table(project, rdr_dataset, dest_dataset_id)

    hpo_sites = readers.get_hpo_site_names()

    #TODO:  create a proper config file to store this path
    field_list = resources.fields_for('identity_match')

    for site_name in hpo_sites:
        bq_utils.create_table(site_name + consts.VALIDATION_TABLE_SUFFIX,
                              field_list,
                              drop_existing=True,
                              dataset_id=validation_dataset)

    read_errors = 0
    write_errors = 0

    # validate first names
    for site in hpo_sites:
        LOGGER.info(f"Beginning identity validation for site: {site}")
        results = {}

        try:
            match_values = None
            match_values = _compare_name_fields(project, validation_dataset,
                                                ehr_dataset, site,
                                                consts.OBS_PII_NAME_FIRST,
                                                consts.FIRST_NAME_FIELD,
                                                ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.FIRST_NAME_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.FIRST_NAME_FIELD)
            LOGGER.info(f"Validated first names for: {site}")

        # validate last names
        try:
            match_values = None
            match_values = _compare_name_fields(project, validation_dataset,
                                                ehr_dataset, site,
                                                consts.OBS_PII_NAME_LAST,
                                                consts.LAST_NAME_FIELD,
                                                ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.LAST_NAME_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.LAST_NAME_FIELD)
            LOGGER.info(f"Validated last names for: {site}")

        # validate middle names
        try:
            match_values = None


#            match_values = _compare_name_fields(
#                project,
#                validation_dataset,
#                ehr_dataset,
#                site,
#                consts.OBS_PII_NAME_MIDDLE,
#                consts.MIDDLE_NAME_FIELD,
#                ehr_tables
#            )
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.MIDDLE_NAME_FIELD} at site: {site}"
            ),
            read_errors += 1
        else:
            # write middle name matches for hpo to table
            #            results = _add_matches_to_results(results, match_values, consts.MIDDLE_NAME_FIELD)
            LOGGER.info("Not validating middle names")

        # validate zip codes
        try:
            match_values = None
            match_values = _compare_zip_codes(
                project, validation_dataset, rdr_dataset, ehr_dataset, site,
                consts.OBS_PII_STREET_ADDRESS_ZIP, consts.ZIP_CODE_FIELD,
                ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.ZIP_CODE_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.ZIP_CODE_FIELD)
            LOGGER.info(f"Validated zip codes for: {site}")

        # validate city
        try:
            match_values = None
            match_values = _compare_cities(project, validation_dataset,
                                           rdr_dataset, ehr_dataset, site,
                                           consts.OBS_PII_STREET_ADDRESS_CITY,
                                           consts.CITY_FIELD, ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.CITY_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.ZIP_CODE_FIELD)
            LOGGER.info(f"Validated city names for: {site}")

        # validate state
        try:
            match_values = None
            match_values = _compare_states(project, validation_dataset,
                                           rdr_dataset, ehr_dataset, site,
                                           consts.OBS_PII_STREET_ADDRESS_STATE,
                                           consts.STATE_FIELD, ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.STATE_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.STATE_FIELD)
            LOGGER.info(f"Validated states for: {site}")

        # validate street addresses
        try:
            address_one_matches = None
            address_two_matches = None
            match_values = None
            address_one_matches, address_two_matches = _compare_street_addresses(
                project, validation_dataset, rdr_dataset, ehr_dataset, site,
                consts.OBS_PII_STREET_ADDRESS_ONE,
                consts.OBS_PII_STREET_ADDRESS_TWO, consts.ADDRESS_ONE_FIELD,
                consts.ADDRESS_TWO_FIELD, ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for fields: {consts.ADDRESS_ONE_FIELD}, {consts.ADDRESS_TWO_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, address_one_matches,
                                              consts.ADDRESS_ONE_FIELD)
            results = _add_matches_to_results(results, address_two_matches,
                                              consts.ADDRESS_TWO_FIELD)
            LOGGER.info(f"Validated street addresses for: {site}")

        # validate email addresses
        try:
            match_values = None
            match_values = _compare_email_addresses(
                project, validation_dataset, ehr_dataset, site,
                consts.OBS_PII_EMAIL_ADDRESS, consts.EMAIL_FIELD, ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.EMAIL_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.EMAIL_FIELD)
            LOGGER.info(f"Validated email addresses for: {site}")

        # validate phone numbers
        try:
            match_values = None
            match_values = _compare_phone_numbers(project, validation_dataset,
                                                  ehr_dataset, site,
                                                  consts.OBS_PII_PHONE,
                                                  consts.PHONE_NUMBER_FIELD,
                                                  ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.PHONE_NUMBER_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.PHONE_NUMBER_FIELD)
            LOGGER.info(f"Validated phone numbers for: {site}")

        # validate genders
        try:
            match_values = None
            match_values = _compare_genders(project, validation_dataset,
                                            ehr_dataset, site,
                                            consts.OBS_PII_SEX, ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.SEX_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.SEX_FIELD)
            LOGGER.info(f"Validated genders for: {site}")

        # validate birth dates
        try:
            match_values = None
            match_values = _compare_birth_dates(project, validation_dataset,
                                                ehr_dataset, site,
                                                consts.OBS_PII_BIRTH_DATETIME,
                                                ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.BIRTH_DATETIME_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.BIRTH_DATE_FIELD)
            LOGGER.info(f"Validated birth dates for: {site}")

        LOGGER.info(f"Writing results to BQ table")
        # write dictionary to a table
        try:
            writers.write_to_result_table(project, validation_dataset, site,
                                          results)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError):
            LOGGER.exception(
                f"Did not write site information to validation dataset:  {site}"
            )
            write_errors += 1

        LOGGER.info(f"Wrote validation results for site: {site}")

    LOGGER.info(f"FINISHED: Validation dataset created:  {validation_dataset}")

    if read_errors > 0:
        LOGGER.error(
            f"Encountered {read_errors} read errors creating validation dataset:\t{validation_dataset}"
        )

    if write_errors > 0:
        LOGGER.error(
            f"Encountered {write_errors} write errors creating validation dataset:\t{validation_dataset}"
        )

    return read_errors + write_errors
示例#7
0
文件: aou.py 项目: dcarbone/curation
def create_person_id_src_hpo_map(input_dataset, credentials):
    """
    Create a table containing person_ids and src_hpo_ids

    :param input_dataset:  the input dataset to deid
    :param credentidals:  the credentials needed to create a new table.
    """
    map_tablename = "_mapping_person_src_hpos"
    sql = ("select person_id, src_hpo_id "
           "from {input_dataset}._mapping_{table} "
           "join {input_dataset}.{table} "
           "using ({table}_id) "
           "where src_hpo_id not like 'rdr'")

    # list dataset contents
    dataset_tables = bq_utils.list_dataset_contents(input_dataset)
    mapping_tables = []
    mapped_tables = []
    for table in dataset_tables:
        if table.startswith('_mapping_'):
            mapping_tables.append(table)
            mapped_tables.append(table[9:])

    # make sure mapped tables all exist
    check_tables = []
    for table in mapped_tables:
        if table in dataset_tables:
            check_tables.append(table)

    # make sure check_tables contain person_id fields
    person_id_tables = []
    for table in check_tables:
        info = bq_utils.get_table_info(table, dataset_id=input_dataset)
        schema = info.get('schema', {})
        for field_info in schema.get('fields', []):
            if 'person_id' in field_info.get('name'):
                person_id_tables.append(table)

    # revamp mapping tables to contain only mapping tables for tables
    # with person_id fields
    mapping_tables = ['_mapping_' + table for table in person_id_tables]

    sql_statement = []
    for table in person_id_tables:
        sql_statement.append(
            sql.format(table=table, input_dataset=input_dataset))

    final_query = ' UNION ALL '.join(sql_statement)

    # create the mapping table
    if map_tablename not in dataset_tables:
        fields = [{
            "type": "integer",
            "name": "person_id",
            "mode": "required",
            "description": "the person_id of someone with an ehr record"
        }, {
            "type": "string",
            "name": "src_hpo_id",
            "mode": "required",
            "description": "the src_hpo_id of an ehr record"
        }]
        bq_utils.create_table(map_tablename, fields, dataset_id=input_dataset)

    bq_utils.query(final_query,
                   destination_table_id=map_tablename,
                   destination_dataset_id=input_dataset,
                   write_disposition=bq_consts.WRITE_TRUNCATE)
    LOGGER.info(f"Created mapping table:\t{input_dataset}.{map_tablename}")
示例#8
0
    def test_merge_EHR(self, mock_check_cron):
        self._load_datasets()
        dataset_id = bq_utils.get_dataset_id()
        old_dataset_items = bq_utils.list_dataset_contents(dataset_id)
        expected_items = ['visit_id_mapping_table']
        expected_items.extend([
            ehr_merge.result_table_for(table_name)
            for table_name in common.CDM_TABLES
        ])

        ehr_merge.merge(dataset_id, self.project_id)

        # Check row counts for each output table
        dataset_items = bq_utils.list_dataset_contents(dataset_id)
        for table_name in common.CDM_TABLES:
            result_table = ehr_merge.result_table_for(table_name)
            expected_rows = self.expected_tables[result_table]
            expected_count = len(expected_rows)
            table_info = bq_utils.get_table_info(result_table)
            actual_count = int(table_info.get('numRows'))
            msg = 'Unexpected row count in table {result_table} after ehr union'.format(
                result_table=result_table)
            self.assertEqual(actual_count, expected_count, msg)

            # Check for clustering if table has person_id
            fields_file = os.path.join(resources.fields_path,
                                       table_name + '.json')
            with open(fields_file, 'r') as fp:
                fields = json.load(fp)
                field_names = [field['name'] for field in fields]
                if 'person_id' in field_names:
                    self._table_has_clustering(table_info)
        self.assertSetEqual(set(old_dataset_items + expected_items),
                            set(dataset_items))

        table_name = 'condition_occurrence'
        hpo_id = 'pitt'
        result_table = ehr_merge.result_table_for(table_name)
        pitt_table = bq_utils.get_table_id(hpo_id, table_name)
        cmd_union = 'SELECT * FROM ' + result_table
        cmd_pitt = 'SELECT * FROM ' + pitt_table
        cmd_visit_mapping = """
          SELECT global_visit_id, 
                 mapping_visit_id 
          FROM visit_id_mapping_table 
          WHERE hpo='{hpo_id}'""".format(hpo_id=hpo_id)
        qr_union = bq_utils.query(cmd_union)
        qr_pitt = bq_utils.query(cmd_pitt)
        qr_visit_mapping = bq_utils.query(cmd_visit_mapping)

        union_result = query_result_to_payload(qr_union)
        pitt_result = query_result_to_payload(qr_pitt)
        visit_mapping_result = query_result_to_payload(qr_visit_mapping)

        def get_element_from_list_of_lists(index, list_of_lists):
            return [list_item[index] for list_item in list_of_lists]

        for ind, pitt_visit_id in enumerate(
                pitt_result['VISIT_OCCURRENCE_ID']):
            if pitt_visit_id not in visit_mapping_result['MAPPING_VISIT_ID']:
                continue
            global_visit_id_index = visit_mapping_result[
                'MAPPING_VISIT_ID'].index(pitt_visit_id)
            global_visit_id = visit_mapping_result['GLOBAL_VISIT_ID'][
                global_visit_id_index]
            union_visit_id_index = union_result['VISIT_OCCURRENCE_ID'].index(
                global_visit_id)
            pitt_cols_without_id = [
                values for key, values in pitt_result.items() if key not in
                [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID']
            ]
            union_cols_without_id = [
                values for key, values in union_result.items() if key not in
                [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID']
            ]
            self.assertListEqual(
                get_element_from_list_of_lists(ind, pitt_cols_without_id),
                get_element_from_list_of_lists(union_visit_id_index,
                                               union_cols_without_id))