示例#1
0
    def setUp(self):
        self.testbed = testbed.Testbed()
        self.testbed.activate()
        self.testbed.init_app_identity_stub()
        self.testbed.init_memcache_stub()
        self.testbed.init_urlfetch_stub()
        self.testbed.init_blobstore_stub()
        self.testbed.init_datastore_v3_stub()
        self.project_id = bq_utils.app_identity.get_application_id()
        self.hpo_ids = [NYC_HPO_ID, PITT_HPO_ID]
        self.input_dataset_id = bq_utils.get_dataset_id()
        self.output_dataset_id = bq_utils.get_unioned_dataset_id()
        self._empty_hpo_buckets()
        test_util.delete_all_tables(self.input_dataset_id)
        test_util.delete_all_tables(self.output_dataset_id)

        # TODO Generalize to work for all foreign key references
        # Collect all primary key fields in CDM tables
        mapped_fields = []
        for table in cdm.tables_to_map():
            field = table + '_id'
            mapped_fields.append(field)
        self.mapped_fields = mapped_fields
        self.implemented_foreign_keys = [
            eu_constants.VISIT_OCCURRENCE_ID, eu_constants.CARE_SITE_ID,
            eu_constants.LOCATION_ID
        ]
    def test_queries_to_retract_from_combined_or_deid_dataset(self, mock_list_existing_tables):
        existing_table_ids = []
        ignored_tables = []
        for cdm_table in resources.CDM_TABLES:
            existing_table_ids.append(cdm_table)
            if cdm_table not in self.tables_to_retract_combined:
                ignored_tables.append(cdm_table)

        mapped_tables = cdm.tables_to_map()
        for mapped_table in mapped_tables:
            mapping_table = ehr_union.mapping_table_for(mapped_table)
            existing_table_ids.append(mapping_table)
            if mapped_table not in self.tables_to_retract_combined:
                ignored_tables.append(mapping_table)

        mock_list_existing_tables.return_value = existing_table_ids
        mqs, qs = retract_data_bq.queries_to_retract_from_combined_or_deid_dataset(self.project_id,
                                                                                   self.combined_dataset_id,
                                                                                   self.person_ids)
        actual_dest_tables = set(q[retract_data_bq.DEST_TABLE] for q in qs+mqs)
        expected_dest_tables = set(existing_table_ids) - set(ignored_tables)
        self.assertSetEqual(expected_dest_tables, actual_dest_tables)

        # death query should use person_id as-is (no constant factor)
        constant_factor = common.RDR_ID_CONSTANT + common.ID_CONSTANT_FACTOR
        for q in qs:
            if q[retract_data_bq.DEST_TABLE] is common.DEATH:
                self.assertNotIn(str(constant_factor), q[retract_data_bq.QUERY])
示例#3
0
def get_id_deduplicate_queries(project_id, dataset_id):
    """
    This function gets the queries required to remove the duplicate id columns from a dataset

    :param project_id: Project name
    :param dataset_id: Name of the dataset where a rule should be applied
    :return: a list of queries.
    """
    queries = []
    tables_with_primary_key = cdm.tables_to_map()
    for table in tables_with_primary_key:
        if 'unioned' in dataset_id:
            table_name = 'unioned_ehr_{table}'.format(table=table)
        else:
            table_name = table
        if bq_utils.table_exists(table_name, dataset_id):
            fields = resources.fields_for(table)
            # Generate column expressions for select
            col_exprs = [field['name'] for field in fields]
            cols = ',\n        '.join(col_exprs)
            query = ID_DE_DUP_QUERY.format(columns=cols,
                                           project_id=project_id,
                                           dataset_id=dataset_id,
                                           domain_table=table,
                                           table_name=table_name)
            queries.append(query)
    return queries
示例#4
0
def get_id_deduplicate_queries(project_id, dataset_id):
    """
    This function gets the queries required to remove the duplicate id columns from a dataset

    :param project_id: Project name
    :param dataset_id: Name of the dataset where a rule should be applied
    :return: a list of queries.
    """
    queries = []
    tables_with_primary_key = cdm.tables_to_map()
    for table in tables_with_primary_key:
        table_name = table
        fields = resources.fields_for(table)
        # Generate column expressions for select
        col_exprs = [field['name'] for field in fields]
        cols = ', '.join(col_exprs)
        query = dict()
        query[cdr_consts.QUERY] = ID_DE_DUP_QUERY.format(columns=cols,
                                                         project_id=project_id,
                                                         dataset_id=dataset_id,
                                                         domain_table=table,
                                                         table_name=table_name)

        query[cdr_consts.DESTINATION_TABLE] = table
        query[cdr_consts.DISPOSITION] = bq_consts.WRITE_TRUNCATE
        query[cdr_consts.DESTINATION_DATASET] = dataset_id
        queries.append(query)
    return queries
示例#5
0
def get_query_result(hpo_id,
                     query_string,
                     table_id,
                     query_wrapper,
                     is_subquery,
                     app_id=None,
                     dataset_id=None):
    """
    :param hpo_id: the name of the hpo_id for which validation is being done
    :param query_string: variable name of the query string stored in the constants
    :param table_id: Name of the table running analysis on
    :param query_wrapper: wrapper over the unioned query if required
    :param is_subquery: binary flag(true/false) to indicate if parsing is needed or not.
    :param app_id: name of the big query application id
    :param dataset_id: name of the big query dataset id
    :return: returns dictionary of rows
    """
    if app_id is None:
        app_id = app_identity.get_application_id()
    if dataset_id is None:
        dataset_id = bq_utils.get_dataset_id()
    query = None
    result = None
    if is_subquery:
        sub_queries = []
        for table in cdm.tables_to_map():
            hpo_table = '{hpo_id}_{table_name}'.format(hpo_id=hpo_id,
                                                       table_name=table)
            if bq_utils.table_exists(hpo_table):
                sub_query = query_string.format(hpo_id=hpo_id,
                                                app_id=app_id,
                                                dataset_id=dataset_id,
                                                domain_table=table)
                sub_queries.append(sub_query)
        unioned_query = main_constants.UNION_ALL.join(sub_queries)
        if unioned_query and query_wrapper is not None:
            query = query_wrapper.format(union_of_subqueries=unioned_query)
        else:
            query = unioned_query
    else:
        table_name = '{hpo_name}_{results_table}'.format(
            hpo_name=hpo_id, results_table=table_id)
        if bq_utils.table_exists(table_name):
            query = query_string.format(application=app_id,
                                        dataset=dataset_id,
                                        table_id=table_name)
    if query:
        # Found achilles_heel_results table(s), run the query
        response = bq_utils.query(query)
        result = bq_utils.response2rows(response)
    if result is None:
        result = []
    return result
示例#6
0
def main(input_dataset_id, output_dataset_id, project_id, hpo_ids_ex=None):
    """
    Create a new CDM which is the union of all EHR datasets submitted by HPOs

    :param input_dataset_id identifies a dataset containing multiple CDMs, one for each HPO submission
    :param output_dataset_id identifies the dataset to store the new CDM in
    :param project_id: project containing the datasets
    :param hpo_ids_ex: (optional) list that identifies HPOs not to process, by default process all
    :returns: list of tables generated successfully
    """
    client = get_client(project_id)

    logging.info('EHR union started')
    # Get all hpo_ids.
    hpo_ids = [item['hpo_id'] for item in bq_utils.get_hpo_info()]
    if hpo_ids_ex:
        hpo_ids = [hpo_id for hpo_id in hpo_ids if hpo_id not in hpo_ids_ex]

    # Create empty output tables to ensure proper schema, clustering, etc.
    for table in resources.CDM_TABLES:
        result_table = output_table_for(table)
        logging.info(f'Creating {output_dataset_id}.{result_table}...')
        bq_utils.create_standard_table(table,
                                       result_table,
                                       drop_existing=True,
                                       dataset_id=output_dataset_id)

    # Create mapping tables
    for domain_table in cdm.tables_to_map():
        logging.info(f'Mapping {domain_table}...')
        mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id,
                project_id, client)

    # Load all tables with union of submitted tables
    for table_name in resources.CDM_TABLES:
        logging.info(f'Creating union of table {table_name}...')
        load(table_name, hpo_ids, input_dataset_id, output_dataset_id)

    logging.info('Creation of Unioned EHR complete')

    # create person mapping table
    domain_table = common.PERSON
    logging.info(f'Mapping {domain_table}...')
    mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id,
            project_id, client)

    logging.info('Starting process for Person to Observation')
    # Map and move EHR person records into four rows in observation, one each for race, ethnicity, dob and gender
    map_ehr_person_to_observation(output_dataset_id)
    move_ehr_person_to_observation(output_dataset_id)

    logging.info('Completed Person to Observation')
示例#7
0
    def get_query_specs(self, *args, **keyword_args) -> query_spec_list:

        sandbox_queries = []
        # iterate through the list of CDM tables with an id column
        for table_name in cdm.tables_to_map():
            sandbox_queries.append({
                cdr_consts.QUERY:
                ID_DE_DUP_SANDBOX_QUERY_TEMPLATE.render(
                    project_id=self.project_id,
                    dataset_id=self.dataset_id,
                    table_name=table_name),
                cdr_consts.DESTINATION_TABLE:
                self.sandbox_table_for(table_name),
                cdr_consts.DISPOSITION:
                WRITE_TRUNCATE,
                cdr_consts.DESTINATION_DATASET:
                self.sandbox_dataset_id
            })

        queries = []
        # iterate through the list of CDM tables with an id column
        for table_name in cdm.tables_to_map():
            queries.append({
                cdr_consts.QUERY:
                ID_DE_DUP_QUERY_TEMPLATE.render(project_id=self.project_id,
                                                dataset_id=self.dataset_id,
                                                table_name=table_name),
                cdr_consts.DESTINATION_TABLE:
                table_name,
                cdr_consts.DISPOSITION:
                WRITE_TRUNCATE,
                cdr_consts.DESTINATION_DATASET:
                self.dataset_id
            })

        return sandbox_queries + queries
示例#8
0
def get_duplicate_counts_query(hpo_id):
    """
    Query to retrieve count of duplicate primary keys in domain tables for an HPO site

    :param hpo_id: identifies the HPO site
    :return: the query
    """
    sub_queries = []
    all_table_ids = bq_utils.list_all_table_ids()
    for table_name in cdm.tables_to_map():
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        if table_id in all_table_ids:
            sub_query = render_query(consts.DUPLICATE_IDS_SUBQUERY,
                                     table_name=table_name,
                                     table_id=table_id)
            sub_queries.append(sub_query)
    unioned_query = consts.UNION_ALL.join(sub_queries)
    return consts.DUPLICATE_IDS_WRAPPER.format(
        union_of_subqueries=unioned_query)
示例#9
0
    def __init__(self, project_id, dataset_id, sandbox_dataset_id):
        """
        Initialize the class with proper information.

        Set the issue numbers, description and affected datasets. As other tickets may affect
        this SQL, append them to the list of Jira Issues.
        DO NOT REMOVE ORIGINAL JIRA ISSUE NUMBERS!
        """
        desc = (
            'Remove the duplicate id columns from OMOP tables that have an ID column '
            'in a given dataset')

        super().__init__(issue_numbers=JIRA_ISSUE_NUMBERS,
                         description=desc,
                         affected_datasets=[cdr_consts.UNIONED],
                         affected_tables=cdm.tables_to_map(),
                         project_id=project_id,
                         dataset_id=dataset_id,
                         sandbox_dataset_id=sandbox_dataset_id)
示例#10
0
    def setUp(self):
        self.project_id = bq_utils.app_identity.get_application_id()
        self.hpo_ids = [PITT_HPO_ID, NYC_HPO_ID, EXCLUDED_HPO_ID]
        self.input_dataset_id = bq_utils.get_dataset_id()
        self.output_dataset_id = bq_utils.get_unioned_dataset_id()
        self.storage_client = StorageClient(self.project_id)
        self.tearDown()

        # TODO Generalize to work for all foreign key references
        # Collect all primary key fields in CDM tables
        mapped_fields = []
        for table in cdm.tables_to_map():
            field = table + '_id'
            mapped_fields.append(field)
        self.mapped_fields = mapped_fields
        self.implemented_foreign_keys = [
            eu_constants.VISIT_OCCURRENCE_ID, eu_constants.VISIT_DETAIL_ID,
            eu_constants.CARE_SITE_ID, eu_constants.LOCATION_ID
        ]
示例#11
0
    def test_queries_to_retract_from_ehr_dataset(self,
                                                 mock_list_existing_tables):
        hpo_person = bq_utils.get_table_id(self.hpo_id, common.PERSON)
        hpo_death = bq_utils.get_table_id(self.hpo_id, common.DEATH)

        # hpo tables
        existing_table_ids = [hpo_person, hpo_death]
        for table in self.tables_to_retract_unioned:
            table_id = bq_utils.get_table_id(self.hpo_id, table)
            existing_table_ids.append(table_id)

        # unioned tables
        ignored_tables = []
        for cdm_table in resources.CDM_TABLES:
            unioned_table_id = retract_data_bq.UNIONED_EHR + cdm_table
            existing_table_ids.append(unioned_table_id)

            if cdm_table not in self.tables_to_retract_unioned:
                ignored_tables.append(unioned_table_id)

        mapped_tables = cdm.tables_to_map()

        # fact_relationship does not have pid, is handled separate from other mapped tables
        for mapped_table in mapped_tables:
            mapping_table = ehr_union.mapping_table_for(mapped_table)
            existing_table_ids.append(mapping_table)
            legacy_mapping_table = retract_data_bq.UNIONED_EHR + mapping_table
            existing_table_ids.append(legacy_mapping_table)
            if mapped_table not in self.tables_to_retract_unioned:
                ignored_tables.append(mapping_table)
                ignored_tables.append(legacy_mapping_table)

        mock_list_existing_tables.return_value = existing_table_ids
        mqs, qs = retract_data_bq.queries_to_retract_from_ehr_dataset(
            self.project_id, self.ehr_dataset_id, self.project_id,
            self.sandbox_dataset_id, self.hpo_id, self.pid_table_id)
        actual_dest_tables = set(q[retract_data_bq.DEST_TABLE]
                                 for q in qs + mqs)
        expected_dest_tables = set(existing_table_ids) - set(hpo_person) - set(
            ignored_tables)
        self.assertSetEqual(expected_dest_tables, actual_dest_tables)
示例#12
0
    def setUp(self):

        self.project_id = bq_utils.app_identity.get_application_id()
        self.hpo_ids = [NYC_HPO_ID, PITT_HPO_ID]
        self.input_dataset_id = bq_utils.get_dataset_id()
        self.output_dataset_id = bq_utils.get_unioned_dataset_id()
        # Done in tearDown().  this is redundant.
        self._empty_hpo_buckets()
        test_util.delete_all_tables(self.input_dataset_id)
        test_util.delete_all_tables(self.output_dataset_id)

        # TODO Generalize to work for all foreign key references
        # Collect all primary key fields in CDM tables
        mapped_fields = []
        for table in cdm.tables_to_map():
            field = table + '_id'
            mapped_fields.append(field)
        self.mapped_fields = mapped_fields
        self.implemented_foreign_keys = [
            eu_constants.VISIT_OCCURRENCE_ID, eu_constants.CARE_SITE_ID,
            eu_constants.LOCATION_ID
        ]
示例#13
0
    def test_queries_to_retract_from_unioned_dataset(self, mock_list_existing_tables):
        existing_table_ids = []
        ignored_tables = []
        for cdm_table in resources.CDM_TABLES:
            existing_table_ids.append(cdm_table)
            if cdm_table not in self.tables_to_retract_unioned:
                ignored_tables.append(cdm_table)

        mapped_tables = cdm.tables_to_map()
        for mapped_table in mapped_tables:
            mapping_table = ehr_union.mapping_table_for(mapped_table)
            existing_table_ids.append(mapping_table)
            if mapped_table not in self.tables_to_retract_unioned:
                ignored_tables.append(mapping_table)

        mock_list_existing_tables.return_value = existing_table_ids
        mqs, qs = retract_data_bq.queries_to_retract_from_unioned_dataset(self.project_id,
                                                                          self.unioned_dataset_id,
                                                                          self.person_ids)
        actual_dest_tables = set(q[retract_data_bq.DEST_TABLE] for q in qs+mqs)
        expected_dest_tables = set(existing_table_ids) - set(ignored_tables)
        self.assertSetEqual(expected_dest_tables, actual_dest_tables)
示例#14
0
    def test_union_ehr(self):
        self._load_datasets()
        input_tables_before = set(self._dataset_tables(self.input_dataset_id))

        # output should be mapping tables and cdm tables
        output_tables_before = self._dataset_tables(self.output_dataset_id)
        mapping_tables = [
            ehr_union.mapping_table_for(table)
            for table in cdm.tables_to_map() + [combine_ehr_rdr.PERSON_TABLE]
        ]
        output_cdm_tables = [
            ehr_union.output_table_for(table) for table in resources.CDM_TABLES
        ]
        expected_output = set(output_tables_before + mapping_tables +
                              output_cdm_tables)

        # perform ehr union
        ehr_union.main(self.input_dataset_id, self.output_dataset_id,
                       self.project_id, self.hpo_ids)

        # input dataset should be unchanged
        input_tables_after = set(self._dataset_tables(self.input_dataset_id))
        self.assertSetEqual(input_tables_before, input_tables_after)

        # fact_relationship from pitt
        hpo_unique_identifiers = ehr_union.get_hpo_offsets(self.hpo_ids)
        pitt_offset = hpo_unique_identifiers[PITT_HPO_ID]
        q = '''SELECT fact_id_1, fact_id_2 
               FROM `{input_dataset}.{hpo_id}_fact_relationship`
               where domain_concept_id_1 = 21 and domain_concept_id_2 = 21'''.format(
            input_dataset=self.input_dataset_id, hpo_id=PITT_HPO_ID)
        response = bq_utils.query(q)
        result = bq_utils.response2rows(response)

        expected_fact_id_1 = result[0]["fact_id_1"] + pitt_offset
        expected_fact_id_2 = result[0]["fact_id_2"] + pitt_offset

        q = '''SELECT fr.fact_id_1, fr.fact_id_2 FROM `{dataset_id}.unioned_ehr_fact_relationship` fr
            join `{dataset_id}._mapping_measurement` mm on fr.fact_id_1 = mm.measurement_id
            and mm.src_hpo_id = "{hpo_id}"'''.format(
            dataset_id=self.output_dataset_id, hpo_id=PITT_HPO_ID)
        response = bq_utils.query(q)
        result = bq_utils.response2rows(response)
        actual_fact_id_1, actual_fact_id_2 = result[0]["fact_id_1"], result[0][
            "fact_id_2"]
        self.assertEqual(expected_fact_id_1, actual_fact_id_1)
        self.assertEqual(expected_fact_id_2, actual_fact_id_2)

        # mapping tables
        tables_to_map = cdm.tables_to_map()
        for table_to_map in tables_to_map:
            mapping_table = ehr_union.mapping_table_for(table_to_map)
            expected_fields = {
                'src_table_id',
                'src_%s_id' % table_to_map,
                '%s_id' % table_to_map, 'src_hpo_id'
            }
            mapping_table_info = bq_utils.get_table_info(
                mapping_table, dataset_id=self.output_dataset_id)
            mapping_table_fields = mapping_table_info.get('schema',
                                                          dict()).get(
                                                              'fields', [])
            actual_fields = set([f['name'] for f in mapping_table_fields])
            message = 'Table %s has fields %s when %s expected' % (
                mapping_table, actual_fields, expected_fields)
            self.assertSetEqual(expected_fields, actual_fields, message)
            result_table = ehr_union.output_table_for(table_to_map)
            expected_num_rows = len(self.expected_tables[result_table])
            actual_num_rows = int(mapping_table_info.get('numRows', -1))
            message = 'Table %s has %s rows when %s expected' % (
                mapping_table, actual_num_rows, expected_num_rows)
            self.assertEqual(expected_num_rows, actual_num_rows, message)

        # check for each output table
        for table_name in resources.CDM_TABLES:
            # output table exists and row count is sum of those submitted by hpos
            result_table = ehr_union.output_table_for(table_name)
            expected_rows = self.expected_tables[result_table]
            expected_count = len(expected_rows)
            table_info = bq_utils.get_table_info(
                result_table, dataset_id=self.output_dataset_id)
            actual_count = int(table_info.get('numRows'))
            msg = 'Unexpected row count in table {result_table} after ehr union'.format(
                result_table=result_table)
            self.assertEqual(expected_count, actual_count, msg)
            # TODO Compare table rows to expected accounting for the new ids and ignoring field types
            # q = 'SELECT * FROM {dataset}.{table}'.format(dataset=self.output_dataset_id, table=result_table)
            # query_response = bq_utils.query(q)
            # actual_rows = bq_utils.response2rows(query_response)

            # output table has clustering on person_id where applicable
            fields_file = os.path.join(resources.fields_path,
                                       table_name + '.json')
            with open(fields_file, 'r') as fp:
                fields = json.load(fp)
                field_names = [field['name'] for field in fields]
                if 'person_id' in field_names:
                    self._table_has_clustering(table_info)

        actual_output = set(self._dataset_tables(self.output_dataset_id))
        self.assertSetEqual(expected_output, actual_output)

        # explicit check that output person_ids are same as input
        nyc_person_table_id = bq_utils.get_table_id(NYC_HPO_ID, 'person')
        pitt_person_table_id = bq_utils.get_table_id(PITT_HPO_ID, 'person')
        q = '''SELECT DISTINCT person_id FROM (
           SELECT person_id FROM {dataset_id}.{nyc_person_table_id}
           UNION ALL
           SELECT person_id FROM {dataset_id}.{pitt_person_table_id}
        ) ORDER BY person_id ASC'''.format(
            dataset_id=self.input_dataset_id,
            nyc_person_table_id=nyc_person_table_id,
            pitt_person_table_id=pitt_person_table_id)
        response = bq_utils.query(q)
        expected_rows = bq_utils.response2rows(response)
        person_table_id = ehr_union.output_table_for('person')
        q = '''SELECT DISTINCT person_id 
               FROM {dataset_id}.{table_id} 
               ORDER BY person_id ASC'''.format(
            dataset_id=self.output_dataset_id, table_id=person_table_id)
        response = bq_utils.query(q)
        actual_rows = bq_utils.response2rows(response)
        self.assertCountEqual(expected_rows, actual_rows)
示例#15
0
import cdm

SOURCE_VALUE_EHR_CONSENT = 'EHRConsentPII_ConsentPermission'
CONCEPT_ID_CONSENT_PERMISSION_YES = 1586100  # ConsentPermission_Yes
EHR_CONSENT_TABLE_ID = '_ehr_consent'
PERSON_TABLE = 'person'
PERSON_ID = 'person_id'
OBSERVATION_TABLE = 'observation'
FOREIGN_KEYS_FIELDS = [
    'visit_occurrence_id', 'location_id', 'care_site_id', 'provider_id'
]
RDR_TABLES_TO_COPY = ['person']
EHR_TABLES_TO_COPY = ['death']
DOMAIN_TABLES = list(
    set(cdm.tables_to_map()) - set(RDR_TABLES_TO_COPY + EHR_TABLES_TO_COPY))
TABLES_TO_PROCESS = RDR_TABLES_TO_COPY + EHR_TABLES_TO_COPY + DOMAIN_TABLES
LEFT_JOIN = (
    ' LEFT JOIN'
    ' ('
    ' SELECT *'
    ' FROM ('
    ' SELECT'
    ' *,'
    ' row_number() OVER (PARTITION BY {prefix}.{field}, {prefix}.src_hpo_id ) '
    ' AS row_num'
    ' FROM {dataset_id}.{table} {prefix}'
    ' )'
    ' WHERE row_num = 1'
    ' ) {prefix}  ON t.{field} = {prefix}.src_{field}'
    ' AND m.src_dataset_id = {prefix}.src_dataset_id')