Пример #1
0
    def test_queries_to_retract_from_combined_or_deid_dataset(self, mock_list_existing_tables):
        existing_table_ids = []
        ignored_tables = []
        for cdm_table in resources.CDM_TABLES:
            existing_table_ids.append(cdm_table)
            if cdm_table not in self.tables_to_retract_combined:
                ignored_tables.append(cdm_table)

        mapped_tables = cdm.tables_to_map()
        for mapped_table in mapped_tables:
            mapping_table = ehr_union.mapping_table_for(mapped_table)
            existing_table_ids.append(mapping_table)
            if mapped_table not in self.tables_to_retract_combined:
                ignored_tables.append(mapping_table)

        mock_list_existing_tables.return_value = existing_table_ids
        mqs, qs = retract_data_bq.queries_to_retract_from_combined_or_deid_dataset(self.project_id,
                                                                                   self.combined_dataset_id,
                                                                                   self.person_ids)
        actual_dest_tables = set(q[retract_data_bq.DEST_TABLE] for q in qs+mqs)
        expected_dest_tables = set(existing_table_ids) - set(ignored_tables)
        self.assertSetEqual(expected_dest_tables, actual_dest_tables)

        # death query should use person_id as-is (no constant factor)
        constant_factor = common.RDR_ID_CONSTANT + common.ID_CONSTANT_FACTOR
        for q in qs:
            if q[retract_data_bq.DEST_TABLE] is common.DEATH:
                self.assertNotIn(str(constant_factor), q[retract_data_bq.QUERY])
def get_mapping_table_update_queries(project_id, dataset_id):
    """
    Generates a list of query dicts for adding newly generated rows to corresponding mapping_tables

    :param project_id: identifies the project containing the dataset
    :param dataset_id: identifies the dataset containing the OMOP data
    :return: list of query dicts for updating mapping_tables
    """
    queries = []
    for domain_table in DOMAIN_TABLE_NAMES:
        mapping_table = mapping_table_for(domain_table)
        query = dict()
        query[cdr_consts.QUERY] = parse_mapping_table_update_query(project_id, dataset_id, domain_table, mapping_table)
        query[cdr_consts.DESTINATION_TABLE] = mapping_table
        query[cdr_consts.DISPOSITION] = bq_consts.WRITE_TRUNCATE
        query[cdr_consts.DESTINATION_DATASET] = dataset_id

        queries.append(query)

    return queries
Пример #3
0
    def test_queries_to_retract_from_ehr_dataset(self,
                                                 mock_list_existing_tables):
        hpo_person = bq_utils.get_table_id(self.hpo_id, common.PERSON)
        hpo_death = bq_utils.get_table_id(self.hpo_id, common.DEATH)

        # hpo tables
        existing_table_ids = [hpo_person, hpo_death]
        for table in self.tables_to_retract_unioned:
            table_id = bq_utils.get_table_id(self.hpo_id, table)
            existing_table_ids.append(table_id)

        # unioned tables
        ignored_tables = []
        for cdm_table in resources.CDM_TABLES:
            unioned_table_id = retract_data_bq.UNIONED_EHR + cdm_table
            existing_table_ids.append(unioned_table_id)

            if cdm_table not in self.tables_to_retract_unioned:
                ignored_tables.append(unioned_table_id)

        mapped_tables = cdm.tables_to_map()

        # fact_relationship does not have pid, is handled separate from other mapped tables
        for mapped_table in mapped_tables:
            mapping_table = ehr_union.mapping_table_for(mapped_table)
            existing_table_ids.append(mapping_table)
            legacy_mapping_table = retract_data_bq.UNIONED_EHR + mapping_table
            existing_table_ids.append(legacy_mapping_table)
            if mapped_table not in self.tables_to_retract_unioned:
                ignored_tables.append(mapping_table)
                ignored_tables.append(legacy_mapping_table)

        mock_list_existing_tables.return_value = existing_table_ids
        mqs, qs = retract_data_bq.queries_to_retract_from_ehr_dataset(
            self.project_id, self.ehr_dataset_id, self.project_id,
            self.sandbox_dataset_id, self.hpo_id, self.pid_table_id)
        actual_dest_tables = set(q[retract_data_bq.DEST_TABLE]
                                 for q in qs + mqs)
        expected_dest_tables = set(existing_table_ids) - set(hpo_person) - set(
            ignored_tables)
        self.assertSetEqual(expected_dest_tables, actual_dest_tables)
Пример #4
0
    def get_mapping_table_update_queries(self):
        """
        Generates a list of query dicts for adding newly generated rows to corresponding 
        mapping_tables 
        :return: list of query dicts for updating mapping_tables 
        """
        queries = []
        for domain_table in self.affected_tables:
            mapping_table = mapping_table_for(domain_table)
            queries.append({
                cdr_consts.QUERY:
                self.parse_mapping_table_update_query(domain_table,
                                                      mapping_table),
                cdr_consts.DESTINATION_TABLE:
                mapping_table,
                cdr_consts.DISPOSITION:
                bq_consts.WRITE_TRUNCATE,
                cdr_consts.DESTINATION_DATASET:
                self.dataset_id
            })

        return queries
Пример #5
0
    def test_queries_to_retract_from_unioned_dataset(self, mock_list_existing_tables):
        existing_table_ids = []
        ignored_tables = []
        for cdm_table in resources.CDM_TABLES:
            existing_table_ids.append(cdm_table)
            if cdm_table not in self.tables_to_retract_unioned:
                ignored_tables.append(cdm_table)

        mapped_tables = cdm.tables_to_map()
        for mapped_table in mapped_tables:
            mapping_table = ehr_union.mapping_table_for(mapped_table)
            existing_table_ids.append(mapping_table)
            if mapped_table not in self.tables_to_retract_unioned:
                ignored_tables.append(mapping_table)

        mock_list_existing_tables.return_value = existing_table_ids
        mqs, qs = retract_data_bq.queries_to_retract_from_unioned_dataset(self.project_id,
                                                                          self.unioned_dataset_id,
                                                                          self.person_ids)
        actual_dest_tables = set(q[retract_data_bq.DEST_TABLE] for q in qs+mqs)
        expected_dest_tables = set(existing_table_ids) - set(ignored_tables)
        self.assertSetEqual(expected_dest_tables, actual_dest_tables)
Пример #6
0
    def test_union_ehr(self):
        self._load_datasets()
        input_tables_before = set(self._dataset_tables(self.input_dataset_id))

        # output should be mapping tables and cdm tables
        output_tables_before = self._dataset_tables(self.output_dataset_id)
        mapping_tables = [
            ehr_union.mapping_table_for(table)
            for table in ehr_union.tables_to_map()
        ]
        output_cdm_tables = [
            ehr_union.output_table_for(table) for table in common.CDM_TABLES
        ]
        expected_output = set(output_tables_before + mapping_tables +
                              output_cdm_tables)

        # perform ehr union
        ehr_union.main(self.input_dataset_id, self.output_dataset_id,
                       self.project_id, self.hpo_ids)

        # input dataset should be unchanged
        input_tables_after = set(self._dataset_tables(self.input_dataset_id))
        self.assertSetEqual(input_tables_before, input_tables_after)

        # mapping tables
        tables_to_map = ehr_union.tables_to_map()
        for table_to_map in tables_to_map:
            mapping_table = ehr_union.mapping_table_for(table_to_map)
            expected_fields = {
                'src_table_id',
                'src_%s_id' % table_to_map,
                '%s_id' % table_to_map, 'src_hpo_id'
            }
            mapping_table_info = bq_utils.get_table_info(
                mapping_table, dataset_id=self.output_dataset_id)
            mapping_table_fields = mapping_table_info.get('schema',
                                                          dict()).get(
                                                              'fields', [])
            actual_fields = set([f['name'] for f in mapping_table_fields])
            message = 'Table %s has fields %s when %s expected' % (
                mapping_table, actual_fields, expected_fields)
            self.assertSetEqual(expected_fields, actual_fields, message)
            result_table = ehr_union.output_table_for(table_to_map)
            expected_num_rows = len(self.expected_tables[result_table])
            actual_num_rows = int(mapping_table_info.get('numRows', -1))
            message = 'Table %s has %s rows when %s expected' % (
                mapping_table, actual_num_rows, expected_num_rows)
            self.assertEqual(expected_num_rows, actual_num_rows, message)

        # check for each output table
        for table_name in common.CDM_TABLES:
            # output table exists and row count is sum of those submitted by hpos
            result_table = ehr_union.output_table_for(table_name)
            expected_rows = self.expected_tables[result_table]
            expected_count = len(expected_rows)
            table_info = bq_utils.get_table_info(
                result_table, dataset_id=self.output_dataset_id)
            actual_count = int(table_info.get('numRows'))
            msg = 'Unexpected row count in table {result_table} after ehr union'.format(
                result_table=result_table)
            self.assertEqual(expected_count, actual_count, msg)
            # TODO Compare table rows to expected accounting for the new ids and ignoring field types
            # q = 'SELECT * FROM {dataset}.{table}'.format(dataset=self.output_dataset_id, table=result_table)
            # query_response = bq_utils.query(q)
            # actual_rows = test_util.response2rows(query_response)

            # output table has clustering on person_id where applicable
            fields_file = os.path.join(resources.fields_path,
                                       table_name + '.json')
            with open(fields_file, 'r') as fp:
                fields = json.load(fp)
                field_names = [field['name'] for field in fields]
                if 'person_id' in field_names:
                    self._table_has_clustering(table_info)

        actual_output = set(self._dataset_tables(self.output_dataset_id))
        self.assertSetEqual(expected_output, actual_output)

        # explicit check that output person_ids are same as input
        chs_person_table_id = bq_utils.get_table_id(CHS_HPO_ID, 'person')
        pitt_person_table_id = bq_utils.get_table_id(PITT_HPO_ID, 'person')
        q = '''SELECT DISTINCT person_id FROM (
           SELECT person_id FROM {dataset_id}.{chs_person_table_id}
           UNION ALL
           SELECT person_id FROM {dataset_id}.{pitt_person_table_id}
        ) ORDER BY person_id ASC'''.format(
            dataset_id=self.input_dataset_id,
            chs_person_table_id=chs_person_table_id,
            pitt_person_table_id=pitt_person_table_id)
        response = bq_utils.query(q)
        expected_rows = test_util.response2rows(response)
        person_table_id = ehr_union.output_table_for('person')
        q = '''SELECT DISTINCT person_id 
               FROM {dataset_id}.{table_id} 
               ORDER BY person_id ASC'''.format(
            dataset_id=self.output_dataset_id, table_id=person_table_id)
        response = bq_utils.query(q)
        actual_rows = test_util.response2rows(response)
        self.assertListEqual(expected_rows, actual_rows)
Пример #7
0
def queries_to_retract_from_combined_or_deid_dataset(project_id, dataset_id,
                                                     ids):
    """
    Get list of queries to remove all records in all tables associated with supplied ids

    :param project_id: identifies associated project
    :param dataset_id: identifies associated dataset
    :param ids: list of ids
    :return: list of dict with keys query, dataset, table
    """
    # If fewer pids, use DELETE statements instead of SELECT
    delete_flag = bool(len(ids) < THRESHOLD_FOR_DML)
    pids = int_list_to_bq(ids)
    logger.debug('Checking existing tables for %s.%s' %
                 (project_id, dataset_id))
    existing_tables = list_existing_tables(project_id, dataset_id)
    combined_mapping_queries = []
    combined_queries = []
    for table in TABLES_FOR_RETRACTION:
        if table is not common.DEATH:
            q_combined_mapping = dict()
            q_combined_mapping[DEST_DATASET] = dataset_id
            q_combined_mapping[DEST_TABLE] = ehr_union.mapping_table_for(table)
            q_combined_mapping[DELETE_FLAG] = delete_flag
            if q_combined_mapping[DEST_TABLE] in existing_tables:
                if q_combined_mapping[DELETE_FLAG]:
                    q_combined_mapping[
                        QUERY] = DELETE_RETRACT_MAPPING_DATA_COMBINED_QUERY.format(
                            project=project_id,
                            dataset=q_combined_mapping[DEST_DATASET],
                            mapping_table=q_combined_mapping[DEST_TABLE],
                            table_id=get_table_id(table),
                            table=table,
                            pids=pids,
                            CONSTANT_FACTOR=common.RDR_ID_CONSTANT +
                            common.ID_CONSTANT_FACTOR)
                else:
                    q_combined_mapping[
                        QUERY] = SELECT_RETRACT_MAPPING_DATA_COMBINED_QUERY.format(
                            project=project_id,
                            dataset=q_combined_mapping[DEST_DATASET],
                            mapping_table=q_combined_mapping[DEST_TABLE],
                            table_id=get_table_id(table),
                            table=table,
                            pids=pids,
                            CONSTANT_FACTOR=common.RDR_ID_CONSTANT +
                            common.ID_CONSTANT_FACTOR)
                combined_mapping_queries.append(q_combined_mapping)

        q_combined = dict()
        q_combined[DEST_DATASET] = dataset_id
        q_combined[DEST_TABLE] = table
        q_combined[DELETE_FLAG] = delete_flag
        if q_combined[DEST_TABLE] in existing_tables:
            if q_combined[DELETE_FLAG]:
                q_combined[QUERY] = DELETE_RETRACT_DATA_COMBINED_QUERY.format(
                    project=project_id,
                    dataset=q_combined[DEST_DATASET],
                    table=q_combined[DEST_TABLE],
                    pids=pids,
                    table_id=get_table_id(table),
                    CONSTANT_FACTOR=common.RDR_ID_CONSTANT +
                    common.ID_CONSTANT_FACTOR)
            else:
                q_combined[QUERY] = SELECT_RETRACT_DATA_COMBINED_QUERY.format(
                    project=project_id,
                    dataset=q_combined[DEST_DATASET],
                    table=q_combined[DEST_TABLE],
                    pids=pids,
                    table_id=get_table_id(table),
                    CONSTANT_FACTOR=common.RDR_ID_CONSTANT +
                    common.ID_CONSTANT_FACTOR)
            combined_queries.append(q_combined)

    # fix death query to exclude constant
    for q in combined_queries:
        if q[DEST_TABLE] is common.DEATH:
            if q[DELETE_FLAG]:
                q[QUERY] = DELETE_RETRACT_DATA_UNIONED_QUERY.format(
                    project=project_id,
                    dataset=q[DEST_DATASET],
                    table=q[DEST_TABLE],
                    pids=pids)
            else:
                q[QUERY] = SELECT_RETRACT_DATA_UNIONED_QUERY.format(
                    project=project_id,
                    dataset=q[DEST_DATASET],
                    table=q[DEST_TABLE],
                    pids=pids)

    q_combined_fact_relationship = dict()
    q_combined_fact_relationship[DEST_DATASET] = dataset_id
    q_combined_fact_relationship[DEST_TABLE] = common.FACT_RELATIONSHIP
    q_combined_fact_relationship[DELETE_FLAG] = delete_flag
    if q_combined_fact_relationship[DEST_TABLE] in existing_tables:
        if q_combined_fact_relationship[DELETE_FLAG]:
            q_combined_fact_relationship[
                QUERY] = DELETE_RETRACT_DATA_FACT_RELATIONSHIP.format(
                    project=project_id,
                    dataset=q_combined_fact_relationship[DEST_DATASET],
                    table=q_combined_fact_relationship[DEST_TABLE],
                    PERSON_DOMAIN=PERSON_DOMAIN,
                    pids=pids)
        else:
            q_combined_fact_relationship[
                QUERY] = SELECT_RETRACT_DATA_FACT_RELATIONSHIP.format(
                    project=project_id,
                    dataset=q_combined_fact_relationship[DEST_DATASET],
                    table=q_combined_fact_relationship[DEST_TABLE],
                    PERSON_DOMAIN=PERSON_DOMAIN,
                    pids=pids)
        combined_queries.append(q_combined_fact_relationship)

    return combined_mapping_queries, combined_queries
Пример #8
0
def queries_to_retract_from_ehr_dataset(project_id, dataset_id, hpo_id, ids):
    """
    Get list of queries to remove all records in all tables associated with supplied ids

    :param project_id: identifies associated project
    :param dataset_id: identifies associated dataset
    :param hpo_id: identifies the HPO site
    :param ids: list of ids
    :return: list of dict with keys query, dataset, table, delete_flag
    """
    # If fewer pids, use DELETE statements instead of SELECT
    delete_flag = bool(len(ids) < THRESHOLD_FOR_DML)
    logger.debug('Checking existing tables for %s.%s' %
                 (project_id, dataset_id))
    pids = int_list_to_bq(ids)
    existing_tables = list_existing_tables(project_id, dataset_id)
    site_queries = []
    unioned_mapping_queries = []
    unioned_mapping_legacy_queries = []
    unioned_queries = []
    for table in TABLES_FOR_RETRACTION:
        q_site = dict()
        q_site[DEST_DATASET] = dataset_id
        q_site[DEST_TABLE] = get_site_table(hpo_id, table)
        q_site[DELETE_FLAG] = delete_flag
        if q_site[DEST_TABLE] in existing_tables:
            if q_site[DELETE_FLAG]:
                q_site[QUERY] = DELETE_RETRACT_DATA_SITE_QUERY.format(
                    project=project_id,
                    dataset=q_site[DEST_DATASET],
                    table=q_site[DEST_TABLE],
                    pids=pids)
            else:
                q_site[QUERY] = SELECT_RETRACT_DATA_SITE_QUERY.format(
                    project=project_id,
                    dataset=q_site[DEST_DATASET],
                    table=q_site[DEST_TABLE],
                    pids=pids)
            site_queries.append(q_site)

        # death does not have mapping table
        if table is not common.DEATH:
            q_unioned_mapping = dict()
            q_unioned_mapping[DEST_DATASET] = dataset_id
            q_unioned_mapping[DEST_TABLE] = ehr_union.mapping_table_for(table)
            q_unioned_mapping[DELETE_FLAG] = delete_flag
            if q_unioned_mapping[DEST_TABLE] in existing_tables:
                if q_unioned_mapping[DELETE_FLAG]:
                    q_unioned_mapping[
                        QUERY] = DELETE_RETRACT_MAPPING_DATA_UNIONED_QUERY.format(
                            project=project_id,
                            dataset=q_unioned_mapping[DEST_DATASET],
                            mapping_table=q_unioned_mapping[DEST_TABLE],
                            table_id=get_table_id(table),
                            table=UNIONED_EHR + table,
                            pids=pids)
                else:
                    q_unioned_mapping[
                        QUERY] = SELECT_RETRACT_MAPPING_DATA_UNIONED_QUERY.format(
                            project=project_id,
                            dataset=q_unioned_mapping[DEST_DATASET],
                            mapping_table=q_unioned_mapping[DEST_TABLE],
                            table_id=get_table_id(table),
                            table=UNIONED_EHR + table,
                            pids=pids)
                unioned_mapping_queries.append(q_unioned_mapping)

            q_unioned_mapping_legacy = dict()
            q_unioned_mapping_legacy[DEST_DATASET] = dataset_id
            q_unioned_mapping_legacy[
                DEST_TABLE] = UNIONED_EHR + ehr_union.mapping_table_for(table)
            q_unioned_mapping_legacy[DELETE_FLAG] = delete_flag
            if q_unioned_mapping_legacy[DEST_TABLE] in existing_tables:
                if q_unioned_mapping_legacy[DELETE_FLAG]:
                    q_unioned_mapping_legacy[
                        QUERY] = DELETE_RETRACT_MAPPING_DATA_UNIONED_QUERY.format(
                            project=project_id,
                            dataset=q_unioned_mapping_legacy[DEST_DATASET],
                            mapping_table=q_unioned_mapping_legacy[DEST_TABLE],
                            table_id=get_table_id(table),
                            table=UNIONED_EHR + table,
                            pids=pids)
                else:
                    q_unioned_mapping_legacy[
                        QUERY] = SELECT_RETRACT_MAPPING_DATA_UNIONED_QUERY.format(
                            project=project_id,
                            dataset=q_unioned_mapping_legacy[DEST_DATASET],
                            mapping_table=q_unioned_mapping_legacy[DEST_TABLE],
                            table_id=get_table_id(table),
                            table=UNIONED_EHR + table,
                            pids=pids)
                unioned_mapping_legacy_queries.append(q_unioned_mapping_legacy)

        q_unioned = dict()
        q_unioned[DEST_DATASET] = dataset_id
        q_unioned[DEST_TABLE] = UNIONED_EHR + table
        q_unioned[DELETE_FLAG] = delete_flag
        if q_unioned[DEST_TABLE] in existing_tables:
            if q_unioned[DELETE_FLAG]:
                q_unioned[QUERY] = DELETE_RETRACT_DATA_UNIONED_QUERY.format(
                    project=project_id,
                    dataset=q_unioned[DEST_DATASET],
                    table=q_unioned[DEST_TABLE],
                    pids=pids)
            else:
                q_unioned[QUERY] = SELECT_RETRACT_DATA_UNIONED_QUERY.format(
                    project=project_id,
                    dataset=q_unioned[DEST_DATASET],
                    table=q_unioned[DEST_TABLE],
                    pids=pids)
            unioned_queries.append(q_unioned)

    # Remove from person table
    q_site_person = dict()
    q_site_person[DEST_DATASET] = dataset_id
    q_site_person[DEST_TABLE] = get_site_table(hpo_id, common.PERSON)
    q_site_person[DELETE_FLAG] = delete_flag
    if q_site_person[DEST_TABLE] in existing_tables:
        if q_site_person[DELETE_FLAG]:
            q_site_person[QUERY] = DELETE_RETRACT_DATA_SITE_QUERY.format(
                project=project_id,
                dataset=q_site_person[DEST_DATASET],
                table=q_site_person[DEST_TABLE],
                pids=pids)
        else:
            q_site_person[QUERY] = SELECT_RETRACT_DATA_SITE_QUERY.format(
                project=project_id,
                dataset=q_site_person[DEST_DATASET],
                table=q_site_person[DEST_TABLE],
                pids=pids)
        site_queries.append(q_site_person)

    q_unioned_person = dict()
    q_unioned_person[DEST_DATASET] = dataset_id
    q_unioned_person[DEST_TABLE] = UNIONED_EHR + common.PERSON
    q_unioned_person[DELETE_FLAG] = delete_flag
    if q_unioned_person[DEST_TABLE] in existing_tables:
        if q_unioned_person[DELETE_FLAG]:
            q_unioned_person[QUERY] = DELETE_RETRACT_DATA_UNIONED_QUERY.format(
                project=project_id,
                dataset=q_unioned_person[DEST_DATASET],
                table=q_unioned_person[DEST_TABLE],
                pids=pids)
        else:
            q_unioned_person[QUERY] = SELECT_RETRACT_DATA_UNIONED_QUERY.format(
                project=project_id,
                dataset=q_unioned_person[DEST_DATASET],
                table=q_unioned_person[DEST_TABLE],
                pids=pids)
        unioned_queries.append(q_unioned_person)

    # Remove fact_relationship records referencing retracted person_ids
    q_site_fact_relationship = dict()
    q_site_fact_relationship[DEST_DATASET] = dataset_id
    q_site_fact_relationship[DEST_TABLE] = get_site_table(
        hpo_id, common.FACT_RELATIONSHIP)
    q_site_fact_relationship[DELETE_FLAG] = delete_flag
    if q_site_fact_relationship[DEST_TABLE] in existing_tables:
        if q_site_fact_relationship[DELETE_FLAG]:
            q_site_fact_relationship[
                QUERY] = DELETE_RETRACT_DATA_FACT_RELATIONSHIP.format(
                    project=project_id,
                    dataset=q_site_fact_relationship[DEST_DATASET],
                    table=q_site_fact_relationship[DEST_TABLE],
                    PERSON_DOMAIN=PERSON_DOMAIN,
                    pids=pids)
        else:
            q_site_fact_relationship[
                QUERY] = SELECT_RETRACT_DATA_FACT_RELATIONSHIP.format(
                    project=project_id,
                    dataset=q_site_fact_relationship[DEST_DATASET],
                    table=q_site_fact_relationship[DEST_TABLE],
                    PERSON_DOMAIN=PERSON_DOMAIN,
                    pids=pids)
        site_queries.append(q_site_fact_relationship)

    q_unioned_fact_relationship = dict()
    q_unioned_fact_relationship[DEST_DATASET] = dataset_id
    q_unioned_fact_relationship[
        DEST_TABLE] = UNIONED_EHR + common.FACT_RELATIONSHIP
    q_unioned_fact_relationship[DELETE_FLAG] = delete_flag
    if q_unioned_fact_relationship[DEST_TABLE] in existing_tables:
        if q_unioned_fact_relationship[DELETE_FLAG]:
            q_unioned_fact_relationship[
                QUERY] = DELETE_RETRACT_DATA_FACT_RELATIONSHIP.format(
                    project=project_id,
                    dataset=q_unioned_fact_relationship[DEST_DATASET],
                    table=q_unioned_fact_relationship[DEST_TABLE],
                    PERSON_DOMAIN=PERSON_DOMAIN,
                    pids=pids)
        else:
            q_unioned_fact_relationship[
                QUERY] = SELECT_RETRACT_DATA_FACT_RELATIONSHIP.format(
                    project=project_id,
                    dataset=q_unioned_fact_relationship[DEST_DATASET],
                    table=q_unioned_fact_relationship[DEST_TABLE],
                    PERSON_DOMAIN=PERSON_DOMAIN,
                    pids=pids)
        unioned_queries.append(q_unioned_fact_relationship)

    return unioned_mapping_legacy_queries + unioned_mapping_queries, unioned_queries + site_queries
Пример #9
0
    def get_table_hpo_subquery_error(self, table, dataset_in, dataset_out):
        subquery = ehr_union.table_hpo_subquery(table, NYC_HPO_ID, dataset_in,
                                                dataset_out)

        # moz-sql-parser doesn't support the ROW_NUMBER() OVER() a analytical function of sql we are removing
        # that statement from the returned query for the parser be able to parse out the query without erroring out.

        subquery = re.sub(
            r",\s+ROW_NUMBER\(\) OVER \(PARTITION BY nm\..+?_id\) AS row_num",
            " ", subquery)
        stmt = moz_sql_parser.parse(subquery)

        # Sanity check it is a select statement
        if 'select' not in stmt:
            return SUBQUERY_FAIL_MSG.format(expr='query type',
                                            table=table,
                                            expected='select',
                                            actual=str(stmt),
                                            subquery=subquery)

        # Input table should be first in FROM expression
        actual_from = first_or_none(
            dpath.util.values(stmt, 'from/0/value/from/value')
            or dpath.util.values(stmt, 'from'))
        expected_from = dataset_in + '.' + bq_utils.get_table_id(
            NYC_HPO_ID, table)
        if expected_from != actual_from:
            return SUBQUERY_FAIL_MSG.format(expr='first object in FROM',
                                            table=table,
                                            expected=expected_from,
                                            actual=actual_from,
                                            subquery=subquery)

        # Ensure all key fields (primary or foreign) yield joins with their associated mapping tables
        # Note: ordering of joins in the subquery is assumed to be consistent with field order in the json file
        fields = resources.fields_for(table)
        id_field = table + '_id'
        key_ind = 0
        expected_join = None
        actual_join = None
        for field in fields:
            if field['name'] in self.mapped_fields:
                # key_ind += 1  # TODO use this increment when we generalize solution for all foreign keys
                if field['name'] == id_field:
                    # Primary key, mapping table associated with this one should be INNER joined
                    key_ind += 1
                    expr = 'inner join on primary key'
                    actual_join = first_or_none(
                        dpath.util.values(stmt,
                                          'from/%s/join/value' % key_ind))
                    expected_join = dataset_out + '.' + ehr_union.mapping_table_for(
                        table)
                elif field['name'] in self.implemented_foreign_keys:
                    # Foreign key, mapping table associated with the referenced table should be LEFT joined
                    key_ind += 1
                    expr = 'left join on foreign key'
                    actual_join = first_or_none(
                        dpath.util.values(stmt,
                                          'from/%s/left join/value' % key_ind))
                    joined_table = field['name'].replace('_id', '')
                    expected_join = dataset_out + '.' + ehr_union.mapping_table_for(
                        joined_table)
                if expected_join != actual_join:
                    return SUBQUERY_FAIL_MSG.format(expr=expr,
                                                    table=table,
                                                    expected=expected_join,
                                                    actual=actual_join,
                                                    subquery=subquery)
Пример #10
0
    def test_union_ehr(self):
        self._load_datasets()
        input_tables_before = set(self._dataset_tables(self.input_dataset_id))

        # output should be mapping tables and cdm tables
        output_tables_before = self._dataset_tables(self.output_dataset_id)
        mapping_tables = [
            ehr_union.mapping_table_for(table)
            for table in cdm.tables_to_map() + [combine_ehr_rdr.PERSON_TABLE]
        ]
        output_cdm_tables = [
            ehr_union.output_table_for(table) for table in resources.CDM_TABLES
        ]
        expected_output = set(output_tables_before + mapping_tables +
                              output_cdm_tables)

        # perform ehr union
        ehr_union.main(self.input_dataset_id, self.output_dataset_id,
                       self.project_id, self.hpo_ids)

        # input dataset should be unchanged
        input_tables_after = set(self._dataset_tables(self.input_dataset_id))
        self.assertSetEqual(input_tables_before, input_tables_after)

        # fact_relationship from pitt
        hpo_unique_identifiers = ehr_union.get_hpo_offsets(self.hpo_ids)
        pitt_offset = hpo_unique_identifiers[PITT_HPO_ID]
        q = '''SELECT fact_id_1, fact_id_2 
               FROM `{input_dataset}.{hpo_id}_fact_relationship`
               where domain_concept_id_1 = 21 and domain_concept_id_2 = 21'''.format(
            input_dataset=self.input_dataset_id, hpo_id=PITT_HPO_ID)
        response = bq_utils.query(q)
        result = bq_utils.response2rows(response)

        expected_fact_id_1 = result[0]["fact_id_1"] + pitt_offset
        expected_fact_id_2 = result[0]["fact_id_2"] + pitt_offset

        q = '''SELECT fr.fact_id_1, fr.fact_id_2 FROM `{dataset_id}.unioned_ehr_fact_relationship` fr
            join `{dataset_id}._mapping_measurement` mm on fr.fact_id_1 = mm.measurement_id
            and mm.src_hpo_id = "{hpo_id}"'''.format(
            dataset_id=self.output_dataset_id, hpo_id=PITT_HPO_ID)
        response = bq_utils.query(q)
        result = bq_utils.response2rows(response)
        actual_fact_id_1, actual_fact_id_2 = result[0]["fact_id_1"], result[0][
            "fact_id_2"]
        self.assertEqual(expected_fact_id_1, actual_fact_id_1)
        self.assertEqual(expected_fact_id_2, actual_fact_id_2)

        # mapping tables
        tables_to_map = cdm.tables_to_map()
        for table_to_map in tables_to_map:
            mapping_table = ehr_union.mapping_table_for(table_to_map)
            expected_fields = {
                'src_table_id',
                'src_%s_id' % table_to_map,
                '%s_id' % table_to_map, 'src_hpo_id'
            }
            mapping_table_info = bq_utils.get_table_info(
                mapping_table, dataset_id=self.output_dataset_id)
            mapping_table_fields = mapping_table_info.get('schema',
                                                          dict()).get(
                                                              'fields', [])
            actual_fields = set([f['name'] for f in mapping_table_fields])
            message = 'Table %s has fields %s when %s expected' % (
                mapping_table, actual_fields, expected_fields)
            self.assertSetEqual(expected_fields, actual_fields, message)
            result_table = ehr_union.output_table_for(table_to_map)
            expected_num_rows = len(self.expected_tables[result_table])
            actual_num_rows = int(mapping_table_info.get('numRows', -1))
            message = 'Table %s has %s rows when %s expected' % (
                mapping_table, actual_num_rows, expected_num_rows)
            self.assertEqual(expected_num_rows, actual_num_rows, message)

        # check for each output table
        for table_name in resources.CDM_TABLES:
            # output table exists and row count is sum of those submitted by hpos
            result_table = ehr_union.output_table_for(table_name)
            expected_rows = self.expected_tables[result_table]
            expected_count = len(expected_rows)
            table_info = bq_utils.get_table_info(
                result_table, dataset_id=self.output_dataset_id)
            actual_count = int(table_info.get('numRows'))
            msg = 'Unexpected row count in table {result_table} after ehr union'.format(
                result_table=result_table)
            self.assertEqual(expected_count, actual_count, msg)
            # TODO Compare table rows to expected accounting for the new ids and ignoring field types
            # q = 'SELECT * FROM {dataset}.{table}'.format(dataset=self.output_dataset_id, table=result_table)
            # query_response = bq_utils.query(q)
            # actual_rows = bq_utils.response2rows(query_response)

            # output table has clustering on person_id where applicable
            fields_file = os.path.join(resources.fields_path,
                                       table_name + '.json')
            with open(fields_file, 'r') as fp:
                fields = json.load(fp)
                field_names = [field['name'] for field in fields]
                if 'person_id' in field_names:
                    self._table_has_clustering(table_info)

        actual_output = set(self._dataset_tables(self.output_dataset_id))
        self.assertSetEqual(expected_output, actual_output)

        # explicit check that output person_ids are same as input
        nyc_person_table_id = bq_utils.get_table_id(NYC_HPO_ID, 'person')
        pitt_person_table_id = bq_utils.get_table_id(PITT_HPO_ID, 'person')
        q = '''SELECT DISTINCT person_id FROM (
           SELECT person_id FROM {dataset_id}.{nyc_person_table_id}
           UNION ALL
           SELECT person_id FROM {dataset_id}.{pitt_person_table_id}
        ) ORDER BY person_id ASC'''.format(
            dataset_id=self.input_dataset_id,
            nyc_person_table_id=nyc_person_table_id,
            pitt_person_table_id=pitt_person_table_id)
        response = bq_utils.query(q)
        expected_rows = bq_utils.response2rows(response)
        person_table_id = ehr_union.output_table_for('person')
        q = '''SELECT DISTINCT person_id 
               FROM {dataset_id}.{table_id} 
               ORDER BY person_id ASC'''.format(
            dataset_id=self.output_dataset_id, table_id=person_table_id)
        response = bq_utils.query(q)
        actual_rows = bq_utils.response2rows(response)
        self.assertCountEqual(expected_rows, actual_rows)
Пример #11
0
def queries_to_retract_from_combined_or_deid_dataset(
        project_id, dataset_id, pid_project_id, sandbox_dataset_id,
        pid_table_id, retraction_type, deid_flag):
    """
    Get list of queries to remove all records in all tables associated with supplied ids

    :param project_id: identifies associated project
    :param dataset_id: identifies associated dataset
    :param pid_project_id: identifies the project containing the sandbox dataset
    :param sandbox_dataset_id: identifies the dataset containing the pid table
    :param pid_table_id: table containing the person_ids and research_ids
    :param retraction_type: string indicating whether all data needs to be removed, including RDR,
        or if RDR data needs to be kept intact. Can take the values 'rdr_and_ehr' or 'only_ehr'
    :param deid_flag: flag indicating if running on a deid dataset
    :return: list of dict with keys query, dataset, table
    """
    logging.info('Checking existing tables for %s.%s' %
                 (project_id, dataset_id))
    existing_tables = list_existing_tables(project_id, dataset_id)

    # retract from ehr and rdr or only ehr
    if retraction_type == 'rdr_and_ehr':
        logging.info('Retracting from RDR and EHR data for %s' % dataset_id)
        constant_factor_rdr = 0
    elif retraction_type == 'only_ehr':
        logging.info('Retracting from EHR data while retaining RDR for %s' %
                     dataset_id)
        constant_factor_rdr = common.RDR_ID_CONSTANT + common.ID_CONSTANT_FACTOR
    else:
        raise ValueError('%s is not a valid retraction type' % retraction_type)

    combined_mapping_queries = []
    combined_queries = []
    for table in TABLES_FOR_RETRACTION:
        if table is not common.DEATH:
            q_combined_mapping = dict()
            q_combined_mapping[DEST_DATASET] = dataset_id
            q_combined_mapping[DEST_TABLE] = ehr_union.mapping_table_for(table)
            if q_combined_mapping[DEST_TABLE] in existing_tables:
                q_combined_mapping[
                    QUERY] = RETRACT_MAPPING_DATA_COMBINED_QUERY.format(
                        project=project_id,
                        pid_project=pid_project_id,
                        dataset=q_combined_mapping[DEST_DATASET],
                        mapping_table=q_combined_mapping[DEST_TABLE],
                        table_id=get_table_id(table),
                        table=table,
                        pid_table_id=pid_table_id,
                        sandbox_dataset_id=sandbox_dataset_id,
                        CONSTANT_FACTOR=constant_factor_rdr,
                        person_research_id=RESEARCH_ID
                        if deid_flag else PERSON_ID)
                combined_mapping_queries.append(q_combined_mapping)

        q_combined = dict()
        q_combined[DEST_DATASET] = dataset_id
        q_combined[DEST_TABLE] = table
        if q_combined[DEST_TABLE] in existing_tables:
            q_combined[QUERY] = RETRACT_DATA_COMBINED_QUERY.format(
                project=project_id,
                pid_project=pid_project_id,
                dataset=q_combined[DEST_DATASET],
                table=q_combined[DEST_TABLE],
                pid_table_id=pid_table_id,
                table_id=get_table_id(table),
                sandbox_dataset_id=sandbox_dataset_id,
                CONSTANT_FACTOR=constant_factor_rdr,
                person_research_id=RESEARCH_ID if deid_flag else PERSON_ID)
            combined_queries.append(q_combined)

    if retraction_type == 'rdr_and_ehr':
        # retract from person
        q_combined_person = dict()
        q_combined_person[DEST_DATASET] = dataset_id
        q_combined_person[DEST_TABLE] = common.PERSON
        if q_combined_person[DEST_TABLE] in existing_tables:
            q_combined_person[QUERY] = RETRACT_DATA_UNIONED_QUERY.format(
                project=project_id,
                pid_project=pid_project_id,
                dataset=q_combined_person[DEST_DATASET],
                table=q_combined_person[DEST_TABLE],
                pid_table_id=pid_table_id,
                sandbox_dataset_id=sandbox_dataset_id,
                person_research_id=RESEARCH_ID if deid_flag else PERSON_ID)
            combined_queries.append(q_combined_person)

    # fix death query to exclude constant
    for q in combined_queries:
        if q[DEST_TABLE] is common.DEATH:
            q[QUERY] = RETRACT_DATA_UNIONED_QUERY.format(
                project=project_id,
                pid_project=pid_project_id,
                dataset=q[DEST_DATASET],
                table=q[DEST_TABLE],
                pid_table_id=pid_table_id,
                sandbox_dataset_id=sandbox_dataset_id,
                person_research_id=RESEARCH_ID if deid_flag else PERSON_ID)

    q_combined_fact_relationship = dict()
    q_combined_fact_relationship[DEST_DATASET] = dataset_id
    q_combined_fact_relationship[DEST_TABLE] = common.FACT_RELATIONSHIP
    if q_combined_fact_relationship[DEST_TABLE] in existing_tables:
        q_combined_fact_relationship[
            QUERY] = RETRACT_DATA_FACT_RELATIONSHIP.format(
                project=project_id,
                pid_project=pid_project_id,
                dataset=q_combined_fact_relationship[DEST_DATASET],
                table=q_combined_fact_relationship[DEST_TABLE],
                PERSON_DOMAIN=PERSON_DOMAIN,
                pid_table_id=pid_table_id,
                sandbox_dataset_id=sandbox_dataset_id,
                person_research_id=RESEARCH_ID if deid_flag else PERSON_ID)
        combined_queries.append(q_combined_fact_relationship)

    return combined_mapping_queries, combined_queries
Пример #12
0
def queries_to_retract_from_unioned_dataset(project_id, dataset_id,
                                            pid_project_id, sandbox_dataset_id,
                                            pid_table_id):
    """
    Get list of queries to remove all records in all tables associated with supplied ids

    :param project_id: identifies associated project
    :param dataset_id: identifies associated dataset
    :param pid_project_id: identifies the project containing the sandbox dataset
    :param sandbox_dataset_id: identifies the dataset containing the pid table
    :param pid_table_id: table containing the person_ids and research_ids
    :return: list of dict with keys query, dataset, table
    """
    logging.info('Checking existing tables for %s.%s' %
                 (project_id, dataset_id))
    existing_tables = list_existing_tables(project_id, dataset_id)
    unioned_mapping_queries = []
    unioned_queries = []
    for table in TABLES_FOR_RETRACTION:
        if table is not common.DEATH:
            q_unioned_mapping = dict()
            q_unioned_mapping[DEST_DATASET] = dataset_id
            q_unioned_mapping[DEST_TABLE] = ehr_union.mapping_table_for(table)
            if q_unioned_mapping[DEST_TABLE] in existing_tables:
                q_unioned_mapping[
                    QUERY] = RETRACT_MAPPING_DATA_UNIONED_QUERY.format(
                        project=project_id,
                        pid_project=pid_project_id,
                        dataset=q_unioned_mapping[DEST_DATASET],
                        mapping_table=q_unioned_mapping[DEST_TABLE],
                        table_id=get_table_id(table),
                        table=table,
                        pid_table_id=pid_table_id,
                        sandbox_dataset_id=sandbox_dataset_id,
                        person_research_id=PERSON_ID)
                unioned_mapping_queries.append(q_unioned_mapping)

        q_unioned = dict()
        q_unioned[DEST_DATASET] = dataset_id
        q_unioned[DEST_TABLE] = table
        if q_unioned[DEST_TABLE] in existing_tables:
            q_unioned[QUERY] = RETRACT_DATA_UNIONED_QUERY.format(
                project=project_id,
                pid_project=pid_project_id,
                dataset=q_unioned[DEST_DATASET],
                table=q_unioned[DEST_TABLE],
                pid_table_id=pid_table_id,
                sandbox_dataset_id=sandbox_dataset_id,
                person_research_id=PERSON_ID)
            unioned_queries.append(q_unioned)

    # retract from person
    q_unioned_person = dict()
    q_unioned_person[DEST_DATASET] = dataset_id
    q_unioned_person[DEST_TABLE] = common.PERSON
    if q_unioned_person[DEST_TABLE] in existing_tables:
        q_unioned_person[QUERY] = RETRACT_DATA_UNIONED_QUERY.format(
            project=project_id,
            pid_project=pid_project_id,
            dataset=q_unioned_person[DEST_DATASET],
            table=q_unioned_person[DEST_TABLE],
            pid_table_id=pid_table_id,
            sandbox_dataset_id=sandbox_dataset_id,
            person_research_id=PERSON_ID)
        unioned_queries.append(q_unioned_person)

    q_unioned_fact_relationship = dict()
    q_unioned_fact_relationship[DEST_DATASET] = dataset_id
    q_unioned_fact_relationship[DEST_TABLE] = common.FACT_RELATIONSHIP
    if q_unioned_fact_relationship[DEST_TABLE] in existing_tables:
        q_unioned_fact_relationship[
            QUERY] = RETRACT_DATA_FACT_RELATIONSHIP.format(
                project=project_id,
                pid_project=pid_project_id,
                dataset=q_unioned_fact_relationship[DEST_DATASET],
                table=q_unioned_fact_relationship[DEST_TABLE],
                PERSON_DOMAIN=PERSON_DOMAIN,
                pid_table_id=pid_table_id,
                sandbox_dataset_id=sandbox_dataset_id,
                person_research_id=PERSON_ID)
        unioned_queries.append(q_unioned_fact_relationship)

    return unioned_mapping_queries, unioned_queries
Пример #13
0
def queries_to_retract_from_ehr_dataset(project_id, dataset_id, pid_project_id,
                                        sandbox_dataset_id, hpo_id,
                                        pid_table_id):
    """
    Get list of queries to remove all records in all tables associated with supplied ids

    :param project_id: identifies associated project
    :param dataset_id: identifies associated dataset
    :param pid_project_id: identifies the project containing the sandbox dataset
    :param sandbox_dataset_id: identifies the dataset containing the pid table
    :param hpo_id: identifies the HPO site
    :param pid_table_id: table containing the person_ids and research_ids
    :return: list of dict with keys query, dataset, table, delete_flag
    """
    logging.info('Checking existing tables for %s.%s' %
                 (project_id, dataset_id))
    existing_tables = list_existing_tables(project_id, dataset_id)
    site_queries = []
    unioned_mapping_queries = []
    unioned_mapping_legacy_queries = []
    unioned_queries = []
    for table in TABLES_FOR_RETRACTION:
        q_site = dict()
        q_site[DEST_DATASET] = dataset_id
        q_site[DEST_TABLE] = get_site_table(hpo_id, table)
        if q_site[DEST_TABLE] in existing_tables:
            q_site[QUERY] = RETRACT_DATA_SITE_QUERY.format(
                project=project_id,
                pid_project=pid_project_id,
                dataset=q_site[DEST_DATASET],
                table=q_site[DEST_TABLE],
                pid_table_id=pid_table_id,
                sandbox_dataset_id=sandbox_dataset_id,
                person_research_id=PERSON_ID)
            site_queries.append(q_site)

        # death does not have mapping table
        if table is not common.DEATH:
            q_unioned_mapping = dict()
            q_unioned_mapping[DEST_DATASET] = dataset_id
            q_unioned_mapping[DEST_TABLE] = ehr_union.mapping_table_for(table)
            if q_unioned_mapping[DEST_TABLE] in existing_tables:
                q_unioned_mapping[
                    QUERY] = RETRACT_MAPPING_DATA_UNIONED_QUERY.format(
                        project=project_id,
                        pid_project=pid_project_id,
                        dataset=q_unioned_mapping[DEST_DATASET],
                        mapping_table=q_unioned_mapping[DEST_TABLE],
                        table_id=get_table_id(table),
                        table=UNIONED_EHR + table,
                        pid_table_id=pid_table_id,
                        sandbox_dataset_id=sandbox_dataset_id,
                        person_research_id=PERSON_ID)
                unioned_mapping_queries.append(q_unioned_mapping)

            q_unioned_mapping_legacy = dict()
            q_unioned_mapping_legacy[DEST_DATASET] = dataset_id
            q_unioned_mapping_legacy[
                DEST_TABLE] = UNIONED_EHR + ehr_union.mapping_table_for(table)
            if q_unioned_mapping_legacy[DEST_TABLE] in existing_tables:
                q_unioned_mapping_legacy[
                    QUERY] = RETRACT_MAPPING_DATA_UNIONED_QUERY.format(
                        project=project_id,
                        pid_project=pid_project_id,
                        dataset=q_unioned_mapping_legacy[DEST_DATASET],
                        mapping_table=q_unioned_mapping_legacy[DEST_TABLE],
                        table_id=get_table_id(table),
                        table=UNIONED_EHR + table,
                        pid_table_id=pid_table_id,
                        sandbox_dataset_id=sandbox_dataset_id,
                        person_research_id=PERSON_ID)
                unioned_mapping_legacy_queries.append(q_unioned_mapping_legacy)

        q_unioned = dict()
        q_unioned[DEST_DATASET] = dataset_id
        q_unioned[DEST_TABLE] = UNIONED_EHR + table
        if q_unioned[DEST_TABLE] in existing_tables:
            q_unioned[QUERY] = RETRACT_DATA_UNIONED_QUERY.format(
                project=project_id,
                pid_project=pid_project_id,
                dataset=q_unioned[DEST_DATASET],
                table=q_unioned[DEST_TABLE],
                pid_table_id=pid_table_id,
                sandbox_dataset_id=sandbox_dataset_id,
                person_research_id=PERSON_ID)
            unioned_queries.append(q_unioned)

    # Remove from person table
    q_site_person = dict()
    q_site_person[DEST_DATASET] = dataset_id
    q_site_person[DEST_TABLE] = get_site_table(hpo_id, common.PERSON)
    if q_site_person[DEST_TABLE] in existing_tables:
        q_site_person[QUERY] = RETRACT_DATA_SITE_QUERY.format(
            project=project_id,
            pid_project=pid_project_id,
            dataset=q_site_person[DEST_DATASET],
            table=q_site_person[DEST_TABLE],
            pid_table_id=pid_table_id,
            sandbox_dataset_id=sandbox_dataset_id,
            person_research_id=PERSON_ID)
        site_queries.append(q_site_person)

    q_unioned_person = dict()
    q_unioned_person[DEST_DATASET] = dataset_id
    q_unioned_person[DEST_TABLE] = UNIONED_EHR + common.PERSON
    if q_unioned_person[DEST_TABLE] in existing_tables:
        q_unioned_person[QUERY] = RETRACT_DATA_UNIONED_QUERY.format(
            project=project_id,
            pid_project=pid_project_id,
            dataset=q_unioned_person[DEST_DATASET],
            table=q_unioned_person[DEST_TABLE],
            pid_table_id=pid_table_id,
            sandbox_dataset_id=sandbox_dataset_id,
            person_research_id=PERSON_ID)
        unioned_queries.append(q_unioned_person)

    # Remove fact_relationship records referencing retracted person_ids
    q_site_fact_relationship = dict()
    q_site_fact_relationship[DEST_DATASET] = dataset_id
    q_site_fact_relationship[DEST_TABLE] = get_site_table(
        hpo_id, common.FACT_RELATIONSHIP)
    if q_site_fact_relationship[DEST_TABLE] in existing_tables:
        q_site_fact_relationship[
            QUERY] = RETRACT_DATA_FACT_RELATIONSHIP.format(
                project=project_id,
                pid_project=pid_project_id,
                dataset=q_site_fact_relationship[DEST_DATASET],
                table=q_site_fact_relationship[DEST_TABLE],
                PERSON_DOMAIN=PERSON_DOMAIN,
                pid_table_id=pid_table_id,
                sandbox_dataset_id=sandbox_dataset_id,
                person_research_id=PERSON_ID)
        site_queries.append(q_site_fact_relationship)

    q_unioned_fact_relationship = dict()
    q_unioned_fact_relationship[DEST_DATASET] = dataset_id
    q_unioned_fact_relationship[
        DEST_TABLE] = UNIONED_EHR + common.FACT_RELATIONSHIP
    if q_unioned_fact_relationship[DEST_TABLE] in existing_tables:
        q_unioned_fact_relationship[
            QUERY] = RETRACT_DATA_FACT_RELATIONSHIP.format(
                project=project_id,
                pid_project=pid_project_id,
                dataset=q_unioned_fact_relationship[DEST_DATASET],
                table=q_unioned_fact_relationship[DEST_TABLE],
                PERSON_DOMAIN=PERSON_DOMAIN,
                pid_table_id=pid_table_id,
                sandbox_dataset_id=sandbox_dataset_id,
                person_research_id=PERSON_ID)
        unioned_queries.append(q_unioned_fact_relationship)

    return unioned_mapping_legacy_queries + unioned_mapping_queries, unioned_queries + site_queries
Пример #14
0
    def get_table_hpo_subquery_error(self, table, dataset_in, dataset_out):
        subquery = ehr_union.table_hpo_subquery(table, NYC_HPO_ID, dataset_in,
                                                dataset_out)

        # moz-sql-parser doesn't support the ROW_NUMBER() OVER() a analytical function of sql we are removing
        # that statement from the returned query for the parser be able to parse out the query without erroring out.

        subquery = re.sub(
            r",\s+ROW_NUMBER\(\) OVER \(PARTITION BY nm\..+?_id\) AS row_num",
            " ", subquery)
        # offset is being used as a column-name in note_nlp table.
        # Although, BigQuery does not throw any errors for this, moz_sql_parser indentifies as a SQL Keyword.
        # So, change required only in Test Script as a workaround.
        if 'offset,' in subquery:
            subquery = subquery.replace('offset,', '"offset",')
        stmt = moz_sql_parser.parse(subquery)

        # Sanity check it is a select statement
        if 'select' not in stmt:
            return SUBQUERY_FAIL_MSG.format(expr='query type',
                                            table=table,
                                            expected='select',
                                            actual=str(stmt),
                                            subquery=subquery)

        # Input table should be first in FROM expression
        actual_from = first_or_none(
            dpath.util.values(stmt, 'from/0/value/from/value') or
            dpath.util.values(stmt, 'from'))
        expected_from = dataset_in + '.' + bq_utils.get_table_id(
            NYC_HPO_ID, table)
        if expected_from != actual_from:
            return SUBQUERY_FAIL_MSG.format(expr='first object in FROM',
                                            table=table,
                                            expected=expected_from,
                                            actual=actual_from,
                                            subquery=subquery)

        # Ensure all key fields (primary or foreign) yield joins with their associated mapping tables
        # Note: ordering of joins in the subquery is assumed to be consistent with field order in the json file
        fields = resources.fields_for(table)
        id_field = table + '_id'
        key_ind = 0
        expected_join = None
        actual_join = None
        for field in fields:
            if field['name'] in self.mapped_fields:
                # key_ind += 1  # TODO use this increment when we generalize solution for all foreign keys
                if field['name'] == id_field:
                    # Primary key, mapping table associated with this one should be INNER joined
                    key_ind += 1
                    expr = 'inner join on primary key'
                    actual_join = first_or_none(
                        dpath.util.values(stmt, 'from/%s/join/value' % key_ind))
                    expected_join = dataset_out + '.' + ehr_union.mapping_table_for(
                        table)
                elif field['name'] in self.implemented_foreign_keys:
                    # Foreign key, mapping table associated with the referenced table should be LEFT joined
                    key_ind += 1
                    expr = 'left join on foreign key'
                    # Visit_detail table has 'visit_occurrence' column after 'care_site', which is different from
                    # other cdm tables, where 'visit_occurrence' comes before other foreign_keys.
                    # The test expects the same order as other cmd tables, so the expected-query has
                    # 'visit_occurrence' before 'care_site'. The following reorder is required to match the sequence
                    # to the actual-query.
                    if table == 'visit_detail' and key_ind == 2:
                        stmt['from'][2], stmt['from'][3] = stmt['from'][
                            3], stmt['from'][2]
                    actual_join = first_or_none(
                        dpath.util.values(stmt,
                                          'from/%s/left join/value' % key_ind))
                    joined_table = field['name'].replace('_id', '')
                    expected_join = dataset_out + '.' + ehr_union.mapping_table_for(
                        joined_table)
                if expected_join != actual_join:
                    return SUBQUERY_FAIL_MSG.format(expr=expr,
                                                    table=table,
                                                    expected=expected_join,
                                                    actual=actual_join,
                                                    subquery=subquery)