Exemplo n.º 1
0
    def test_subqueries(self):
        hpo_ids = ['chs', 'pitt']
        project_id = bq_utils.app_identity.get_application_id()
        dataset_id = bq_utils.get_dataset_id()
        table = 'measurement'
        mapping_msg = 'Expected mapping subquery count %s but got %s'
        union_msg = 'Expected union subquery count %s but got %s'

        # Should not generate subqueries when HPO tables do not exist
        pitt_table_id = self._create_hpo_table('pitt', table, dataset_id)
        expected_count = 1

        subqueries = ehr_union._mapping_subqueries(table, hpo_ids, dataset_id,
                                                   project_id)
        actual_count = len(subqueries)
        self.assertEqual(expected_count, actual_count,
                         mapping_msg % (expected_count, actual_count))
        subquery = subqueries[0]
        self.assertTrue(pitt_table_id in subquery)

        subqueries = ehr_union._union_subqueries(table, hpo_ids, dataset_id,
                                                 self.output_dataset_id)
        self.assertEqual(expected_count, actual_count,
                         union_msg % (expected_count, actual_count))
        subquery = subqueries[0]
        self.assertTrue(pitt_table_id in subquery)

        # After adding measurement table for chs, should generate subqueries for both
        chs_table_id = self._create_hpo_table('chs', table, dataset_id)
        expected_count = 2
        subqueries = ehr_union._mapping_subqueries(table, hpo_ids, dataset_id,
                                                   project_id)
        actual_count = len(subqueries)
        self.assertEqual(expected_count, actual_count,
                         mapping_msg % (expected_count, actual_count))
        self.assertTrue(any(sq for sq in subqueries if pitt_table_id in sq))
        self.assertTrue(any(sq for sq in subqueries if chs_table_id in sq))

        subqueries = ehr_union._union_subqueries(table, hpo_ids, dataset_id,
                                                 self.output_dataset_id)
        actual_count = len(subqueries)
        self.assertEqual(expected_count, actual_count,
                         union_msg % (expected_count, actual_count))
        self.assertTrue(any(sq for sq in subqueries if pitt_table_id in sq))
        self.assertTrue(any(sq for sq in subqueries if chs_table_id in sq))
Exemplo n.º 2
0
    def test_mapping_subqueries(self, mock_list_all_table_ids):
        """
        Verify the query for loading mapping tables. A constant value should be added to
        destination key fields in all tables except for person where the values in 
        the src_person_id and person_id fields should be equal.
        
        :param mock_list_all_table_ids: simulate tables being returned
        """
        # patch list_all_table_ids so that
        # for FAKE_SITE_1 and FAKE_SITE_2
        #   it returns both of their person, visit_occurrence and pii_name tables
        # for FAKE_SITE_1 only
        #   it returns the condition_occurrence table
        tables = ['person', 'visit_occurrence', 'pii_name']
        fake_table_ids = [
            bq_utils.get_table_id(hpo_id, table) for hpo_id in self.hpo_ids
            for table in tables
        ]
        fake_table_ids.append(
            bq_utils.get_table_id(self.FAKE_SITE_1, 'condition_occurrence'))
        mock_list_all_table_ids.return_value = fake_table_ids
        hpo_count = len(self.hpo_ids)

        # offset is added to the visit_occurrence destination key field
        actual = eu._mapping_subqueries('visit_occurrence', self.hpo_ids,
                                        'fake_dataset', 'fake_project')
        hpo_unique_identifiers = eu.get_hpo_offsets(self.hpo_ids)
        self.assertEqual(hpo_count, len(actual))
        for i in range(0, hpo_count):
            hpo_id = self.hpo_ids[i]
            subquery = actual[i]
            hpo_table = bq_utils.get_table_id(hpo_id, 'visit_occurrence')
            hpo_offset = hpo_unique_identifiers[hpo_id]
            self.assertIn(f"'{hpo_table}' AS src_table_id", subquery)
            self.assertIn('visit_occurrence_id AS src_visit_occurrence_id',
                          subquery)
            self.assertIn(
                f'visit_occurrence_id + {hpo_offset} AS visit_occurrence_id',
                subquery)

        # src_person_id and person_id fields both use participant ID value
        # (offset is NOT added to the value)
        actual = eu._mapping_subqueries('person', self.hpo_ids, 'fake_dataset',
                                        'fake_project')
        self.assertEqual(hpo_count, len(actual))
        for i in range(0, hpo_count):
            hpo_id = self.hpo_ids[i]
            subquery = actual[i]
            hpo_table = bq_utils.get_table_id(hpo_id, 'person')
            self.assertIn(f"'{hpo_table}' AS src_table_id", subquery)
            self.assertIn('person_id AS src_person_id', subquery)
            self.assertIn('person_id AS person_id', subquery)

        # only return queries for tables that exist
        actual = eu._mapping_subqueries('condition_occurrence', self.hpo_ids,
                                        'fake_dataset', 'fake_project')
        self.assertEqual(1, len(actual))
        subquery = actual[0]
        hpo_table = bq_utils.get_table_id(self.FAKE_SITE_1,
                                          'condition_occurrence')
        hpo_offset = hpo_unique_identifiers[self.FAKE_SITE_1]
        self.assertIn(f"'{hpo_table}' AS src_table_id", subquery)
        self.assertIn('condition_occurrence_id AS src_condition_occurrence_id',
                      subquery)
        self.assertIn(
            f'condition_occurrence_id + {hpo_offset} AS condition_occurrence_id',
            subquery)