def get_reroute_domain_mapping_queries(project_id, dataset_id): """ The functions generates a list of query dicts for rerouting the mapping records to the approapriate domain. :param project_id: the project_id in which the query is run :param dataset_id: the dataset_id in which the query is run :return: a list of query dicts for rerouting the mapping records to the corresponding mapping table """ queries = [] for dest_table in domain_mapping.DOMAIN_TABLE_NAMES: # Figure out all possible rerouting source tables for a given destination table src_tables = [ src_table for src_table in domain_mapping.DOMAIN_TABLE_NAMES if (src_table == dest_table) or domain_mapping.exist_domain_mappings(src_table, dest_table) ] queries.append({ cdr_consts.QUERY: REROUTE_DOMAIN_MAPPING_RECORD_QUERY.render(project_id=project_id, dataset_id=dataset_id, src_tables=src_tables, dest_table=dest_table), cdr_consts.DESTINATION_TABLE: mapping_table_for(dest_table), cdr_consts.DISPOSITION: bq_consts.WRITE_TRUNCATE, cdr_consts.DESTINATION_DATASET: dataset_id }) return queries
def _all_rdr_records_included(self): """ All rdr records are included whether or not there is corresponding ehr record """ for domain_table in DOMAIN_TABLES: mapping_table = mapping_table_for(domain_table) query = ( 'SELECT rt.{domain_table}_id as id ' 'FROM `{rdr_dataset_id}.{domain_table}` AS rt ' 'LEFT JOIN `{combined_dataset_id}.{mapping_table}` AS m ' 'ON rt.{domain_table}_id = m.src_{domain_table}_id ' 'WHERE ' ' m.{domain_table}_id IS NULL ' 'OR NOT EXISTS ' ' (SELECT 1 FROM `{combined_dataset_id}.{domain_table}` AS t ' ' WHERE t.{domain_table}_id = m.{domain_table}_id)').format( domain_table=domain_table, rdr_dataset_id=bq_utils.get_rdr_dataset_id(), combined_dataset_id=bq_utils.get_combined_dataset_id(), mapping_table=mapping_table) response = bq_utils.query(query) rows = bq_utils.response2rows(response) self.assertEqual( 0, len(rows), "RDR records should map to records in mapping and combined tables" )
def _all_rdr_records_included(self): """ All rdr records are included whether or not there is corresponding ehr record """ for domain_table in DOMAIN_TABLES: mapping_table = mapping_table_for(domain_table) q = '''SELECT rt.{domain_table}_id as id FROM {rdr_dataset_id}.{domain_table} rt LEFT JOIN {ehr_rdr_dataset_id}.{mapping_table} m ON rt.{domain_table}_id = m.src_{domain_table}_id WHERE m.{domain_table}_id IS NULL OR NOT EXISTS (SELECT 1 FROM {ehr_rdr_dataset_id}.{domain_table} t WHERE t.{domain_table}_id = m.{domain_table}_id)'''.format( domain_table=domain_table, rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), mapping_table=mapping_table) response = bq_utils.query(q) rows = test_util.response2rows(response) self.assertEqual( 0, len(rows), "RDR records should map to records in mapping and combined tables" )
def _mapping_table_checks(self): """ Check mapping tables exist, have correct schema, have expected number of records """ where = ( 'WHERE EXISTS ' ' (SELECT 1 FROM `{combined_dataset_id}.{ehr_consent_table_id}` AS c ' ' WHERE t.person_id = c.person_id)').format( combined_dataset_id=self.combined_dataset_id, ehr_consent_table_id=EHR_CONSENT_TABLE_ID) ehr_counts = test_util.get_table_counts(self.ehr_dataset_id, DOMAIN_TABLES, where) rdr_counts = test_util.get_table_counts(self.rdr_dataset_id) combined_counts = test_util.get_table_counts(self.combined_dataset_id) output_tables = combined_counts.keys() expected_counts = dict() expected_diffs = ['observation'] for table in DOMAIN_TABLES: expected_mapping_table = mapping_table_for(table) self.assertIn(expected_mapping_table, output_tables) expected_fields = resources.fields_for(expected_mapping_table) actual_table_info = bq_utils.get_table_info( expected_mapping_table, self.combined_dataset_id) actual_fields = actual_table_info.get('schema', dict()).get('fields', []) actual_fields_norm = map(test_util.normalize_field_payload, actual_fields) self.assertCountEqual(expected_fields, actual_fields_norm) # Count should be sum of EHR and RDR # (except for tables like observation where extra records are created for demographics) if 'person_id' in [ field.get('name', '') for field in resources.fields_for(table) ]: unconsented_ehr_records = self.get_unconsented_ehr_records_count( table) else: unconsented_ehr_records = 0 actual_count = combined_counts[expected_mapping_table] if table in expected_diffs: expected_count = actual_count else: expected_count = (ehr_counts[table] - unconsented_ehr_records) + rdr_counts[table] expected_counts[expected_mapping_table] = expected_count self.assertDictContainsSubset(expected_counts, combined_counts)
def _mapping_table_checks(self): """ Check mapping tables exist, have correct schema, have expected number of records """ where = ''' WHERE EXISTS (SELECT 1 FROM {ehr_rdr_dataset_id}.{ehr_consent_table_id} c WHERE t.person_id = c.person_id) '''.format(ehr_rdr_dataset_id=self.combined_dataset_id, ehr_consent_table_id=EHR_CONSENT_TABLE_ID) ehr_counts = test_util.get_table_counts(self.ehr_dataset_id, DOMAIN_TABLES, where) rdr_counts = test_util.get_table_counts(self.rdr_dataset_id) combined_counts = test_util.get_table_counts(self.combined_dataset_id) output_tables = combined_counts.keys() expected_counts = dict() expected_diffs = ['observation'] self.maxDiff = None for t in DOMAIN_TABLES: expected_mapping_table = mapping_table_for(t) self.assertIn(expected_mapping_table, output_tables) expected_fields = resources.fields_for(expected_mapping_table) actual_table_info = bq_utils.get_table_info( expected_mapping_table, self.combined_dataset_id) actual_fields = actual_table_info.get('schema', dict()).get('fields', []) actual_fields_norm = map(test_util.normalize_field_payload, actual_fields) self.assertItemsEqual(expected_fields, actual_fields_norm) # Count should be sum of EHR and RDR # (except for tables like observation where extra records are created for demographics) actual_count = combined_counts[expected_mapping_table] expected_count = actual_count if t in expected_diffs else ehr_counts[ t] + rdr_counts[t] expected_counts[expected_mapping_table] = expected_count self.assertDictContainsSubset(expected=expected_counts, actual=combined_counts)
def get_clean_domain_queries(project_id, dataset_id, sandbox_dataset_id): """ This function generates a list of query dicts for dropping records that do not belong to the domain table after rerouting. :param project_id: :param dataset_id: :param sandbox_dataset_id: :return: """ queries = [] sandbox_queries = [] for domain_table in domain_mapping.DOMAIN_TABLE_NAMES: sandbox_queries.append({ cdr_consts.QUERY: SANDBOX_DOMAIN_RECORD_QUERY_TEMPLATE.render( project_id=project_id, dataset_id=dataset_id, domain_table=domain_table), cdr_consts.DESTINATION_TABLE: sandbox_name_for(domain_table), cdr_consts.DISPOSITION: bq_consts.WRITE_TRUNCATE, cdr_consts.DESTINATION_DATASET: sandbox_dataset_id }) # add the clean-up query for the domain table queries.append({ cdr_consts.QUERY: CLEAN_DOMAIN_RECORD_QUERY_TEMPLATE.render( project_id=project_id, dataset_id=dataset_id, sandbox_dataset_id=sandbox_dataset_id, domain_table=domain_table, sandbox_table=sandbox_name_for(domain_table), is_mapping=False), cdr_consts.DESTINATION_TABLE: domain_table, cdr_consts.DISPOSITION: bq_consts.WRITE_TRUNCATE, cdr_consts.DESTINATION_DATASET: dataset_id }) # add the clean-up query for the corresponding mapping of the domain table queries.append({ cdr_consts.QUERY: CLEAN_DOMAIN_RECORD_QUERY_TEMPLATE.render( project_id=project_id, dataset_id=dataset_id, sandbox_dataset_id=sandbox_dataset_id, domain_table=domain_table, sandbox_table=sandbox_name_for(domain_table), is_mapping=True), cdr_consts.DESTINATION_TABLE: mapping_table_for(domain_table), cdr_consts.DISPOSITION: bq_consts.WRITE_TRUNCATE, cdr_consts.DESTINATION_DATASET: dataset_id }) return sandbox_queries + queries
def get_clean_domain_queries(project_id, dataset_id, sandbox_dataset_id): """ This function generates a list of query dicts for dropping records that do not belong to the domain table after rerouting. :param project_id: the project_id in which the query is run :param dataset_id: the dataset_id in which the query is run :param sandbox_dataset_id: sandbox dataset for dataset_id :return: list of query dicts to run """ queries = [] sandbox_queries = [] for domain_table in domain_mapping.DOMAIN_TABLE_NAMES: #Use non-standard concept if table is observation if domain_table == OBSERVATION: domain_concept_id = 'observation_source_concept_id' else: domain_concept_id = resources.get_domain_concept_id(domain_table) sandbox_queries.append({ cdr_consts.QUERY: SANDBOX_DOMAIN_RECORD_QUERY_TEMPLATE.render( project_id=project_id, dataset_id=dataset_id, domain_table=domain_table, domain_concept_id=domain_concept_id), cdr_consts.DESTINATION_TABLE: sandbox_name_for(domain_table), cdr_consts.DISPOSITION: bq_consts.WRITE_TRUNCATE, cdr_consts.DESTINATION_DATASET: sandbox_dataset_id }) # add the clean-up query for the domain table queries.append({ cdr_consts.QUERY: CLEAN_DOMAIN_RECORD_QUERY_TEMPLATE.render( project_id=project_id, dataset_id=dataset_id, sandbox_dataset_id=sandbox_dataset_id, domain_table=domain_table, sandbox_table=sandbox_name_for(domain_table), is_mapping=False), cdr_consts.DESTINATION_TABLE: domain_table, cdr_consts.DISPOSITION: bq_consts.WRITE_TRUNCATE, cdr_consts.DESTINATION_DATASET: dataset_id }) # add the clean-up query for the corresponding mapping of the domain table queries.append({ cdr_consts.QUERY: CLEAN_DOMAIN_RECORD_QUERY_TEMPLATE.render( project_id=project_id, dataset_id=dataset_id, sandbox_dataset_id=sandbox_dataset_id, domain_table=domain_table, sandbox_table=sandbox_name_for(domain_table), is_mapping=True), cdr_consts.DESTINATION_TABLE: mapping_table_for(domain_table), cdr_consts.DISPOSITION: bq_consts.WRITE_TRUNCATE, cdr_consts.DESTINATION_DATASET: dataset_id }) return sandbox_queries + queries