def test_ehr_person_to_observation_counts(self): self._load_datasets() # perform ehr union ehr_union.main(self.input_dataset_id, self.output_dataset_id, self.project_id, self.hpo_ids) q_person = ''' SELECT * FROM {output_dataset_id}.unioned_ehr_person AS p '''.format(output_dataset_id=self.output_dataset_id) person_response = bq_utils.query(q_person) person_rows = bq_utils.response2rows(person_response) q_observation = ''' SELECT * FROM {output_dataset_id}.unioned_ehr_observation WHERE observation_type_concept_id = 38000280 '''.format(output_dataset_id=self.output_dataset_id) # observation should contain 4 records per person of type EHR expected = len(person_rows) * 4 observation_response = bq_utils.query(q_observation) observation_rows = bq_utils.response2rows(observation_response) actual = len(observation_rows) self.assertEqual( actual, expected, 'Expected %s EHR person records in observation but found %s' % (expected, actual))
def run_heel(hpo_id): """ Run heel commands :param hpo_id: string name for the hpo identifier :returns: None :raises RuntimeError: Raised if BigQuery takes longer than 30 seconds to complete a job on a temporary table """ # very long test commands = _get_heel_commands(hpo_id) count = 0 for command in commands: count = count + 1 logging.debug(' ---- running query # {}'.format(count)) logging.debug(' ---- Running `%s`...\n' % command) if sql_wrangle.is_to_temp_table(command): table_id = sql_wrangle.get_temp_table_name(command) query = sql_wrangle.get_temp_table_query(command) insert_query_job_result = bq_utils.query(query, False, table_id) query_job_id = insert_query_job_result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([query_job_id]) if len(incomplete_jobs) > 0: logging.critical('tempresults doesnt get created in 30 secs') raise RuntimeError('Tempresults taking too long to create') elif sql_wrangle.is_truncate(command): table_id = sql_wrangle.get_truncate_table_name(command) query = 'DELETE FROM %s WHERE TRUE' % table_id bq_utils.query(query) elif sql_wrangle.is_drop(command): table_id = sql_wrangle.get_drop_table_name(command) bq_utils.delete_table(table_id) else: bq_utils.query(command)
def test_ehr_person_to_observation_counts(self, mock_tables_map): self._load_datasets() mock_tables_map.return_value = [ common.OBSERVATION, common.LOCATION, common.CARE_SITE, common.VISIT_OCCURRENCE ] # perform ehr union ehr_union.main(self.input_dataset_id, self.output_dataset_id, self.project_id, self.hpo_ids) q_person = ''' SELECT * FROM {output_dataset_id}.unioned_ehr_person AS p '''.format(output_dataset_id=self.output_dataset_id) person_response = bq_utils.query(q_person) person_rows = bq_utils.response2rows(person_response) q_observation = ''' SELECT * FROM {output_dataset_id}.unioned_ehr_observation WHERE observation_type_concept_id = 38000280 '''.format(output_dataset_id=self.output_dataset_id) # observation should contain 4 records per person of type EHR expected = len(person_rows) * 4 observation_response = bq_utils.query(q_observation) observation_rows = bq_utils.response2rows(observation_response) actual = len(observation_rows) self.assertEqual( actual, expected, 'Expected %s EHR person records in observation but found %s' % (expected, actual))
def _create_drug_class_table(self): table_name = 'drug_class' fields = [{ "type": "integer", "name": "concept_id", "mode": "required" }, { "type": "string", "name": "concept_name", "mode": "required" }, { "type": "string", "name": "drug_class_name", "mode": "required" }] bq_utils.create_table(table_id=table_name, fields=fields, drop_existing=True, dataset_id=self.bigquery_dataset_id) bq_utils.query(q=main_constants.DRUG_CLASS_QUERY.format( dataset_id=self.bigquery_dataset_id), use_legacy_sql=False, destination_table_id='drug_class', retry_count=bq_consts.BQ_DEFAULT_RETRY_COUNT, write_disposition='WRITE_TRUNCATE', destination_dataset_id=self.bigquery_dataset_id)
def retraction_query_runner(queries): query_job_ids = [] for query_dict in queries: logger.debug('Retracting from %s.%s using query %s' % (query_dict[DEST_DATASET], query_dict[DEST_TABLE], query_dict[QUERY])) if query_dict[DELETE_FLAG]: job_results = bq_utils.query(q=query_dict[QUERY], batch=True) rows_affected = job_results['numDmlAffectedRows'] logger.debug('%s rows deleted from %s.%s' % (rows_affected, query_dict[DEST_DATASET], query_dict[DEST_TABLE])) else: job_results = bq_utils.query( q=query_dict[QUERY], destination_table_id=query_dict[DEST_TABLE], write_disposition=WRITE_TRUNCATE, destination_dataset_id=query_dict[DEST_DATASET], batch=True) query_job_id = job_results['jobReference']['jobId'] query_job_ids.append(query_job_id) incomplete_jobs = bq_utils.wait_on_jobs(query_job_ids) if incomplete_jobs: logger.debug('Failed on {count} job ids {ids}'.format( count=len(incomplete_jobs), ids=incomplete_jobs)) logger.debug('Terminating retraction') raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
def run_analyses(hpo_id): """ Run the achilles analyses :param hpo_id: :return: """ commands = _get_run_analysis_commands(hpo_id) for command in commands: logging.debug(' ---- Running `%s`...\n' % command) if sql_wrangle.is_to_temp_table(command): table_id = sql_wrangle.get_temp_table_name(command) query = sql_wrangle.get_temp_table_query(command) insert_query_job_result = bq_utils.query(query, False, table_id) query_job_id = insert_query_job_result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([query_job_id]) if len(incomplete_jobs) > 0: logging.critical('tempresults doesnt get created in 15 secs') raise RuntimeError('Tempresults taking too long to create') elif sql_wrangle.is_truncate(command): table_id = sql_wrangle.get_truncate_table_name(command) if bq_utils.table_exists(table_id): bq_utils.delete_table(table_id) elif sql_wrangle.is_drop(command): table_id = sql_wrangle.get_drop_table_name(command) if bq_utils.table_exists(table_id): bq_utils.delete_table(table_id) else: bq_utils.query(command)
def run_heel(hpo_id): # very long test commands = _get_heel_commands(hpo_id) count = 0 for command in commands: count = count + 1 logging.debug(' ---- running query # {}'.format(count)) logging.debug(' ---- Running `%s`...\n' % command) if sql_wrangle.is_to_temp_table(command): table_id = sql_wrangle.get_temp_table_name(command) query = sql_wrangle.get_temp_table_query(command) insert_query_job_result = bq_utils.query(query, False, table_id) query_job_id = insert_query_job_result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([query_job_id]) if len(incomplete_jobs) > 0: logging.critical('tempresults doesnt get created in 30 secs') raise RuntimeError('Tempresults taking too long to create') elif sql_wrangle.is_truncate(command): table_id = sql_wrangle.get_truncate_table_name(command) query = 'DELETE FROM %s WHERE TRUE' % table_id bq_utils.query(query) elif sql_wrangle.is_drop(command): table_id = sql_wrangle.get_drop_table_name(command) bq_utils.delete_table(table_id) else: bq_utils.query(command) time.sleep(1)
def test_merge_EHR(self, mock_check_cron): self._load_datasets() # enable exception propagation as described at https://goo.gl/LqDgnj old_dataset_items = bq_utils.list_dataset_contents( bq_utils.get_dataset_id()) expected_items = ['visit_id_mapping_table'] expected_items.extend( ['unioned_ehr_' + table_name for table_name in common.CDM_TABLES]) ehr_merge.merge(bq_utils.get_dataset_id(), self.project_id) # check the result files were placed in bucket dataset_items = bq_utils.list_dataset_contents( bq_utils.get_dataset_id()) for table_name in common.CDM_TABLES: cmd = 'SELECT COUNT(1) FROM unioned_ehr_{}'.format(table_name) result = bq_utils.query(cmd) self.assertEqual( int(result['rows'][0]['f'][0]['v']), 2 * globals().get(table_name.upper() + '_COUNT', 0), msg='failed for table unioned_ehr_{}'.format(table_name)) self.assertSetEqual(set(old_dataset_items + expected_items), set(dataset_items)) table_name = 'condition_occurrence' cmd_union = 'SELECT * FROM unioned_ehr_{}'.format(table_name) cmd_pitt = 'SELECT * FROM pitt_{}'.format(table_name) cmd_visit_mapping = "SELECT global_visit_id, mapping_visit_id FROM visit_id_mapping_table where hpo='pitt'" qr_union = bq_utils.query(cmd_union) qr_pitt = bq_utils.query(cmd_pitt) qr_visit_mapping = bq_utils.query(cmd_visit_mapping) union_result = query_result_to_payload(qr_union) pitt_result = query_result_to_payload(qr_pitt) visit_mapping_result = query_result_to_payload(qr_visit_mapping) def get_element_from_list_of_lists(index, list_of_lists): return [list_item[index] for list_item in list_of_lists] for ind, pitt_visit_id in enumerate( pitt_result['VISIT_OCCURRENCE_ID']): if pitt_visit_id not in visit_mapping_result['MAPPING_VISIT_ID']: continue global_visit_id_index = visit_mapping_result[ 'MAPPING_VISIT_ID'].index(pitt_visit_id) global_visit_id = visit_mapping_result['GLOBAL_VISIT_ID'][ global_visit_id_index] union_visit_id_index = union_result['VISIT_OCCURRENCE_ID'].index( global_visit_id) pitt_cols_without_id = [ values for key, values in pitt_result.items() if key not in [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID'] ] union_cols_without_id = [ values for key, values in union_result.items() if key not in [u'VISIT_OCCURRENCE_ID', u'CONDITION_OCCURRENCE_ID'] ] self.assertListEqual( get_element_from_list_of_lists(ind, pitt_cols_without_id), get_element_from_list_of_lists(union_visit_id_index, union_cols_without_id))
def get_most_recent(app_id=None, drc_bucket=None, report_for=None): """ Query audit logs for paths to the most recent datasources.json files in the DRC bucket. Note: Results are cached in a local json file to avoid unnecessary queries. :param app_id: identifies the GCP project :param drc_bucket: identifies the DRC bucket :param report_for: denotes which query to use b/w achilles and results :return: list of dict with keys `file_path`, `upload_timestamp` """ if app_id is None: app_id = app_identity.get_application_id() if drc_bucket is None: drc_bucket = gcs_utils.get_drc_bucket() if report_for == common.REPORT_FOR_ACHILLES: if not os.path.exists(common.LATEST_REPORTS_JSON): query = LATEST_REPORTS_QUERY.format(app_id=app_id, drc_bucket=drc_bucket, year=common.LOG_YEAR) query_job = bq_utils.query(query) result = bq_utils.response2rows(query_job) with open(common.LATEST_REPORTS_JSON, 'w') as fp: json.dump(result, fp, sort_keys=True, indent=4) with open(common.LATEST_REPORTS_JSON, 'r') as fp: return json.load(fp) elif report_for == common.REPORT_FOR_RESULTS: if not os.path.exists(common.LATEST_RESULTS_JSON): query = LATEST_RESULTS_QUERY.format(app_id=app_id, drc_bucket=drc_bucket, year=common.LOG_YEAR) query_job = bq_utils.query(query) result = bq_utils.response2rows(query_job) with open(common.LATEST_RESULTS_JSON, 'w') as fp: json.dump(result, fp, sort_keys=True, indent=4) with open(common.LATEST_RESULTS_JSON, 'r') as fp: return json.load(fp)
def drop_concept_table(dataset_id): if bq_utils.table_exists(common.CONCEPT): q = "DROP TABLE {dataset}.concept;".format(dataset=dataset_id) try: bq_utils.query(q) except HttpError as err: if err.resp.status != 404: raise
def load_analyses(hpo_id): """ Populate achilles lookup table :param hpo_id: :return: """ commands = _get_load_analysis_commands(hpo_id) for command in commands: bq_utils.query(command)
def test_ehr_person_to_observation(self, mock_tables_map): # ehr person table converts to observation records self._load_datasets() mock_tables_map.return_value = [ common.OBSERVATION, common.LOCATION, common.CARE_SITE, common.VISIT_OCCURRENCE ] # perform ehr union ehr_union.main(self.input_dataset_id, self.output_dataset_id, self.project_id, self.hpo_ids) person_query = ''' SELECT person_id, gender_concept_id, gender_source_value, race_concept_id, race_source_value, CAST(birth_datetime AS STRING) AS birth_datetime, ethnicity_concept_id, ethnicity_source_value, EXTRACT(DATE FROM birth_datetime) AS birth_date FROM {output_dataset_id}.unioned_ehr_person '''.format(output_dataset_id=self.output_dataset_id) person_response = bq_utils.query(person_query) person_rows = bq_utils.response2rows(person_response) # construct dicts of expected values expected = [] for person_row in person_rows: expected.extend(self.convert_ehr_person_to_observation(person_row)) # query for observation table records query = ''' SELECT person_id, observation_concept_id, value_as_concept_id, value_as_string, observation_source_value, observation_date FROM {output_dataset_id}.unioned_ehr_observation AS obs WHERE obs.observation_concept_id IN ({gender_concept_id},{race_concept_id},{dob_concept_id}, {ethnicity_concept_id}) ''' obs_query = query.format( output_dataset_id=self.output_dataset_id, gender_concept_id=eu_constants.GENDER_CONCEPT_ID, race_concept_id=eu_constants.RACE_CONCEPT_ID, dob_concept_id=eu_constants.DOB_CONCEPT_ID, ethnicity_concept_id=eu_constants.ETHNICITY_CONCEPT_ID) obs_response = bq_utils.query(obs_query) obs_rows = bq_utils.response2rows(obs_response) actual = obs_rows self.assertCountEqual(expected, actual)
def test_integration_queries_to_retract_from_fake_dataset(self, mock_list_datasets, mock_is_ehr_dataset): mock_list_datasets.return_value = [{'id': self.project_id+':'+self.bq_dataset_id}] mock_is_ehr_dataset.return_value = True job_ids = [] row_count_queries = {} # load the cdm files into dataset for cdm_file in test_util.NYC_FIVE_PERSONS_FILES: cdm_file_name = os.path.basename(cdm_file) cdm_table = cdm_file_name.split('.')[0] hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table) # store query for checking number of rows to delete row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format(dataset_id=self.bq_dataset_id, table_id=hpo_table, pids=retract_data_bq.int_list_to_bq( self.person_ids)) retract_data_bq.logger.debug('Preparing to load table %s.%s' % (self.bq_dataset_id, hpo_table)) with open(cdm_file, 'rb') as f: gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id), cdm_file_name, f) result = bq_utils.load_cdm_csv(self.hpo_id, cdm_table, dataset_id=self.bq_dataset_id) retract_data_bq.logger.debug('Loading table %s.%s' % (self.bq_dataset_id, hpo_table)) job_id = result['jobReference']['jobId'] job_ids.append(job_id) incomplete_jobs = bq_utils.wait_on_jobs(job_ids) self.assertEqual(len(incomplete_jobs), 0, 'NYC five person load job did not complete') retract_data_bq.logger.debug('All tables loaded successfully') # use query results to count number of expected row deletions expected_row_count = {} for table in row_count_queries: result = bq_utils.query(row_count_queries[table]) expected_row_count[table] = retract_data_bq.to_int(result['totalRows']) # separate check to find number of actual deleted rows q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id) q_result = bq_utils.query(q) result = bq_utils.response2rows(q_result) row_count_before_retraction = {} for row in result: row_count_before_retraction[row['table_id']] = row['row_count'] deid_flag = False # perform retraction retract_data_bq.run_retraction(self.test_project_id, self.person_ids, self.hpo_id, deid_flag) # find actual deleted rows q_result = bq_utils.query(q) result = bq_utils.response2rows(q_result) row_count_after_retraction = {} for row in result: row_count_after_retraction[row['table_id']] = row['row_count'] for table in expected_row_count: self.assertEqual(expected_row_count[table], row_count_before_retraction[table] - row_count_after_retraction[table])
def test_get_lab_concept_summary_query(self): summary_query = required_labs.get_lab_concept_summary_query( FAKE_HPO_ID) summary_response = bq_utils.query(summary_query) summary_rows = bq_utils.response2rows(summary_response) submitted_labs = [ row for row in summary_rows if row['measurement_concept_id_exists'] == 1 ] actual_total_labs = summary_response['totalRows'] # Count the total number of labs required, this number should be equal to the total number of rows in the # results generated by get_lab_concept_summary_query including the submitted and missing labs. unique_ancestor_concept_query = sql_wrangle.qualify_tables( """SELECT DISTINCT ancestor_concept_id FROM `{project_id}.{dataset_id}.{table_id}`""" .format(project_id=self.project_id, dataset_id=self.dataset_id, table_id=MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE)) unique_ancestor_cocnept_response = bq_utils.query( unique_ancestor_concept_query) expected_total_labs = unique_ancestor_cocnept_response['totalRows'] # Count the number of labs in the measurement table, this number should be equal to the number of labs # submitted by the fake site unique_measurement_concept_id_query = ''' SELECT DISTINCT c.ancestor_concept_id FROM `{project_id}.{dataset_id}.{measurement_concept_sets_descendants}` AS c JOIN `{project_id}.{dataset_id}.{measurement}` AS m ON c.descendant_concept_id = m.measurement_concept_id '''.format(project_id=self.project_id, dataset_id=self.dataset_id, measurement_concept_sets_descendants= MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE, measurement=bq_utils.get_table_id( FAKE_HPO_ID, common.MEASUREMENT)) unique_measurement_concept_id_response = bq_utils.query( unique_measurement_concept_id_query) unique_measurement_concept_id_total_labs = unique_measurement_concept_id_response[ 'totalRows'] self.assertEqual(int(expected_total_labs), int(actual_total_labs), msg='Compare the total number of labs') self.assertEqual(int(unique_measurement_concept_id_total_labs), len(submitted_labs), msg='Compare the number ' 'of labs submitted ' 'in the measurement')
def test_execute_queries(self): gender_nonbinary_concept_id = 1585841 gender_nonbinary_source_concept_id = 123 sex_female_concept_id = 1585847 sex_female_source_concept_id = 45878463 for tmpl in INSERT_FAKE_PARTICIPANTS_TMPLS: query = tmpl.render( project_id=self.project_id, dataset_id=self.dataset_id, gender_concept_id=repopulate_person_post_deid.GENDER_CONCEPT_ID, gender_nonbinary_concept_id=gender_nonbinary_concept_id, gender_nonbinary_source_concept_id= gender_nonbinary_source_concept_id, sex_at_birth_concept_id=repopulate_person_post_deid. SEX_AT_BIRTH_CONCEPT_ID, sex_female_concept_id=sex_female_concept_id, sex_female_source_concept_id=sex_female_source_concept_id) try: resp = bq_utils.query(query) except HttpError as e: self.fail("failed to execute query '{}': {}".format( query, e.content)) self.assertTrue(resp["jobComplete"]) clean_cdr_engine.clean_dataset( self.project_id, self.dataset_id, self.sandbox_dataset_id, [(repopulate_person_post_deid. get_repopulate_person_post_deid_queries,)]) rows = bq_utils.response2rows( bq_utils.query("SELECT * FROM `{}.{}.person`".format( self.project_id, self.dataset_id))) self.assertEquals(len(rows), 2) by_participant = {r["person_id"]: r for r in rows} self.assertPersonFields( by_participant[1], { "gender_concept_id": gender_nonbinary_concept_id, "gender_source_value": "nonbinary_src", "gender_source_concept_id": gender_nonbinary_source_concept_id, "sex_at_birth_concept_id": sex_female_concept_id, "sex_at_birth_source_value": "female_src", "sex_at_birth_source_concept_id": sex_female_source_concept_id }) self.assertPersonFields( by_participant[2], { "gender_concept_id": 0, "gender_source_value": "No matching concept", "gender_source_concept_id": 0, "sex_at_birth_concept_id": 0, "sex_at_birth_source_value": "No matching concept", "sex_at_birth_source_concept_id": 0 })
def tearDown(self): test_util.delete_all_tables(self.dataset_id) # Delete concept table drop_concept_table(self.dataset_id) # re create concept table q = """CREATE or REPLACE table `{project}.{dataset}.concept` as ( SELECT * FROM `{project}.{vocab}.concept`)""".format( project=self.project_id, dataset=self.dataset_id, vocab=common.VOCABULARY_DATASET) bq_utils.query(q)
def drop_or_truncate_table(command): """ Deletes or truncates table :param command: query to run :return: None """ if sql_wrangle.is_truncate(command): table_id = sql_wrangle.get_truncate_table_name(command) query = 'DELETE FROM %s WHERE TRUE' % table_id bq_utils.query(query) else: table_id = sql_wrangle.get_drop_table_name(command) bq_utils.delete_table(table_id)
def copy_metadata_table(project_id, source_dataset_id, target_dataset_id, table_fields): """ :param project_id: :param source_dataset_id: :param target_dataset_id: :param table_fields: :return: """ create_metadata_table(target_dataset_id, table_fields) query = COPY_QUERY.format(project=project_id, datset=source_dataset_id) bq_utils.query(query, destination_dataset_id=target_dataset_id, destination_table_id=METADATA_TABLE)
def test_load_ehr_observation(self): hpo_id = 'pitt' dataset_id = self.dataset_id table_id = bq_utils.get_table_id(hpo_id, table_name='observation') q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format( dataset_id=dataset_id, table_id=table_id) expected_observation_ids = [ int(row['observation_id']) for row in resources.csv_to_list(PITT_FIVE_PERSONS_OBSERVATION_CSV) ] sc_bucket = self.client.get_bucket(gcs_utils.get_hpo_bucket(hpo_id)) bucket_blob = sc_bucket.blob('observation.csv') with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp: bucket_blob.upload_from_file(fp) result = bq_utils.load_cdm_csv(hpo_id, 'observation') job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([job_id]) self.assertEqual(len(incomplete_jobs), 0, 'pitt_observation load job did not complete') load_job_result = bq_utils.get_job_details(job_id) load_job_result_status = load_job_result['status'] load_job_errors = load_job_result_status.get('errors') self.assertIsNone(load_job_errors, msg='pitt_observation load job failed: ' + str(load_job_errors)) query_results_response = bq_utils.query(q) query_job_errors = query_results_response.get('errors') self.assertIsNone(query_job_errors) actual_result = [ int(row['f'][0]['v']) for row in query_results_response['rows'] ] self.assertCountEqual(actual_result, expected_observation_ids)
def test_consented_person_id(self): """ Test observation data has seven (7) persons with consent records as described below 1: No 2: Yes 3: NULL 4: No followed by Yes 5: Yes followed by No 6: Yes followed by NULL 7: NULL and Yes with same date/time """ # sanity check # pre-conditions self.assertFalse( bq_utils.table_exists(EHR_CONSENT_TABLE_ID, self.combined_dataset_id)) # test ehr_consent() # post conditions self.assertTrue( bq_utils.table_exists(EHR_CONSENT_TABLE_ID, self.combined_dataset_id), 'Table {dataset}.{table} created by consented_person'.format( dataset=self.combined_dataset_id, table=EHR_CONSENT_TABLE_ID)) response = bq_utils.query('SELECT * FROM {dataset}.{table}'.format( dataset=self.combined_dataset_id, table=EHR_CONSENT_TABLE_ID)) rows = bq_utils.response2rows(response) expected = {2, 4} actual = set(row['person_id'] for row in rows) self.assertSetEqual( expected, actual, 'Records in {dataset}.{table}'.format( dataset=self.combined_dataset_id, table=EHR_CONSENT_TABLE_ID))
def _all_rdr_records_included(self): """ All rdr records are included whether or not there is corresponding ehr record """ for domain_table in DOMAIN_TABLES: mapping_table = mapping_table_for(domain_table) query = ( 'SELECT rt.{domain_table}_id as id ' 'FROM `{rdr_dataset_id}.{domain_table}` AS rt ' 'LEFT JOIN `{combined_dataset_id}.{mapping_table}` AS m ' 'ON rt.{domain_table}_id = m.src_{domain_table}_id ' 'WHERE ' ' m.{domain_table}_id IS NULL ' 'OR NOT EXISTS ' ' (SELECT 1 FROM `{combined_dataset_id}.{domain_table}` AS t ' ' WHERE t.{domain_table}_id = m.{domain_table}_id)').format( domain_table=domain_table, rdr_dataset_id=bq_utils.get_rdr_dataset_id(), combined_dataset_id=bq_utils.get_combined_dataset_id(), mapping_table=mapping_table) response = bq_utils.query(query) rows = bq_utils.response2rows(response) self.assertEqual( 0, len(rows), "RDR records should map to records in mapping and combined tables" )
def _ehr_only_records_excluded(self): """ EHR person records which are missing from RDR are excluded from combined """ query = ('WITH ehr_only AS ' ' (SELECT person_id ' ' FROM `{ehr_dataset_id}.person` AS ep ' ' WHERE NOT EXISTS ' ' (SELECT 1 ' ' FROM `{rdr_dataset_id}.person` AS rp ' ' WHERE rp.person_id = ep.person_id) ' ' ) ' 'SELECT ' 'ehr_only.person_id AS ehr_person_id, ' 'p.person_id AS combined_person_id ' 'FROM ehr_only ' 'LEFT JOIN `{combined_dataset_id}.person` AS p ' 'ON ehr_only.person_id = p.person_id').format( ehr_dataset_id=self.ehr_dataset_id, rdr_dataset_id=self.rdr_dataset_id, combined_dataset_id=self.combined_dataset_id) response = bq_utils.query(query) rows = bq_utils.response2rows(response) self.assertGreater(len(rows), 0, 'Test data is missing EHR-only records') for row in rows: combined_person_id = row['combined_person_id'] self.assertIsNone( combined_person_id, 'EHR-only person_id `{ehr_person_id}` found in combined when it should be excluded' )
def test_copy_rdr_tables(self): for table in RDR_TABLES_TO_COPY: self.assertFalse( bq_utils.table_exists( table, self.combined_dataset_id)) # sanity check copy_rdr_table(table) actual = bq_utils.table_exists(table, self.combined_dataset_id) self.assertTrue( actual, msg='RDR table {table} should be copied'.format(table=table)) # Check that row count in combined is same as rdr query = ( 'WITH rdr AS ' ' (SELECT COUNT(1) n FROM `{rdr_dataset_id}.{table}`), ' 'combined AS ' ' (SELECT COUNT(1) n FROM `{combined_dataset_id}.{table}`) ' 'SELECT ' 'rdr.n AS rdr_count, ' 'combined.n AS combined_count ' 'FROM rdr, combined ').format( rdr_dataset_id=self.rdr_dataset_id, combined_dataset_id=self.combined_dataset_id, table=table) response = bq_utils.query(query) rows = bq_utils.response2rows(response) self.assertTrue(len(rows) == 1) # sanity check row = rows[0] rdr_count, combined_count = row['rdr_count'], row['combined_count'] msg_fmt = 'Table {table} has {rdr_count} in rdr and {combined_count} in combined (expected to be equal)' self.assertEqual( rdr_count, combined_count, msg_fmt.format(table=table, rdr_count=rdr_count, combined_count=combined_count))
def _all_rdr_records_included(self): """ All rdr records are included whether or not there is corresponding ehr record """ for domain_table in DOMAIN_TABLES: mapping_table = mapping_table_for(domain_table) q = '''SELECT rt.{domain_table}_id as id FROM {rdr_dataset_id}.{domain_table} rt LEFT JOIN {ehr_rdr_dataset_id}.{mapping_table} m ON rt.{domain_table}_id = m.src_{domain_table}_id WHERE m.{domain_table}_id IS NULL OR NOT EXISTS (SELECT 1 FROM {ehr_rdr_dataset_id}.{domain_table} t WHERE t.{domain_table}_id = m.{domain_table}_id)'''.format( domain_table=domain_table, rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), mapping_table=mapping_table) response = bq_utils.query(q) rows = test_util.response2rows(response) self.assertEqual( 0, len(rows), "RDR records should map to records in mapping and combined tables" )
def export_from_path(p, hpo_id): """ Export results :param p: path to SQL file :param hpo_id: HPO to run export for :return: `dict` structured for report render """ result = dict() for f in list_files_only(p): name = f[0:-4].upper() abs_path = os.path.join(p, f) with open(abs_path, 'r') as fp: sql = fp.read() sql = render(sql, hpo_id, results_schema=bq_utils.get_dataset_id(), vocab_schema='') query_result = bq_utils.query(sql) # TODO reshape results result[name] = query_result_to_payload(query_result) for d in list_dirs_only(p): abs_path = os.path.join(p, d) name = d.upper() # recursive call dir_result = export_from_path(abs_path, hpo_id) if name in result: # a sql file generated the item already result[name].update(dir_result) else: # add the item result[name] = dir_result return result
def query(q, dst_table_id, dst_dataset_id, write_disposition='WRITE_APPEND'): """ Run query and save results to a table :param q: SQL statement :param dst_table_id: save results in a table with the specified id :param dst_dataset_id: identifies output dataset :param write_disposition: WRITE_TRUNCATE, WRITE_EMPTY, or WRITE_APPEND (default, to preserve schema) :return: query result """ query_job_result = bq_utils.query(q, destination_table_id=dst_table_id, destination_dataset_id=dst_dataset_id, write_disposition=write_disposition) query_job_id = query_job_result['jobReference']['jobId'] logging.info(f'Job {query_job_id} started for table {dst_table_id}') job_status = query_job_result['status'] error_result = job_status.get('errorResult') if error_result is not None: msg = f'Job {query_job_id} failed because: {error_result}' raise bq_utils.InvalidOperationError(msg) incomplete_jobs = bq_utils.wait_on_jobs([query_job_id]) if len(incomplete_jobs) > 0: raise bq_utils.BigQueryJobWaitError(incomplete_jobs) return query_job_result
def test_load_csv(self): from google.appengine.api import app_identity app_id = app_identity.get_application_id() table_name = 'achilles_analysis' schema_file_name = table_name + '.json' csv_file_name = table_name + '.csv' schema_path = os.path.join(resources.fields_path, schema_file_name) local_csv_path = os.path.join(test_util.TEST_DATA_EXPORT_PATH, csv_file_name) with open(local_csv_path, 'r') as fp: response = gcs_utils.upload_object(self.hpo_bucket, csv_file_name, fp) hpo_bucket = self.hpo_bucket gcs_object_path = 'gs://%(hpo_bucket)s/%(csv_file_name)s' % locals() dataset_id = bq_utils.get_dataset_id() load_results = bq_utils.load_csv(schema_path, gcs_object_path, app_id, dataset_id, table_name) load_job_id = load_results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(table_name)) query_response = bq_utils.query('SELECT COUNT(1) FROM %(table_name)s' % locals()) self.assertEqual(query_response['kind'], 'bigquery#queryResponse')
def test_integration_create_drug_route_mappings_table(self): if bq_utils.table_exists(populate_route_ids.DRUG_ROUTES_TABLE_ID, dataset_id=self.dataset_id): bq_utils.delete_table(populate_route_ids.DRUG_ROUTES_TABLE_ID, dataset_id=self.dataset_id) if not bq_utils.table_exists( populate_route_ids.DOSE_FORM_ROUTES_TABLE_ID, dataset_id=self.dataset_id): populate_route_ids.create_dose_form_route_mappings_table( self.project_id, self.dataset_id) populate_route_ids.create_drug_route_mappings_table( self.project_id, self.dataset_id, populate_route_ids.DOSE_FORM_ROUTES_TABLE_ID, self.route_mapping_prefix) time.sleep(10) query = ("SELECT COUNT(*) AS n " "FROM `{project_id}.{dataset_id}.{table_id}`").format( project_id=self.project_id, dataset_id=self.dataset_id, table_id=populate_route_ids.DRUG_ROUTES_TABLE_ID) result = bq_utils.query(query) actual = bq_utils.response2rows(result) self.assertGreater(actual[0]["n"], 0)
def _export_query_response_by_path(p, hpo_id): """Utility to create response test payloads""" from validation import export for f in export.list_files_only(p): abs_path = os.path.join(p, f) with open(abs_path, 'r') as fp: sql = fp.read() sql = export.render(sql, hpo_id, results_schema=bq_utils.get_dataset_id(), vocab_schema='synpuf_100') query_result = bq_utils.query(sql) out_file = os.path.join(TEST_DATA_EXPORT_PATH, f.replace('.sql', '_response.json')) with open(out_file, 'w') as fp: data = dict() if 'rows' in query_result: data['rows'] = query_result['rows'] if 'schema' in query_result: data['schema'] = query_result['schema'] import json json.dump(data, fp, sort_keys=True, indent=4, separators=(',', ': '))
def most_common_heel_errors(app_id=None, dataset_id=None, hpo_ids=None): """ :param app_id: Application Id :param dataset_id: Dataset Id :param hpo_ids: list of Hpo_ids :return: None """ heel_errors = list() if app_id is None: app_id = app_identity.get_application_id() if dataset_id is None: dataset_id = bq_utils.get_dataset_id() if not os.path.exists(HEEL_ERRORS_JSON) and not os.path.exists( HEEL_ERRORS_CSV): for hpo_id in hpo_ids: if bq_utils.table_exists( table_id='{hpo_id}_achilles_heel_results'.format( hpo_id=hpo_id), dataset_id=dataset_id): query = heel_error_query.format(app_id=app_id, dataset_id=dataset_id, hpo_id=hpo_id) query_job = bq_utils.query(query) result = bq_utils.response2rows(query_job) heel_errors.extend(result) with open(HEEL_ERRORS_JSON, 'w') as fp: json.dump(heel_errors, fp, sort_keys=True, indent=4) parse_json_csv()