def populate_achilles(hpo_bucket, hpo_id=FAKE_HPO_ID, include_heel=True): from validation import achilles, achilles_heel import app_identity app_id = app_identity.get_application_id() test_file_name = achilles.ACHILLES_ANALYSIS + '.csv' achilles_analysis_file_path = os.path.join(TEST_DATA_EXPORT_PATH, test_file_name) schema_name = achilles.ACHILLES_ANALYSIS write_cloud_file(hpo_bucket, achilles_analysis_file_path) gcs_path = 'gs://' + hpo_bucket + '/' + test_file_name dataset_id = bq_utils.get_dataset_id() table_id = bq_utils.get_table_id(hpo_id, achilles.ACHILLES_ANALYSIS) bq_utils.load_csv(schema_name, gcs_path, app_id, dataset_id, table_id) table_names = [achilles.ACHILLES_RESULTS, achilles.ACHILLES_RESULTS_DIST] if include_heel: table_names.append(achilles_heel.ACHILLES_HEEL_RESULTS) running_jobs = [] for table_name in table_names: test_file_name = table_name + '.csv' test_file_path = os.path.join(TEST_DATA_EXPORT_SYNPUF_PATH, table_name + '.csv') write_cloud_file(hpo_bucket, test_file_path) gcs_path = 'gs://' + hpo_bucket + '/' + test_file_name dataset_id = bq_utils.get_dataset_id() table_id = bq_utils.get_table_id(hpo_id, table_name) load_results = bq_utils.load_csv(table_name, gcs_path, app_id, dataset_id, table_id) running_jobs.append(load_results['jobReference']['jobId']) bq_utils.wait_on_jobs(running_jobs)
def get_hpo_missing_pii_query(hpo_id): """ Query to retrieve counts of drug classes in an HPO site's drug_exposure table :param hpo_id: identifies the HPO site :return: the query """ person_table_id = bq_utils.get_table_id(hpo_id, common.PERSON) pii_name_table_id = bq_utils.get_table_id(hpo_id, common.PII_NAME) pii_wildcard = bq_utils.get_table_id(hpo_id, common.PII_WILDCARD) participant_match_table_id = bq_utils.get_table_id( hpo_id, common.PARTICIPANT_MATCH) rdr_dataset_id = bq_utils.get_rdr_dataset_id() rdr_date = extract_date_from_rdr_dataset_id(rdr_dataset_id) ehr_no_rdr_with_date = consts.EHR_NO_RDR.format(date=rdr_date) rdr_person_table_id = common.PERSON return render_query( consts.MISSING_PII_QUERY, person_table_id=person_table_id, rdr_dataset_id=rdr_dataset_id, rdr_person_table_id=rdr_person_table_id, ehr_no_pii=consts.EHR_NO_PII, ehr_no_rdr=ehr_no_rdr_with_date, pii_no_ehr=consts.PII_NO_EHR, ehr_no_participant_match=consts.EHR_NO_PARTICIPANT_MATCH, pii_name_table_id=pii_name_table_id, pii_wildcard=pii_wildcard, participant_match_table_id=participant_match_table_id)
def test_is_hpo_col(self): col = dict() hpo_id = 'bogus_hpo' hpo_id_fake = 'bogus_hpo_fake' table_name = 'condition_occurrence' col[consts.TABLE_NAME] = bq_utils.get_table_id(hpo_id, table_name) self.assertTrue(completeness.is_hpo_col(hpo_id, col)) col[consts.TABLE_NAME] = table_name self.assertFalse(completeness.is_hpo_col(hpo_id, col)) col[consts.TABLE_NAME] = bq_utils.get_table_id(hpo_id_fake, table_name) self.assertFalse(completeness.is_hpo_col(hpo_id, col))
def _load_data(self): # Load measurement_concept_sets required_labs.load_measurement_concept_sets_table( project_id=self.project_id, dataset_id=self.dataset_id) # Load measurement_concept_sets_descendants required_labs.load_measurement_concept_sets_descendants_table( project_id=self.project_id, dataset_id=self.dataset_id) # Create concept and concept_ancestor empty tables if not exist if not bq_utils.table_exists(common.CONCEPT, self.dataset_id): bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT) if not bq_utils.table_exists(common.CONCEPT, self.dataset_id): bq_utils.create_standard_table(common.CONCEPT_ANCESTOR, common.CONCEPT_ANCESTOR) # we need to load measurement.csv into bigquery_dataset_id in advance for the other integration tests ehr_measurement_result = bq_utils.load_table_from_csv( project_id=self.project_id, dataset_id=self.dataset_id, table_name=bq_utils.get_table_id(FAKE_HPO_ID, common.MEASUREMENT), csv_path=test_util.FIVE_PERSONS_MEASUREMENT_CSV, fields=resources.fields_for(common.MEASUREMENT)) bq_utils.wait_on_jobs( [ehr_measurement_result['jobReference']['jobId']])
def _union_subqueries(table_name, hpo_ids, input_dataset_id, output_dataset_id): """ Get list of subqueries (one for each HPO table found in the source) that comprise the load query :param table_name: name of a CDM table to load :param hpo_ids: list of HPOs to process :param input_dataset_id: identifies the source dataset :param output_dataset_id: identifies the output dataset :return: list of subqueries """ result = [] # Exclude subqueries that reference tables that are missing from source dataset all_table_ids = bq_utils.list_all_table_ids(input_dataset_id) for hpo_id in hpo_ids: table_id = bq_utils.get_table_id(hpo_id, table_name) if table_id in all_table_ids: if table_name == common.FACT_RELATIONSHIP: subquery = fact_relationship_hpo_subquery( hpo_id, input_dataset_id, output_dataset_id) result.append(subquery) else: subquery = table_hpo_subquery(table_name, hpo_id, input_dataset_id, output_dataset_id) result.append(subquery) else: logging.info( f'Excluding table {table_id} from mapping query because it does not exist' ) return result
def get_lab_concept_summary_query(hpo_id): """ Get the query that checks if the HPO site has submitted the required labs :param hpo_id: :return: """ project_id = app_identity.get_application_id() dataset_id = bq_utils.get_dataset_id() hpo_measurement_table = bq_utils.get_table_id(hpo_id, common.MEASUREMENT) # Create measurement_concept_sets_table if not exist if not bq_utils.table_exists(MEASUREMENT_CONCEPT_SETS_TABLE, dataset_id): load_measurement_concept_sets_table(project_id, dataset_id) # Create measurement_concept_sets_descendants_table if not exist if not bq_utils.table_exists(MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE, dataset_id): load_measurement_concept_sets_descendants_table(project_id, dataset_id) return CHECK_REQUIRED_LAB_QUERY.format( project_id=project_id, ehr_ops_dataset_id=dataset_id, hpo_measurement_table=hpo_measurement_table, measurement_concept_sets_descendants= MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE)
def render(sql, hpo_id, results_schema, vocab_schema=''): table_id = bq_utils.get_table_id(hpo_id, '') vocab_replacement = vocab_schema + '.' if vocab_schema else '' sql = sql.replace(RESULTS_SCHEMA_PLACEHOLDER, results_schema + '.' + table_id) sql = sql.replace(VOCAB_SCHEMA_PLACEHOLDER, vocab_replacement) return sql
def test_load_ehr_observation(self): hpo_id = 'pitt' dataset_id = self.dataset_id table_id = bq_utils.get_table_id(hpo_id, table_name='observation') q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format( dataset_id=dataset_id, table_id=table_id) expected_observation_ids = [ int(row['observation_id']) for row in resources.csv_to_list(PITT_FIVE_PERSONS_OBSERVATION_CSV) ] sc_bucket = self.client.get_bucket(gcs_utils.get_hpo_bucket(hpo_id)) bucket_blob = sc_bucket.blob('observation.csv') with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp: bucket_blob.upload_from_file(fp) result = bq_utils.load_cdm_csv(hpo_id, 'observation') job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([job_id]) self.assertEqual(len(incomplete_jobs), 0, 'pitt_observation load job did not complete') load_job_result = bq_utils.get_job_details(job_id) load_job_result_status = load_job_result['status'] load_job_errors = load_job_result_status.get('errors') self.assertIsNone(load_job_errors, msg='pitt_observation load job failed: ' + str(load_job_errors)) query_results_response = bq_utils.query(q) query_job_errors = query_results_response.get('errors') self.assertIsNone(query_job_errors) actual_result = [ int(row['f'][0]['v']) for row in query_results_response['rows'] ] self.assertCountEqual(actual_result, expected_observation_ids)
def test_validate_five_persons_success(self, mock_check_cron): expected_results = [] test_file_names = [ os.path.basename(f) for f in test_util.FIVE_PERSONS_FILES ] for cdm_file in common.SUBMISSION_FILES: if cdm_file in test_file_names: expected_result = (cdm_file, 1, 1, 1) test_file = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_file) test_util.write_cloud_file(self.hpo_bucket, test_file, prefix=self.folder_prefix) else: expected_result = (cdm_file, 0, 0, 0) expected_results.append(expected_result) bucket_items = gcs_utils.list_bucket(self.hpo_bucket) folder_items = main.get_folder_items(bucket_items, self.folder_prefix) r = main.validate_submission(self.hpo_id, self.hpo_bucket, folder_items, self.folder_prefix) self.assertSetEqual(set(r['results']), set(expected_results)) # check tables exist and are clustered as expected for table in resources.CDM_TABLES + common.PII_TABLES: fields_file = os.path.join(resources.fields_path, table + '.json') table_id = bq_utils.get_table_id(test_util.FAKE_HPO_ID, table) table_info = bq_utils.get_table_info(table_id) with open(fields_file, 'r') as fp: fields = json.load(fp) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self.table_has_clustering(table_info)
def test_validate_five_persons_success(self, mock_check_cron): expected_results: list = [] test_file_names: list = [ os.path.basename(f) for f in test_util.FIVE_PERSONS_FILES ] for cdm_filename in common.SUBMISSION_FILES: if cdm_filename in test_file_names: expected_result: tuple = (cdm_filename, 1, 1, 1) test_filepath: str = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_filename) test_blob = self.storage_bucket.blob( f'{self.folder_prefix}{cdm_filename}') test_blob.upload_from_filename(test_filepath) else: expected_result: tuple = (cdm_filename, 0, 0, 0) expected_results.append(expected_result) bucket_items = gcs_utils.list_bucket(self.hpo_bucket) folder_items = main.get_folder_items(bucket_items, self.folder_prefix) r = main.validate_submission(self.hpo_id, self.hpo_bucket, folder_items, self.folder_prefix) self.assertSetEqual(set(r['results']), set(expected_results)) # check tables exist and are clustered as expected for table in resources.CDM_TABLES + common.PII_TABLES: table_id = bq_utils.get_table_id(test_util.FAKE_HPO_ID, table) table_info = bq_utils.get_table_info(table_id) fields = resources.fields_for(table) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self.table_has_clustering(table_info)
def load_test_data(self, hpo_id: str = None): """ Load to bq test achilles heel results data from csv file :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used :return: contents of the file as list of objects """ table_name: str = common.ACHILLES_HEEL_RESULTS if hpo_id is not None: table_id: str = bq_utils.get_table_id(hpo_id, table_name) else: table_id: str = table_name test_file_name: str = f'{table_id}.csv' test_file_path: str = os.path.join(test_util.TEST_DATA_PATH, test_file_name) target_bucket = self.storage_client.get_bucket(self.bucket) test_blob = target_bucket.blob(test_file_name) test_blob.upload_from_filename(test_file_path) gcs_path: str = f'gs://{self.bucket}/{test_file_name}' load_results = bq_utils.load_csv(table_name, gcs_path, self.project_id, self.dataset_id, table_id) job_id = load_results['jobReference']['jobId'] bq_utils.wait_on_jobs([job_id]) return resources.csv_to_list(test_file_path)
def _mapping_subqueries(table_name, hpo_ids, dataset_id, project_id): """ Get list of subqueries (one for each HPO table found in the source) that comprise the ID mapping query :param table_name: name of a CDM table whose ID field must be remapped :param hpo_ids: list of HPOs to process :param dataset_id: identifies the source dataset :param project_id: identifies the GCP project :return: list of subqueries """ result = [] hpo_unique_identifiers = get_hpo_offsets(hpo_ids) # Exclude subqueries that reference tables that are missing from source dataset all_table_ids = bq_utils.list_all_table_ids(dataset_id) for hpo_id in hpo_ids: table_id = bq_utils.get_table_id(hpo_id, table_name) if table_id in all_table_ids: subquery = f''' (SELECT '{table_id}' AS src_table_id, {table_name}_id AS src_{table_name}_id, {table_name}_id + {hpo_unique_identifiers[hpo_id]} as {table_name}_id FROM `{project_id}.{dataset_id}.{table_id}`) ''' result.append(subquery) else: logging.info( f'Excluding table {table_id} from mapping query because it does not exist' ) return result
def test_queries_to_retract_from_ehr_dataset(self): hpo_person = bq_utils.get_table_id(self.hpo_id, common.PERSON) hpo_death = bq_utils.get_table_id(self.hpo_id, common.DEATH) # hpo tables existing_table_ids = [hpo_person, hpo_death] for table in self.tables_to_retract_unioned: table_id = bq_utils.get_table_id(self.hpo_id, table) existing_table_ids.append(table_id) # unioned tables ignored_tables = [] for cdm_table in resources.CDM_TABLES: unioned_table_id = rbq.UNIONED_EHR + cdm_table existing_table_ids.append(unioned_table_id) if cdm_table not in self.tables_to_retract_unioned: ignored_tables.append(unioned_table_id) # mock existing tables mock_table_ids = [] for table_id in existing_table_ids: mock_table_id = MagicMock() mock_table_id.table_id = table_id mock_table_ids.append(mock_table_id) self.client.list_tables.return_value = mock_table_ids person_id_query = rbq.JINJA_ENV.from_string( rbq.PERSON_ID_QUERY).render( person_research_id=rbq.PERSON_ID, pid_project=self.project_id, sandbox_dataset_id=self.sandbox_dataset_id, pid_table_id=self.pid_table_id) qs = rbq.queries_to_retract_from_ehr_dataset(self.client, self.project_id, self.ehr_dataset_id, self.hpo_id, person_id_query) expected_tables = set(existing_table_ids) - set(ignored_tables) actual_tables = set() for query in qs: fq_table = re.search('`(.*)`', query) if fq_table: table = fq_table.group(1).split('.')[-1] actual_tables.add(table) self.assertSetEqual(expected_tables, actual_tables)
def get_heel_error_query(hpo_id): """ Query to retrieve errors in Achilles Heel for an HPO site :param hpo_id: identifies the HPO site :return: the query """ table_id = bq_utils.get_table_id(hpo_id, consts.ACHILLES_HEEL_RESULTS_TABLE) return render_query(consts.HEEL_ERROR_QUERY_VALIDATION, table_id=table_id)
def get_drug_class_counts_query(hpo_id): """ Query to retrieve counts of drug classes in an HPO site's drug_exposure table :param hpo_id: identifies the HPO site :return: the query """ table_id = bq_utils.get_table_id(hpo_id, consts.DRUG_CHECK_TABLE) return render_query(consts.DRUG_CHECKS_QUERY_VALIDATION, table_id=table_id)
def test_integration_queries_to_retract_from_fake_dataset(self, mock_list_datasets, mock_is_ehr_dataset): mock_list_datasets.return_value = [{'id': self.project_id+':'+self.bq_dataset_id}] mock_is_ehr_dataset.return_value = True job_ids = [] row_count_queries = {} # load the cdm files into dataset for cdm_file in test_util.NYC_FIVE_PERSONS_FILES: cdm_file_name = os.path.basename(cdm_file) cdm_table = cdm_file_name.split('.')[0] hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table) # store query for checking number of rows to delete row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format(dataset_id=self.bq_dataset_id, table_id=hpo_table, pids=retract_data_bq.int_list_to_bq( self.person_ids)) retract_data_bq.logger.debug('Preparing to load table %s.%s' % (self.bq_dataset_id, hpo_table)) with open(cdm_file, 'rb') as f: gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id), cdm_file_name, f) result = bq_utils.load_cdm_csv(self.hpo_id, cdm_table, dataset_id=self.bq_dataset_id) retract_data_bq.logger.debug('Loading table %s.%s' % (self.bq_dataset_id, hpo_table)) job_id = result['jobReference']['jobId'] job_ids.append(job_id) incomplete_jobs = bq_utils.wait_on_jobs(job_ids) self.assertEqual(len(incomplete_jobs), 0, 'NYC five person load job did not complete') retract_data_bq.logger.debug('All tables loaded successfully') # use query results to count number of expected row deletions expected_row_count = {} for table in row_count_queries: result = bq_utils.query(row_count_queries[table]) expected_row_count[table] = retract_data_bq.to_int(result['totalRows']) # separate check to find number of actual deleted rows q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id) q_result = bq_utils.query(q) result = bq_utils.response2rows(q_result) row_count_before_retraction = {} for row in result: row_count_before_retraction[row['table_id']] = row['row_count'] deid_flag = False # perform retraction retract_data_bq.run_retraction(self.test_project_id, self.person_ids, self.hpo_id, deid_flag) # find actual deleted rows q_result = bq_utils.query(q) result = bq_utils.response2rows(q_result) row_count_after_retraction = {} for row in result: row_count_after_retraction[row['table_id']] = row['row_count'] for table in expected_row_count: self.assertEqual(expected_row_count[table], row_count_before_retraction[table] - row_count_after_retraction[table])
def test_validate_five_persons_success(self, mock_check_cron): prefix = 'dummy-prefix-2018-03-22/' expected_result_items = resources._csv_to_list( test_util.FIVE_PERSONS_SUCCESS_RESULT_CSV) json_export_files = self.get_json_export_files(test_util.FAKE_HPO_ID) # upload all five_persons files for cdm_file in test_util.FIVE_PERSONS_FILES: test_util.write_cloud_file(self.hpo_bucket, cdm_file, prefix=prefix) expected_tables = [ 'person', 'visit_occurrence', 'condition_occurrence', 'procedure_occurrence', 'drug_exposure', 'measurement' ] cdm_files = [table + '.csv' for table in expected_tables] main.app.testing = True with main.app.test_client() as c: c.get(test_util.VALIDATE_HPO_FILES_URL) # check the result file was put in bucket expected_object_names = cdm_files + common.IGNORE_LIST + json_export_files expected_objects = [ prefix + item for item in expected_object_names ] list_bucket_result = gcs_utils.list_bucket(self.hpo_bucket) actual_objects = [item['name'] for item in list_bucket_result] self.assertSetEqual(set(expected_objects), set(actual_objects)) # result says file found, parsed, loaded actual_result = test_util.read_cloud_file( self.hpo_bucket, prefix + common.RESULT_CSV) actual_result_file = StringIO.StringIO(actual_result) actual_result_items = resources._csv_file_to_list( actual_result_file) expected_result_items.sort() actual_result_items.sort() self.assertListEqual(expected_result_items, actual_result_items) self.assertTrue( main.all_required_files_loaded(test_util.FAKE_HPO_ID, folder_prefix=prefix)) # check tables exist and are clustered as expected for table in expected_tables: fields_file = os.path.join(resources.fields_path, table + '.json') table_id = bq_utils.get_table_id(test_util.FAKE_HPO_ID, table) table_info = bq_utils.get_table_info(table_id) with open(fields_file, 'r') as fp: fields = json.load(fp) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self.table_has_clustering(table_info)
def create_tables(hpo_id, drop_existing=False): """ Create the achilles related tables :param hpo_id: associated hpo id :param drop_existing: if True, drop existing tables :return: """ for table_name in ACHILLES_HEEL_TABLES: table_id = bq_utils.get_table_id(hpo_id, table_name) bq_utils.create_standard_table(table_name, table_id, drop_existing)
def exist_participant_match(ehr_dataset_id, hpo_id): """ This function checks if the hpo has submitted the participant_match data :param ehr_dataset_id: :param hpo_id: :return: """ return bq_utils.table_exists( bq_utils.get_table_id(hpo_id, PARTICIPANT_MATCH), ehr_dataset_id)
def setUp(self): self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID) self.person_table_id = bq_utils.get_table_id(FAKE_HPO_ID, common.PERSON) self.dataset_id = bq_utils.get_dataset_id() test_util.delete_all_tables(self.dataset_id) self.project_id = app_identity.get_application_id() self.TEST_FIELDS = [ { "type": "integer", "name": "integer_field", "mode": "required", "description": "An integer field" }, # DC-586 Import RDR rules should support null fields { "type": "integer", "name": "nullable_integer_field", "mode": "nullable", "description": "A nullable integer field" }, { "type": "string", "name": "string_field", "mode": "required", "description": "A string field" }, { "type": "date", "name": "date_field", "mode": "required", "description": "A date field" }, { "type": "timestamp", "name": "timestamp_field", "mode": "required", "description": "A timestamp field" }, { "type": "boolean", "name": "boolean_field", "mode": "required", "description": "A boolean field" }, { "type": "float", "name": "float_field", "mode": "required", "description": "A float field" } ] self.DT_FORMAT = '%Y-%m-%d %H:%M:%S' self.client = StorageClient(self.project_id) self.client.empty_bucket(self.hpo_bucket)
def get_hpo_subqueries(app_id, dataset_id, all_table_ids): result = [] for hpo_id in get_hpo_ids(): table_id = bq_utils.get_table_id(hpo_id, common.ACHILLES_HEEL_RESULTS) if table_id in all_table_ids: subquery = QUERY_FORMAT(dataset_name=hpo_id, app_id=app_id, dataset_id=dataset_id, table_id=table_id) result.append(subquery) return result
def test_queries_to_retract_from_ehr_dataset(self, mock_list_existing_tables): hpo_person = bq_utils.get_table_id(self.hpo_id, common.PERSON) hpo_death = bq_utils.get_table_id(self.hpo_id, common.DEATH) # hpo tables existing_table_ids = [hpo_person, hpo_death] for table in self.tables_to_retract_unioned: table_id = bq_utils.get_table_id(self.hpo_id, table) existing_table_ids.append(table_id) # unioned tables ignored_tables = [] for cdm_table in resources.CDM_TABLES: unioned_table_id = retract_data_bq.UNIONED_EHR + cdm_table existing_table_ids.append(unioned_table_id) if cdm_table not in self.tables_to_retract_unioned: ignored_tables.append(unioned_table_id) mapped_tables = cdm.tables_to_map() # fact_relationship does not have pid, is handled separate from other mapped tables for mapped_table in mapped_tables: mapping_table = ehr_union.mapping_table_for(mapped_table) existing_table_ids.append(mapping_table) legacy_mapping_table = retract_data_bq.UNIONED_EHR + mapping_table existing_table_ids.append(legacy_mapping_table) if mapped_table not in self.tables_to_retract_unioned: ignored_tables.append(mapping_table) ignored_tables.append(legacy_mapping_table) mock_list_existing_tables.return_value = existing_table_ids mqs, qs = retract_data_bq.queries_to_retract_from_ehr_dataset( self.project_id, self.ehr_dataset_id, self.project_id, self.sandbox_dataset_id, self.hpo_id, self.pid_table_id) actual_dest_tables = set(q[retract_data_bq.DEST_TABLE] for q in qs + mqs) expected_dest_tables = set(existing_table_ids) - set(hpo_person) - set( ignored_tables) self.assertSetEqual(expected_dest_tables, actual_dest_tables)
def test_get_lab_concept_summary_query(self): summary_query = required_labs.get_lab_concept_summary_query( FAKE_HPO_ID) summary_response = bq_utils.query(summary_query) summary_rows = bq_utils.response2rows(summary_response) submitted_labs = [ row for row in summary_rows if row['measurement_concept_id_exists'] == 1 ] actual_total_labs = summary_response['totalRows'] # Count the total number of labs required, this number should be equal to the total number of rows in the # results generated by get_lab_concept_summary_query including the submitted and missing labs. unique_ancestor_concept_query = sql_wrangle.qualify_tables( """SELECT DISTINCT ancestor_concept_id FROM `{project_id}.{dataset_id}.{table_id}`""" .format(project_id=self.project_id, dataset_id=self.dataset_id, table_id=MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE)) unique_ancestor_cocnept_response = bq_utils.query( unique_ancestor_concept_query) expected_total_labs = unique_ancestor_cocnept_response['totalRows'] # Count the number of labs in the measurement table, this number should be equal to the number of labs # submitted by the fake site unique_measurement_concept_id_query = ''' SELECT DISTINCT c.ancestor_concept_id FROM `{project_id}.{dataset_id}.{measurement_concept_sets_descendants}` AS c JOIN `{project_id}.{dataset_id}.{measurement}` AS m ON c.descendant_concept_id = m.measurement_concept_id '''.format(project_id=self.project_id, dataset_id=self.dataset_id, measurement_concept_sets_descendants= MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE, measurement=bq_utils.get_table_id( FAKE_HPO_ID, common.MEASUREMENT)) unique_measurement_concept_id_response = bq_utils.query( unique_measurement_concept_id_query) unique_measurement_concept_id_total_labs = unique_measurement_concept_id_response[ 'totalRows'] self.assertEqual(int(expected_total_labs), int(actual_total_labs), msg='Compare the total number of labs') self.assertEqual(int(unique_measurement_concept_id_total_labs), len(submitted_labs), msg='Compare the number ' 'of labs submitted ' 'in the measurement')
def is_hpo_col(hpo_id, col): """ True if col is on specified HPO table :param hpo_id: identifies the HPO :param col: column summary :return: True if col is on specified HPO table, False otherwise """ hpo_tables = [ bq_utils.get_table_id(hpo_id, table) for table in resources.CDM_TABLES ] return col[consts.TABLE_NAME] in hpo_tables
def setUp(self): self.testbed = testbed.Testbed() self.testbed.activate() self.testbed.init_app_identity_stub() self.testbed.init_memcache_stub() self.testbed.init_urlfetch_stub() self.testbed.init_blobstore_stub() self.testbed.init_datastore_v3_stub() self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID) self.person_table_id = bq_utils.get_table_id(FAKE_HPO_ID, PERSON) test_util.delete_all_tables(self.EHR_DATASET_ID) self._empty_bucket()
def _filter_hpo_tables(tables: List[bigquery.table.TableListItem], hpo_id: str) -> List[bigquery.table.TableListItem]: """ Given a list of tables get those associated with an HPO submission :param tables: list of tables to filter :param hpo_id: identifies the HPO :return: list of tables associated with the HPO """ expected_tables = [ bq_utils.get_table_id(hpo_id, table) for table in resources.CDM_TABLES ] return [table for table in tables if table.table_id in expected_tables]
def test_query_result(self): with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp) result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON) load_job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(PERSON)) table_id = bq_utils.get_table_id(FAKE_HPO_ID, PERSON) q = 'SELECT person_id FROM %s' % table_id result = bq_utils.query(q) self.assertEqual(5, int(result['totalRows']))
def setUp(self): super(BqUtilsTest, self).setUp() self.testbed = testbed.Testbed() self.testbed.activate() self.testbed.init_app_identity_stub() self.testbed.init_memcache_stub() self.testbed.init_urlfetch_stub() self.testbed.init_blobstore_stub() self.testbed.init_datastore_v3_stub() self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID) self.person_table_id = bq_utils.get_table_id(FAKE_HPO_ID, PERSON) self._drop_tables() self._empty_bucket()
def test_query_result(self): sc_bucket = self.client.get_bucket(self.hpo_bucket) bucket_blob = sc_bucket.blob('person.csv') with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: bucket_blob.upload_from_file(fp) result = bq_utils.load_cdm_csv(FAKE_HPO_ID, common.PERSON) load_job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(common.PERSON)) table_id = bq_utils.get_table_id(FAKE_HPO_ID, common.PERSON) q = 'SELECT person_id FROM %s' % table_id result = bq_utils.query(q) self.assertEqual(5, int(result['totalRows']))
def validate_submission(hpo_id, bucket, bucket_items, folder_prefix): """ Load submission in BigQuery and summarize outcome :param hpo_id: :param bucket: :param bucket_items: :param folder_prefix: :return: a dict with keys results, errors, warnings results is list of tuples (file_name, found, parsed, loaded) errors and warnings are both lists of tuples (file_name, message) """ logging.info('Validating %s submission in gs://%s/%s', hpo_id, bucket, folder_prefix) # separate cdm from the unknown (unexpected) files folder_items = [item['name'][len(folder_prefix):] \ for item in bucket_items if item['name'].startswith(folder_prefix)] found_cdm_files, found_pii_files, unknown_files = categorize_folder_items( folder_items) errors = [] results = [] # Create all tables first to simplify downstream processes # (e.g. ehr_union doesn't have to check if tables exist) for file_name in resources.CDM_FILES + common.PII_FILES: table_name = file_name.split('.')[0] table_id = bq_utils.get_table_id(hpo_id, table_name) bq_utils.create_standard_table(table_name, table_id, drop_existing=True) for cdm_file_name in sorted(resources.CDM_FILES): file_results, file_errors = perform_validation_on_file( cdm_file_name, found_cdm_files, hpo_id, folder_prefix, bucket) results.extend(file_results) errors.extend(file_errors) for pii_file_name in sorted(common.PII_FILES): file_results, file_errors = perform_validation_on_file( pii_file_name, found_pii_files, hpo_id, folder_prefix, bucket) results.extend(file_results) errors.extend(file_errors) # (filename, message) for each unknown file warnings = [(unknown_file, common.UNKNOWN_FILE) for unknown_file in unknown_files] return dict(results=results, errors=errors, warnings=warnings)