def merge(dataset_id, project_id): """merge hpo ehr data :dataset_id: source and target dataset :project_id: project in which everything happens :returns: list of tables generated successfully """ logging.info('Starting merge') existing_tables = bq_utils.list_dataset_contents(dataset_id) hpos_to_merge = [] hpos_with_visit = [] for item in resources.hpo_csv(): hpo_id = item['hpo_id'] if hpo_id + '_person' in existing_tables: hpos_to_merge.append(hpo_id) if hpo_id + '_visit_occurrence' in existing_tables: hpos_with_visit.append(hpo_id) logging.info('HPOs to merge: %s' % hpos_to_merge) logging.info('HPOs with visit_occurrence: %s' % hpos_with_visit) create_mapping_table(hpos_with_visit, project_id, dataset_id) # before loading [drop and] create all tables to ensure they are set up properly for cdm_file_name in common.CDM_FILES: cdm_table_name = cdm_file_name.split('.')[0] result_table = result_table_for(cdm_table_name) bq_utils.create_standard_table(cdm_table_name, result_table, drop_existing=True) jobs_to_wait_on = [] for table_name in common.CDM_TABLES: q = construct_query(table_name, hpos_to_merge, hpos_with_visit, project_id, dataset_id) logging.info('Merging table: ' + table_name) result_table = result_table_for(table_name) query_result = query(q, destination_table_id=result_table, write_disposition='WRITE_TRUNCATE') query_job_id = query_result['jobReference']['jobId'] jobs_to_wait_on.append(query_job_id) incomplete_jobs = bq_utils.wait_on_jobs(jobs_to_wait_on) if len(incomplete_jobs) == 0: tables_created = [] for job_id in jobs_to_wait_on: job_details = bq_utils.get_job_details(job_id) status = job_details['status'] table = job_details['configuration']['query']['destinationTable'][ 'tableId'] if 'errors' in status: logging.error('Job ID %s errors: %s' % (job_id, status['errors'])) else: tables_created.append(table) return tables_created else: message = "Merge failed because job id(s) %s did not complete." % incomplete_jobs logging.error(message) raise RuntimeError(message)
def test_load_ehr_observation(self): hpo_id = 'pitt' dataset_id = self.dataset_id table_id = bq_utils.get_table_id(hpo_id, table_name='observation') q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format( dataset_id=dataset_id, table_id=table_id) expected_observation_ids = [ int(row['observation_id']) for row in resources.csv_to_list(PITT_FIVE_PERSONS_OBSERVATION_CSV) ] sc_bucket = self.client.get_bucket(gcs_utils.get_hpo_bucket(hpo_id)) bucket_blob = sc_bucket.blob('observation.csv') with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp: bucket_blob.upload_from_file(fp) result = bq_utils.load_cdm_csv(hpo_id, 'observation') job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([job_id]) self.assertEqual(len(incomplete_jobs), 0, 'pitt_observation load job did not complete') load_job_result = bq_utils.get_job_details(job_id) load_job_result_status = load_job_result['status'] load_job_errors = load_job_result_status.get('errors') self.assertIsNone(load_job_errors, msg='pitt_observation load job failed: ' + str(load_job_errors)) query_results_response = bq_utils.query(q) query_job_errors = query_results_response.get('errors') self.assertIsNone(query_job_errors) actual_result = [ int(row['f'][0]['v']) for row in query_results_response['rows'] ] self.assertCountEqual(actual_result, expected_observation_ids)
def create_mapping_table(hpos_with_visit, project_id, dataset_id): """ creates the visit mapping table :hpos_with_visit: hpos that should be including the visit occurrence table :project_id: project with the dataset :dataset_id: dataset with the tables :returns: string if visit table failed; otherwise none """ # list of hpos with visit table and creating visit id mapping table queries visit_hpo_queries = [] for hpo in hpos_with_visit: visit_hpo_queries.append(VISIT_ID_HPO_BLOCK % locals()) union_all_blocks = '\n UNION ALL'.join(visit_hpo_queries) visit_mapping_query = VIST_ID_MAPPING_QUERY_SKELETON % locals() logging.info('Loading ' + VISIT_ID_MAPPING_TABLE) query_result = query(visit_mapping_query, destination_table_id=VISIT_ID_MAPPING_TABLE, write_disposition='WRITE_TRUNCATE') visit_mapping_query_job_id = query_result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([visit_mapping_query_job_id]) if len(incomplete_jobs) == 0: query_result = bq_utils.get_job_details(visit_mapping_query_job_id) if 'errors' in query_result['status']: errors = query_result['status']['errors'] message = 'Failed to load %s due to the following error(s): %s' % ( VISIT_ID_MAPPING_TABLE, errors) logging.error(message) raise RuntimeError(message) else: message = 'Failed to load %s. Job with ID %s never completed.' % ( VISIT_ID_MAPPING_TABLE, visit_mapping_query_job_id) logging.error(message) raise RuntimeError(message)
def test_load_ehr_observation(self): observation_query = "SELECT observation_id FROM {}.{} ORDER BY observation_id" hpo_id = 'pitt' expected_observation_ids = [ int(row['observation_id']) for row in resources._csv_to_list( PITT_FIVE_PERSONS_OBSERVATION_CSV) ] with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket(hpo_id), 'observation.csv', fp) result = bq_utils.load_cdm_csv(hpo_id, 'observation') job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([job_id]) self.assertEqual(len(incomplete_jobs), 0, 'pitt_observation load job did not complete') load_job_result = bq_utils.get_job_details(job_id) load_job_result_status = load_job_result['status'] load_job_errors = load_job_result_status.get('errors') self.assertIsNone(load_job_errors, msg='pitt_observation load job failed: ' + str(load_job_errors)) query_string = observation_query.format(bq_utils.get_dataset_id(), 'pitt_observation') query_results_response = bq_utils.query_table(query_string) query_job_errors = query_results_response.get('errors') self.assertIsNone(query_job_errors) actual_result = [ int(row['f'][0]['v']) for row in query_results_response['rows'] ] self.assertListEqual(actual_result, expected_observation_ids)
def perform_validation_on_file(file_name, found_file_names, hpo_id, folder_prefix, bucket): """ Attempts to load a csv file into BigQuery :param file_name: name of the file to validate :param found_file_names: files found in the submission folder :param hpo_id: identifies the hpo site :param folder_prefix: directory containing the submission :param bucket: bucket containing the submission :return: tuple (results, errors) where results is list of tuples (file_name, found, parsed, loaded) errors is list of tuples (file_name, message) """ errors = [] results = [] logging.info(f"Validating file '{file_name}'") found = parsed = loaded = 0 table_name = file_name.split('.')[0] if file_name in found_file_names: found = 1 load_results = bq_utils.load_from_csv(hpo_id, table_name, folder_prefix) load_job_id = load_results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) if not incomplete_jobs: job_resource = bq_utils.get_job_details(job_id=load_job_id) job_status = job_resource['status'] if 'errorResult' in job_status: # These are issues (which we report back) as opposed to internal errors issues = [item['message'] for item in job_status['errors']] errors.append((file_name, ' || '.join(issues))) logging.info( f"Issues found in gs://{bucket}/{folder_prefix}/{file_name}" ) for issue in issues: logging.info(issue) else: # Processed ok parsed = loaded = 1 else: # Incomplete jobs are internal unrecoverable errors. # Aborting the process allows for this submission to be validated when system recovers. message = ( f"Loading hpo_id '{hpo_id}' table '{table_name}' failed because " f"job id '{load_job_id}' did not complete.\n") message += f"Aborting processing 'gs://{bucket}/{folder_prefix}'." logging.error(message) raise InternalValidationError(message) if file_name in common.SUBMISSION_FILES: results.append((file_name, found, parsed, loaded)) return results, errors
def perform_validation_on_file(file_name, found_file_names, hpo_id, folder_prefix, bucket): errors = [] results = [] logging.info('Validating file `{file_name}`'.format(file_name=file_name)) found = parsed = loaded = 0 table_name = file_name.split('.')[0] if file_name in found_file_names: found = 1 load_results = bq_utils.load_from_csv(hpo_id, table_name, folder_prefix) load_job_id = load_results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) if len(incomplete_jobs) == 0: job_resource = bq_utils.get_job_details(job_id=load_job_id) job_status = job_resource['status'] if 'errorResult' in job_status: # These are issues (which we report back) as opposed to internal errors issues = [item['message'] for item in job_status['errors']] errors.append((file_name, ' || '.join(issues))) logging.info( 'Issues found in gs://{bucket}/{folder_prefix}/{file_name}' .format(bucket=bucket, folder_prefix=folder_prefix, file_name=file_name)) for issue in issues: logging.info(issue) else: # Processed ok parsed = loaded = 1 else: # Incomplete jobs are internal unrecoverable errors. # Aborting the process allows for this submission to be validated when system recovers. message_fmt = 'Loading hpo_id `%s` table `%s` failed because job id `%s` did not complete.' message = message_fmt % (hpo_id, table_name, load_job_id) message += ' Aborting processing `gs://%s/%s`.' % (bucket, folder_prefix) logging.error(message) raise InternalValidationError(message) if file_name in common.REQUIRED_FILES or found: results.append((file_name, found, parsed, loaded)) return results, errors
def run_validation(hpo_id, force_run=False): """ runs validation for a single hpo_id :param hpo_id: which hpo_id to run for :param force_run: if True, process the latest submission whether or not it has already been processed before :raises BucketDoesNotExistError: Raised when a configured bucket does not exist InternalValidationError: Raised when an internal error is encountered during validation """ logging.info(' Validating hpo_id %s' % hpo_id) bucket = gcs_utils.get_hpo_bucket(hpo_id) bucket_items = list_bucket(bucket) to_process_folder_list = _get_to_process_list(bucket, bucket_items, force_run) for folder_prefix in to_process_folder_list: logging.info('Processing gs://%s/%s' % (bucket, folder_prefix)) # separate cdm from the unknown (unexpected) files found_cdm_files = [] unknown_files = [] folder_items = [ item['name'].split('/')[1] for item in bucket_items if item['name'].startswith(folder_prefix) ] for item in folder_items: if _is_cdm_file(item): found_cdm_files.append(item) else: is_known_file = item in common.IGNORE_LIST or is_pii(item) if not is_known_file: unknown_files.append(item) errors = [] results = [] found_cdm_file_names = found_cdm_files # Create all tables first to simplify downstream processes # (e.g. ehr_union doesn't have to check if tables exist) for cdm_file_name in common.CDM_FILES: cdm_table_name = cdm_file_name.split('.')[0] table_id = bq_utils.get_table_id(hpo_id, cdm_table_name) bq_utils.create_standard_table(cdm_table_name, table_id, drop_existing=True) for cdm_file_name in common.CDM_FILES: logging.info('Validating file `{file_name}`'.format( file_name=cdm_file_name)) found = parsed = loaded = 0 cdm_table_name = cdm_file_name.split('.')[0] if cdm_file_name in found_cdm_file_names: found = 1 load_results = bq_utils.load_cdm_csv(hpo_id, cdm_table_name, folder_prefix) load_job_id = load_results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) if len(incomplete_jobs) == 0: job_resource = bq_utils.get_job_details(job_id=load_job_id) job_status = job_resource['status'] if 'errorResult' in job_status: # These are issues (which we report back) as opposed to internal errors issues = [ item['message'] for item in job_status['errors'] ] errors.append((cdm_file_name, ' || '.join(issues))) logging.info( 'Issues found in gs://{bucket}/{folder_prefix}/{cdm_file_name}' .format(bucket=bucket, folder_prefix=folder_prefix, cdm_file_name=cdm_file_name)) for issue in issues: logging.info(issue) else: # Processed ok parsed = loaded = 1 else: # Incomplete jobs are internal unrecoverable errors. # Aborting the process allows for this submission to be validated when system recovers. message_fmt = 'Loading hpo_id `%s` table `%s` failed because job id `%s` did not complete.' message = message_fmt % (hpo_id, cdm_table_name, load_job_id) message += ' Aborting processing `gs://%s/%s`.' % ( bucket, folder_prefix) logging.error(message) raise InternalValidationError(message) if cdm_file_name in common.REQUIRED_FILES or found: results.append((cdm_file_name, found, parsed, loaded)) # (filename, message) for each unknown file warnings = [(unknown_file, UNKNOWN_FILE) for unknown_file in unknown_files] # output to GCS _save_result_in_gcs(bucket, folder_prefix + RESULT_CSV, results) _save_warnings_in_gcs(bucket, folder_prefix + WARNINGS_CSV, warnings) _save_errors_in_gcs(bucket, folder_prefix + ERRORS_CSV, errors) if all_required_files_loaded(hpo_id, folder_prefix=folder_prefix): run_achilles(hpo_id) run_export(hpo_id=hpo_id, folder_prefix=folder_prefix) logging.info('Uploading achilles index files to `gs://%s/%s`.' % (bucket, folder_prefix)) _upload_achilles_files(hpo_id, folder_prefix) now_datetime_string = datetime.datetime.now().strftime( '%Y-%m-%dT%H:%M:%S') logging.info( 'Processing complete. Saving timestamp %s to `gs://%s/%s`.' % (bucket, now_datetime_string, folder_prefix + common.PROCESSED_TXT)) _write_string_to_file(bucket, folder_prefix + common.PROCESSED_TXT, now_datetime_string)