def write_results_to_drc_bucket(project, validation_dataset=None): """ Write the results of participant matching to the drc bucket. :param project: a string representing the project name :param validation_dataset: the identifier for the match values destination dataset :return: None :raises: RuntimeError if validation_dataset is not defined. """ LOGGER.info('Writing to the DRC bucket') if validation_dataset is None: LOGGER.error('Validation_dataset name is not defined.') raise RuntimeError('validation_dataset name cannot be None.') date_string = _get_date_string(validation_dataset) hpo_sites = readers.get_hpo_site_names() # generate aggregate site report bucket = gcs_utils.get_drc_bucket() filename = os.path.join(validation_dataset, consts.REPORT_DIRECTORY.format(date=date_string), consts.REPORT_TITLE) _, errors = writers.create_site_validation_report(project, validation_dataset, hpo_sites, bucket, filename) if errors > 0: LOGGER.error( f"Encountered {errors} read errors when writing drc report")
def write_results_to_site_buckets(project, validation_dataset=None): """ Write the results of participant matching to each site's bucket. :param project: a string representing the project name :param validation_dataset: the identifier for the match values destination dataset :return: None :raises: RuntimeError if validation_dataset is not defined. """ LOGGER.info('Writing to site buckets') if validation_dataset is None: LOGGER.error('Validation_dataset name is not defined.') raise RuntimeError('validation_dataset name cannot be None.') date_string = _get_date_string(validation_dataset) hpo_sites = readers.get_hpo_site_names() # generate hpo site reports for site in hpo_sites: bucket = gcs_utils.get_hpo_bucket(site) filename = os.path.join( consts.REPORT_DIRECTORY.format(date=date_string), consts.REPORT_TITLE ) _, errors = writers.create_site_validation_report( project, validation_dataset, [site], bucket, filename ) if errors > 0: LOGGER.error("Encountered %d read errors when writing %s site report", errors, site )
def test_create_site_validation_report_with_errors( self, mock_report_file, mock_query, mock_upload ): # preconditions mock_query.side_effect = oauth2client.client.HttpAccessTokenRefreshError() bucket = 'abc' filename = 'output.csv' # test writer.create_site_validation_report( self.project, self.dataset, [self.site], bucket, filename ) # post conditions self.assertEqual(mock_report_file.call_count, 1) self.assertEqual(mock_query.call_count, len([self.site])) self.assertEqual(mock_upload.call_count, 1) expected_query = consts.VALIDATION_RESULTS_VALUES.format( project=self.project, dataset=self.dataset, table=self.site + consts.VALIDATION_TABLE_SUFFIX, ) self.assertEqual( mock_query.assert_called_with(expected_query, batch=True), None ) self.assertEqual( mock_upload.assert_called_with( bucket, filename, ANY ), None ) expected_report_calls = [ call(), call().write('person_id,first_name,last_name,birth_date,sex,address,phone_number,email,algorithm\n'), call().write("Unable to report id validation match records for site:\t{}.\n".format(self.site)), call().seek(0), call().close() ] self.assertEqual(mock_report_file.mock_calls, expected_report_calls)
def test_create_site_validation_report( self, mock_report_file, mock_query, mock_response, mock_upload ): # preconditions bucket = 'abc' filename = 'output.csv' mock_response.return_value = [ { consts.ADDRESS_ONE_FIELD: consts.MATCH, consts.ADDRESS_TWO_FIELD: consts.MATCH, consts.CITY_FIELD: consts.MATCH, consts.STATE_FIELD: consts.MATCH, consts.ZIP_CODE_FIELD: consts.MATCH, consts.PERSON_ID_FIELD: 1, consts.FIRST_NAME_FIELD: consts.MATCH, consts.LAST_NAME_FIELD: consts.MATCH, consts.MIDDLE_NAME_FIELD: consts.MATCH, consts.BIRTH_DATE_FIELD: consts.MATCH, consts.PHONE_NUMBER_FIELD: consts.MATCH, consts.EMAIL_FIELD: consts.MATCH, consts.ALGORITHM_FIELD: consts.MATCH, consts.SEX_FIELD: consts.MATCH, }, { consts.ADDRESS_ONE_FIELD: consts.MATCH, consts.ADDRESS_TWO_FIELD: consts.MATCH, consts.CITY_FIELD: consts.MATCH, consts.STATE_FIELD: consts.MATCH, consts.ZIP_CODE_FIELD: consts.MISMATCH, consts.PERSON_ID_FIELD: 2, consts.FIRST_NAME_FIELD: consts.MATCH, consts.LAST_NAME_FIELD: consts.MATCH, consts.MIDDLE_NAME_FIELD: consts.MATCH, consts.BIRTH_DATE_FIELD: consts.MISMATCH, consts.PHONE_NUMBER_FIELD: consts.MATCH, consts.EMAIL_FIELD: consts.MATCH, consts.ALGORITHM_FIELD: consts.MATCH, consts.SEX_FIELD: consts.MISSING, }, ] # test writer.create_site_validation_report( self.project, self.dataset, [self.site], bucket, filename ) # post conditions self.assertEqual(mock_report_file.call_count, 1) self.assertEqual(mock_query.call_count, len([self.site])) self.assertEqual(mock_response.call_count, len([self.site])) self.assertEqual(mock_upload.call_count, 1) expected_query = consts.VALIDATION_RESULTS_VALUES.format( project=self.project, dataset=self.dataset, table=self.site + consts.VALIDATION_TABLE_SUFFIX, ) self.assertEqual( mock_query.assert_called_with(expected_query, batch=True), None ) self.assertEqual( mock_upload.assert_called_with( bucket, filename, ANY ), None ) expected_report_calls = [ call(), call().write('person_id,first_name,last_name,birth_date,sex,address,phone_number,email,algorithm\n'), call().write('1,match,match,match,match,match,match,match,match\n'), call().write('2,match,match,no_match,missing,no_match,match,match,match\n'), call().seek(0), call().close() ] self.assertEqual(mock_report_file.mock_calls, expected_report_calls)
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id): """ Entry point for performing participant matching of PPI, EHR, and PII data. :param project: a string representing the project name :param rdr_dataset: the dataset created from the results given to us by the rdr team :param ehr_dataset: the dataset containing the pii information for comparisons :param dest_dataset_id: the desired identifier for the match values destination dataset :return: results of the field comparison for each hpo """ date_string = _get_date_string(rdr_dataset) if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]): dest_dataset_id += date_string # create new dataset for the intermediate tables and results dataset_result = bq_utils.create_dataset( dataset_id=dest_dataset_id, description=consts.DESTINATION_DATASET_DESCRIPTION.format( version='', rdr_dataset=rdr_dataset, ehr_dataset=ehr_dataset ), overwrite_existing=True) validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {}) validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '') # create intermediate observation table in new dataset readers.create_match_values_table(project, rdr_dataset, dest_dataset_id) hpo_sites = readers.get_hpo_site_names() #TODO: create a proper config file to store this path field_list = resources.fields_for('identity_match') for site_name in hpo_sites: bq_utils.create_table( site_name + consts.VALIDATION_TABLE_SUFFIX, field_list, drop_existing=True, dataset_id=validation_dataset ) results = {} # validate first names for site in hpo_sites: match_values = _compare_name_fields( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_FIRST, consts.FIRST_NAME_FIELD ) writers.append_to_result_table( site, match_values, project, validation_dataset, consts.FIRST_NAME_FIELD ) # validate last names for site in hpo_sites: match_values = _compare_name_fields( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_LAST, consts.LAST_NAME_FIELD ) # write last name matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.LAST_NAME_FIELD ) # validate middle names for site in hpo_sites: match_values = _compare_name_fields( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_MIDDLE, consts.MIDDLE_NAME_FIELD ) # write middle name matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.MIDDLE_NAME_FIELD ) # validate zip codes for site in hpo_sites: match_values = _compare_zip_codes( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ZIP, consts.ZIP_CODE_FIELD ) # write zip codes matces for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.ZIP_CODE_FIELD ) # validate city for site in hpo_sites: match_values = _compare_cities( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_CITY, consts.CITY_FIELD ) # write city matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.CITY_FIELD ) # validate state for site in hpo_sites: match_values = _compare_states( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_STATE, consts.STATE_FIELD ) # write state matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.STATE_FIELD ) # validate street addresses for site in hpo_sites: address_one_matches, address_two_matches = _compare_street_addresses( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ONE, consts.OBS_PII_STREET_ADDRESS_TWO, consts.ADDRESS_ONE_FIELD, consts.ADDRESS_TWO_FIELD ) # write street address matches for hpo to table writers.append_to_result_table( site, address_one_matches, project, validation_dataset, consts.ADDRESS_ONE_FIELD ) writers.append_to_result_table( site, address_two_matches, project, validation_dataset, consts.ADDRESS_TWO_FIELD ) # validate email addresses for site in hpo_sites: match_values = _compare_email_addresses( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_EMAIL_ADDRESS, consts.EMAIL_FIELD ) # write email matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.EMAIL_FIELD ) # validate phone numbers for site in hpo_sites: match_values = _compare_phone_numbers( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_PHONE, consts.PHONE_NUMBER_FIELD ) # write phone number matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.PHONE_NUMBER_FIELD ) # validate genders for site in hpo_sites: match_values = _compare_genders( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_SEX ) # write birthday match for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.SEX_FIELD ) # validate birth dates for site in hpo_sites: match_values = _compare_birth_dates( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_BIRTH_DATETIME ) # write birthday match for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.BIRTH_DATE_FIELD ) # generate single clean record for each participant at each site for site in hpo_sites: writers.merge_fields_into_single_record(project, validation_dataset, site) writers.remove_sparse_records(project, validation_dataset, site) writers.change_nulls_to_missing_value(project, validation_dataset, site) # generate hpo site reports for site in hpo_sites: bucket = gcs_utils.get_hpo_bucket(site) filename = os.path.join( consts.REPORT_DIRECTORY.format(date=date_string), consts.REPORT_TITLE ) writers.create_site_validation_report( project, validation_dataset, [site], bucket, filename ) # generate aggregate site report bucket = gcs_utils.get_drc_bucket() filename = os.path.join(validation_dataset, consts.REPORT_TITLE) writers.create_site_validation_report( project, validation_dataset, hpo_sites, bucket, filename ) return results