def create_empty_dataset(project_id, dataset_id, snapshot_dataset_id): """ Create the empty tables in the new snapshot dataset :param project_id: :param dataset_id: :param snapshot_dataset_id: :return: """ create_dataset( project_id=project_id, dataset_id=snapshot_dataset_id, description='Snapshot of {dataset_id}'.format(dataset_id=dataset_id), overwrite_existing=True)
def create_sandbox_dataset(project_id, dataset_id): """ A helper function create a sandbox dataset if the sandbox dataset doesn't exist :param project_id: project_id :param dataset_id: any dataset_id :return: the sandbox dataset_id """ sandbox_dataset_id = get_sandbox_dataset_id(dataset_id) friendly_name = 'Sandbox for {dataset_id}'.format(dataset_id=dataset_id) description = 'Sandbox created for storing records affected by the cleaning rules applied to {dataset_id}'.format( dataset_id=dataset_id) create_dataset(project_id=project_id, dataset_id=sandbox_dataset_id, friendly_name=friendly_name, description=description, overwrite_existing=bq_consts.FALSE) return sandbox_dataset_id
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id): """ Entry point for performing participant matching of PPI, EHR, and PII data. :param project: a string representing the project name :param rdr_dataset: the dataset created from the results given to us by the rdr team :param ehr_dataset: the dataset containing the pii information for comparisons :param dest_dataset_id: the desired identifier for the match values destination dataset :return: results of the field comparison for each hpo """ LOGGER.info(f"Calling match_participants with:\n" f"project:\t{project}\n" f"rdr_dataset:\t{rdr_dataset}\n" f"ehr_dataset:\t{ehr_dataset}\n" f"dest_dataset_id:\t{dest_dataset_id}\n") ehr_tables = bq_utils.list_dataset_contents(ehr_dataset) date_string = _get_date_string(rdr_dataset) if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]): dest_dataset_id += date_string # create new dataset for the intermediate tables and results dataset_result = bq_utils.create_dataset( dataset_id=dest_dataset_id, description=consts.DESTINATION_DATASET_DESCRIPTION.format( version='', rdr_dataset=rdr_dataset, ehr_dataset=ehr_dataset), overwrite_existing=True) validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {}) validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '') LOGGER.info( f"Created new validation results dataset:\t{validation_dataset}") # create intermediate observation table in new dataset readers.create_match_values_table(project, rdr_dataset, dest_dataset_id) hpo_sites = readers.get_hpo_site_names() #TODO: create a proper config file to store this path field_list = resources.fields_for('identity_match') for site_name in hpo_sites: bq_utils.create_table(site_name + consts.VALIDATION_TABLE_SUFFIX, field_list, drop_existing=True, dataset_id=validation_dataset) read_errors = 0 write_errors = 0 # validate first names for site in hpo_sites: LOGGER.info(f"Beginning identity validation for site: {site}") results = {} try: match_values = None match_values = _compare_name_fields(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_FIRST, consts.FIRST_NAME_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.FIRST_NAME_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.FIRST_NAME_FIELD) LOGGER.info(f"Validated first names for: {site}") # validate last names try: match_values = None match_values = _compare_name_fields(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_LAST, consts.LAST_NAME_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.LAST_NAME_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.LAST_NAME_FIELD) LOGGER.info(f"Validated last names for: {site}") # validate middle names try: match_values = None # match_values = _compare_name_fields( # project, # validation_dataset, # ehr_dataset, # site, # consts.OBS_PII_NAME_MIDDLE, # consts.MIDDLE_NAME_FIELD, # ehr_tables # ) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.MIDDLE_NAME_FIELD} at site: {site}" ), read_errors += 1 else: # write middle name matches for hpo to table # results = _add_matches_to_results(results, match_values, consts.MIDDLE_NAME_FIELD) LOGGER.info("Not validating middle names") # validate zip codes try: match_values = None match_values = _compare_zip_codes( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ZIP, consts.ZIP_CODE_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.ZIP_CODE_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.ZIP_CODE_FIELD) LOGGER.info(f"Validated zip codes for: {site}") # validate city try: match_values = None match_values = _compare_cities(project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_CITY, consts.CITY_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.CITY_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.ZIP_CODE_FIELD) LOGGER.info(f"Validated city names for: {site}") # validate state try: match_values = None match_values = _compare_states(project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_STATE, consts.STATE_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.STATE_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.STATE_FIELD) LOGGER.info(f"Validated states for: {site}") # validate street addresses try: address_one_matches = None address_two_matches = None match_values = None address_one_matches, address_two_matches = _compare_street_addresses( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ONE, consts.OBS_PII_STREET_ADDRESS_TWO, consts.ADDRESS_ONE_FIELD, consts.ADDRESS_TWO_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for fields: {consts.ADDRESS_ONE_FIELD}, {consts.ADDRESS_TWO_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, address_one_matches, consts.ADDRESS_ONE_FIELD) results = _add_matches_to_results(results, address_two_matches, consts.ADDRESS_TWO_FIELD) LOGGER.info(f"Validated street addresses for: {site}") # validate email addresses try: match_values = None match_values = _compare_email_addresses( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_EMAIL_ADDRESS, consts.EMAIL_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.EMAIL_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.EMAIL_FIELD) LOGGER.info(f"Validated email addresses for: {site}") # validate phone numbers try: match_values = None match_values = _compare_phone_numbers(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_PHONE, consts.PHONE_NUMBER_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.PHONE_NUMBER_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.PHONE_NUMBER_FIELD) LOGGER.info(f"Validated phone numbers for: {site}") # validate genders try: match_values = None match_values = _compare_genders(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_SEX, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.SEX_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.SEX_FIELD) LOGGER.info(f"Validated genders for: {site}") # validate birth dates try: match_values = None match_values = _compare_birth_dates(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_BIRTH_DATETIME, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.BIRTH_DATETIME_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.BIRTH_DATE_FIELD) LOGGER.info(f"Validated birth dates for: {site}") LOGGER.info(f"Writing results to BQ table") # write dictionary to a table try: writers.write_to_result_table(project, validation_dataset, site, results) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( f"Did not write site information to validation dataset: {site}" ) write_errors += 1 LOGGER.info(f"Wrote validation results for site: {site}") LOGGER.info(f"FINISHED: Validation dataset created: {validation_dataset}") if read_errors > 0: LOGGER.error( f"Encountered {read_errors} read errors creating validation dataset:\t{validation_dataset}" ) if write_errors > 0: LOGGER.error( f"Encountered {write_errors} write errors creating validation dataset:\t{validation_dataset}" ) return read_errors + write_errors
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id): """ Entry point for performing participant matching of PPI, EHR, and PII data. :param project: a string representing the project name :param rdr_dataset: the dataset created from the results given to us by the rdr team :param ehr_dataset: the dataset containing the pii information for comparisons :param dest_dataset_id: the desired identifier for the match values destination dataset :return: results of the field comparison for each hpo """ LOGGER.info( 'Calling match_participants with:\n' 'project:\t%s\n' 'rdr_dataset:\t%s\n' 'ehr_dataset:\t%s\n' 'dest_dataset_id:\t%s\n', project, rdr_dataset, ehr_dataset, dest_dataset_id) date_string = _get_date_string(rdr_dataset) if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]): dest_dataset_id += date_string # create new dataset for the intermediate tables and results dataset_result = bq_utils.create_dataset( dataset_id=dest_dataset_id, description=consts.DESTINATION_DATASET_DESCRIPTION.format( version='', rdr_dataset=rdr_dataset, ehr_dataset=ehr_dataset), overwrite_existing=True) validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {}) validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '') LOGGER.info('Created new validation results dataset:\t%s', validation_dataset) # create intermediate observation table in new dataset readers.create_match_values_table(project, rdr_dataset, dest_dataset_id) hpo_sites = readers.get_hpo_site_names() #TODO: create a proper config file to store this path field_list = resources.fields_for('identity_match') for site_name in hpo_sites: bq_utils.create_table(site_name + consts.VALIDATION_TABLE_SUFFIX, field_list, drop_existing=True, dataset_id=validation_dataset) read_errors = 0 write_errors = 0 results = {} # validate first names for site in hpo_sites: match_values, exc = _compare_name_fields(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_FIRST, consts.FIRST_NAME_FIELD) if exc is not None: read_errors += 1 else: try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.FIRST_NAME_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.FIRST_NAME_FIELD) write_errors += 1 LOGGER.info('Validated first names') # validate last names for site in hpo_sites: match_values, exc = _compare_name_fields(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_LAST, consts.LAST_NAME_FIELD) if exc is not None: read_errors += 1 else: # write last name matches for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.LAST_NAME_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.LAST_NAME_FIELD) write_errors += 1 LOGGER.info('Validated last names') # validate middle names for site in hpo_sites: match_values, exc = _compare_name_fields(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_MIDDLE, consts.MIDDLE_NAME_FIELD) if exc is not None: read_errors += 1 else: # write middle name matches for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.MIDDLE_NAME_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.MIDDLE_NAME_FIELD) write_errors += 1 LOGGER.info('Validated middle names') # validate zip codes for site in hpo_sites: match_values, exc = _compare_zip_codes( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ZIP, consts.ZIP_CODE_FIELD) if exc is not None: read_errors += 1 else: # write zip codes matces for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.ZIP_CODE_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.ZIP_CODE_FIELD) write_errors += 1 LOGGER.info('Validated zip codes') # validate city for site in hpo_sites: match_values, exc = _compare_cities(project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_CITY, consts.CITY_FIELD) if exc is not None: read_errors += 1 else: # write city matches for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.CITY_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.CITY_FIELD) write_errors += 1 LOGGER.info('Validated city names') # validate state for site in hpo_sites: match_values, exc = _compare_states( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_STATE, consts.STATE_FIELD) if exc is not None: read_errors += 1 else: # write state matches for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.STATE_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.STATE_FIELD) write_errors += 1 LOGGER.info('Validated states') # validate street addresses for site in hpo_sites: address_one_matches, address_two_matches, exc = _compare_street_addresses( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ONE, consts.OBS_PII_STREET_ADDRESS_TWO, consts.ADDRESS_ONE_FIELD, consts.ADDRESS_TWO_FIELD) if exc is not None: read_errors += 1 else: # write street address matches for hpo to table try: writers.append_to_result_table(site, address_one_matches, project, validation_dataset, consts.ADDRESS_ONE_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.ADDRESS_ONE_FIELD) write_errors += 1 try: writers.append_to_result_table(site, address_two_matches, project, validation_dataset, consts.ADDRESS_TWO_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.ADDRESS_TWO_FIELD) write_errors += 1 LOGGER.info('Validated street addresses') # validate email addresses for site in hpo_sites: match_values, exc = _compare_email_addresses( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_EMAIL_ADDRESS, consts.EMAIL_FIELD) if exc is not None: read_errors += 1 else: # write email matches for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.EMAIL_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.EMAIL_FIELD) write_errors += 1 LOGGER.info('Validated email addresses') # validate phone numbers for site in hpo_sites: match_values, exc = _compare_phone_numbers(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_PHONE, consts.PHONE_NUMBER_FIELD) if exc is not None: read_errors += 1 else: # write phone number matches for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.PHONE_NUMBER_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.PHONE_NUMBER_FIELD) write_errors += 1 LOGGER.info('Validated phone numbers') # validate genders for site in hpo_sites: match_values, exc = _compare_genders(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_SEX) if exc is not None: read_errors += 1 else: # write birthday match for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.SEX_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.SEX_FIELD) write_errors += 1 LOGGER.info('Validated genders') # validate birth dates for site in hpo_sites: match_values, exc = _compare_birth_dates(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_BIRTH_DATETIME) if exc is not None: read_errors += 1 else: # write birthday match for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.BIRTH_DATE_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.BIRTH_DATE_FIELD) write_errors += 1 LOGGER.info('Validated birth dates') # generate single clean record for each participant at each site for site in hpo_sites: try: writers.merge_fields_into_single_record(project, validation_dataset, site) LOGGER.info('Merged participant match records') except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): write_errors += 1 try: writers.remove_sparse_records(project, validation_dataset, site) LOGGER.info('Removed sparse participant match records') except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): write_errors += 1 try: writers.change_nulls_to_missing_value(project, validation_dataset, site) LOGGER.info( 'Changed nulls to missing values in participant match records') except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): write_errors += 1 LOGGER.info("Finished creating validation dataset") if read_errors > 0: LOGGER.error( "Encountered %d read errors creating validation dataset:\t%s", read_errors, validation_dataset) if write_errors > 0: LOGGER.error( "Encountered %d write errors creating validation dataset:\t%s", write_errors, validation_dataset) return results, read_errors + write_errors
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id): """ Entry point for performing participant matching of PPI, EHR, and PII data. :param project: a string representing the project name :param rdr_dataset: the dataset created from the results given to us by the rdr team :param ehr_dataset: the dataset containing the pii information for comparisons :param dest_dataset_id: the desired identifier for the match values destination dataset :return: results of the field comparison for each hpo """ date_string = _get_date_string(rdr_dataset) if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]): dest_dataset_id += date_string # create new dataset for the intermediate tables and results dataset_result = bq_utils.create_dataset( dataset_id=dest_dataset_id, description=consts.DESTINATION_DATASET_DESCRIPTION.format( version='', rdr_dataset=rdr_dataset, ehr_dataset=ehr_dataset ), overwrite_existing=True) validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {}) validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '') # create intermediate observation table in new dataset readers.create_match_values_table(project, rdr_dataset, dest_dataset_id) hpo_sites = readers.get_hpo_site_names() #TODO: create a proper config file to store this path field_list = resources.fields_for('identity_match') for site_name in hpo_sites: bq_utils.create_table( site_name + consts.VALIDATION_TABLE_SUFFIX, field_list, drop_existing=True, dataset_id=validation_dataset ) results = {} # validate first names for site in hpo_sites: match_values = _compare_name_fields( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_FIRST, consts.FIRST_NAME_FIELD ) writers.append_to_result_table( site, match_values, project, validation_dataset, consts.FIRST_NAME_FIELD ) # validate last names for site in hpo_sites: match_values = _compare_name_fields( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_LAST, consts.LAST_NAME_FIELD ) # write last name matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.LAST_NAME_FIELD ) # validate middle names for site in hpo_sites: match_values = _compare_name_fields( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_MIDDLE, consts.MIDDLE_NAME_FIELD ) # write middle name matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.MIDDLE_NAME_FIELD ) # validate zip codes for site in hpo_sites: match_values = _compare_zip_codes( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ZIP, consts.ZIP_CODE_FIELD ) # write zip codes matces for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.ZIP_CODE_FIELD ) # validate city for site in hpo_sites: match_values = _compare_cities( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_CITY, consts.CITY_FIELD ) # write city matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.CITY_FIELD ) # validate state for site in hpo_sites: match_values = _compare_states( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_STATE, consts.STATE_FIELD ) # write state matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.STATE_FIELD ) # validate street addresses for site in hpo_sites: address_one_matches, address_two_matches = _compare_street_addresses( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ONE, consts.OBS_PII_STREET_ADDRESS_TWO, consts.ADDRESS_ONE_FIELD, consts.ADDRESS_TWO_FIELD ) # write street address matches for hpo to table writers.append_to_result_table( site, address_one_matches, project, validation_dataset, consts.ADDRESS_ONE_FIELD ) writers.append_to_result_table( site, address_two_matches, project, validation_dataset, consts.ADDRESS_TWO_FIELD ) # validate email addresses for site in hpo_sites: match_values = _compare_email_addresses( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_EMAIL_ADDRESS, consts.EMAIL_FIELD ) # write email matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.EMAIL_FIELD ) # validate phone numbers for site in hpo_sites: match_values = _compare_phone_numbers( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_PHONE, consts.PHONE_NUMBER_FIELD ) # write phone number matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.PHONE_NUMBER_FIELD ) # validate genders for site in hpo_sites: match_values = _compare_genders( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_SEX ) # write birthday match for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.SEX_FIELD ) # validate birth dates for site in hpo_sites: match_values = _compare_birth_dates( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_BIRTH_DATETIME ) # write birthday match for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.BIRTH_DATE_FIELD ) # generate single clean record for each participant at each site for site in hpo_sites: writers.merge_fields_into_single_record(project, validation_dataset, site) writers.remove_sparse_records(project, validation_dataset, site) writers.change_nulls_to_missing_value(project, validation_dataset, site) # generate hpo site reports for site in hpo_sites: bucket = gcs_utils.get_hpo_bucket(site) filename = os.path.join( consts.REPORT_DIRECTORY.format(date=date_string), consts.REPORT_TITLE ) writers.create_site_validation_report( project, validation_dataset, [site], bucket, filename ) # generate aggregate site report bucket = gcs_utils.get_drc_bucket() filename = os.path.join(validation_dataset, consts.REPORT_TITLE) writers.create_site_validation_report( project, validation_dataset, hpo_sites, bucket, filename ) return results