def get_generate_ext_table_queries(project_id, deid_dataset_id,
                                   combined_dataset_id):
    """
    Generate the queries for generating the ext tables
    :param project_id: project_id containing the dataset to generate ext tables in
    :param deid_dataset_id: deid_dataset_id to generate ext tables in
    :param combined_dataset_id: combined_dataset_id to use the mapping tables from
    :return: list of query dicts
    """
    queries = []

    mapping_table_ids = get_mapping_table_ids(project_id, combined_dataset_id)
    create_and_populate_source_mapping_table(project_id, combined_dataset_id)

    for mapping_table_id in mapping_table_ids:
        cdm_table_id = get_cdm_table_from_mapping(mapping_table_id)
        ext_table_id = cdm_table_id + EXT_TABLE_SUFFIX
        ext_table_fields = get_table_fields(cdm_table_id)
        bq_utils.create_table(ext_table_id,
                              ext_table_fields,
                              drop_existing=True,
                              dataset_id=deid_dataset_id)
        query = dict()
        query[cdr_consts.QUERY] = REPLACE_SRC_QUERY.format(
            project_id=project_id,
            combined_dataset_id=combined_dataset_id,
            mapping_table_id=mapping_table_id,
            site_mappings_table_id=SITE_TABLE_ID,
            cdm_table_id=cdm_table_id)
        query[cdr_consts.DESTINATION_TABLE] = ext_table_id
        query[cdr_consts.DESTINATION_DATASET] = deid_dataset_id
        query[cdr_consts.DISPOSITION] = bq_consts.WRITE_EMPTY
        queries.append(query)

    return queries
Exemplo n.º 2
0
    def _create_drug_class_table(self):
        table_name = 'drug_class'
        fields = [{
            "type": "integer",
            "name": "concept_id",
            "mode": "required"
        }, {
            "type": "string",
            "name": "concept_name",
            "mode": "required"
        }, {
            "type": "string",
            "name": "drug_class_name",
            "mode": "required"
        }]
        bq_utils.create_table(table_id=table_name,
                              fields=fields,
                              drop_existing=True,
                              dataset_id=self.bigquery_dataset_id)

        bq_utils.query(q=main_constants.DRUG_CLASS_QUERY.format(
            dataset_id=self.bigquery_dataset_id),
                       use_legacy_sql=False,
                       destination_table_id='drug_class',
                       retry_count=bq_consts.BQ_DEFAULT_RETRY_COUNT,
                       write_disposition='WRITE_TRUNCATE',
                       destination_dataset_id=self.bigquery_dataset_id)
Exemplo n.º 3
0
 def test_create_existing_table_without_drop_raises_error(self):
     table_id = 'some_random_table_id'
     fields = [dict(name='id', type='integer', mode='required'),
               dict(name='name', type='string', mode='nullable')]
     bq_utils.create_table(table_id, fields)
     with self.assertRaises(bq_utils.InvalidOperationError) as cm:
         bq_utils.create_table(table_id, fields, drop_existing=False)
Exemplo n.º 4
0
def create_dose_form_route_mappings_table(project_id, dataset_id=None):
    """
    Creates "_logging_dose_form_route_mappings" table with only id columns from resources/dose_form_route_mappings.csv

    :param project_id:
    :param dataset_id: BQ dataset_id
    :return: upload metadata for created table
    """
    if dataset_id is None:
        # Using table created in bq_dataset instead of re-creating in every dataset
        dataset_id = bq_utils.get_dataset_id()

    dose_form_routes_table_id = DOSE_FORM_ROUTES_TABLE_ID

    LOGGER.info("Creating %s.%s", dataset_id, DOSE_FORM_ROUTES_TABLE_ID)

    # create empty table
    bq_utils.create_table(DOSE_FORM_ROUTES_TABLE_ID,
                          DOSE_FORM_ROUTE_FIELDS,
                          drop_existing=True,
                          dataset_id=dataset_id)

    dose_form_route_mappings_csv = os.path.join(resources.resource_path,
                                                DOSE_FORM_ROUTES_FILE + ".csv")
    dose_form_route_mappings_list = resources.csv_to_list(
        dose_form_route_mappings_csv)
    dose_form_routes_populate_query = INSERT_ROUTES_QUERY.format(
        dataset_id=dataset_id,
        project_id=project_id,
        routes_table_id=DOSE_FORM_ROUTES_TABLE_ID,
        mapping_list=get_mapping_list(dose_form_route_mappings_list))
    result = bq_utils.query(dose_form_routes_populate_query)
    LOGGER.info("Created %s.%s", dataset_id, dose_form_routes_table_id)
    return result
Exemplo n.º 5
0
def create_metadata_table(dataset_id, fields_list):
    """
    Creates a metadata table in a given dataset.
    :param dataset_id: name of the dataset
    :param fields_list: name of the dataset
    :return:
    """
    if not bq_utils.table_exists(METADATA_TABLE, dataset_id):
        bq_utils.create_table(table_id=METADATA_TABLE,
                              fields=fields_list,
                              dataset_id=dataset_id)
Exemplo n.º 6
0
 def test_create_table_drop_existing_success(self):
     table_id = 'some_random_table_id'
     fields = [dict(name='id', type='integer', mode='required'),
               dict(name='name', type='string', mode='nullable')]
     result_1 = bq_utils.create_table(table_id, fields)
     # sanity check
     table_id = result_1['tableReference']['tableId']
     self.assertTrue(bq_utils.table_exists(table_id))
     result_2 = bq_utils.create_table(table_id, fields, drop_existing=True)
     # same id and second one created after first one
     self.assertEqual(result_1['id'], result_2['id'])
     self.assertTrue(result_2['creationTime'] > result_1['creationTime'])
Exemplo n.º 7
0
def copy_suppressed_table_schemas(known_tables, dest_dataset):
    """
    Copy only table schemas for suppressed tables.

    :param known_tables:  list of tables the software 'knows' about for deid purposes.
    :param dest_dataset:  name of the dataset to copy tables to.
    """
    for table in SUPPRESSED_TABLES:
        if table in known_tables:
            field_list = fields_for(table)
            # create a table schema only.
            bq_utils.create_table(table,
                                  field_list,
                                  drop_existing=True,
                                  dataset_id=dest_dataset)
Exemplo n.º 8
0
 def test_create_table(self):
     table_id = 'some_random_table_id'
     fields = [dict(name='person_id', type='integer', mode='required'),
               dict(name='name', type='string', mode='nullable')]
     result = bq_utils.create_table(table_id, fields)
     self.assertTrue('kind' in result)
     self.assertEqual(result['kind'], 'bigquery#table')
     table_info = bq_utils.get_table_info(table_id)
     self._table_has_clustering(table_info)
Exemplo n.º 9
0
def create_drug_route_mappings_table(project_id, route_mapping_dataset_id,
                                     dose_form_routes_table_id,
                                     route_mapping_prefix):
    """
    Creates "drug_route_mappings" table using the query GET_DRUGS_FROM_DOSE_FORM
    
    :param project_id: the project containing the routes dataset
    :param route_mapping_dataset_id: dataset where the dose_form_route mapping table exists
            and where the drug_route mapping table will be created
    :param dose_form_routes_table_id: table_id of the dose_form_routes mapping table
    :param route_mapping_prefix: prefix for the dose_form_routes_mapping_table
    :return: upload metadata and created drug_route_table_id
    """
    if route_mapping_dataset_id is None:
        # Using table created in bq_dataset instead of re-creating in every dataset
        route_mapping_dataset_id = bq_utils.get_dataset_id()

    LOGGER.info("Creating %s.%s", route_mapping_dataset_id,
                DRUG_ROUTES_TABLE_ID)

    # create empty table
    bq_utils.create_table(DRUG_ROUTES_TABLE_ID,
                          DRUG_ROUTE_FIELDS,
                          drop_existing=True,
                          dataset_id=route_mapping_dataset_id)

    drug_routes_populate_query = GET_DRUGS_FROM_DOSE_FORM.format(
        project_id=project_id,
        vocabulary_dataset=common.VOCABULARY_DATASET,
        route_mapping_dataset_id=route_mapping_dataset_id,
        dose_form_route_mapping_table=dose_form_routes_table_id,
        route_mapping_prefix=route_mapping_prefix)
    result = bq_utils.query(q=drug_routes_populate_query,
                            write_disposition='WRITE_TRUNCATE',
                            destination_dataset_id=route_mapping_dataset_id,
                            destination_table_id=DRUG_ROUTES_TABLE_ID,
                            batch=True)
    incomplete_jobs = bq_utils.wait_on_jobs([result['jobReference']['jobId']])
    if incomplete_jobs:
        LOGGER.info('Failed job id {id}'.format(id=incomplete_jobs[0]))
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
    LOGGER.info("Created %s.%s", route_mapping_dataset_id,
                DRUG_ROUTES_TABLE_ID)
    return result
Exemplo n.º 10
0
 def test_create_table(self):
     table_id = 'some_random_table_id'
     fields = [
         dict(name='id', type='integer', mode='required'),
         dict(name='name', type='string', mode='nullable')
     ]
     result = bq_utils.create_table(table_id, fields)
     self.assertTrue('kind' in result)
     self.assertEqual(result['kind'], 'bigquery#table')
     # sanity check
     self.assertTrue(bq_utils.table_exists(table_id))
Exemplo n.º 11
0
def get_generate_ext_table_queries(project_id, dataset_id, sandbox_dataset_id,
                                   mapping_dataset_id):
    """
    Generate the queries for generating the ext tables
    :param project_id: project_id containing the dataset to generate ext tables in
    :param dataset_id: dataset_id to generate ext tables in
    :param sandbox_dataset_id: sandbox_dataset_id to store sandboxed rows.
    :param mapping_dataset_id: mapping_tables_dataset_id to use the mapping tables from
    :return: list of query dicts
    """
    queries = []

    # FIXME: Remove ths reference in future
    LOGGER.info(f'sandbox_dataset_id : {sandbox_dataset_id}')

    mapping_table_ids = get_mapping_table_ids(project_id, mapping_dataset_id)

    for mapping_table_id in mapping_table_ids:
        cdm_table_id = get_cdm_table_from_mapping(mapping_table_id)
        ext_table_id = cdm_table_id + EXT_TABLE_SUFFIX
        ext_table_fields = get_table_fields(cdm_table_id, ext_table_id)
        bq_utils.create_table(ext_table_id,
                              ext_table_fields,
                              drop_existing=True,
                              dataset_id=dataset_id)
        query = dict()
        query[cdr_consts.QUERY] = REPLACE_SRC_QUERY.render(
            project_id=project_id,
            sandbox_dataset_id=sandbox_dataset_id,
            mapping_dataset_id=mapping_dataset_id,
            mapping_table_id=mapping_table_id,
            site_mappings_table_id=SITE_TABLE_ID,
            cdm_table_id=cdm_table_id)
        query[cdr_consts.DESTINATION_TABLE] = ext_table_id
        query[cdr_consts.DESTINATION_DATASET] = dataset_id
        query[cdr_consts.DISPOSITION] = bq_consts.WRITE_EMPTY
        queries.append(query)

    return queries
Exemplo n.º 12
0
def create_unit_mapping_table(project_id, dataset_id):
    """
    This function creates the unit_mapping table and populate it with the values from resources/unit_mapping.csv
    :param project_id:
    :param dataset_id:
    :return:
    """
    bq_utils.create_table(table_id=UNIT_MAPPING_TABLE,
                          fields=UNIT_MAPPING_FIELDS,
                          drop_existing=True,
                          dataset_id=dataset_id)

    unit_mappings_csv = os.path.join(resources.resource_path,
                                     UNIT_MAPPING_FILE + ".csv")
    unit_mappings_list = resources.csv_to_list(unit_mappings_csv)
    unit_mappings_populate_query = INSERT_UNITS_QUERY.format(
        dataset_id=dataset_id,
        project_id=project_id,
        units_table_id=UNIT_MAPPING_TABLE,
        mapping_list=get_mapping_list(unit_mappings_list))
    result = bq_utils.query(unit_mappings_populate_query)
    LOGGER.info("Created %s.%s", dataset_id, UNIT_MAPPING_TABLE)
    return result
Exemplo n.º 13
0
    def _create_drug_class_table(bigquery_dataset_id):

        table_name = 'drug_class'
        fields = [{
            "type": "integer",
            "name": "concept_id",
            "mode": "required"
        }, {
            "type": "string",
            "name": "concept_name",
            "mode": "required"
        }, {
            "type": "string",
            "name": "drug_class_name",
            "mode": "required"
        }]
        bq_utils.create_table(table_id=table_name,
                              fields=fields,
                              drop_existing=True,
                              dataset_id=bigquery_dataset_id)

        bq_utils.query(q=main_consts.DRUG_CLASS_QUERY.format(
            dataset_id=bigquery_dataset_id),
                       use_legacy_sql=False,
                       destination_table_id='drug_class',
                       retry_count=bq_consts.BQ_DEFAULT_RETRY_COUNT,
                       write_disposition='WRITE_TRUNCATE',
                       destination_dataset_id=bigquery_dataset_id)

        # ensure concept ancestor table exists
        if not bq_utils.table_exists(common.CONCEPT_ANCESTOR):
            bq_utils.create_standard_table(common.CONCEPT_ANCESTOR,
                                           common.CONCEPT_ANCESTOR)
            q = """INSERT INTO {dataset}.concept_ancestor
            SELECT * FROM {vocab}.concept_ancestor""".format(
                dataset=bigquery_dataset_id, vocab=common.VOCABULARY_DATASET)
            bq_utils.query(q)
Exemplo n.º 14
0
def create_and_populate_source_mapping_table(project_id, dataset_id):
    """
    creates the site mapping table and inserts the site mappings
    :param project_id: project_id containing the dataset
    :param dataset_id: dataset to create the mapping table in
    :return: number of rows inserted in string from
    """
    mapping_list = get_hpo_and_rdr_mappings()
    site_mapping_insert_string = convert_to_bq_string(mapping_list)
    result = bq_utils.create_table(SITE_TABLE_ID,
                                   SITE_MAPPING_FIELDS,
                                   drop_existing=True,
                                   dataset_id=dataset_id)
    site_mappings_insert_query = INSERT_SITE_MAPPINGS_QUERY.format(
        combined_dataset_id=dataset_id,
        project_id=project_id,
        table_id=SITE_TABLE_ID,
        values=site_mapping_insert_string)
    result = bq_utils.query(site_mappings_insert_query)
    rows_affected = result['numDmlAffectedRows']
    return rows_affected
Exemplo n.º 15
0
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id):
    """
    Entry point for performing participant matching of PPI, EHR, and PII data.

    :param project: a string representing the project name
    :param rdr_dataset:  the dataset created from the results given to us by
        the rdr team
    :param ehr_dataset:  the dataset containing the pii information for
        comparisons
    :param dest_dataset_id:  the desired identifier for the match values
        destination dataset

    :return: results of the field comparison for each hpo
    """
    LOGGER.info(f"Calling match_participants with:\n"
                f"project:\t{project}\n"
                f"rdr_dataset:\t{rdr_dataset}\n"
                f"ehr_dataset:\t{ehr_dataset}\n"
                f"dest_dataset_id:\t{dest_dataset_id}\n")

    ehr_tables = bq_utils.list_dataset_contents(ehr_dataset)

    date_string = _get_date_string(rdr_dataset)

    if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]):
        dest_dataset_id += date_string

    # create new dataset for the intermediate tables and results
    dataset_result = bq_utils.create_dataset(
        dataset_id=dest_dataset_id,
        description=consts.DESTINATION_DATASET_DESCRIPTION.format(
            version='', rdr_dataset=rdr_dataset, ehr_dataset=ehr_dataset),
        overwrite_existing=True)

    validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {})
    validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '')
    LOGGER.info(
        f"Created new validation results dataset:\t{validation_dataset}")

    # create intermediate observation table in new dataset
    readers.create_match_values_table(project, rdr_dataset, dest_dataset_id)

    hpo_sites = readers.get_hpo_site_names()

    #TODO:  create a proper config file to store this path
    field_list = resources.fields_for('identity_match')

    for site_name in hpo_sites:
        bq_utils.create_table(site_name + consts.VALIDATION_TABLE_SUFFIX,
                              field_list,
                              drop_existing=True,
                              dataset_id=validation_dataset)

    read_errors = 0
    write_errors = 0

    # validate first names
    for site in hpo_sites:
        LOGGER.info(f"Beginning identity validation for site: {site}")
        results = {}

        try:
            match_values = None
            match_values = _compare_name_fields(project, validation_dataset,
                                                ehr_dataset, site,
                                                consts.OBS_PII_NAME_FIRST,
                                                consts.FIRST_NAME_FIELD,
                                                ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.FIRST_NAME_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.FIRST_NAME_FIELD)
            LOGGER.info(f"Validated first names for: {site}")

        # validate last names
        try:
            match_values = None
            match_values = _compare_name_fields(project, validation_dataset,
                                                ehr_dataset, site,
                                                consts.OBS_PII_NAME_LAST,
                                                consts.LAST_NAME_FIELD,
                                                ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.LAST_NAME_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.LAST_NAME_FIELD)
            LOGGER.info(f"Validated last names for: {site}")

        # validate middle names
        try:
            match_values = None


#            match_values = _compare_name_fields(
#                project,
#                validation_dataset,
#                ehr_dataset,
#                site,
#                consts.OBS_PII_NAME_MIDDLE,
#                consts.MIDDLE_NAME_FIELD,
#                ehr_tables
#            )
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.MIDDLE_NAME_FIELD} at site: {site}"
            ),
            read_errors += 1
        else:
            # write middle name matches for hpo to table
            #            results = _add_matches_to_results(results, match_values, consts.MIDDLE_NAME_FIELD)
            LOGGER.info("Not validating middle names")

        # validate zip codes
        try:
            match_values = None
            match_values = _compare_zip_codes(
                project, validation_dataset, rdr_dataset, ehr_dataset, site,
                consts.OBS_PII_STREET_ADDRESS_ZIP, consts.ZIP_CODE_FIELD,
                ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.ZIP_CODE_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.ZIP_CODE_FIELD)
            LOGGER.info(f"Validated zip codes for: {site}")

        # validate city
        try:
            match_values = None
            match_values = _compare_cities(project, validation_dataset,
                                           rdr_dataset, ehr_dataset, site,
                                           consts.OBS_PII_STREET_ADDRESS_CITY,
                                           consts.CITY_FIELD, ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.CITY_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.ZIP_CODE_FIELD)
            LOGGER.info(f"Validated city names for: {site}")

        # validate state
        try:
            match_values = None
            match_values = _compare_states(project, validation_dataset,
                                           rdr_dataset, ehr_dataset, site,
                                           consts.OBS_PII_STREET_ADDRESS_STATE,
                                           consts.STATE_FIELD, ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.STATE_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.STATE_FIELD)
            LOGGER.info(f"Validated states for: {site}")

        # validate street addresses
        try:
            address_one_matches = None
            address_two_matches = None
            match_values = None
            address_one_matches, address_two_matches = _compare_street_addresses(
                project, validation_dataset, rdr_dataset, ehr_dataset, site,
                consts.OBS_PII_STREET_ADDRESS_ONE,
                consts.OBS_PII_STREET_ADDRESS_TWO, consts.ADDRESS_ONE_FIELD,
                consts.ADDRESS_TWO_FIELD, ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for fields: {consts.ADDRESS_ONE_FIELD}, {consts.ADDRESS_TWO_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, address_one_matches,
                                              consts.ADDRESS_ONE_FIELD)
            results = _add_matches_to_results(results, address_two_matches,
                                              consts.ADDRESS_TWO_FIELD)
            LOGGER.info(f"Validated street addresses for: {site}")

        # validate email addresses
        try:
            match_values = None
            match_values = _compare_email_addresses(
                project, validation_dataset, ehr_dataset, site,
                consts.OBS_PII_EMAIL_ADDRESS, consts.EMAIL_FIELD, ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.EMAIL_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.EMAIL_FIELD)
            LOGGER.info(f"Validated email addresses for: {site}")

        # validate phone numbers
        try:
            match_values = None
            match_values = _compare_phone_numbers(project, validation_dataset,
                                                  ehr_dataset, site,
                                                  consts.OBS_PII_PHONE,
                                                  consts.PHONE_NUMBER_FIELD,
                                                  ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.PHONE_NUMBER_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.PHONE_NUMBER_FIELD)
            LOGGER.info(f"Validated phone numbers for: {site}")

        # validate genders
        try:
            match_values = None
            match_values = _compare_genders(project, validation_dataset,
                                            ehr_dataset, site,
                                            consts.OBS_PII_SEX, ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.SEX_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.SEX_FIELD)
            LOGGER.info(f"Validated genders for: {site}")

        # validate birth dates
        try:
            match_values = None
            match_values = _compare_birth_dates(project, validation_dataset,
                                                ehr_dataset, site,
                                                consts.OBS_PII_BIRTH_DATETIME,
                                                ehr_tables)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError, RuntimeError):
            LOGGER.exception(
                f"Could not read data for field: {consts.BIRTH_DATETIME_FIELD} at site: {site}"
            )
            read_errors += 1
        else:
            results = _add_matches_to_results(results, match_values,
                                              consts.BIRTH_DATE_FIELD)
            LOGGER.info(f"Validated birth dates for: {site}")

        LOGGER.info(f"Writing results to BQ table")
        # write dictionary to a table
        try:
            writers.write_to_result_table(project, validation_dataset, site,
                                          results)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError):
            LOGGER.exception(
                f"Did not write site information to validation dataset:  {site}"
            )
            write_errors += 1

        LOGGER.info(f"Wrote validation results for site: {site}")

    LOGGER.info(f"FINISHED: Validation dataset created:  {validation_dataset}")

    if read_errors > 0:
        LOGGER.error(
            f"Encountered {read_errors} read errors creating validation dataset:\t{validation_dataset}"
        )

    if write_errors > 0:
        LOGGER.error(
            f"Encountered {write_errors} write errors creating validation dataset:\t{validation_dataset}"
        )

    return read_errors + write_errors
Exemplo n.º 16
0
    def test_integration_queries_to_retract_from_fake_dataset(
            self, mock_list_datasets, mock_is_ehr_dataset,
            mock_is_unioned_dataset, mock_is_combined_dataset,
            mock_is_deid_dataset):
        mock_list_datasets.return_value = [{
            'id':
            self.project_id + ':' + self.bq_dataset_id
        }]
        mock_is_deid_dataset.return_value = False
        mock_is_combined_dataset.return_value = False
        mock_is_unioned_dataset.return_value = False
        mock_is_ehr_dataset.return_value = True

        # create and load person_ids to pid table
        bq_utils.create_table(self.pid_table_id,
                              retract_data_bq.PID_TABLE_FIELDS,
                              drop_existing=True,
                              dataset_id=self.bq_dataset_id)
        bq_formatted_insert_values = ', '.join([
            '(%s, %s)' % (person_id, research_id)
            for (person_id, research_id) in self.person_research_ids
        ])
        q = INSERT_PID_TABLE.format(
            dataset_id=self.bq_dataset_id,
            pid_table_id=self.pid_table_id,
            person_research_ids=bq_formatted_insert_values)
        bq_utils.query(q)

        job_ids = []
        row_count_queries = {}
        # load the cdm files into dataset
        for cdm_file in test_util.NYC_FIVE_PERSONS_FILES:
            cdm_file_name = os.path.basename(cdm_file)
            cdm_table = cdm_file_name.split('.')[0]
            hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table)
            # store query for checking number of rows to delete
            row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format(
                dataset_id=self.bq_dataset_id,
                table_id=hpo_table,
                pid_table_id=self.pid_table_id)
            retract_data_bq.logger.info('Preparing to load table %s.%s' %
                                        (self.bq_dataset_id, hpo_table))
            with open(cdm_file, 'rb') as f:
                gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id),
                                        cdm_file_name, f)
            result = bq_utils.load_cdm_csv(self.hpo_id,
                                           cdm_table,
                                           dataset_id=self.bq_dataset_id)
            retract_data_bq.logger.info('Loading table %s.%s' %
                                        (self.bq_dataset_id, hpo_table))
            job_id = result['jobReference']['jobId']
            job_ids.append(job_id)
        incomplete_jobs = bq_utils.wait_on_jobs(job_ids)
        self.assertEqual(len(incomplete_jobs), 0,
                         'NYC five person load job did not complete')
        retract_data_bq.logger.info('All tables loaded successfully')

        # use query results to count number of expected row deletions
        expected_row_count = {}
        for table in row_count_queries:
            result = bq_utils.query(row_count_queries[table])
            expected_row_count[table] = int(result['totalRows'])

        # separate check to find number of actual deleted rows
        q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id)
        q_result = bq_utils.query(q)
        result = bq_utils.response2rows(q_result)
        row_count_before_retraction = {}
        for row in result:
            row_count_before_retraction[row['table_id']] = row['row_count']

        # perform retraction
        retract_data_bq.run_bq_retraction(self.test_project_id,
                                          self.bq_dataset_id,
                                          self.test_project_id,
                                          self.pid_table_id, self.hpo_id,
                                          self.dataset_ids)

        # find actual deleted rows
        q_result = bq_utils.query(q)
        result = bq_utils.response2rows(q_result)
        row_count_after_retraction = {}
        for row in result:
            row_count_after_retraction[row['table_id']] = row['row_count']
        for table in expected_row_count:
            self.assertEqual(
                expected_row_count[table], row_count_before_retraction[table] -
                row_count_after_retraction[table])
Exemplo n.º 17
0
def create_person_id_src_hpo_map(input_dataset, credentials):
    """
    Create a table containing person_ids and src_hpo_ids

    :param input_dataset:  the input dataset to deid
    :param credentidals:  the credentials needed to create a new table.
    """
    map_tablename = "_mapping_person_src_hpos"
    sql = ("select person_id, src_hpo_id "
           "from {input_dataset}._mapping_{table} "
           "join {input_dataset}.{table} "
           "using ({table}_id) "
           "where src_hpo_id not like 'rdr'")

    # list dataset contents
    dataset_tables = bq_utils.list_dataset_contents(input_dataset)
    mapping_tables = []
    mapped_tables = []
    for table in dataset_tables:
        if table.startswith('_mapping_'):
            mapping_tables.append(table)
            mapped_tables.append(table[9:])

    # make sure mapped tables all exist
    check_tables = []
    for table in mapped_tables:
        if table in dataset_tables:
            check_tables.append(table)

    # make sure check_tables contain person_id fields
    person_id_tables = []
    for table in check_tables:
        info = bq_utils.get_table_info(table, dataset_id=input_dataset)
        schema = info.get('schema', {})
        for field_info in schema.get('fields', []):
            if 'person_id' in field_info.get('name'):
                person_id_tables.append(table)

    # revamp mapping tables to contain only mapping tables for tables
    # with person_id fields
    mapping_tables = ['_mapping_' + table for table in person_id_tables]

    sql_statement = []
    for table in person_id_tables:
        sql_statement.append(
            sql.format(table=table, input_dataset=input_dataset))

    final_query = ' UNION ALL '.join(sql_statement)

    # create the mapping table
    if map_tablename not in dataset_tables:
        fields = [{
            "type": "integer",
            "name": "person_id",
            "mode": "required",
            "description": "the person_id of someone with an ehr record"
        }, {
            "type": "string",
            "name": "src_hpo_id",
            "mode": "required",
            "description": "the src_hpo_id of an ehr record"
        }]
        bq_utils.create_table(map_tablename, fields, dataset_id=input_dataset)

    bq_utils.query(final_query,
                   destination_table_id=map_tablename,
                   destination_dataset_id=input_dataset,
                   write_disposition=bq_consts.WRITE_TRUNCATE)
    LOGGER.info(f"Created mapping table:\t{input_dataset}.{map_tablename}")
Exemplo n.º 18
0
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id):
    """
    Entry point for performing participant matching of PPI, EHR, and PII data.

    :param project: a string representing the project name
    :param rdr_dataset:  the dataset created from the results given to us by
        the rdr team
    :param ehr_dataset:  the dataset containing the pii information for
        comparisons
    :param dest_dataset_id:  the desired identifier for the match values
        destination dataset

    :return: results of the field comparison for each hpo
    """
    LOGGER.info(
        'Calling match_participants with:\n'
        'project:\t%s\n'
        'rdr_dataset:\t%s\n'
        'ehr_dataset:\t%s\n'
        'dest_dataset_id:\t%s\n', project, rdr_dataset, ehr_dataset,
        dest_dataset_id)

    date_string = _get_date_string(rdr_dataset)

    if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]):
        dest_dataset_id += date_string

    # create new dataset for the intermediate tables and results
    dataset_result = bq_utils.create_dataset(
        dataset_id=dest_dataset_id,
        description=consts.DESTINATION_DATASET_DESCRIPTION.format(
            version='', rdr_dataset=rdr_dataset, ehr_dataset=ehr_dataset),
        overwrite_existing=True)

    validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {})
    validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '')
    LOGGER.info('Created new validation results dataset:\t%s',
                validation_dataset)

    # create intermediate observation table in new dataset
    readers.create_match_values_table(project, rdr_dataset, dest_dataset_id)

    hpo_sites = readers.get_hpo_site_names()

    #TODO:  create a proper config file to store this path
    field_list = resources.fields_for('identity_match')

    for site_name in hpo_sites:
        bq_utils.create_table(site_name + consts.VALIDATION_TABLE_SUFFIX,
                              field_list,
                              drop_existing=True,
                              dataset_id=validation_dataset)

    read_errors = 0
    write_errors = 0
    results = {}

    # validate first names
    for site in hpo_sites:
        match_values, exc = _compare_name_fields(project, validation_dataset,
                                                 ehr_dataset, site,
                                                 consts.OBS_PII_NAME_FIRST,
                                                 consts.FIRST_NAME_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.FIRST_NAME_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.FIRST_NAME_FIELD)
                write_errors += 1

    LOGGER.info('Validated first names')

    # validate last names
    for site in hpo_sites:
        match_values, exc = _compare_name_fields(project, validation_dataset,
                                                 ehr_dataset, site,
                                                 consts.OBS_PII_NAME_LAST,
                                                 consts.LAST_NAME_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            # write last name matches for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.LAST_NAME_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.LAST_NAME_FIELD)
                write_errors += 1

    LOGGER.info('Validated last names')

    # validate middle names
    for site in hpo_sites:
        match_values, exc = _compare_name_fields(project, validation_dataset,
                                                 ehr_dataset, site,
                                                 consts.OBS_PII_NAME_MIDDLE,
                                                 consts.MIDDLE_NAME_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            # write middle name matches for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.MIDDLE_NAME_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.MIDDLE_NAME_FIELD)
                write_errors += 1

    LOGGER.info('Validated middle names')

    # validate zip codes
    for site in hpo_sites:
        match_values, exc = _compare_zip_codes(
            project, validation_dataset, rdr_dataset, ehr_dataset, site,
            consts.OBS_PII_STREET_ADDRESS_ZIP, consts.ZIP_CODE_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            # write zip codes matces for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.ZIP_CODE_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.ZIP_CODE_FIELD)
                write_errors += 1

    LOGGER.info('Validated zip codes')

    # validate city
    for site in hpo_sites:
        match_values, exc = _compare_cities(project, validation_dataset,
                                            rdr_dataset, ehr_dataset, site,
                                            consts.OBS_PII_STREET_ADDRESS_CITY,
                                            consts.CITY_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            # write city matches for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.CITY_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.CITY_FIELD)
                write_errors += 1

    LOGGER.info('Validated city names')

    # validate state
    for site in hpo_sites:
        match_values, exc = _compare_states(
            project, validation_dataset, rdr_dataset, ehr_dataset, site,
            consts.OBS_PII_STREET_ADDRESS_STATE, consts.STATE_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            # write state matches for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.STATE_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.STATE_FIELD)
                write_errors += 1

    LOGGER.info('Validated states')

    # validate street addresses
    for site in hpo_sites:
        address_one_matches, address_two_matches, exc = _compare_street_addresses(
            project, validation_dataset, rdr_dataset, ehr_dataset, site,
            consts.OBS_PII_STREET_ADDRESS_ONE,
            consts.OBS_PII_STREET_ADDRESS_TWO, consts.ADDRESS_ONE_FIELD,
            consts.ADDRESS_TWO_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            # write street address matches for hpo to table
            try:
                writers.append_to_result_table(site, address_one_matches,
                                               project, validation_dataset,
                                               consts.ADDRESS_ONE_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.ADDRESS_ONE_FIELD)
                write_errors += 1

            try:
                writers.append_to_result_table(site, address_two_matches,
                                               project, validation_dataset,
                                               consts.ADDRESS_TWO_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.ADDRESS_TWO_FIELD)
                write_errors += 1

    LOGGER.info('Validated street addresses')

    # validate email addresses
    for site in hpo_sites:
        match_values, exc = _compare_email_addresses(
            project, validation_dataset, ehr_dataset, site,
            consts.OBS_PII_EMAIL_ADDRESS, consts.EMAIL_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            # write email matches for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.EMAIL_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.EMAIL_FIELD)
                write_errors += 1

    LOGGER.info('Validated email addresses')

    # validate phone numbers
    for site in hpo_sites:
        match_values, exc = _compare_phone_numbers(project, validation_dataset,
                                                   ehr_dataset, site,
                                                   consts.OBS_PII_PHONE,
                                                   consts.PHONE_NUMBER_FIELD)

        if exc is not None:
            read_errors += 1
        else:
            # write phone number matches for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.PHONE_NUMBER_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.PHONE_NUMBER_FIELD)
                write_errors += 1

    LOGGER.info('Validated phone numbers')

    # validate genders
    for site in hpo_sites:
        match_values, exc = _compare_genders(project, validation_dataset,
                                             ehr_dataset, site,
                                             consts.OBS_PII_SEX)

        if exc is not None:
            read_errors += 1
        else:
            # write birthday match for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.SEX_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.SEX_FIELD)
                write_errors += 1

    LOGGER.info('Validated genders')

    # validate birth dates
    for site in hpo_sites:
        match_values, exc = _compare_birth_dates(project, validation_dataset,
                                                 ehr_dataset, site,
                                                 consts.OBS_PII_BIRTH_DATETIME)

        if exc is not None:
            read_errors += 1
        else:
            # write birthday match for hpo to table
            try:
                writers.append_to_result_table(site, match_values, project,
                                               validation_dataset,
                                               consts.BIRTH_DATE_FIELD)
            except (oauth2client.client.HttpAccessTokenRefreshError,
                    googleapiclient.errors.HttpError):
                LOGGER.exception(
                    "Unable to insert records in table:\t%s\tfor field: %s",
                    site, consts.BIRTH_DATE_FIELD)
                write_errors += 1

    LOGGER.info('Validated birth dates')

    # generate single clean record for each participant at each site
    for site in hpo_sites:
        try:
            writers.merge_fields_into_single_record(project,
                                                    validation_dataset, site)
            LOGGER.info('Merged participant match records')
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError):
            write_errors += 1

        try:
            writers.remove_sparse_records(project, validation_dataset, site)
            LOGGER.info('Removed sparse participant match records')
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError):
            write_errors += 1

        try:
            writers.change_nulls_to_missing_value(project, validation_dataset,
                                                  site)
            LOGGER.info(
                'Changed nulls to missing values in participant match records')
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError):
            write_errors += 1

    LOGGER.info("Finished creating validation dataset")

    if read_errors > 0:
        LOGGER.error(
            "Encountered %d read errors creating validation dataset:\t%s",
            read_errors, validation_dataset)

    if write_errors > 0:
        LOGGER.error(
            "Encountered %d write errors creating validation dataset:\t%s",
            write_errors, validation_dataset)

    return results, read_errors + write_errors
Exemplo n.º 19
0
 def _create_hpo_table(self, hpo_id, table, dataset_id):
     table_id = bq_utils.get_table_id(hpo_id, table)
     bq_utils.create_table(table_id,
                           resources.fields_for(table),
                           dataset_id=dataset_id)
     return table_id
Exemplo n.º 20
0
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id):
    """
    Entry point for performing participant matching of PPI, EHR, and PII data.

    :param project: a string representing the project name
    :param rdr_dataset:  the dataset created from the results given to us by
        the rdr team
    :param ehr_dataset:  the dataset containing the pii information for
        comparisons
    :param dest_dataset_id:  the desired identifier for the match values
        destination dataset

    :return: results of the field comparison for each hpo
    """
    date_string = _get_date_string(rdr_dataset)

    if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]):
        dest_dataset_id += date_string

    # create new dataset for the intermediate tables and results
    dataset_result = bq_utils.create_dataset(
        dataset_id=dest_dataset_id,
        description=consts.DESTINATION_DATASET_DESCRIPTION.format(
            version='',
            rdr_dataset=rdr_dataset,
            ehr_dataset=ehr_dataset
        ),
        overwrite_existing=True)

    validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {})
    validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '')

    # create intermediate observation table in new dataset
    readers.create_match_values_table(project, rdr_dataset, dest_dataset_id)

    hpo_sites = readers.get_hpo_site_names()

    #TODO:  create a proper config file to store this path
    field_list = resources.fields_for('identity_match')

    for site_name in hpo_sites:
        bq_utils.create_table(
            site_name + consts.VALIDATION_TABLE_SUFFIX,
            field_list,
            drop_existing=True,
            dataset_id=validation_dataset
        )

    results = {}

    # validate first names
    for site in hpo_sites:
        match_values = _compare_name_fields(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_NAME_FIRST,
            consts.FIRST_NAME_FIELD
        )

        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.FIRST_NAME_FIELD
        )

    # validate last names
    for site in hpo_sites:
        match_values = _compare_name_fields(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_NAME_LAST,
            consts.LAST_NAME_FIELD
        )
        # write last name matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.LAST_NAME_FIELD
        )

    # validate middle names
    for site in hpo_sites:
        match_values = _compare_name_fields(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_NAME_MIDDLE,
            consts.MIDDLE_NAME_FIELD
        )
        # write middle name matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.MIDDLE_NAME_FIELD
        )

    # validate zip codes
    for site in hpo_sites:
        match_values = _compare_zip_codes(
            project,
            validation_dataset,
            rdr_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_STREET_ADDRESS_ZIP,
            consts.ZIP_CODE_FIELD
        )
        # write zip codes matces for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.ZIP_CODE_FIELD
        )

    # validate city
    for site in hpo_sites:
        match_values = _compare_cities(
            project,
            validation_dataset,
            rdr_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_STREET_ADDRESS_CITY,
            consts.CITY_FIELD
        )
        # write city matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.CITY_FIELD
        )

    # validate state
    for site in hpo_sites:
        match_values = _compare_states(
            project,
            validation_dataset,
            rdr_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_STREET_ADDRESS_STATE,
            consts.STATE_FIELD
        )
        # write state matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.STATE_FIELD
        )

    # validate street addresses
    for site in hpo_sites:
        address_one_matches, address_two_matches = _compare_street_addresses(
            project,
            validation_dataset,
            rdr_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_STREET_ADDRESS_ONE,
            consts.OBS_PII_STREET_ADDRESS_TWO,
            consts.ADDRESS_ONE_FIELD,
            consts.ADDRESS_TWO_FIELD
        )
        # write street address matches for hpo to table
        writers.append_to_result_table(
            site,
            address_one_matches,
            project,
            validation_dataset,
            consts.ADDRESS_ONE_FIELD
        )
        writers.append_to_result_table(
            site,
            address_two_matches,
            project,
            validation_dataset,
            consts.ADDRESS_TWO_FIELD
        )

    # validate email addresses
    for site in hpo_sites:
        match_values = _compare_email_addresses(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_EMAIL_ADDRESS,
            consts.EMAIL_FIELD
        )
        # write email matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.EMAIL_FIELD
        )

    # validate phone numbers
    for site in hpo_sites:
        match_values = _compare_phone_numbers(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_PHONE,
            consts.PHONE_NUMBER_FIELD
        )
        # write phone number matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.PHONE_NUMBER_FIELD
        )

    # validate genders
    for site in hpo_sites:
        match_values = _compare_genders(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_SEX
        )
        # write birthday match for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.SEX_FIELD
        )

    # validate birth dates
    for site in hpo_sites:
        match_values = _compare_birth_dates(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_BIRTH_DATETIME
        )
        # write birthday match for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.BIRTH_DATE_FIELD
        )

    # generate single clean record for each participant at each site
    for site in hpo_sites:
        writers.merge_fields_into_single_record(project, validation_dataset, site)
        writers.remove_sparse_records(project, validation_dataset, site)
        writers.change_nulls_to_missing_value(project, validation_dataset, site)

    # generate hpo site reports
    for site in hpo_sites:
        bucket = gcs_utils.get_hpo_bucket(site)
        filename = os.path.join(
            consts.REPORT_DIRECTORY.format(date=date_string),
            consts.REPORT_TITLE
        )
        writers.create_site_validation_report(
            project, validation_dataset, [site], bucket, filename
        )

    # generate aggregate site report
    bucket = gcs_utils.get_drc_bucket()
    filename = os.path.join(validation_dataset, consts.REPORT_TITLE)
    writers.create_site_validation_report(
        project, validation_dataset, hpo_sites, bucket, filename
    )

    return results