示例#1
0
def write_results_to_drc_bucket(project, validation_dataset=None):
    """
    Write the results of participant matching to the drc bucket.

    :param project: a string representing the project name
    :param validation_dataset:  the identifier for the match values
        destination dataset

    :return: None
    :raises:  RuntimeError if validation_dataset is not defined.
    """
    LOGGER.info('Writing to the DRC bucket')
    if validation_dataset is None:
        LOGGER.error('Validation_dataset name is not defined.')
        raise RuntimeError('validation_dataset name cannot be None.')

    date_string = _get_date_string(validation_dataset)
    hpo_sites = readers.get_hpo_site_names()
    # generate aggregate site report
    bucket = gcs_utils.get_drc_bucket()
    filename = os.path.join(validation_dataset,
                            consts.REPORT_DIRECTORY.format(date=date_string),
                            consts.REPORT_TITLE)
    _, errors = writers.create_site_validation_report(project,
                                                      validation_dataset,
                                                      hpo_sites, bucket,
                                                      filename)

    if errors > 0:
        LOGGER.error(
            f"Encountered {errors} read errors when writing drc report")
示例#2
0
def write_results_to_site_buckets(project, validation_dataset=None):
    """
    Write the results of participant matching to each site's bucket.

    :param project: a string representing the project name
    :param validation_dataset:  the identifier for the match values
        destination dataset

    :return: None
    :raises:  RuntimeError if validation_dataset is not defined.
    """
    LOGGER.info('Writing to site buckets')
    if validation_dataset is None:
        LOGGER.error('Validation_dataset name is not defined.')
        raise RuntimeError('validation_dataset name cannot be None.')

    date_string = _get_date_string(validation_dataset)
    hpo_sites = readers.get_hpo_site_names()
    # generate hpo site reports
    for site in hpo_sites:
        bucket = gcs_utils.get_hpo_bucket(site)
        filename = os.path.join(
            consts.REPORT_DIRECTORY.format(date=date_string),
            consts.REPORT_TITLE
        )
        _, errors = writers.create_site_validation_report(
            project, validation_dataset, [site], bucket, filename
        )

        if errors > 0:
            LOGGER.error("Encountered %d read errors when writing %s site report",
                         errors,
                         site
                        )
示例#3
0
    def test_create_site_validation_report_with_errors(
            self,
            mock_report_file,
            mock_query,
            mock_upload
    ):
        # preconditions
        mock_query.side_effect = oauth2client.client.HttpAccessTokenRefreshError()

        bucket = 'abc'
        filename = 'output.csv'

        # test
        writer.create_site_validation_report(
            self.project, self.dataset, [self.site], bucket, filename
        )

        # post conditions
        self.assertEqual(mock_report_file.call_count, 1)
        self.assertEqual(mock_query.call_count, len([self.site]))
        self.assertEqual(mock_upload.call_count, 1)

        expected_query = consts.VALIDATION_RESULTS_VALUES.format(
            project=self.project,
            dataset=self.dataset,
            table=self.site + consts.VALIDATION_TABLE_SUFFIX,
        )
        self.assertEqual(
            mock_query.assert_called_with(expected_query, batch=True),
            None
        )

        self.assertEqual(
            mock_upload.assert_called_with(
                bucket, filename, ANY
            ),
            None
        )

        expected_report_calls = [
            call(),
            call().write('person_id,first_name,last_name,birth_date,sex,address,phone_number,email,algorithm\n'),
            call().write("Unable to report id validation match records for site:\t{}.\n".format(self.site)),
            call().seek(0),
            call().close()
        ]
        self.assertEqual(mock_report_file.mock_calls, expected_report_calls)
示例#4
0
    def test_create_site_validation_report(
            self,
            mock_report_file,
            mock_query,
            mock_response,
            mock_upload
    ):
        # preconditions
        bucket = 'abc'
        filename = 'output.csv'
        mock_response.return_value = [
            {
                consts.ADDRESS_ONE_FIELD: consts.MATCH,
                consts.ADDRESS_TWO_FIELD: consts.MATCH,
                consts.CITY_FIELD: consts.MATCH,
                consts.STATE_FIELD: consts.MATCH,
                consts.ZIP_CODE_FIELD: consts.MATCH,
                consts.PERSON_ID_FIELD: 1,
                consts.FIRST_NAME_FIELD: consts.MATCH,
                consts.LAST_NAME_FIELD: consts.MATCH,
                consts.MIDDLE_NAME_FIELD: consts.MATCH,
                consts.BIRTH_DATE_FIELD: consts.MATCH,
                consts.PHONE_NUMBER_FIELD: consts.MATCH,
                consts.EMAIL_FIELD: consts.MATCH,
                consts.ALGORITHM_FIELD: consts.MATCH,
                consts.SEX_FIELD: consts.MATCH,
            },
            {
                consts.ADDRESS_ONE_FIELD: consts.MATCH,
                consts.ADDRESS_TWO_FIELD: consts.MATCH,
                consts.CITY_FIELD: consts.MATCH,
                consts.STATE_FIELD: consts.MATCH,
                consts.ZIP_CODE_FIELD: consts.MISMATCH,
                consts.PERSON_ID_FIELD: 2,
                consts.FIRST_NAME_FIELD: consts.MATCH,
                consts.LAST_NAME_FIELD: consts.MATCH,
                consts.MIDDLE_NAME_FIELD: consts.MATCH,
                consts.BIRTH_DATE_FIELD: consts.MISMATCH,
                consts.PHONE_NUMBER_FIELD: consts.MATCH,
                consts.EMAIL_FIELD: consts.MATCH,
                consts.ALGORITHM_FIELD: consts.MATCH,
                consts.SEX_FIELD: consts.MISSING,
            },
        ]

        # test
        writer.create_site_validation_report(
            self.project, self.dataset, [self.site], bucket, filename
        )

        # post conditions
        self.assertEqual(mock_report_file.call_count, 1)
        self.assertEqual(mock_query.call_count, len([self.site]))
        self.assertEqual(mock_response.call_count, len([self.site]))
        self.assertEqual(mock_upload.call_count, 1)

        expected_query = consts.VALIDATION_RESULTS_VALUES.format(
            project=self.project,
            dataset=self.dataset,
            table=self.site + consts.VALIDATION_TABLE_SUFFIX,
        )
        self.assertEqual(
            mock_query.assert_called_with(expected_query, batch=True),
            None
        )

        self.assertEqual(
            mock_upload.assert_called_with(
                bucket, filename, ANY
            ),
            None
        )

        expected_report_calls = [
            call(),
            call().write('person_id,first_name,last_name,birth_date,sex,address,phone_number,email,algorithm\n'),
            call().write('1,match,match,match,match,match,match,match,match\n'),
            call().write('2,match,match,no_match,missing,no_match,match,match,match\n'),
            call().seek(0),
            call().close()
        ]
        self.assertEqual(mock_report_file.mock_calls, expected_report_calls)
示例#5
0
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id):
    """
    Entry point for performing participant matching of PPI, EHR, and PII data.

    :param project: a string representing the project name
    :param rdr_dataset:  the dataset created from the results given to us by
        the rdr team
    :param ehr_dataset:  the dataset containing the pii information for
        comparisons
    :param dest_dataset_id:  the desired identifier for the match values
        destination dataset

    :return: results of the field comparison for each hpo
    """
    date_string = _get_date_string(rdr_dataset)

    if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]):
        dest_dataset_id += date_string

    # create new dataset for the intermediate tables and results
    dataset_result = bq_utils.create_dataset(
        dataset_id=dest_dataset_id,
        description=consts.DESTINATION_DATASET_DESCRIPTION.format(
            version='',
            rdr_dataset=rdr_dataset,
            ehr_dataset=ehr_dataset
        ),
        overwrite_existing=True)

    validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {})
    validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '')

    # create intermediate observation table in new dataset
    readers.create_match_values_table(project, rdr_dataset, dest_dataset_id)

    hpo_sites = readers.get_hpo_site_names()

    #TODO:  create a proper config file to store this path
    field_list = resources.fields_for('identity_match')

    for site_name in hpo_sites:
        bq_utils.create_table(
            site_name + consts.VALIDATION_TABLE_SUFFIX,
            field_list,
            drop_existing=True,
            dataset_id=validation_dataset
        )

    results = {}

    # validate first names
    for site in hpo_sites:
        match_values = _compare_name_fields(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_NAME_FIRST,
            consts.FIRST_NAME_FIELD
        )

        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.FIRST_NAME_FIELD
        )

    # validate last names
    for site in hpo_sites:
        match_values = _compare_name_fields(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_NAME_LAST,
            consts.LAST_NAME_FIELD
        )
        # write last name matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.LAST_NAME_FIELD
        )

    # validate middle names
    for site in hpo_sites:
        match_values = _compare_name_fields(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_NAME_MIDDLE,
            consts.MIDDLE_NAME_FIELD
        )
        # write middle name matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.MIDDLE_NAME_FIELD
        )

    # validate zip codes
    for site in hpo_sites:
        match_values = _compare_zip_codes(
            project,
            validation_dataset,
            rdr_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_STREET_ADDRESS_ZIP,
            consts.ZIP_CODE_FIELD
        )
        # write zip codes matces for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.ZIP_CODE_FIELD
        )

    # validate city
    for site in hpo_sites:
        match_values = _compare_cities(
            project,
            validation_dataset,
            rdr_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_STREET_ADDRESS_CITY,
            consts.CITY_FIELD
        )
        # write city matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.CITY_FIELD
        )

    # validate state
    for site in hpo_sites:
        match_values = _compare_states(
            project,
            validation_dataset,
            rdr_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_STREET_ADDRESS_STATE,
            consts.STATE_FIELD
        )
        # write state matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.STATE_FIELD
        )

    # validate street addresses
    for site in hpo_sites:
        address_one_matches, address_two_matches = _compare_street_addresses(
            project,
            validation_dataset,
            rdr_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_STREET_ADDRESS_ONE,
            consts.OBS_PII_STREET_ADDRESS_TWO,
            consts.ADDRESS_ONE_FIELD,
            consts.ADDRESS_TWO_FIELD
        )
        # write street address matches for hpo to table
        writers.append_to_result_table(
            site,
            address_one_matches,
            project,
            validation_dataset,
            consts.ADDRESS_ONE_FIELD
        )
        writers.append_to_result_table(
            site,
            address_two_matches,
            project,
            validation_dataset,
            consts.ADDRESS_TWO_FIELD
        )

    # validate email addresses
    for site in hpo_sites:
        match_values = _compare_email_addresses(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_EMAIL_ADDRESS,
            consts.EMAIL_FIELD
        )
        # write email matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.EMAIL_FIELD
        )

    # validate phone numbers
    for site in hpo_sites:
        match_values = _compare_phone_numbers(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_PHONE,
            consts.PHONE_NUMBER_FIELD
        )
        # write phone number matches for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.PHONE_NUMBER_FIELD
        )

    # validate genders
    for site in hpo_sites:
        match_values = _compare_genders(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_SEX
        )
        # write birthday match for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.SEX_FIELD
        )

    # validate birth dates
    for site in hpo_sites:
        match_values = _compare_birth_dates(
            project,
            validation_dataset,
            ehr_dataset,
            site,
            consts.OBS_PII_BIRTH_DATETIME
        )
        # write birthday match for hpo to table
        writers.append_to_result_table(
            site,
            match_values,
            project,
            validation_dataset,
            consts.BIRTH_DATE_FIELD
        )

    # generate single clean record for each participant at each site
    for site in hpo_sites:
        writers.merge_fields_into_single_record(project, validation_dataset, site)
        writers.remove_sparse_records(project, validation_dataset, site)
        writers.change_nulls_to_missing_value(project, validation_dataset, site)

    # generate hpo site reports
    for site in hpo_sites:
        bucket = gcs_utils.get_hpo_bucket(site)
        filename = os.path.join(
            consts.REPORT_DIRECTORY.format(date=date_string),
            consts.REPORT_TITLE
        )
        writers.create_site_validation_report(
            project, validation_dataset, [site], bucket, filename
        )

    # generate aggregate site report
    bucket = gcs_utils.get_drc_bucket()
    filename = os.path.join(validation_dataset, consts.REPORT_TITLE)
    writers.create_site_validation_report(
        project, validation_dataset, hpo_sites, bucket, filename
    )

    return results