def get_generate_ext_table_queries(project_id, deid_dataset_id, combined_dataset_id): """ Generate the queries for generating the ext tables :param project_id: project_id containing the dataset to generate ext tables in :param deid_dataset_id: deid_dataset_id to generate ext tables in :param combined_dataset_id: combined_dataset_id to use the mapping tables from :return: list of query dicts """ queries = [] mapping_table_ids = get_mapping_table_ids(project_id, combined_dataset_id) create_and_populate_source_mapping_table(project_id, combined_dataset_id) for mapping_table_id in mapping_table_ids: cdm_table_id = get_cdm_table_from_mapping(mapping_table_id) ext_table_id = cdm_table_id + EXT_TABLE_SUFFIX ext_table_fields = get_table_fields(cdm_table_id) bq_utils.create_table(ext_table_id, ext_table_fields, drop_existing=True, dataset_id=deid_dataset_id) query = dict() query[cdr_consts.QUERY] = REPLACE_SRC_QUERY.format( project_id=project_id, combined_dataset_id=combined_dataset_id, mapping_table_id=mapping_table_id, site_mappings_table_id=SITE_TABLE_ID, cdm_table_id=cdm_table_id) query[cdr_consts.DESTINATION_TABLE] = ext_table_id query[cdr_consts.DESTINATION_DATASET] = deid_dataset_id query[cdr_consts.DISPOSITION] = bq_consts.WRITE_EMPTY queries.append(query) return queries
def _create_drug_class_table(self): table_name = 'drug_class' fields = [{ "type": "integer", "name": "concept_id", "mode": "required" }, { "type": "string", "name": "concept_name", "mode": "required" }, { "type": "string", "name": "drug_class_name", "mode": "required" }] bq_utils.create_table(table_id=table_name, fields=fields, drop_existing=True, dataset_id=self.bigquery_dataset_id) bq_utils.query(q=main_constants.DRUG_CLASS_QUERY.format( dataset_id=self.bigquery_dataset_id), use_legacy_sql=False, destination_table_id='drug_class', retry_count=bq_consts.BQ_DEFAULT_RETRY_COUNT, write_disposition='WRITE_TRUNCATE', destination_dataset_id=self.bigquery_dataset_id)
def test_create_existing_table_without_drop_raises_error(self): table_id = 'some_random_table_id' fields = [dict(name='id', type='integer', mode='required'), dict(name='name', type='string', mode='nullable')] bq_utils.create_table(table_id, fields) with self.assertRaises(bq_utils.InvalidOperationError) as cm: bq_utils.create_table(table_id, fields, drop_existing=False)
def create_dose_form_route_mappings_table(project_id, dataset_id=None): """ Creates "_logging_dose_form_route_mappings" table with only id columns from resources/dose_form_route_mappings.csv :param project_id: :param dataset_id: BQ dataset_id :return: upload metadata for created table """ if dataset_id is None: # Using table created in bq_dataset instead of re-creating in every dataset dataset_id = bq_utils.get_dataset_id() dose_form_routes_table_id = DOSE_FORM_ROUTES_TABLE_ID LOGGER.info("Creating %s.%s", dataset_id, DOSE_FORM_ROUTES_TABLE_ID) # create empty table bq_utils.create_table(DOSE_FORM_ROUTES_TABLE_ID, DOSE_FORM_ROUTE_FIELDS, drop_existing=True, dataset_id=dataset_id) dose_form_route_mappings_csv = os.path.join(resources.resource_path, DOSE_FORM_ROUTES_FILE + ".csv") dose_form_route_mappings_list = resources.csv_to_list( dose_form_route_mappings_csv) dose_form_routes_populate_query = INSERT_ROUTES_QUERY.format( dataset_id=dataset_id, project_id=project_id, routes_table_id=DOSE_FORM_ROUTES_TABLE_ID, mapping_list=get_mapping_list(dose_form_route_mappings_list)) result = bq_utils.query(dose_form_routes_populate_query) LOGGER.info("Created %s.%s", dataset_id, dose_form_routes_table_id) return result
def create_metadata_table(dataset_id, fields_list): """ Creates a metadata table in a given dataset. :param dataset_id: name of the dataset :param fields_list: name of the dataset :return: """ if not bq_utils.table_exists(METADATA_TABLE, dataset_id): bq_utils.create_table(table_id=METADATA_TABLE, fields=fields_list, dataset_id=dataset_id)
def test_create_table_drop_existing_success(self): table_id = 'some_random_table_id' fields = [dict(name='id', type='integer', mode='required'), dict(name='name', type='string', mode='nullable')] result_1 = bq_utils.create_table(table_id, fields) # sanity check table_id = result_1['tableReference']['tableId'] self.assertTrue(bq_utils.table_exists(table_id)) result_2 = bq_utils.create_table(table_id, fields, drop_existing=True) # same id and second one created after first one self.assertEqual(result_1['id'], result_2['id']) self.assertTrue(result_2['creationTime'] > result_1['creationTime'])
def copy_suppressed_table_schemas(known_tables, dest_dataset): """ Copy only table schemas for suppressed tables. :param known_tables: list of tables the software 'knows' about for deid purposes. :param dest_dataset: name of the dataset to copy tables to. """ for table in SUPPRESSED_TABLES: if table in known_tables: field_list = fields_for(table) # create a table schema only. bq_utils.create_table(table, field_list, drop_existing=True, dataset_id=dest_dataset)
def test_create_table(self): table_id = 'some_random_table_id' fields = [dict(name='person_id', type='integer', mode='required'), dict(name='name', type='string', mode='nullable')] result = bq_utils.create_table(table_id, fields) self.assertTrue('kind' in result) self.assertEqual(result['kind'], 'bigquery#table') table_info = bq_utils.get_table_info(table_id) self._table_has_clustering(table_info)
def create_drug_route_mappings_table(project_id, route_mapping_dataset_id, dose_form_routes_table_id, route_mapping_prefix): """ Creates "drug_route_mappings" table using the query GET_DRUGS_FROM_DOSE_FORM :param project_id: the project containing the routes dataset :param route_mapping_dataset_id: dataset where the dose_form_route mapping table exists and where the drug_route mapping table will be created :param dose_form_routes_table_id: table_id of the dose_form_routes mapping table :param route_mapping_prefix: prefix for the dose_form_routes_mapping_table :return: upload metadata and created drug_route_table_id """ if route_mapping_dataset_id is None: # Using table created in bq_dataset instead of re-creating in every dataset route_mapping_dataset_id = bq_utils.get_dataset_id() LOGGER.info("Creating %s.%s", route_mapping_dataset_id, DRUG_ROUTES_TABLE_ID) # create empty table bq_utils.create_table(DRUG_ROUTES_TABLE_ID, DRUG_ROUTE_FIELDS, drop_existing=True, dataset_id=route_mapping_dataset_id) drug_routes_populate_query = GET_DRUGS_FROM_DOSE_FORM.format( project_id=project_id, vocabulary_dataset=common.VOCABULARY_DATASET, route_mapping_dataset_id=route_mapping_dataset_id, dose_form_route_mapping_table=dose_form_routes_table_id, route_mapping_prefix=route_mapping_prefix) result = bq_utils.query(q=drug_routes_populate_query, write_disposition='WRITE_TRUNCATE', destination_dataset_id=route_mapping_dataset_id, destination_table_id=DRUG_ROUTES_TABLE_ID, batch=True) incomplete_jobs = bq_utils.wait_on_jobs([result['jobReference']['jobId']]) if incomplete_jobs: LOGGER.info('Failed job id {id}'.format(id=incomplete_jobs[0])) raise bq_utils.BigQueryJobWaitError(incomplete_jobs) LOGGER.info("Created %s.%s", route_mapping_dataset_id, DRUG_ROUTES_TABLE_ID) return result
def test_create_table(self): table_id = 'some_random_table_id' fields = [ dict(name='id', type='integer', mode='required'), dict(name='name', type='string', mode='nullable') ] result = bq_utils.create_table(table_id, fields) self.assertTrue('kind' in result) self.assertEqual(result['kind'], 'bigquery#table') # sanity check self.assertTrue(bq_utils.table_exists(table_id))
def get_generate_ext_table_queries(project_id, dataset_id, sandbox_dataset_id, mapping_dataset_id): """ Generate the queries for generating the ext tables :param project_id: project_id containing the dataset to generate ext tables in :param dataset_id: dataset_id to generate ext tables in :param sandbox_dataset_id: sandbox_dataset_id to store sandboxed rows. :param mapping_dataset_id: mapping_tables_dataset_id to use the mapping tables from :return: list of query dicts """ queries = [] # FIXME: Remove ths reference in future LOGGER.info(f'sandbox_dataset_id : {sandbox_dataset_id}') mapping_table_ids = get_mapping_table_ids(project_id, mapping_dataset_id) for mapping_table_id in mapping_table_ids: cdm_table_id = get_cdm_table_from_mapping(mapping_table_id) ext_table_id = cdm_table_id + EXT_TABLE_SUFFIX ext_table_fields = get_table_fields(cdm_table_id, ext_table_id) bq_utils.create_table(ext_table_id, ext_table_fields, drop_existing=True, dataset_id=dataset_id) query = dict() query[cdr_consts.QUERY] = REPLACE_SRC_QUERY.render( project_id=project_id, sandbox_dataset_id=sandbox_dataset_id, mapping_dataset_id=mapping_dataset_id, mapping_table_id=mapping_table_id, site_mappings_table_id=SITE_TABLE_ID, cdm_table_id=cdm_table_id) query[cdr_consts.DESTINATION_TABLE] = ext_table_id query[cdr_consts.DESTINATION_DATASET] = dataset_id query[cdr_consts.DISPOSITION] = bq_consts.WRITE_EMPTY queries.append(query) return queries
def create_unit_mapping_table(project_id, dataset_id): """ This function creates the unit_mapping table and populate it with the values from resources/unit_mapping.csv :param project_id: :param dataset_id: :return: """ bq_utils.create_table(table_id=UNIT_MAPPING_TABLE, fields=UNIT_MAPPING_FIELDS, drop_existing=True, dataset_id=dataset_id) unit_mappings_csv = os.path.join(resources.resource_path, UNIT_MAPPING_FILE + ".csv") unit_mappings_list = resources.csv_to_list(unit_mappings_csv) unit_mappings_populate_query = INSERT_UNITS_QUERY.format( dataset_id=dataset_id, project_id=project_id, units_table_id=UNIT_MAPPING_TABLE, mapping_list=get_mapping_list(unit_mappings_list)) result = bq_utils.query(unit_mappings_populate_query) LOGGER.info("Created %s.%s", dataset_id, UNIT_MAPPING_TABLE) return result
def _create_drug_class_table(bigquery_dataset_id): table_name = 'drug_class' fields = [{ "type": "integer", "name": "concept_id", "mode": "required" }, { "type": "string", "name": "concept_name", "mode": "required" }, { "type": "string", "name": "drug_class_name", "mode": "required" }] bq_utils.create_table(table_id=table_name, fields=fields, drop_existing=True, dataset_id=bigquery_dataset_id) bq_utils.query(q=main_consts.DRUG_CLASS_QUERY.format( dataset_id=bigquery_dataset_id), use_legacy_sql=False, destination_table_id='drug_class', retry_count=bq_consts.BQ_DEFAULT_RETRY_COUNT, write_disposition='WRITE_TRUNCATE', destination_dataset_id=bigquery_dataset_id) # ensure concept ancestor table exists if not bq_utils.table_exists(common.CONCEPT_ANCESTOR): bq_utils.create_standard_table(common.CONCEPT_ANCESTOR, common.CONCEPT_ANCESTOR) q = """INSERT INTO {dataset}.concept_ancestor SELECT * FROM {vocab}.concept_ancestor""".format( dataset=bigquery_dataset_id, vocab=common.VOCABULARY_DATASET) bq_utils.query(q)
def create_and_populate_source_mapping_table(project_id, dataset_id): """ creates the site mapping table and inserts the site mappings :param project_id: project_id containing the dataset :param dataset_id: dataset to create the mapping table in :return: number of rows inserted in string from """ mapping_list = get_hpo_and_rdr_mappings() site_mapping_insert_string = convert_to_bq_string(mapping_list) result = bq_utils.create_table(SITE_TABLE_ID, SITE_MAPPING_FIELDS, drop_existing=True, dataset_id=dataset_id) site_mappings_insert_query = INSERT_SITE_MAPPINGS_QUERY.format( combined_dataset_id=dataset_id, project_id=project_id, table_id=SITE_TABLE_ID, values=site_mapping_insert_string) result = bq_utils.query(site_mappings_insert_query) rows_affected = result['numDmlAffectedRows'] return rows_affected
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id): """ Entry point for performing participant matching of PPI, EHR, and PII data. :param project: a string representing the project name :param rdr_dataset: the dataset created from the results given to us by the rdr team :param ehr_dataset: the dataset containing the pii information for comparisons :param dest_dataset_id: the desired identifier for the match values destination dataset :return: results of the field comparison for each hpo """ LOGGER.info(f"Calling match_participants with:\n" f"project:\t{project}\n" f"rdr_dataset:\t{rdr_dataset}\n" f"ehr_dataset:\t{ehr_dataset}\n" f"dest_dataset_id:\t{dest_dataset_id}\n") ehr_tables = bq_utils.list_dataset_contents(ehr_dataset) date_string = _get_date_string(rdr_dataset) if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]): dest_dataset_id += date_string # create new dataset for the intermediate tables and results dataset_result = bq_utils.create_dataset( dataset_id=dest_dataset_id, description=consts.DESTINATION_DATASET_DESCRIPTION.format( version='', rdr_dataset=rdr_dataset, ehr_dataset=ehr_dataset), overwrite_existing=True) validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {}) validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '') LOGGER.info( f"Created new validation results dataset:\t{validation_dataset}") # create intermediate observation table in new dataset readers.create_match_values_table(project, rdr_dataset, dest_dataset_id) hpo_sites = readers.get_hpo_site_names() #TODO: create a proper config file to store this path field_list = resources.fields_for('identity_match') for site_name in hpo_sites: bq_utils.create_table(site_name + consts.VALIDATION_TABLE_SUFFIX, field_list, drop_existing=True, dataset_id=validation_dataset) read_errors = 0 write_errors = 0 # validate first names for site in hpo_sites: LOGGER.info(f"Beginning identity validation for site: {site}") results = {} try: match_values = None match_values = _compare_name_fields(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_FIRST, consts.FIRST_NAME_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.FIRST_NAME_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.FIRST_NAME_FIELD) LOGGER.info(f"Validated first names for: {site}") # validate last names try: match_values = None match_values = _compare_name_fields(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_LAST, consts.LAST_NAME_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.LAST_NAME_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.LAST_NAME_FIELD) LOGGER.info(f"Validated last names for: {site}") # validate middle names try: match_values = None # match_values = _compare_name_fields( # project, # validation_dataset, # ehr_dataset, # site, # consts.OBS_PII_NAME_MIDDLE, # consts.MIDDLE_NAME_FIELD, # ehr_tables # ) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.MIDDLE_NAME_FIELD} at site: {site}" ), read_errors += 1 else: # write middle name matches for hpo to table # results = _add_matches_to_results(results, match_values, consts.MIDDLE_NAME_FIELD) LOGGER.info("Not validating middle names") # validate zip codes try: match_values = None match_values = _compare_zip_codes( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ZIP, consts.ZIP_CODE_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.ZIP_CODE_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.ZIP_CODE_FIELD) LOGGER.info(f"Validated zip codes for: {site}") # validate city try: match_values = None match_values = _compare_cities(project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_CITY, consts.CITY_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.CITY_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.ZIP_CODE_FIELD) LOGGER.info(f"Validated city names for: {site}") # validate state try: match_values = None match_values = _compare_states(project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_STATE, consts.STATE_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.STATE_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.STATE_FIELD) LOGGER.info(f"Validated states for: {site}") # validate street addresses try: address_one_matches = None address_two_matches = None match_values = None address_one_matches, address_two_matches = _compare_street_addresses( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ONE, consts.OBS_PII_STREET_ADDRESS_TWO, consts.ADDRESS_ONE_FIELD, consts.ADDRESS_TWO_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for fields: {consts.ADDRESS_ONE_FIELD}, {consts.ADDRESS_TWO_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, address_one_matches, consts.ADDRESS_ONE_FIELD) results = _add_matches_to_results(results, address_two_matches, consts.ADDRESS_TWO_FIELD) LOGGER.info(f"Validated street addresses for: {site}") # validate email addresses try: match_values = None match_values = _compare_email_addresses( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_EMAIL_ADDRESS, consts.EMAIL_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.EMAIL_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.EMAIL_FIELD) LOGGER.info(f"Validated email addresses for: {site}") # validate phone numbers try: match_values = None match_values = _compare_phone_numbers(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_PHONE, consts.PHONE_NUMBER_FIELD, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.PHONE_NUMBER_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.PHONE_NUMBER_FIELD) LOGGER.info(f"Validated phone numbers for: {site}") # validate genders try: match_values = None match_values = _compare_genders(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_SEX, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.SEX_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.SEX_FIELD) LOGGER.info(f"Validated genders for: {site}") # validate birth dates try: match_values = None match_values = _compare_birth_dates(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_BIRTH_DATETIME, ehr_tables) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError, RuntimeError): LOGGER.exception( f"Could not read data for field: {consts.BIRTH_DATETIME_FIELD} at site: {site}" ) read_errors += 1 else: results = _add_matches_to_results(results, match_values, consts.BIRTH_DATE_FIELD) LOGGER.info(f"Validated birth dates for: {site}") LOGGER.info(f"Writing results to BQ table") # write dictionary to a table try: writers.write_to_result_table(project, validation_dataset, site, results) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( f"Did not write site information to validation dataset: {site}" ) write_errors += 1 LOGGER.info(f"Wrote validation results for site: {site}") LOGGER.info(f"FINISHED: Validation dataset created: {validation_dataset}") if read_errors > 0: LOGGER.error( f"Encountered {read_errors} read errors creating validation dataset:\t{validation_dataset}" ) if write_errors > 0: LOGGER.error( f"Encountered {write_errors} write errors creating validation dataset:\t{validation_dataset}" ) return read_errors + write_errors
def test_integration_queries_to_retract_from_fake_dataset( self, mock_list_datasets, mock_is_ehr_dataset, mock_is_unioned_dataset, mock_is_combined_dataset, mock_is_deid_dataset): mock_list_datasets.return_value = [{ 'id': self.project_id + ':' + self.bq_dataset_id }] mock_is_deid_dataset.return_value = False mock_is_combined_dataset.return_value = False mock_is_unioned_dataset.return_value = False mock_is_ehr_dataset.return_value = True # create and load person_ids to pid table bq_utils.create_table(self.pid_table_id, retract_data_bq.PID_TABLE_FIELDS, drop_existing=True, dataset_id=self.bq_dataset_id) bq_formatted_insert_values = ', '.join([ '(%s, %s)' % (person_id, research_id) for (person_id, research_id) in self.person_research_ids ]) q = INSERT_PID_TABLE.format( dataset_id=self.bq_dataset_id, pid_table_id=self.pid_table_id, person_research_ids=bq_formatted_insert_values) bq_utils.query(q) job_ids = [] row_count_queries = {} # load the cdm files into dataset for cdm_file in test_util.NYC_FIVE_PERSONS_FILES: cdm_file_name = os.path.basename(cdm_file) cdm_table = cdm_file_name.split('.')[0] hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table) # store query for checking number of rows to delete row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format( dataset_id=self.bq_dataset_id, table_id=hpo_table, pid_table_id=self.pid_table_id) retract_data_bq.logger.info('Preparing to load table %s.%s' % (self.bq_dataset_id, hpo_table)) with open(cdm_file, 'rb') as f: gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id), cdm_file_name, f) result = bq_utils.load_cdm_csv(self.hpo_id, cdm_table, dataset_id=self.bq_dataset_id) retract_data_bq.logger.info('Loading table %s.%s' % (self.bq_dataset_id, hpo_table)) job_id = result['jobReference']['jobId'] job_ids.append(job_id) incomplete_jobs = bq_utils.wait_on_jobs(job_ids) self.assertEqual(len(incomplete_jobs), 0, 'NYC five person load job did not complete') retract_data_bq.logger.info('All tables loaded successfully') # use query results to count number of expected row deletions expected_row_count = {} for table in row_count_queries: result = bq_utils.query(row_count_queries[table]) expected_row_count[table] = int(result['totalRows']) # separate check to find number of actual deleted rows q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id) q_result = bq_utils.query(q) result = bq_utils.response2rows(q_result) row_count_before_retraction = {} for row in result: row_count_before_retraction[row['table_id']] = row['row_count'] # perform retraction retract_data_bq.run_bq_retraction(self.test_project_id, self.bq_dataset_id, self.test_project_id, self.pid_table_id, self.hpo_id, self.dataset_ids) # find actual deleted rows q_result = bq_utils.query(q) result = bq_utils.response2rows(q_result) row_count_after_retraction = {} for row in result: row_count_after_retraction[row['table_id']] = row['row_count'] for table in expected_row_count: self.assertEqual( expected_row_count[table], row_count_before_retraction[table] - row_count_after_retraction[table])
def create_person_id_src_hpo_map(input_dataset, credentials): """ Create a table containing person_ids and src_hpo_ids :param input_dataset: the input dataset to deid :param credentidals: the credentials needed to create a new table. """ map_tablename = "_mapping_person_src_hpos" sql = ("select person_id, src_hpo_id " "from {input_dataset}._mapping_{table} " "join {input_dataset}.{table} " "using ({table}_id) " "where src_hpo_id not like 'rdr'") # list dataset contents dataset_tables = bq_utils.list_dataset_contents(input_dataset) mapping_tables = [] mapped_tables = [] for table in dataset_tables: if table.startswith('_mapping_'): mapping_tables.append(table) mapped_tables.append(table[9:]) # make sure mapped tables all exist check_tables = [] for table in mapped_tables: if table in dataset_tables: check_tables.append(table) # make sure check_tables contain person_id fields person_id_tables = [] for table in check_tables: info = bq_utils.get_table_info(table, dataset_id=input_dataset) schema = info.get('schema', {}) for field_info in schema.get('fields', []): if 'person_id' in field_info.get('name'): person_id_tables.append(table) # revamp mapping tables to contain only mapping tables for tables # with person_id fields mapping_tables = ['_mapping_' + table for table in person_id_tables] sql_statement = [] for table in person_id_tables: sql_statement.append( sql.format(table=table, input_dataset=input_dataset)) final_query = ' UNION ALL '.join(sql_statement) # create the mapping table if map_tablename not in dataset_tables: fields = [{ "type": "integer", "name": "person_id", "mode": "required", "description": "the person_id of someone with an ehr record" }, { "type": "string", "name": "src_hpo_id", "mode": "required", "description": "the src_hpo_id of an ehr record" }] bq_utils.create_table(map_tablename, fields, dataset_id=input_dataset) bq_utils.query(final_query, destination_table_id=map_tablename, destination_dataset_id=input_dataset, write_disposition=bq_consts.WRITE_TRUNCATE) LOGGER.info(f"Created mapping table:\t{input_dataset}.{map_tablename}")
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id): """ Entry point for performing participant matching of PPI, EHR, and PII data. :param project: a string representing the project name :param rdr_dataset: the dataset created from the results given to us by the rdr team :param ehr_dataset: the dataset containing the pii information for comparisons :param dest_dataset_id: the desired identifier for the match values destination dataset :return: results of the field comparison for each hpo """ LOGGER.info( 'Calling match_participants with:\n' 'project:\t%s\n' 'rdr_dataset:\t%s\n' 'ehr_dataset:\t%s\n' 'dest_dataset_id:\t%s\n', project, rdr_dataset, ehr_dataset, dest_dataset_id) date_string = _get_date_string(rdr_dataset) if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]): dest_dataset_id += date_string # create new dataset for the intermediate tables and results dataset_result = bq_utils.create_dataset( dataset_id=dest_dataset_id, description=consts.DESTINATION_DATASET_DESCRIPTION.format( version='', rdr_dataset=rdr_dataset, ehr_dataset=ehr_dataset), overwrite_existing=True) validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {}) validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '') LOGGER.info('Created new validation results dataset:\t%s', validation_dataset) # create intermediate observation table in new dataset readers.create_match_values_table(project, rdr_dataset, dest_dataset_id) hpo_sites = readers.get_hpo_site_names() #TODO: create a proper config file to store this path field_list = resources.fields_for('identity_match') for site_name in hpo_sites: bq_utils.create_table(site_name + consts.VALIDATION_TABLE_SUFFIX, field_list, drop_existing=True, dataset_id=validation_dataset) read_errors = 0 write_errors = 0 results = {} # validate first names for site in hpo_sites: match_values, exc = _compare_name_fields(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_FIRST, consts.FIRST_NAME_FIELD) if exc is not None: read_errors += 1 else: try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.FIRST_NAME_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.FIRST_NAME_FIELD) write_errors += 1 LOGGER.info('Validated first names') # validate last names for site in hpo_sites: match_values, exc = _compare_name_fields(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_LAST, consts.LAST_NAME_FIELD) if exc is not None: read_errors += 1 else: # write last name matches for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.LAST_NAME_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.LAST_NAME_FIELD) write_errors += 1 LOGGER.info('Validated last names') # validate middle names for site in hpo_sites: match_values, exc = _compare_name_fields(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_MIDDLE, consts.MIDDLE_NAME_FIELD) if exc is not None: read_errors += 1 else: # write middle name matches for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.MIDDLE_NAME_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.MIDDLE_NAME_FIELD) write_errors += 1 LOGGER.info('Validated middle names') # validate zip codes for site in hpo_sites: match_values, exc = _compare_zip_codes( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ZIP, consts.ZIP_CODE_FIELD) if exc is not None: read_errors += 1 else: # write zip codes matces for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.ZIP_CODE_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.ZIP_CODE_FIELD) write_errors += 1 LOGGER.info('Validated zip codes') # validate city for site in hpo_sites: match_values, exc = _compare_cities(project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_CITY, consts.CITY_FIELD) if exc is not None: read_errors += 1 else: # write city matches for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.CITY_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.CITY_FIELD) write_errors += 1 LOGGER.info('Validated city names') # validate state for site in hpo_sites: match_values, exc = _compare_states( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_STATE, consts.STATE_FIELD) if exc is not None: read_errors += 1 else: # write state matches for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.STATE_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.STATE_FIELD) write_errors += 1 LOGGER.info('Validated states') # validate street addresses for site in hpo_sites: address_one_matches, address_two_matches, exc = _compare_street_addresses( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ONE, consts.OBS_PII_STREET_ADDRESS_TWO, consts.ADDRESS_ONE_FIELD, consts.ADDRESS_TWO_FIELD) if exc is not None: read_errors += 1 else: # write street address matches for hpo to table try: writers.append_to_result_table(site, address_one_matches, project, validation_dataset, consts.ADDRESS_ONE_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.ADDRESS_ONE_FIELD) write_errors += 1 try: writers.append_to_result_table(site, address_two_matches, project, validation_dataset, consts.ADDRESS_TWO_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.ADDRESS_TWO_FIELD) write_errors += 1 LOGGER.info('Validated street addresses') # validate email addresses for site in hpo_sites: match_values, exc = _compare_email_addresses( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_EMAIL_ADDRESS, consts.EMAIL_FIELD) if exc is not None: read_errors += 1 else: # write email matches for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.EMAIL_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.EMAIL_FIELD) write_errors += 1 LOGGER.info('Validated email addresses') # validate phone numbers for site in hpo_sites: match_values, exc = _compare_phone_numbers(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_PHONE, consts.PHONE_NUMBER_FIELD) if exc is not None: read_errors += 1 else: # write phone number matches for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.PHONE_NUMBER_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.PHONE_NUMBER_FIELD) write_errors += 1 LOGGER.info('Validated phone numbers') # validate genders for site in hpo_sites: match_values, exc = _compare_genders(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_SEX) if exc is not None: read_errors += 1 else: # write birthday match for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.SEX_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.SEX_FIELD) write_errors += 1 LOGGER.info('Validated genders') # validate birth dates for site in hpo_sites: match_values, exc = _compare_birth_dates(project, validation_dataset, ehr_dataset, site, consts.OBS_PII_BIRTH_DATETIME) if exc is not None: read_errors += 1 else: # write birthday match for hpo to table try: writers.append_to_result_table(site, match_values, project, validation_dataset, consts.BIRTH_DATE_FIELD) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( "Unable to insert records in table:\t%s\tfor field: %s", site, consts.BIRTH_DATE_FIELD) write_errors += 1 LOGGER.info('Validated birth dates') # generate single clean record for each participant at each site for site in hpo_sites: try: writers.merge_fields_into_single_record(project, validation_dataset, site) LOGGER.info('Merged participant match records') except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): write_errors += 1 try: writers.remove_sparse_records(project, validation_dataset, site) LOGGER.info('Removed sparse participant match records') except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): write_errors += 1 try: writers.change_nulls_to_missing_value(project, validation_dataset, site) LOGGER.info( 'Changed nulls to missing values in participant match records') except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): write_errors += 1 LOGGER.info("Finished creating validation dataset") if read_errors > 0: LOGGER.error( "Encountered %d read errors creating validation dataset:\t%s", read_errors, validation_dataset) if write_errors > 0: LOGGER.error( "Encountered %d write errors creating validation dataset:\t%s", write_errors, validation_dataset) return results, read_errors + write_errors
def _create_hpo_table(self, hpo_id, table, dataset_id): table_id = bq_utils.get_table_id(hpo_id, table) bq_utils.create_table(table_id, resources.fields_for(table), dataset_id=dataset_id) return table_id
def match_participants(project, rdr_dataset, ehr_dataset, dest_dataset_id): """ Entry point for performing participant matching of PPI, EHR, and PII data. :param project: a string representing the project name :param rdr_dataset: the dataset created from the results given to us by the rdr team :param ehr_dataset: the dataset containing the pii information for comparisons :param dest_dataset_id: the desired identifier for the match values destination dataset :return: results of the field comparison for each hpo """ date_string = _get_date_string(rdr_dataset) if not re.match(consts.DRC_DATE_REGEX, dest_dataset_id[-8:]): dest_dataset_id += date_string # create new dataset for the intermediate tables and results dataset_result = bq_utils.create_dataset( dataset_id=dest_dataset_id, description=consts.DESTINATION_DATASET_DESCRIPTION.format( version='', rdr_dataset=rdr_dataset, ehr_dataset=ehr_dataset ), overwrite_existing=True) validation_dataset = dataset_result.get(bq_consts.DATASET_REF, {}) validation_dataset = validation_dataset.get(bq_consts.DATASET_ID, '') # create intermediate observation table in new dataset readers.create_match_values_table(project, rdr_dataset, dest_dataset_id) hpo_sites = readers.get_hpo_site_names() #TODO: create a proper config file to store this path field_list = resources.fields_for('identity_match') for site_name in hpo_sites: bq_utils.create_table( site_name + consts.VALIDATION_TABLE_SUFFIX, field_list, drop_existing=True, dataset_id=validation_dataset ) results = {} # validate first names for site in hpo_sites: match_values = _compare_name_fields( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_FIRST, consts.FIRST_NAME_FIELD ) writers.append_to_result_table( site, match_values, project, validation_dataset, consts.FIRST_NAME_FIELD ) # validate last names for site in hpo_sites: match_values = _compare_name_fields( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_LAST, consts.LAST_NAME_FIELD ) # write last name matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.LAST_NAME_FIELD ) # validate middle names for site in hpo_sites: match_values = _compare_name_fields( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_NAME_MIDDLE, consts.MIDDLE_NAME_FIELD ) # write middle name matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.MIDDLE_NAME_FIELD ) # validate zip codes for site in hpo_sites: match_values = _compare_zip_codes( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ZIP, consts.ZIP_CODE_FIELD ) # write zip codes matces for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.ZIP_CODE_FIELD ) # validate city for site in hpo_sites: match_values = _compare_cities( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_CITY, consts.CITY_FIELD ) # write city matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.CITY_FIELD ) # validate state for site in hpo_sites: match_values = _compare_states( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_STATE, consts.STATE_FIELD ) # write state matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.STATE_FIELD ) # validate street addresses for site in hpo_sites: address_one_matches, address_two_matches = _compare_street_addresses( project, validation_dataset, rdr_dataset, ehr_dataset, site, consts.OBS_PII_STREET_ADDRESS_ONE, consts.OBS_PII_STREET_ADDRESS_TWO, consts.ADDRESS_ONE_FIELD, consts.ADDRESS_TWO_FIELD ) # write street address matches for hpo to table writers.append_to_result_table( site, address_one_matches, project, validation_dataset, consts.ADDRESS_ONE_FIELD ) writers.append_to_result_table( site, address_two_matches, project, validation_dataset, consts.ADDRESS_TWO_FIELD ) # validate email addresses for site in hpo_sites: match_values = _compare_email_addresses( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_EMAIL_ADDRESS, consts.EMAIL_FIELD ) # write email matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.EMAIL_FIELD ) # validate phone numbers for site in hpo_sites: match_values = _compare_phone_numbers( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_PHONE, consts.PHONE_NUMBER_FIELD ) # write phone number matches for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.PHONE_NUMBER_FIELD ) # validate genders for site in hpo_sites: match_values = _compare_genders( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_SEX ) # write birthday match for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.SEX_FIELD ) # validate birth dates for site in hpo_sites: match_values = _compare_birth_dates( project, validation_dataset, ehr_dataset, site, consts.OBS_PII_BIRTH_DATETIME ) # write birthday match for hpo to table writers.append_to_result_table( site, match_values, project, validation_dataset, consts.BIRTH_DATE_FIELD ) # generate single clean record for each participant at each site for site in hpo_sites: writers.merge_fields_into_single_record(project, validation_dataset, site) writers.remove_sparse_records(project, validation_dataset, site) writers.change_nulls_to_missing_value(project, validation_dataset, site) # generate hpo site reports for site in hpo_sites: bucket = gcs_utils.get_hpo_bucket(site) filename = os.path.join( consts.REPORT_DIRECTORY.format(date=date_string), consts.REPORT_TITLE ) writers.create_site_validation_report( project, validation_dataset, [site], bucket, filename ) # generate aggregate site report bucket = gcs_utils.get_drc_bucket() filename = os.path.join(validation_dataset, consts.REPORT_TITLE) writers.create_site_validation_report( project, validation_dataset, hpo_sites, bucket, filename ) return results