def remove_ehr_data_queries(project_id, ticket_number, pids_project_id, pids_dataset_id, tablename): """ Creates sandboxes and drops all EHR data found for deactivated participants after their deactivation date :param project_id: BQ name of the project :param ticket_number: Jira ticket number to identify and title sandbox tables :param pids_project_id: deactivated participants PIDs table in BQ's project_id :param pids_dataset_id: deactivated participants PIDs table in BQ's dataset_id :param tablename: The name of the table to house the deactivated participant data """ ehr_union_dataset = bq_utils.get_unioned_dataset_id() # gets the deactivated participant dataset to ensure it's up-to-date df = psr.get_deactivated_participants(pids_project_id, pids_dataset_id, tablename, DEACTIVATED_PARTICIPANTS_COLUMNS) # To store dataframe in a BQ dataset table destination_table = pids_dataset_id + '.' + tablename psr.store_participant_data(df, project_id, destination_table) # creates sandbox and truncate queries to run for deactivated participant data drops queries = rdp.create_queries( project_id, ticket_number=ticket_number, # the deactivated participants table is stored in the same project # as the data being retracted pids_project_id=project_id, pids_dataset_id=pids_dataset_id, pids_table=tablename, datasets=[ehr_union_dataset]) return queries
def setup_rule(self, client): """ Responsible for grabbing and storing deactivated participant data. :param client: client object passed to store the data """ LOGGER.info("Querying RDR API for deactivated participant data") # gets the deactivated participant dataset to ensure it's up-to-date df = psr.get_deactivated_participants(self.api_project_id, DEACTIVATED_PARTICIPANTS_COLUMNS) LOGGER.info(f"Found '{len(df)}' deactivated participants via RDR API") # To store dataframe in a BQ dataset table named _deactivated_participants psr.store_participant_data(df, self.project_id, self.destination_table) LOGGER.info(f"Finished storing participant records in: " f"`{self.destination_table}`") LOGGER.debug("instantiating class client object") self.client = client # reinitializing self.affected_tables LOGGER.debug( "reinitializing self.affected_tables to actual tables available") tables_list = self.client.list_tables(self.dataset_id) self.affected_tables = [ table_item.table_id for table_item in tables_list ]
def remove_ehr_data_queries(client, api_project_id, project_id, dataset_id, sandbox_dataset_id): """ Sandboxes and drops all EHR data found for deactivated participants after their deactivation date :param client: BQ client :param api_project_id: Project containing the RDR Participant Summary API :param project_id: Identifies the project containing the target dataset :param dataset_id: Identifies the dataset to retract deactivated participants from :param sandbox_dataset_id: Identifies the sandbox dataset to store records for dataset_id :returns queries: List of query dictionaries """ # gets the deactivated participant dataset to ensure it's up-to-date df = psr.get_deactivated_participants(api_project_id, DEACTIVATED_PARTICIPANTS_COLUMNS) # To store dataframe in a BQ dataset table named _deactivated_participants destination_table = f'{sandbox_dataset_id}.{DEACTIVATED_PARTICIPANTS}' psr.store_participant_data(df, project_id, destination_table) fq_deact_table = f'{project_id}.{destination_table}' deact_table_ref = gbq.TableReference.from_string(f"{fq_deact_table}") LOGGER.info(f"Retracting deactivated participants from '{dataset_id}'") LOGGER.info( f"Using sandbox dataset '{sandbox_dataset_id}' for '{dataset_id}'") # creates sandbox and truncate queries to run for deactivated participant data drops queries = rdp.generate_queries(client, project_id, dataset_id, sandbox_dataset_id, deact_table_ref) return queries
def test_get_deactivated_participants(self, mock_get): # Pre conditions mock_get.return_value.status_code = 200 mock_get.return_value.json.return_value = self.json_response_entry # Tests df = psr.get_deactivated_participants(self.project_id, self.columns) psr.store_participant_data(df, self.project_id, f'{self.dataset_id}.{self.tablename}') # Post conditions values = [(111, 'NO_CONTACT', datetime.date(2018, 12, 7)), (222, 'NO_CONTACT', datetime.date(2018, 12, 7))] self.assertTableValuesMatch( '.'.join([self.project_id, self.destination_table]), self.bq_columns, values)
def test_get_deactivated_participants(self, mock_get_deactivated_participants, mock_store_participant_data): # pre conditions mock_get_deactivated_participants.return_value = self.fake_dataframe # tests dataframe_response = psr.get_deactivated_participants( self.project_id, self.dataset_id, self.tablename, self.columns) dataset_response = psr.store_participant_data(dataframe_response, self.project_id, self.destination_table) expected_response = mock_store_participant_data( dataframe_response, self.project_id, self.destination_table) # post conditions pandas.testing.assert_frame_equal( dataframe_response, pandas.DataFrame(self.updated_deactivated_participants, columns=self.columns)) self.assertEqual(expected_response, dataset_response)
def test_remove_ehr_data_past_deactivation_date(self, mock_retraction_info, mock_get): # pre conditions for participant summary API module mock_get.return_value.status_code = 200 mock_get.return_value.json.return_value = self.json_response_entry # Ensure deactivated participants table is created and or updated df = psr.get_deactivated_participants(self.project_id, self.dataset_id, self.tablename, self.columns) psr.store_participant_data(df, self.project_id, f'{self.dataset_id}.{self.tablename}') # pre conditions for retraction module d = { 'project_id': [ self.project_id, self.project_id, self.project_id, self.project_id, self.project_id, self.project_id ], 'dataset_id': [ self.dataset_id, self.dataset_id, self.dataset_id, self.dataset_id, self.dataset_id, self.dataset_id ], 'table': [ 'condition_occurrence', 'drug_exposure', 'measurement', 'observation', 'procedure_occurrence', 'visit_occurrence' ], 'date_column': [ None, None, 'measurement_date', 'observation_date', 'procedure_date', None ], 'start_date_column': [ 'condition_start_date', 'drug_exposure_start_date', None, None, None, 'visit_start_date' ], 'end_date_column': [ 'condition_end_date', 'drug_exposure_end_date', None, None, None, 'visit_end_date' ] } retraction_info = pandas.DataFrame(data=d) mock_retraction_info.return_value = retraction_info load_data_queries = [] dropped_row_count_queries = [] kept_row_count_queries = [] sandbox_row_count_queries = [] # Queries to load the dummy data into the tables measurement_query = jinja_env.from_string(""" INSERT INTO `{{project}}.{{dataset}}.{{measurement}}` (measurement_id, person_id, measurement_concept_id, measurement_date, measurement_type_concept_id) VALUES (1234, 1, 0, date('2017-12-07'), 0), (5678, 2, 0, date('2017-12-07'), 0), (2345, 3, 0, date('2018-12-07'), 0)""").render( project=self.project_id, dataset=self.dataset_id, measurement=TABLES[0]) load_data_queries.append(measurement_query) observation_query = jinja_env.from_string(""" INSERT INTO `{{project}}.{{dataset}}.{{observation}}` (observation_id, person_id, observation_concept_id, observation_date, observation_type_concept_id) VALUES (1234, 1, 0, date('2017-12-07'), 0), (5678, 2, 0, date('2017-12-07'), 0), (2345, 3, 0, date('2018-12-07'), 0)""").render( project=self.project_id, dataset=self.dataset_id, observation=TABLES[1]) load_data_queries.append(observation_query) procedure_occ_query = jinja_env.from_string( """ INSERT INTO `{{project}}.{{dataset}}.{{procedure}}` (procedure_occurrence_id, person_id, procedure_concept_id, procedure_date, procedure_datetime, procedure_type_concept_id) VALUES (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), 0), (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), 0), (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), 0)""" ).render(project=self.project_id, dataset=self.dataset_id, procedure=TABLES[2]) load_data_queries.append(procedure_occ_query) condition_occ_query = jinja_env.from_string( """ INSERT INTO `{{project}}.{{dataset}}.{{condition}}` (condition_occurrence_id, person_id, condition_concept_id, condition_start_date, condition_start_datetime, condition_end_date, condition_type_concept_id) VALUES (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), date('2018-12-08'), 0)""" ).render(project=self.project_id, dataset=self.dataset_id, condition=TABLES[3]) load_data_queries.append(condition_occ_query) drug_query = jinja_env.from_string(""" INSERT INTO `{{project}}.{{dataset}}.{{drug}}` (drug_exposure_id, person_id, drug_concept_id, drug_exposure_start_date, drug_exposure_start_datetime, drug_exposure_end_date, drug_type_concept_id) VALUES (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), date('2018-12-08'), 0)""" ).render(project=self.project_id, dataset=self.dataset_id, drug=TABLES[4]) load_data_queries.append(drug_query) visit_query = jinja_env.from_string(""" INSERT INTO `{{project}}.{{dataset}}.{{visit}}` (visit_occurrence_id, person_id, visit_concept_id, visit_start_date, visit_start_datetime, visit_end_date, visit_type_concept_id) VALUES (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), date('2018-12-08'), 0)""" ).render(project=self.project_id, dataset=self.dataset_id, visit=TABLES[5]) load_data_queries.append(visit_query) # Create tables fq_table_names = [] for table_name in TABLES: fq_table_names.append( f'{self.project_id}.{self.dataset_id}.{table_name}') bq.create_tables(self.client, self.project_id, fq_table_names, exists_ok=True) # Load queries for query in load_data_queries: response = self.client.query(query) self.assertIsNotNone(response.result()) self.assertIsNone(response.exception()) # Store query for checking number of rows to delete for ehr in self.deactivated_participants: pid = ehr[0] for row in retraction_info.itertuples(index=False): if row.date_column is None: dropped_query = rdpt.EXPECTED_DROPPED_ROWS_QUERY_END_DATE.format( dataset_id=self.dataset_id, table_id=row.table, pid_table_id=self.tablename, pid=pid, start_date_column=row.start_date_column, end_date_column=row.end_date_column) kept_query = rdpt.EXPECTED_KEPT_ROWS_QUERY_END_DATE.format( dataset_id=self.dataset_id, table_id=row.table, pid_table_id=self.tablename, pid=pid, start_date_column=row.start_date_column, end_date_column=row.end_date_column) sandbox_query = rdp.SANDBOX_QUERY_END_DATE.render( project=self.project_id, dataset=self.dataset_id, table=row.table, pid=pid, end_date_column=row.end_date_column, start_date_column=row.start_date_column, deactivated_pids_project=self.project_id, deactivated_pids_dataset=self.dataset_id, deactivated_pids_table=self.tablename) else: dropped_query = rdpt.EXPECTED_DROPPED_ROWS_QUERY.format( dataset_id=self.dataset_id, table_id=row.table, pid_table_id=self.tablename, pid=pid, date_column=row.date_column) kept_query = rdpt.EXPECTED_KEPT_ROWS_QUERY.format( dataset_id=self.dataset_id, table_id=row.table, pid_table_id=self.tablename, pid=pid, date_column=row.date_column) sandbox_query = rdp.SANDBOX_QUERY_DATE.render( project=self.project_id, dataset=self.dataset_id, table=row.table, pid=pid, date_column=row.date_column, deactivated_pids_project=self.project_id, deactivated_pids_dataset=self.dataset_id, deactivated_pids_table=self.tablename) dropped_row_count_queries.append({ clean_consts.QUERY: dropped_query, clean_consts.DESTINATION_DATASET: self.dataset_id, clean_consts.DESTINATION_TABLE: row.table }) kept_row_count_queries.append({ clean_consts.QUERY: kept_query, clean_consts.DESTINATION_DATASET: self.dataset_id, clean_consts.DESTINATION_TABLE: row.table }) sandbox_row_count_queries.append({ clean_consts.QUERY: sandbox_query, clean_consts.DESTINATION_DATASET: self.sandbox_id, clean_consts.DESTINATION_TABLE: self.tablename }) # Use query results to count number of expected dropped row deletions expected_dropped_row_count = {} for query_dict in dropped_row_count_queries: response = self.client.query(query_dict['query']) result = response.result() if query_dict[ 'destination_table_id'] in expected_dropped_row_count: expected_dropped_row_count[ query_dict['destination_table_id']] += result.total_rows else: expected_dropped_row_count[ query_dict['destination_table_id']] = result.total_rows # Separate check to find number of actual deleted rows q = rdpt.TABLE_ROWS_QUERY.format(dataset_id=self.dataset_id) q_result = self.client.query(q) row_count_before_retraction = {} for row in q_result: row_count_before_retraction[row['table_id']] = row['row_count'] # Use query results to count number of expected dropped row deletions expected_kept_row_count = {} for query_dict in kept_row_count_queries: response = self.client.query(query_dict['query']) result = response.result() if query_dict['destination_table_id'] in expected_kept_row_count: expected_kept_row_count[ query_dict['destination_table_id']] -= ( (row_count_before_retraction[ query_dict['destination_table_id']] - result.total_rows)) else: expected_kept_row_count[query_dict['destination_table_id']] = ( row_count_before_retraction[ query_dict['destination_table_id']] - (row_count_before_retraction[ query_dict['destination_table_id']] - result.total_rows)) # Perform retraction query_list = red.remove_ehr_data_queries(self.project_id, self.ticket_number, self.project_id, self.dataset_id, self.tablename) rdp.run_queries(query_list, self.client) # Find actual deleted rows q_result = self.client.query(q) results = q_result.result() row_count_after_retraction = {} for row in results: row_count_after_retraction[row['table_id']] = row['row_count'] for table in expected_dropped_row_count: self.assertEqual( expected_dropped_row_count[table], row_count_before_retraction[table] - row_count_after_retraction[table]) for table in expected_kept_row_count: self.assertEqual(expected_kept_row_count[table], row_count_after_retraction[table])