예제 #1
0
def remove_ehr_data_queries(project_id, ticket_number, pids_project_id,
                            pids_dataset_id, tablename):
    """
    Creates sandboxes and drops all EHR data found for deactivated participants after
    their deactivation date

    :param project_id: BQ name of the project
    :param ticket_number: Jira ticket number to identify and title sandbox tables
    :param pids_project_id: deactivated participants PIDs table in BQ's project_id
    :param pids_dataset_id: deactivated participants PIDs table in BQ's dataset_id
    :param tablename: The name of the table to house the deactivated participant data
    """

    ehr_union_dataset = bq_utils.get_unioned_dataset_id()

    # gets the deactivated participant dataset to ensure it's up-to-date
    df = psr.get_deactivated_participants(pids_project_id, pids_dataset_id,
                                          tablename,
                                          DEACTIVATED_PARTICIPANTS_COLUMNS)
    # To store dataframe in a BQ dataset table
    destination_table = pids_dataset_id + '.' + tablename
    psr.store_participant_data(df, project_id, destination_table)
    # creates sandbox and truncate queries to run for deactivated participant data drops
    queries = rdp.create_queries(
        project_id,
        ticket_number=ticket_number,
        # the deactivated participants table is stored in the same project
        # as the data being retracted
        pids_project_id=project_id,
        pids_dataset_id=pids_dataset_id,
        pids_table=tablename,
        datasets=[ehr_union_dataset])

    return queries
    def setup_rule(self, client):
        """
        Responsible for grabbing and storing deactivated participant data.

        :param client: client object passed to store the data
        """
        LOGGER.info("Querying RDR API for deactivated participant data")
        # gets the deactivated participant dataset to ensure it's up-to-date
        df = psr.get_deactivated_participants(self.api_project_id,
                                              DEACTIVATED_PARTICIPANTS_COLUMNS)

        LOGGER.info(f"Found '{len(df)}' deactivated participants via RDR API")

        # To store dataframe in a BQ dataset table named _deactivated_participants
        psr.store_participant_data(df, self.project_id, self.destination_table)

        LOGGER.info(f"Finished storing participant records in: "
                    f"`{self.destination_table}`")

        LOGGER.debug("instantiating class client object")
        self.client = client

        # reinitializing self.affected_tables
        LOGGER.debug(
            "reinitializing self.affected_tables to actual tables available")
        tables_list = self.client.list_tables(self.dataset_id)
        self.affected_tables = [
            table_item.table_id for table_item in tables_list
        ]
예제 #3
0
def remove_ehr_data_queries(client, api_project_id, project_id, dataset_id,
                            sandbox_dataset_id):
    """
    Sandboxes and drops all EHR data found for deactivated participants after their deactivation date

    :param client: BQ client
    :param api_project_id: Project containing the RDR Participant Summary API
    :param project_id: Identifies the project containing the target dataset
    :param dataset_id: Identifies the dataset to retract deactivated participants from
    :param sandbox_dataset_id: Identifies the sandbox dataset to store records for dataset_id
    :returns queries: List of query dictionaries
    """
    # gets the deactivated participant dataset to ensure it's up-to-date
    df = psr.get_deactivated_participants(api_project_id,
                                          DEACTIVATED_PARTICIPANTS_COLUMNS)

    # To store dataframe in a BQ dataset table named _deactivated_participants
    destination_table = f'{sandbox_dataset_id}.{DEACTIVATED_PARTICIPANTS}'
    psr.store_participant_data(df, project_id, destination_table)

    fq_deact_table = f'{project_id}.{destination_table}'
    deact_table_ref = gbq.TableReference.from_string(f"{fq_deact_table}")
    LOGGER.info(f"Retracting deactivated participants from '{dataset_id}'")
    LOGGER.info(
        f"Using sandbox dataset '{sandbox_dataset_id}' for '{dataset_id}'")
    # creates sandbox and truncate queries to run for deactivated participant data drops
    queries = rdp.generate_queries(client, project_id, dataset_id,
                                   sandbox_dataset_id, deact_table_ref)
    return queries
    def test_get_deactivated_participants(self, mock_get):
        # Pre conditions
        mock_get.return_value.status_code = 200
        mock_get.return_value.json.return_value = self.json_response_entry

        # Tests
        df = psr.get_deactivated_participants(self.project_id, self.columns)

        psr.store_participant_data(df, self.project_id,
                                   f'{self.dataset_id}.{self.tablename}')

        # Post conditions
        values = [(111, 'NO_CONTACT', datetime.date(2018, 12, 7)),
                  (222, 'NO_CONTACT', datetime.date(2018, 12, 7))]
        self.assertTableValuesMatch(
            '.'.join([self.project_id, self.destination_table]),
            self.bq_columns, values)
예제 #5
0
    def test_get_deactivated_participants(self,
                                          mock_get_deactivated_participants,
                                          mock_store_participant_data):

        # pre conditions
        mock_get_deactivated_participants.return_value = self.fake_dataframe

        # tests
        dataframe_response = psr.get_deactivated_participants(
            self.project_id, self.dataset_id, self.tablename, self.columns)

        dataset_response = psr.store_participant_data(dataframe_response,
                                                      self.project_id,
                                                      self.destination_table)
        expected_response = mock_store_participant_data(
            dataframe_response, self.project_id, self.destination_table)

        # post conditions
        pandas.testing.assert_frame_equal(
            dataframe_response,
            pandas.DataFrame(self.updated_deactivated_participants,
                             columns=self.columns))

        self.assertEqual(expected_response, dataset_response)
    def test_remove_ehr_data_past_deactivation_date(self, mock_retraction_info,
                                                    mock_get):
        # pre conditions for participant summary API module
        mock_get.return_value.status_code = 200
        mock_get.return_value.json.return_value = self.json_response_entry

        # Ensure deactivated participants table is created and or updated
        df = psr.get_deactivated_participants(self.project_id, self.dataset_id,
                                              self.tablename, self.columns)
        psr.store_participant_data(df, self.project_id,
                                   f'{self.dataset_id}.{self.tablename}')
        # pre conditions for retraction module
        d = {
            'project_id': [
                self.project_id, self.project_id, self.project_id,
                self.project_id, self.project_id, self.project_id
            ],
            'dataset_id': [
                self.dataset_id, self.dataset_id, self.dataset_id,
                self.dataset_id, self.dataset_id, self.dataset_id
            ],
            'table': [
                'condition_occurrence', 'drug_exposure', 'measurement',
                'observation', 'procedure_occurrence', 'visit_occurrence'
            ],
            'date_column': [
                None, None, 'measurement_date', 'observation_date',
                'procedure_date', None
            ],
            'start_date_column': [
                'condition_start_date', 'drug_exposure_start_date', None, None,
                None, 'visit_start_date'
            ],
            'end_date_column': [
                'condition_end_date', 'drug_exposure_end_date', None, None,
                None, 'visit_end_date'
            ]
        }
        retraction_info = pandas.DataFrame(data=d)
        mock_retraction_info.return_value = retraction_info

        load_data_queries = []
        dropped_row_count_queries = []
        kept_row_count_queries = []
        sandbox_row_count_queries = []

        # Queries to load the dummy data into the tables
        measurement_query = jinja_env.from_string("""
        INSERT INTO `{{project}}.{{dataset}}.{{measurement}}`
        (measurement_id, person_id, measurement_concept_id, measurement_date,
        measurement_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), 0),
            (5678, 2, 0, date('2017-12-07'), 0),
            (2345, 3, 0, date('2018-12-07'), 0)""").render(
            project=self.project_id,
            dataset=self.dataset_id,
            measurement=TABLES[0])
        load_data_queries.append(measurement_query)

        observation_query = jinja_env.from_string("""
        INSERT INTO `{{project}}.{{dataset}}.{{observation}}`
        (observation_id, person_id, observation_concept_id, observation_date,
        observation_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), 0),
            (5678, 2, 0, date('2017-12-07'), 0),
            (2345, 3, 0, date('2018-12-07'), 0)""").render(
            project=self.project_id,
            dataset=self.dataset_id,
            observation=TABLES[1])
        load_data_queries.append(observation_query)

        procedure_occ_query = jinja_env.from_string(
            """
        INSERT INTO `{{project}}.{{dataset}}.{{procedure}}`
        (procedure_occurrence_id, person_id, procedure_concept_id, procedure_date,
        procedure_datetime, procedure_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), 0),
            (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), 0), 
            (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), 0)"""
        ).render(project=self.project_id,
                 dataset=self.dataset_id,
                 procedure=TABLES[2])
        load_data_queries.append(procedure_occ_query)

        condition_occ_query = jinja_env.from_string(
            """
        INSERT INTO `{{project}}.{{dataset}}.{{condition}}`
        (condition_occurrence_id, person_id, condition_concept_id, condition_start_date,
        condition_start_datetime, condition_end_date, condition_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0),
            (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), 
            (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), date('2018-12-08'), 0)"""
        ).render(project=self.project_id,
                 dataset=self.dataset_id,
                 condition=TABLES[3])
        load_data_queries.append(condition_occ_query)

        drug_query = jinja_env.from_string("""
        INSERT INTO `{{project}}.{{dataset}}.{{drug}}`
        (drug_exposure_id, person_id, drug_concept_id, drug_exposure_start_date, 
        drug_exposure_start_datetime, drug_exposure_end_date, drug_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0),
            (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), 
            (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), date('2018-12-08'), 0)"""
                                           ).render(project=self.project_id,
                                                    dataset=self.dataset_id,
                                                    drug=TABLES[4])
        load_data_queries.append(drug_query)

        visit_query = jinja_env.from_string("""
        INSERT INTO `{{project}}.{{dataset}}.{{visit}}`
        (visit_occurrence_id, person_id, visit_concept_id, visit_start_date, visit_start_datetime,
        visit_end_date, visit_type_concept_id)
        VALUES
            (1234, 1, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0),
            (5678, 2, 0, date('2017-12-07'), timestamp('2017-12-07T08:21:14'), date('2017-12-08'), 0), 
            (2345, 3, 0, date('2018-12-07'), timestamp('2018-12-07T08:21:14'), date('2018-12-08'), 0)"""
                                            ).render(project=self.project_id,
                                                     dataset=self.dataset_id,
                                                     visit=TABLES[5])
        load_data_queries.append(visit_query)

        # Create tables
        fq_table_names = []
        for table_name in TABLES:
            fq_table_names.append(
                f'{self.project_id}.{self.dataset_id}.{table_name}')
        bq.create_tables(self.client,
                         self.project_id,
                         fq_table_names,
                         exists_ok=True)

        # Load queries
        for query in load_data_queries:
            response = self.client.query(query)
            self.assertIsNotNone(response.result())
            self.assertIsNone(response.exception())

        # Store query for checking number of rows to delete
        for ehr in self.deactivated_participants:
            pid = ehr[0]
            for row in retraction_info.itertuples(index=False):
                if row.date_column is None:
                    dropped_query = rdpt.EXPECTED_DROPPED_ROWS_QUERY_END_DATE.format(
                        dataset_id=self.dataset_id,
                        table_id=row.table,
                        pid_table_id=self.tablename,
                        pid=pid,
                        start_date_column=row.start_date_column,
                        end_date_column=row.end_date_column)
                    kept_query = rdpt.EXPECTED_KEPT_ROWS_QUERY_END_DATE.format(
                        dataset_id=self.dataset_id,
                        table_id=row.table,
                        pid_table_id=self.tablename,
                        pid=pid,
                        start_date_column=row.start_date_column,
                        end_date_column=row.end_date_column)
                    sandbox_query = rdp.SANDBOX_QUERY_END_DATE.render(
                        project=self.project_id,
                        dataset=self.dataset_id,
                        table=row.table,
                        pid=pid,
                        end_date_column=row.end_date_column,
                        start_date_column=row.start_date_column,
                        deactivated_pids_project=self.project_id,
                        deactivated_pids_dataset=self.dataset_id,
                        deactivated_pids_table=self.tablename)
                else:
                    dropped_query = rdpt.EXPECTED_DROPPED_ROWS_QUERY.format(
                        dataset_id=self.dataset_id,
                        table_id=row.table,
                        pid_table_id=self.tablename,
                        pid=pid,
                        date_column=row.date_column)
                    kept_query = rdpt.EXPECTED_KEPT_ROWS_QUERY.format(
                        dataset_id=self.dataset_id,
                        table_id=row.table,
                        pid_table_id=self.tablename,
                        pid=pid,
                        date_column=row.date_column)
                    sandbox_query = rdp.SANDBOX_QUERY_DATE.render(
                        project=self.project_id,
                        dataset=self.dataset_id,
                        table=row.table,
                        pid=pid,
                        date_column=row.date_column,
                        deactivated_pids_project=self.project_id,
                        deactivated_pids_dataset=self.dataset_id,
                        deactivated_pids_table=self.tablename)
                dropped_row_count_queries.append({
                    clean_consts.QUERY:
                    dropped_query,
                    clean_consts.DESTINATION_DATASET:
                    self.dataset_id,
                    clean_consts.DESTINATION_TABLE:
                    row.table
                })
                kept_row_count_queries.append({
                    clean_consts.QUERY:
                    kept_query,
                    clean_consts.DESTINATION_DATASET:
                    self.dataset_id,
                    clean_consts.DESTINATION_TABLE:
                    row.table
                })
                sandbox_row_count_queries.append({
                    clean_consts.QUERY:
                    sandbox_query,
                    clean_consts.DESTINATION_DATASET:
                    self.sandbox_id,
                    clean_consts.DESTINATION_TABLE:
                    self.tablename
                })

        # Use query results to count number of expected dropped row deletions
        expected_dropped_row_count = {}
        for query_dict in dropped_row_count_queries:
            response = self.client.query(query_dict['query'])
            result = response.result()
            if query_dict[
                    'destination_table_id'] in expected_dropped_row_count:
                expected_dropped_row_count[
                    query_dict['destination_table_id']] += result.total_rows
            else:
                expected_dropped_row_count[
                    query_dict['destination_table_id']] = result.total_rows

        # Separate check to find number of actual deleted rows
        q = rdpt.TABLE_ROWS_QUERY.format(dataset_id=self.dataset_id)
        q_result = self.client.query(q)
        row_count_before_retraction = {}
        for row in q_result:
            row_count_before_retraction[row['table_id']] = row['row_count']

        # Use query results to count number of expected dropped row deletions
        expected_kept_row_count = {}
        for query_dict in kept_row_count_queries:
            response = self.client.query(query_dict['query'])
            result = response.result()
            if query_dict['destination_table_id'] in expected_kept_row_count:
                expected_kept_row_count[
                    query_dict['destination_table_id']] -= (
                        (row_count_before_retraction[
                            query_dict['destination_table_id']] -
                         result.total_rows))
            else:
                expected_kept_row_count[query_dict['destination_table_id']] = (
                    row_count_before_retraction[
                        query_dict['destination_table_id']] -
                    (row_count_before_retraction[
                        query_dict['destination_table_id']] -
                     result.total_rows))

        # Perform retraction
        query_list = red.remove_ehr_data_queries(self.project_id,
                                                 self.ticket_number,
                                                 self.project_id,
                                                 self.dataset_id,
                                                 self.tablename)
        rdp.run_queries(query_list, self.client)

        # Find actual deleted rows
        q_result = self.client.query(q)
        results = q_result.result()
        row_count_after_retraction = {}
        for row in results:
            row_count_after_retraction[row['table_id']] = row['row_count']

        for table in expected_dropped_row_count:
            self.assertEqual(
                expected_dropped_row_count[table],
                row_count_before_retraction[table] -
                row_count_after_retraction[table])

        for table in expected_kept_row_count:
            self.assertEqual(expected_kept_row_count[table],
                             row_count_after_retraction[table])