def run_retraction_cron(): project_id = bq_utils.app_identity.get_application_id() output_project_id = bq_utils.get_output_project_id() hpo_id = bq_utils.get_retraction_hpo_id() retraction_type = bq_utils.get_retraction_type() pid_table_id = bq_utils.get_retraction_pid_table_id() sandbox_dataset_id = bq_utils.get_retraction_sandbox_dataset_id() # retract from bq dataset_ids = bq_utils.get_retraction_dataset_ids() logging.info(f"Dataset id/s to target from env variable: {dataset_ids}") logging.info(f"Running retraction on BQ datasets") if output_project_id: # retract from output dataset retract_data_bq.run_bq_retraction(output_project_id, sandbox_dataset_id, project_id, pid_table_id, hpo_id, dataset_ids, retraction_type) # retract from default dataset retract_data_bq.run_bq_retraction(project_id, sandbox_dataset_id, project_id, pid_table_id, hpo_id, dataset_ids, retraction_type) logging.info(f"Completed retraction on BQ datasets") # retract from gcs folder = bq_utils.get_retraction_submission_folder() logging.info(f"Submission folder/s to target from env variable: {folder}") logging.info(f"Running retraction from internal bucket folders") retract_data_gcs.run_gcs_retraction(project_id, sandbox_dataset_id, pid_table_id, hpo_id, folder, force_flag=True) logging.info(f"Completed retraction from internal bucket folders") return 'retraction-complete'
def test_integration_queries_to_retract_from_fake_dataset( self, mock_list_datasets, mock_is_ehr_dataset, mock_is_unioned_dataset, mock_is_combined_dataset, mock_is_deid_dataset): mock_list_datasets.return_value = [self.bq_dataset_id] mock_is_deid_dataset.return_value = False mock_is_combined_dataset.return_value = False mock_is_unioned_dataset.return_value = False mock_is_ehr_dataset.return_value = True # create and load person_ids to pid table bq_utils.create_table(self.pid_table_id, retract_data_bq.PID_TABLE_FIELDS, drop_existing=True, dataset_id=self.bq_dataset_id) bq_formatted_insert_values = ', '.join([ '(%s, %s)' % (person_id, research_id) for (person_id, research_id) in self.person_research_ids ]) q = INSERT_PID_TABLE.format( dataset_id=self.bq_dataset_id, pid_table_id=self.pid_table_id, person_research_ids=bq_formatted_insert_values) bq_utils.query(q) job_ids = [] row_count_queries = {} # load the cdm files into dataset for cdm_file in test_util.NYC_FIVE_PERSONS_FILES: cdm_file_name = os.path.basename(cdm_file) cdm_table = cdm_file_name.split('.')[0] hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table) # store query for checking number of rows to delete row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format( dataset_id=self.bq_dataset_id, table_id=hpo_table, pid_table_id=self.pid_table_id) logging.info('Preparing to load table %s.%s' % (self.bq_dataset_id, hpo_table)) with open(cdm_file, 'rb') as f: gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id), cdm_file_name, f) result = bq_utils.load_cdm_csv(self.hpo_id, cdm_table, dataset_id=self.bq_dataset_id) logging.info('Loading table %s.%s' % (self.bq_dataset_id, hpo_table)) job_id = result['jobReference']['jobId'] job_ids.append(job_id) incomplete_jobs = bq_utils.wait_on_jobs(job_ids) self.assertEqual(len(incomplete_jobs), 0, 'NYC five person load job did not complete') logging.info('All tables loaded successfully') # use query results to count number of expected row deletions expected_row_count = {} for table in row_count_queries: result = bq_utils.query(row_count_queries[table]) expected_row_count[table] = int(result['totalRows']) # separate check to find number of actual deleted rows q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id) q_result = bq_utils.query(q) result = bq_utils.response2rows(q_result) row_count_before_retraction = {} for row in result: row_count_before_retraction[row['table_id']] = row['row_count'] # perform retraction retract_data_bq.run_bq_retraction(self.test_project_id, self.bq_dataset_id, self.test_project_id, self.pid_table_id, self.hpo_id, self.dataset_ids, self.retraction_type) # find actual deleted rows q_result = bq_utils.query(q) result = bq_utils.response2rows(q_result) row_count_after_retraction = {} for row in result: row_count_after_retraction[row['table_id']] = row['row_count'] for table in expected_row_count: self.assertEqual( expected_row_count[table], row_count_before_retraction[table] - row_count_after_retraction[table])
def test_integration_queries_to_retract_from_fake_dataset( self, mock_list_datasets, mock_is_ehr_dataset, mock_is_unioned_dataset, mock_is_combined_dataset, mock_is_deid_dataset): mock_list_datasets.return_value = [self.bq_dataset_id] mock_is_deid_dataset.return_value = False mock_is_combined_dataset.return_value = False mock_is_unioned_dataset.return_value = False mock_is_ehr_dataset.return_value = True # create and load person_ids to pid table bq.create_tables( self.client, self.test_project_id, [ f'{self.test_project_id}.{self.bq_dataset_id}.{self.pid_table_id}' ], exists_ok=False, fields=[rbq.PID_TABLE_FIELDS]) bq_formatted_insert_values = ', '.join([ f'({person_id}, {research_id})' for (person_id, research_id) in self.person_research_ids ]) q = INSERT_PID_TABLE.format( dataset_id=self.bq_dataset_id, pid_table_id=self.pid_table_id, person_research_ids=bq_formatted_insert_values) job = self.client.query(q) job.result() row_count_queries = {} # load the cdm files into dataset for cdm_file in test_util.NYC_FIVE_PERSONS_FILES: cdm_file_name = os.path.basename(cdm_file) cdm_table = cdm_file_name.split('.')[0] hpo_table = f'{self.hpo_id}_{cdm_table}' # store query for checking number of rows to delete row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format( dataset_id=self.bq_dataset_id, table_id=hpo_table, pid_table_id=self.pid_table_id) logging.info( f'Preparing to load table {self.bq_dataset_id}.{hpo_table}') with open(cdm_file, 'rb') as f: job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.CSV job_config.skip_leading_rows = 1 job_config.write_disposition = 'WRITE_EMPTY' job_config.schema = bq.get_table_schema(cdm_table) load_job = self.client.load_table_from_file( f, f'{self.test_project_id}.{self.bq_dataset_id}.{hpo_table}', job_config=job_config) load_job.result() logging.info('All tables loaded successfully') # use query results to count number of expected row deletions expected_row_count = {} for table in row_count_queries: job = self.client.query(row_count_queries[table]) result = job.result() expected_row_count[table] = result.to_dataframe()['count'].to_list( )[0] # separate check to find number of actual deleted rows q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id) job = self.client.query(q) result = job.result().to_dataframe() row_counts_before_retraction = pd.Series( result.row_count.values, index=result.table_id).to_dict() # perform retraction rbq.run_bq_retraction(self.test_project_id, self.bq_dataset_id, self.test_project_id, self.pid_table_id, self.hpo_id, self.dataset_ids, self.retraction_type) # find actual deleted rows job = self.client.query(q) result = job.result().to_dataframe() row_counts_after_retraction = pd.Series( result.row_count.values, index=result.table_id).to_dict() for table in expected_row_count: self.assertEqual( expected_row_count[table], row_counts_before_retraction[table] - row_counts_after_retraction[table])