def main(output_folder, do_download, gdc_executable, manifest_filepath, source_slides_folder): logger = get_logger(filename_handler='code.data_processing.' + os.path.basename(__file__) + '_' + strftime("%Y-%m-%d_%H:%M:%S", gmtime()) + '.log') logger.info('Data_processing control parameters:') logger.info(' do_download: ' + str(do_download)) logger.info(' gdc_executable: ' + str(gdc_executable)) logger.info(' manifest_filepath: ' + str(manifest_filepath)) logger.info(' source_slides_folder: ' + str(source_slides_folder)) logger.info(' Meta-parameters:') logger.info(' desired_magnification %s' % str(desired_magnification)) logger.info(' tile_width %s' % str(desired_tile_width)) logger.info(' expected_tile_shape %s' % str(expected_tile_shape)) logger.info(' background_threshold %s' % str(background_threshold)) logger.info(' background_pixel_value %s' % str(background_pixel_value)) if not os.path.exists(output_folder): os.makedirs(output_folder) # Download SVS files using manifest and GDC extraction tool if do_download: if not os.path.exists( os.path.join(output_folder, 'has_been_downloaded')): if manifest_filepath is None or gdc_executable is None: raise ValueError( 'Download from TCGA is on: need both --gdc-executable and --manifest arguments filled' ) if not os.path.exists(gdc_executable): raise FileNotFoundError( f'Specified GDC executable {gdc_executable} not found') if not os.path.exists(manifest_filepath): raise FileNotFoundError( f'Specified manifest file {manifest_filepath} not found') crude_slides_output_folder = os.path.join(output_folder, 'downloaded_slides') if not os.path.exists(crude_slides_output_folder): os.makedirs(crude_slides_output_folder) logger.info( f'Downloading slides into {crude_slides_output_folder}...') slides_filepaths, md5sums, cases_ids = svs_factory.download_svs_files( gdc_executable, manifest_filepath, crude_slides_output_folder) # Write control file after download is finished with open(os.path.join(output_folder, 'has_been_downloaded'), 'w') as f: f.write('\n'.join( ','.join(a) for a in zip(slides_filepaths, md5sums, cases_ids))) logger.info(' done') else: logger.info('Slides already downloaded -> skipping') # Retrieve all downloaded SVS files in case previous step not performed with open(os.path.join(output_folder, 'has_been_downloaded'), 'r') as f: download_content = f.read().splitlines() download_content = list( map(lambda line: line.split(','), download_content)) slides_filepaths, md5sums, cases_ids = list( map(list, zip(*download_content))) else: if source_slides_folder is None: raise ValueError( 'No download from TCGA: need --source-slides-folder argument filled with folder ' 'containing slides to process') elif not os.path.exists(source_slides_folder): raise FileNotFoundError( f'Input folder {source_slides_folder} with slides to be processed not found' ) logger.info( f'Performing no download from TCGA as requested; listing source slides from {source_slides_folder}' ) slides_filepaths = svs_factory.list_slides_in_folder( source_slides_folder) # if not slide filepaths retrieved, try regime where slides are inside sup-folders if len(slides_filepaths) == 0: slides_filepaths = svs_factory.list_slides_in_folder( source_slides_folder, with_supfolder=True) md5sums = ['no_md5sum'] * len(slides_filepaths) cases_ids = ['no_case_id'] * len(slides_filepaths) # Tile all slides into super-patches has_been_tiled_filename = 'has_been_tiled_mag%d' % desired_magnification if not os.path.exists( os.path.join(output_folder, 'has_been_tiled_mag%d' % desired_magnification)): logger.info('Tiling slides into super-patches...') logger.info(' found %d files to be processed' % len(slides_filepaths)) output_tiles_folders = svs_factory.tile_slides(slides_filepaths, desired_tile_width, desired_overlap, desired_magnification) assert None not in output_tiles_folders assert len(output_tiles_folders) == len(slides_filepaths) # Write control file after tile processing is finished with tiles folders, slide names, md5sum and cases ID with open(os.path.join(output_folder, has_been_tiled_filename), 'w') as f: f.write('\n'.join( ','.join(a) for a in zip(output_tiles_folders, list(map(os.path.basename, slides_filepaths)), md5sums, cases_ids))) else: logger.info('Slides already tiled at magnification %d -> skipping' % desired_magnification) # Independently from previous processing, extract labels of SVS files filtered_tiles_output_folder = os.path.join(output_folder, 'filtered_tiles') has_been_filtered_filename = os.path.join(output_folder, 'has_been_moved_and_filtered') if not os.path.exists( os.path.join(output_folder, 'has_been_moved_and_filtered')): logger.info('Extracting labels...') # Retrieve directories in which the super-patches are located in case previous step not performed with open(os.path.join(output_folder, has_been_tiled_filename), 'r') as f: tiled_content = f.read().splitlines() tiled_content = list(map(lambda line: line.split(','), tiled_content)) output_tiles_folders, svs_filenames, md5sums, cases_ids = list( map(list, zip(*tiled_content))) associated_labels = list( map(case_factory.infer_class_from_tcga_name, svs_filenames)) logger.info(' done') logger.info('Moving+background-filtering tiles into %s' % output_folder) data_folders = svs_factory.move_and_filter_tiles_folders( output_tiles_folders, associated_labels, svs_filenames, cases_ids, filtered_tiles_output_folder, background_pixel_value, background_threshold, expected_tile_shape, logger=logger) logger.info(' done') open(has_been_filtered_filename, 'w').write('\n'.join(data_folders)) logger.info('Wrote `has_been_moved_and_filtered` file') else: logger.info('Tiles already moved and filtered -> skipping') # seek classes folders # data_folders = [f for f in os.listdir(filtered_tiles_output_folder) # if not os.path.isfile(os.path.join(filtered_tiles_output_folder, f))] data_folders = open(has_been_filtered_filename, 'r').read().splitlines() logger.info('Found %d source slides folders in %s' % (len(data_folders), filtered_tiles_output_folder)) # logger.info('Performing train/val/test splitting with background removal') # train_cases_ids, val_cases_ids, test_cases_ids = case_factory.split_svs_samples_casewise(output_tiles_folders, # cases_ids, # train_size, # val_size, # test_size) logger.info('Pre-processing done') return list(map(lambda f: os.path.join(output_folder, f), data_folders))
def main(hyper_parameters): prefix_time = strftime("%Y-%m-%d %H:%M:%S", gmtime()) logger = get_logger(filename_handler='code.' + __file__ + '_' + prefix_time + '.log', verbose=hyper_parameters['verbose']) logger.info('Hyper parameters') logger.info(pprint.pformat(hyper_parameters, indent=4)) # Pre-processing should have been done beforehand, retrieve data by specifying data preprocessing output folder slides_folders = end_to_end_data_preprocessing( hyper_parameters['input_data_folder'], hyper_parameters['do_download'], hyper_parameters['gdc_executable'], hyper_parameters['manifest_file'], hyper_parameters['source_slides_folder']) logger.info('Initializing model... ') if not os.path.exists(hyper_parameters['models_save_folder']): os.makedirs(hyper_parameters['models_save_folder']) # Instantiate instance classifier model, then MIL wrapper instance_classifier, input_width = instantiate_model( model_type=hyper_parameters['underlying_model_type'], pretrained=hyper_parameters['underlying_model_pretrained'], n_classes=1) logger.info( f' {hyper_parameters["underlying_model_type"]} architecture initialized' ) mil_model = MaxMinMIL(instance_classifier, alpha=hyper_parameters['alpha'], beta=hyper_parameters['beta'], cuda=hyper_parameters['cuda']) logger.info(' mil wrapper initialized') if hyper_parameters['underlying_model_load_from'] is not None: logger.warning(' Initializing model from %s' % hyper_parameters['underlying_model_load_from']) mil_model.load_state_dict( torch.load(hyper_parameters['underlying_model_load_from'])) if hyper_parameters['cuda']: n_devices = torch.cuda.device_count() if n_devices > 1: mil_model = nn.DataParallel(mil_model) mil_model.cuda() logger.info('done') optimizer = optim.Adam(mil_model.parameters(), lr=hyper_parameters['learning_rate'], weight_decay=hyper_parameters['weight_decay']) # Load data and split case-wise into train, val and test sets logger.info('Pre-loading all data...') train_dataset, val_dataset, test_dataset = build_datasets( source_slides_folders=slides_folders, model_input_width=input_width, hyper_parameters=hyper_parameters, logger=logger) logger.info('Train size %d' % len(train_dataset)) logger.info('Val size %d' % len(val_dataset)) logger.info('Test size %d' % len(test_dataset)) train_dataloader = to_dataloader(train_dataset, True) val_dataloader = to_dataloader(val_dataset, False) if len(val_dataset) else None test_dataloader = to_dataloader(test_dataset, False) if len(test_dataset) else None # Instantiate summary writer if tensorboard activated if hyper_parameters['with_tensorboard']: summary_writer_filename = 'summary_' + prefix_time summary_writer_folder_path = os.path.join('tensorboard', summary_writer_filename) summary_writer = SummaryWriter(log_dir=summary_writer_folder_path) else: summary_writer = None val_losses = [] # Perform training loop: for each epoch, train and validate logger.info('Starting training...') start_training_time = time.time() for epoch in range(hyper_parameters['n_epochs']): # Train train_loss, train_savepath = perform_epoch( mil_model, optimizer, epoch, train_dataloader, hyper_parameters=hyper_parameters, is_training=True, logger=logger, set_name='training', prefix_time=prefix_time, summary_writer=summary_writer) # Validate if val_dataloader: with torch.no_grad(): val_loss, _ = perform_epoch(mil_model, optimizer, epoch, val_dataloader, hyper_parameters=hyper_parameters, is_training=False, logger=logger, set_name='validation', prefix_time=prefix_time, summary_writer=summary_writer) # Early stopping val_losses.append(val_loss) do_stop, best_value = early_stopping( val_losses, patience=hyper_parameters['early_stopping_patience']) if do_stop: logger.warning( 'Early stopping triggered: stopping training after no improvement on val set for ' '%d epochs with value %.3f' % (hyper_parameters['early_stopping_patience'], best_value)) break logger.warning('Total training time %s' % (time.time() - start_training_time)) # Test if test_dataloader: logger.info('Starting testing...') with torch.no_grad(): perform_epoch(mil_model, optimizer, -1, test_dataloader, hyper_parameters=hyper_parameters, is_training=False, logger=logger, set_name='test', prefix_time=prefix_time, summary_writer=summary_writer) return
def main(source_folder, output_folder, gdc_executable_path): logger = get_logger(filename_handler='data_processing.log', verbose=True) logger.info('Source folder %s' % os.path.abspath(source_folder) if source_folder else 'None') logger.info('Output folder %s' % os.path.abspath(output_folder)) logger.info('Meta-parameters:') logger.info(' desired_magnification %s' % str(desired_magnification)) logger.info(' tile_width %s' % str(desired_tile_width)) logger.info(' expected_tile_shape %s' % str(expected_tile_shape)) logger.info(' background_threshold %s' % str(background_threshold)) logger.info(' background_pixel_value %s' % str(background_pixel_value)) if not os.path.exists(output_folder): os.makedirs(output_folder) # Download SVS files using manifest and GDC extraction tool if not os.path.exists(os.path.join(output_folder, 'has_been_downloaded')): logger.info('Downloading slides...') svs_filepaths, md5sums, cases_ids = svs_factory.download_svs_files( source_folder, gdc_executable_path) # Write control file after download is finished with open(os.path.join(output_folder, 'has_been_downloaded'), 'w') as f: f.write('\n'.join(','.join(a) for a in zip(svs_filepaths, md5sums, cases_ids))) logger.info(' done') else: logger.info('Slides already downloaded -> skipping') # Tile all slides into super-patches if not os.path.exists( os.path.join(output_folder, 'has_been_tiled_mag%d' % desired_magnification)): logger.info('Tiling slides into super-patches...') # Retrieve all downloaded SVS files in case previous step not performed with open(os.path.join(output_folder, 'has_been_downloaded'), 'r') as f: download_content = f.read().splitlines() download_content = list( map(lambda line: line.split(','), download_content)) svs_filepaths, md5sums, cases_ids = list( map(list, zip(*download_content))) logger.info(' found %d files to be processed' % len(svs_filepaths)) output_tiles_folders = svs_factory.tile_slides(svs_filepaths, desired_tile_width, desired_overlap, desired_magnification) assert None not in output_tiles_folders assert len(output_tiles_folders) == len(svs_filepaths) # Write control file after tile processing is finished with tiles folders, slide names, md5sum and cases ID with open( os.path.join(output_folder, 'has_been_tiled_mag%d' % desired_magnification), 'w') as f: f.write('\n'.join(','.join(a) for a in zip( output_tiles_folders, list(map( os.path.basename, svs_filepaths)), md5sums, cases_ids))) else: logger.info('Slides already tiled at magnification %d -> skipping' % desired_magnification) # Independently from previous processing, extract labels of SVS files if not os.path.exists( os.path.join(output_folder, 'has_been_moved_and_filtered')): logger.info('Extracting labels...') # Retrieve directories in which the super-patches are located in case previous step not performed with open( os.path.join(output_folder, 'has_been_tiled_mag%d' % desired_magnification), 'r') as f: tiled_content = f.read().splitlines() tiled_content = list(map(lambda line: line.split(','), tiled_content)) output_tiles_folders, svs_filenames, md5sums, cases_ids = list( map(list, zip(*tiled_content))) associated_labels = list( map(case_factory.infer_class_from_tcga_name, svs_filenames)) logger.info(' done') logger.info('Moving+background-filtering tiles into %s' % output_folder) data_folders = svs_factory.move_and_filter_tiles_folders( output_tiles_folders, associated_labels, svs_filenames, cases_ids, output_folder, background_pixel_value, background_threshold, expected_tile_shape, logger=logger) logger.info(' done') open(os.path.join(output_folder, 'has_been_moved_and_filtered'), 'w').write('\n'.join(data_folders)) logger.info('Wrote `has_been_moved_and_filtered` file') else: logger.info('Tiles already moved and filtered -> skipping') # seek classes folders data_folders = [ f for f in os.listdir(output_folder) if not os.path.isfile(os.path.join(output_folder, f)) ] logger.info('Found %d source slides folders' % len(data_folders)) # logger.info('Performing train/val/test splitting with background removal') # train_cases_ids, val_cases_ids, test_cases_ids = case_factory.split_svs_samples_casewise(output_tiles_folders, # cases_ids, # train_size, # val_size, # test_size) return list(map(lambda f: os.path.join(output_folder, f), data_folders))