def gen_train_patches(input_folder: Path, output_folder: Path, num_train_per_class: int, num_workers: int, patch_size: int, purple_threshold: int, purple_scale_size: int, image_ext: str, type_histopath: bool) -> None: """ Generates all patches for subfolders in the training set. Args: input_folder: Folder containing the subfolders containing WSI. output_folder: Folder to save the patches to. num_train_per_class: The desired number of training patches per class. num_workers: Number of workers to use for IO. patch_size: Size of the patches extracted from the WSI. purple_threshold: Number of purple points for region to be considered purple. purple_scale_size: Scalar to use for reducing image to check for purple. image_ext: Image extension for saving patches. type_histopath: Only look for purple histopathology images and filter whitespace. """ # Find the subfolders and how much patches should overlap for each. subfolders = get_subfolder_paths(folder=input_folder) print(f"{subfolders} subfolders found from {input_folder}") subfolder_to_overlap_factor = get_subfolder_to_overlap( subfolders=subfolders, desired_crops_per_class=num_train_per_class) # Produce the patches. for input_subfolder in subfolders: produce_patches(input_folder=input_subfolder, output_folder=output_folder.joinpath( input_subfolder.name), inverse_overlap_factor=subfolder_to_overlap_factor[ input_subfolder], by_folder=False, num_workers=num_workers, patch_size=patch_size, purple_threshold=purple_threshold, purple_scale_size=purple_scale_size, image_ext=image_ext, type_histopath=type_histopath) print("\nfinished all folders\n")
def balance_classes(training_folder: Path) -> None: """ Balancing class distribution so that training isn't skewed. Args: training_folder: Folder containing the subfolders to be balanced. """ subfolders = get_subfolder_paths(folder=training_folder) subfolder_to_images = { subfolder: get_image_paths(folder=subfolder) for subfolder in subfolders } # Find the class with the most images. biggest_size = max({ subfolder: len(subfolder_to_images[subfolder]) for subfolder in subfolders }.values()) for subfolder in subfolder_to_images: duplicate_until_n(image_paths=subfolder_to_images[subfolder], n=biggest_size) print(f"balanced all training classes to have {biggest_size} images\n")
def get_predictions(patches_eval_folder: Path, output_folder: Path, checkpoints_folder: Path, auto_select: bool, eval_model: Path, device: torch.device, classes: List[str], num_classes: int, path_mean: List[float], path_std: List[float], num_layers: int, pretrain: bool, batch_size: int, num_workers: int) -> None: """ Main function for running the model on all of the generated patches. Args: patches_eval_folder: Folder containing patches to evaluate on. output_folder: Folder to save the model results to. checkpoints_folder: Directory to save model checkpoints to. auto_select: Automatically select the model with the highest validation accuracy, eval_model: Path to the model with the highest validation accuracy. device: Device to use for running model. classes: Names of the classes in the dataset. num_classes: Number of classes in the dataset. path_mean: Means of the WSIs for each dimension. path_std: Standard deviations of the WSIs for each dimension. num_layers: Number of layers to use in the ResNet model from [18, 34, 50, 101, 152]. pretrain: Use pretrained ResNet weights. batch_size: Mini-batch size to use for training. num_workers: Number of workers to use for IO. """ # Initialize the model. model_path = get_best_model( checkpoints_folder=checkpoints_folder) if auto_select else eval_model model = create_model(num_classes=num_classes, num_layers=num_layers, pretrain=pretrain) ckpt = torch.load(f=model_path) model.load_state_dict(state_dict=ckpt["model_state_dict"]) model = model.to(device=device) model.train(mode=False) print(f"model loaded from {model_path}") # For outputting the predictions. class_num_to_class = {i: classes[i] for i in range(num_classes)} start = time.time() # Load the data for each folder. image_folders = get_subfolder_paths(folder=patches_eval_folder) # Where we want to write out the predictions. # Confirm the output directory exists. output_folder.mkdir(parents=True, exist_ok=True) # For each WSI. for image_folder in image_folders: # Temporary fix. Need not to make folders with no crops. try: # Load the image dataset. dataloader = torch.utils.data.DataLoader( dataset=datasets.ImageFolder( root=str(image_folder), transform=transforms.Compose(transforms=[ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=path_mean, std=path_std) ])), batch_size=batch_size, shuffle=False, num_workers=num_workers) except RuntimeError: print( "WARNING: One of the image directories is empty. Skipping this directory." ) continue num_test_image_windows = len(dataloader) * batch_size # Load the image names so we know the coordinates of the patches we are predicting. image_folder = image_folder.joinpath(image_folder.name) window_names = get_image_paths(folder=image_folder) print(f"testing on {num_test_image_windows} crops from {image_folder}") test_label_to_class = {0:"0", 1:"180", 2:"270", 3:"90"} with output_folder.joinpath(f"{image_folder.name}.csv").open( mode="w") as writer: writer.write("image_name,ground_truth,prediction,confidence\n") # Loop through all of the patches. for batch_num, (test_inputs, test_labels) in enumerate(dataloader): batch_window_names = window_names[batch_num * batch_size:batch_num * batch_size + batch_size] confidences, test_preds = torch.max(nn.Softmax(dim=1)(model( test_inputs.to(device=device))), dim=1) for i in range(test_preds.shape[0]): # Find coordinates and predicted class. image_name = batch_window_names[i].name # xy = batch_window_names[i].name.split(".")[0].split(";") writer.write( f"{','.join([image_name, image_folder.name, f'{class_num_to_class[test_preds[i].data.item()]}', f'{confidences[i].data.item():.5f}'])}\n" ) print(f"time for {patches_eval_folder}: {time.time() - start:.2f} seconds")
def split(all_wsi, train_folder, val_folder, test_folder, val_split, test_split, keep_orig_copy, labels_train, labels_val, labels_test): head = 'cp' if keep_orig_copy else 'mv' # based on whether we want to move or keep the files # create folders for folder in [train_folder, val_folder, test_folder]: subfolders = [join(folder, _class) for _class in config.classes] for subfolder in subfolders: confirm_output_folder(subfolder) train_img_to_label = {} val_img_to_label = {} test_img_to_label = {} def move_set(folder, image_files, ops): """ Return: a dictionary where key is (str)image_file_name and value is (str)image_class """ def remove_topdir(filepath): """filepath should be a relative path ex) a/b/c.jpg -> b/c.jpg """ first_delimiter_idx = filepath.find('/') return filepath[first_delimiter_idx + 1:] img_to_label = {} for image_file in image_files: output_path = join(folder, remove_topdir(image_file)) os.system(f'{ops} {image_file} {output_path}') img_name = basename(image_file) img_class = basename(dirname(image_file)) img_to_label[img_name] = img_class return img_to_label # sort the images and move/copy them appropriately subfolder_paths = get_subfolder_paths(all_wsi) for subfolder in subfolder_paths: image_paths = get_image_paths(subfolder) assert len(image_paths) > val_split + test_split # make sure we have enough slides in each class # assign training, test, and val images test_idx = len(image_paths) - test_split val_idx = test_idx - val_split train_images = image_paths[:val_idx] val_images = image_paths[val_idx:test_idx] test_images = image_paths[test_idx:] print('class {}:'.format(basename(subfolder)), '#train={}'.format(len(train_images)), '#val={} '.format(len(val_images)), '#test={}'.format(len(test_images))) # move train tmp_train_img_to_label = move_set(folder=train_folder, image_files=train_images, ops=head) train_img_to_label.update(tmp_train_img_to_label) # move val tmp_val_img_to_label = move_set(folder=val_folder, image_files=val_images, ops=head) val_img_to_label.update(tmp_train_img_to_label) # move test tmp_test_img_to_label = move_set(folder=test_folder, image_files=test_images, ops=head) # for making the csv files def write_to_csv(dest_filename, image_lable_dict): with open(dest_filename, 'w') as writer: writer.write('img,gt\n') for img in sorted(image_lable_dict.keys()): writer.write(img + ',' + image_lable_dict[img] + '\n') write_to_csv(dest_filename=labels_train, image_lable_dict=train_img_to_label) write_to_csv(dest_filename=labels_val, image_lable_dict=val_img_to_label) write_to_csv(dest_filename=labels_test, image_lable_dict=test_img_to_label)
def split(keep_orig_copy: bool, wsi_train: Path, wsi_val: Path, wsi_test: Path, classes: List[str], all_wsi: Path, val_wsi_per_class: int, test_wsi_per_class: int, labels_train: Path, labels_test: Path, labels_val: Path) -> None: """ Main function for splitting data. Note that we want the validation and test sets to be balanced. Args: keep_orig_copy: Whether to move or copy the WSI when splitting into training, validation, and test sets. wsi_train: Location to be created to store WSI for training. wsi_val: Location to be created to store WSI for validation. wsi_test: Location to be created to store WSI for testing. classes: Names of the classes in the dataset. all_wsi: Location of the WSI organized in subfolders by class. val_wsi_per_class: Number of WSI per class to use in the validation set. test_wsi_per_class: Number of WSI per class to use in the test set. labels_train: Location to store the CSV file labels for training. labels_test: Location to store the CSV file labels for testing. labels_val: Location to store the CSV file labels for validation. """ # Based on whether we want to move or keep the files. head = shutil.copyfile if keep_orig_copy else shutil.move # Create folders. for f in (wsi_train, wsi_val, wsi_test): subfolders = [f.joinpath(_class) for _class in classes] for subfolder in subfolders: # Confirm the output directory exists. subfolder.mkdir(parents=True, exist_ok=True) train_img_to_label = {} val_img_to_label = {} test_img_to_label = {} def move_set(folder: Path, image_files: List[Path], ops: shutil) -> Dict[Path, str]: """ Moves the sets to the desired output directories. Args: folder: Folder to move images to. image_files: Image files to move. ops: Whether to move or copy the files. Return: A dictionary mapping image filenames to classes. """ def remove_topdir(filepath: Path) -> Path: """ Remove the top directory since the filepath needs to be a relative path (i.e., a/b/c.jpg -> b/c.jpg). Args: filepath: Path to remove top directory from. Returns: Path with top directory removed. """ return Path(*filepath.parts[1:]) img_to_label = {} for image_file in image_files: # Copy or move the files. ops(src=image_file, dst=folder.joinpath(remove_topdir(filepath=image_file))) img_to_label[Path(image_file.name)] = image_file.parent.name return img_to_label # Sort the images and move/copy them appropriately. subfolder_paths = get_subfolder_paths(folder=all_wsi) for subfolder in subfolder_paths: image_paths = get_image_paths(folder=subfolder) # Make sure we have enough slides in each class. assert len( image_paths ) > val_wsi_per_class + test_wsi_per_class, "Not enough slides in each class." # Assign training, test, and validation images. test_idx = len(image_paths) - test_wsi_per_class val_idx = test_idx - val_wsi_per_class train_images = image_paths[:val_idx] val_images = image_paths[val_idx:test_idx] test_images = image_paths[test_idx:] print(f"class {Path(subfolder).name} " f"#train={len(train_images)} " f"#val={len(val_images)} " f"#test={len(test_images)}") # Move the training images. train_img_to_label.update( move_set(folder=wsi_train, image_files=train_images, ops=head)) # Move the validation images. val_img_to_label.update( move_set(folder=wsi_val, image_files=val_images, ops=head)) # Move the testing images. test_img_to_label.update( move_set(folder=wsi_test, image_files=test_images, ops=head)) def write_to_csv(dest_filename: Path, image_label_dict: Dict[Path, str]) -> None: """ Write the image names and corresponding labels to a CSV file. Args: dest_filename: Destination filename for the CSV file. image_label_dict: Dictionary mapping filenames to labels. """ with dest_filename.open(mode="w") as writer: writer.write("img,gt\n") for img in sorted(image_label_dict.keys()): writer.write(f"{img},{image_label_dict[img]}\n") write_to_csv(dest_filename=labels_train, image_label_dict=train_img_to_label) write_to_csv(dest_filename=labels_val, image_label_dict=val_img_to_label) write_to_csv(dest_filename=labels_test, image_label_dict=test_img_to_label)