def main(args: argparse.Namespace) -> None: print( f'>>> Starting data augmentation (original + {args.n_aug} new images)') root_dir: str = args.root_dir dest_dir: str = args.dest_dir folders: List[Path] = list(Path(root_dir).glob("*")) dest_folders: List[Path] = [Path(dest_dir, p.name) for p in folders] print( f"Will augment data from {len(folders)} folders ({map_(str, folders)})" ) # Create all the destination folders for d_folder in dest_folders: d_folder.mkdir(parents=True, exist_ok=True) names: List[str] = map_(lambda p: str(p.name), folders[0].glob("*.png")) partial_process = partial(process_name, folders=folders, dest_folders=dest_folders, n_aug=args.n_aug, args=args) mmap_(partial_process, names)
def main(): assert len(argv) == 3 folder = Path(argv[1]) changes = eval(argv[2]) remap_ = partial(remap, changes) targets: Iterable[str] = map(str, folder.glob("*.png")) mmap_(remap_, targets)
def main(args: Namespace) -> None: inputs: List[Path] = list( Path(args.base_folder, args.GT_subfolder).glob(args.regex)) names: List[str] = [p.name for p in inputs] print(f"Found {len(names)} images to weaken") if args.verbose: pprint(names[:10]) strategy: Callable = eval(args.strategy) strat: Callable = partial(weaken_img, strategy=strategy) # sizes: np.ndarray = np.zeros(len(inputs), dtype=np.uint32) # for i, (pn) in tqdm(enumerate(zip(inputs, names)), ncols=100, total=len(names)): # sizes[i] = strat(pn) orig_sizes, new_sizes = map_(np.asarray, zip(*mmap_(strat, zip(inputs, names)))) assert len(orig_sizes) == len(new_sizes) == len(names) try: print("Orig sizes: (min, mean, max)", orig_sizes[orig_sizes > 0].min(), orig_sizes.mean(), orig_sizes.max()) print( f"Annotated {new_sizes.sum()} pixels for {len(new_sizes)} images") except ValueError: pass
def main(args: argparse.Namespace): src_path: Path = Path(args.source_dir) dest_path: Path = Path(args.dest_dir) # Assume the cleaning up is done before calling the script assert src_path.exists() assert not dest_path.exists() # Get all the file names, avoid the temporal ones nii_paths: List[Path] = [p for p in src_path.rglob('*.nii.gz') if "_4d" not in str(p)] assert len(nii_paths) % 2 == 0, "Uneven number of .nii, one+ pair is broken" # We sort now, but also id matching is checked while iterating later on img_nii_paths: List[Path] = sorted(p for p in nii_paths if "_gt" not in str(p)) gt_nii_paths: List[Path] = sorted(p for p in nii_paths if "_gt" in str(p)) assert len(img_nii_paths) == len(gt_nii_paths) paths: List[Tuple[Path, Path]] = list(zip(img_nii_paths, gt_nii_paths)) print(f"Found {len(img_nii_paths)} pairs in total") pprint(paths[:5]) validation_paths: List[Tuple[Path, Path]] = random.sample(paths, args.retain) training_paths: List[Tuple[Path, Path]] = [p for p in paths if p not in validation_paths] assert set(validation_paths).isdisjoint(set(training_paths)) assert len(paths) == (len(validation_paths) + len(training_paths)) for mode, _paths, n_augment in zip(["train", "val"], [training_paths, validation_paths], [args.n_augment, 0]): img_paths, gt_paths = zip(*_paths) # type: Tuple[Any, Any] dest_dir = Path(dest_path, mode) print(f"Slicing {len(img_paths)} pairs to {dest_dir}") assert len(img_paths) == len(gt_paths) pfun = partial(save_slices, dest_dir=dest_dir, shape=args.shape, n_augment=n_augment) all_sizes = mmap_(uc_(pfun), zip(img_paths, gt_paths)) # for paths in tqdm(list(zip(img_paths, gt_paths)), ncols=50): # uc_(pfun)(paths) all_slices_sizes_px, all_slices_sizes_mm2, all_volume_size_px, all_volume_size_mm3 = zip(*all_sizes) flat_sizes_px = flatten_(all_slices_sizes_px) flat_sizes_mm2 = flatten_(all_slices_sizes_mm2) print("px", len(flat_sizes_px), min(flat_sizes_px), max(flat_sizes_px)) print('\t', "px 5/95", np.percentile(flat_sizes_px, 5), np.percentile(flat_sizes_px, 95)) print('\t', "mm2", f"{min(flat_sizes_mm2):.02f}", f"{max(flat_sizes_mm2):.02f}") _, axes = plt.subplots(nrows=2, ncols=2) axes = axes.flatten() axes[0].set_title("Slice surface (pixel)") axes[0].boxplot(all_slices_sizes_px, whis=[0, 100]) axes[1].set_title("Slice surface (mm2)") axes[1].boxplot(all_slices_sizes_mm2, whis=[0, 100]) axes[2].set_title("LV volume (pixel)") axes[2].hist(all_volume_size_px, bins=len(all_volume_size_px) // 2) axes[3].set_title("LV volume (mm3)") axes[3].hist(all_volume_size_mm3, bins=len(all_volume_size_px) // 2)
def main(args: argparse.Namespace): src_path: Path = Path(args.source_dir) dest_path: Path = Path(args.dest_dir) # Assume the cleaning up is done before calling the script assert src_path.exists() assert not dest_path.exists() # Get all the file names, avoid the temporal ones nii_paths: List[Path] = [p for p in src_path.rglob('*.nii')] assert len( nii_paths) % 2 == 0, "Uneven number of .nii, one+ pair is broken" # We sort now, but also id matching is checked while iterating later on img_nii_paths: List[Path] = sorted(p for p in nii_paths if "_Labels" not in str(p)) gt_nii_paths: List[Path] = sorted(p for p in nii_paths if "_Labels" in str(p)) assert len(img_nii_paths) == len(gt_nii_paths) paths: List[Tuple[Path, Path]] = list(zip(img_nii_paths, gt_nii_paths)) print(f"Found {len(img_nii_paths)} pairs in total") pprint(paths[:5]) validation_paths: List[Tuple[Path, Path]] = random.sample(paths, args.retain) training_paths: List[Tuple[Path, Path]] = [ p for p in paths if p not in validation_paths ] assert set(validation_paths).isdisjoint(set(training_paths)) assert len(paths) == (len(validation_paths) + len(training_paths)) for mode, _paths in zip(["train", "val"], [training_paths, validation_paths]): img_paths, gt_paths = zip(*_paths) # type: Tuple[Any, Any] dest_dir = Path(dest_path, mode) print(f"Slicing {len(img_paths)} pairs to {dest_dir}") assert len(img_paths) == len(gt_paths) pfun = partial(process_patient, dest_dir=dest_dir, shape=args.shape, cr=args.crop) sizess = mmap_(uc_(pfun), zip(img_paths, gt_paths)) # for paths in tqdm(list(zip(img_paths, gt_paths)), ncols=50): # uc_(pfun)(paths) all_sizes = np.array(flatten_(sizess)) all_pos = all_sizes[all_sizes > 0] print( f"sizes: min={np.min(all_pos)}, 5th={np.percentile(all_pos, 5):0.02f}, median={np.median(all_pos):0.0f}, " + f"mean={np.mean(all_pos):0.02f}, 95th={np.percentile(all_pos, 95):0.02f}, max={np.max(all_pos)}" )
def main(args) -> None: W, H = args.wh # Tuple[int, int] r: int = args.r for folder, n_img in zip(["train", "val"], args.n): gt_folder: Path = Path(args.dest, folder, 'gt') img_folder: Path = Path(args.dest, folder, 'img') gt_folder.mkdir(parents=True, exist_ok=True) img_folder.mkdir(parents=True, exist_ok=True) gen_fn = partial(gen_img, W=W, H=W, r=r, gt_folder=gt_folder, img_folder=img_folder) mmap_(gen_fn, range(n_img))
def main(args: argparse.Namespace): src_path: Path = Path(args.source_dir) dest_path: Path = Path(args.dest_dir) # Assume the cleaning up is done before calling the script assert src_path.exists() assert not dest_path.exists() # Get all the file names, avoid the temporal ones nii_paths: List[Path] = [p for p in src_path.rglob('*.mhd')] assert len( nii_paths) % 2 == 0, "Uneven number of .nii, one+ pair is broken" # We sort now, but also id matching is checked while iterating later on img_nii_paths: List[Path] = sorted(p for p in nii_paths if "_segmentation" not in str(p)) gt_nii_paths: List[Path] = sorted(p for p in nii_paths if "_segmentation" in str(p)) assert len(img_nii_paths) == len(gt_nii_paths) paths: List[Tuple[Path, Path]] = list(zip(img_nii_paths, gt_nii_paths)) print(f"Found {len(img_nii_paths)} pairs in total") pprint(paths[:5]) validation_paths: List[Tuple[Path, Path]] = random.sample(paths, args.retain) training_paths: List[Tuple[Path, Path]] = [ p for p in paths if p not in validation_paths ] assert set(validation_paths).isdisjoint(set(training_paths)) assert len(paths) == (len(validation_paths) + len(training_paths)) for mode, _paths, n_augment in zip(["train", "val"], [training_paths, validation_paths], [args.n_augment, 0]): img_paths, gt_paths = zip(*_paths) # type: Tuple[Any, Any] dest_dir = Path(dest_path, mode) print(f"Slicing {len(img_paths)} pairs to {dest_dir}") assert len(img_paths) == len(gt_paths) pfun = partial(save_slices, dest_dir=dest_dir, shape=args.shape, n_augment=n_augment) sizes = mmap_(uc_(pfun), zip(img_paths, gt_paths)) # sizes = [] # for paths in tqdm(list(zip(img_paths, gt_paths)), ncols=50): # sizes.append(uc_(pfun)(paths)) sizes_3d, sizes_2d_min, sizes_2d_max = map_(np.asarray, zip(*sizes)) print("2d sizes: ", sizes_2d_min.min(), sizes_2d_max.max()) print("3d sizes: ", sizes_3d.min(), sizes_3d.mean(), sizes_3d.max())
def main(args: argparse.Namespace): src_path: Path = Path(args.source_dir) dest_path: Path = Path(args.dest_dir) # Assume the cleaning up is done before calling the script assert src_path.exists() assert not dest_path.exists() # Get all the file names, avoid the temporal ones all_paths: List[Path] = list(src_path.rglob('*.nii')) nii_paths: List[Path] = [p for p in all_paths if "_4D" not in str(p)] assert len(nii_paths) % 6 == 0, "Number of .nii not multiple of 6, some pairs are broken" # We sort now, but also id matching is checked while iterating later on CT_nii_paths: List[Path] = sorted(p for p in nii_paths if "CT." in str(p)) CBF_nii_paths: List[Path] = sorted(p for p in nii_paths if "CT_CBF" in str(p)) CBV_nii_paths: List[Path] = sorted(p for p in nii_paths if "CT_CBV" in str(p)) MTT_nii_paths: List[Path] = sorted(p for p in nii_paths if "CT_MTT" in str(p)) Tmax_nii_paths: List[Path] = sorted(p for p in nii_paths if "CT_Tmax" in str(p)) gt_nii_paths: List[Path] = sorted(p for p in nii_paths if "OT" in str(p)) assert len(CT_nii_paths) == len(CBF_nii_paths) == len(CBV_nii_paths) == len(MTT_nii_paths) \ == len(Tmax_nii_paths) == len(gt_nii_paths) paths: List[Tuple[Path, ...]] = list(zip(CT_nii_paths, CBF_nii_paths, CBV_nii_paths, MTT_nii_paths, Tmax_nii_paths, gt_nii_paths)) print(f"Found {len(CT_nii_paths)} pairs in total") pprint(paths[:2]) validation_paths: List[Tuple[Path, ...]] = random.sample(paths, args.retain) training_paths: List[Tuple[Path, ...]] = [p for p in paths if p not in validation_paths] assert set(validation_paths).isdisjoint(set(training_paths)) assert len(paths) == (len(validation_paths) + len(training_paths)) for mode, _paths, n_augment in zip(["train", "val"], [training_paths, validation_paths], [args.n_augment, 0]): # ct_paths, cbf_paths, cbv_paths, mtt_paths, tmax_paths, gt_paths = zip(*_paths) six_paths = list(zip(*_paths)) dest_dir = Path(dest_path, mode) print(f"Slicing {len(six_paths[0])} pairs to {dest_dir}") assert len(set(map_(len, six_paths))) == 1 pfun = partial(save_slices, dest_dir=dest_dir, shape=args.shape, n_augment=n_augment) space_dicts = mmap_(uc_(pfun), zip(*six_paths)) # for case_paths in tqdm(list(zip(*six_paths)), ncols=50): # uc_(pfun)(case_paths) final_dict = {k: v for space_dict in space_dicts for k, v in space_dict.items()} with open(Path(dest_dir, "spacing.pkl"), 'wb') as f: pickle.dump(final_dict, f, pickle.HIGHEST_PROTOCOL) print(f"Saved spacing dictionnary to {f}")
def main(args: argparse.Namespace): src_path: Path = Path(args.source_dir) dest_path: Path = Path(args.dest_dir) # Assume the cleaning up is done before calling the script assert src_path.exists() assert not dest_path.exists() training_ids: List[str] validation_ids: List[str] training_ids, validation_ids = get_splits(args.id_list, args.retains, args.fold) split_ids: List[str] for mode, split_ids in zip(["train", "val"], [training_ids, validation_ids]): dest_mode: Path = Path(dest_path, mode) print(f"Slicing {len(split_ids)} pairs to {dest_mode}") pfun: Callable = partial(slice_patient, dest_path=dest_mode, source_path=src_path, shape=tuple(args.shape), n_augment=args.n_augment if mode == "train" else 0) mmap_(pfun, split_ids)
def main(args: argparse.Namespace): src_path: Path = Path(args.source_dir) dest_path: Path = Path(args.dest_dir) # Assume the cleaning up is done before calling the script assert src_path.exists() assert not dest_path.exists() # Get all the file names, avoid the temporal ones all_paths: List[Path] = list(src_path.rglob('*.nii.gz')) nii_paths: List[Path] = [p for p in all_paths if "_4D" not in str(p)] assert len(nii_paths) % 3 == 0, "Number of .nii not multiple of 6, some pairs are broken" # We sort now, but also id matching is checked while iterating later on flair_nii_paths: List[Path] = sorted(p for p in nii_paths if "FLAIR" in str(p)) t1_nii_paths: List[Path] = sorted(p for p in nii_paths if "T1" in str(p)) gt_nii_paths: List[Path] = sorted(p for p in nii_paths if "wmh.nii" in str(p)) assert len(flair_nii_paths) == len(t1_nii_paths) == len(gt_nii_paths) paths: List[Tuple[Path, ...]] = list(zip(flair_nii_paths, t1_nii_paths, gt_nii_paths)) print(f"Found {len(flair_nii_paths)} pairs in total") pprint(paths[:2]) validation_paths: List[Tuple[Path, ...]] = random.sample(paths, args.retain) training_paths: List[Tuple[Path, ...]] = [p for p in paths if p not in validation_paths] assert set(validation_paths).isdisjoint(set(training_paths)) assert len(paths) == (len(validation_paths) + len(training_paths)) for mode, _paths, n_augment in zip(["train", "val"], [training_paths, validation_paths], [args.n_augment, 0]): three_paths = list(zip(*_paths)) dest_dir = Path(dest_path, mode) print(f"Slicing {len(three_paths[0])} pairs to {dest_dir}") assert len(set(map_(len, three_paths))) == 1 pfun = partial(save_slices, dest_dir=dest_dir, shape=args.shape, n_augment=n_augment, discard_negatives=args.discard_negatives) sizes = mmap_(uc_(pfun), zip(*three_paths)) all_neg, all_pos, space_dicts = zip(*sizes) neg, pos = sum(all_neg), sum(all_pos) ratio = pos / neg print(f"Ratio between pos/neg: {ratio} ({pos}/{neg})") final_dict = {k: v for space_dict in space_dicts for k, v in space_dict.items()} with open(Path(dest_dir, "spacing.pkl"), 'wb') as f: pickle.dump(final_dict, f, pickle.HIGHEST_PROTOCOL) print(f"Saved spacing dictionnary to {f}")
def main(args: argparse.Namespace): src_path: Path = Path(args.source_dir) dest_path: Path = Path(args.dest_dir) # Assume the cleaning up is done before calling the script assert src_path.exists() assert not dest_path.exists() # Get all the file names, avoid the temporal ones nii_paths: list[Path] = [p for p in src_path.rglob('*.mhd')] assert len( nii_paths) % 2 == 0, "Uneven number of .nii, one+ pair is broken" # We sort now, but also id matching is checked while iterating later on img_nii_paths: list[Path] = sorted(p for p in nii_paths if "_segmentation" not in str(p)) gt_nii_paths: list[Path] = sorted(p for p in nii_paths if "_segmentation" in str(p)) assert len(img_nii_paths) == len( gt_nii_paths) == 50 # Hardcode that value for sanity test paths: list[Tuple[Path, Path]] = list(zip(img_nii_paths, gt_nii_paths)) print(f"Found {len(img_nii_paths)} pairs in total") pprint(paths[:5]) pids: list[str] = sorted(set(map_(get_p_id, img_nii_paths))) # Sanity test: there is two scans per patients: we don't want to mix them up assert len(pids) == len(img_nii_paths), (len(pids), len(img_nii_paths)) random.shuffle( pids ) # Shuffle before to avoid any problem if the patients are sorted in any way fold_size: int = args.retains + args.retains_test offset: int = args.fold * fold_size # offset by (fold_size) at the beginning validation_slice = slice(offset, offset + args.retains) # offset by (fold_size + val_retains) at the beginning) test_slice = slice(offset + args.retains, offset + args.retains + args.retains_test) validation_pids: list[str] = pids[validation_slice] test_pids: list[str] = pids[test_slice] training_pids: list[str] = [ pid for pid in pids if (pid not in validation_pids) and (pid not in test_pids) ] assert len(validation_pids) == args.retains assert (len(validation_pids) + len(training_pids) + len(test_pids)) == len(pids) assert set(validation_pids).union(training_pids).union(test_pids) == set( pids) assert set(validation_pids).isdisjoint(training_pids) assert set(validation_pids).isdisjoint(test_pids) assert set(test_pids).isdisjoint(training_pids) # assert len(test_pids) == args.retains_test validation_paths: list[Tuple[Path, Path]] = [ p for p in paths if get_p_id(p[0]) in validation_pids ] test_paths: list[Tuple[Path, Path]] = [ p for p in paths if get_p_id(p[0]) in test_pids ] training_paths: list[Tuple[Path, Path]] = [ p for p in paths if get_p_id(p[0]) in training_pids ] # redundant sanity, but you never know assert set(validation_paths).isdisjoint(set(training_paths)) assert set(validation_paths).isdisjoint(set(test_paths)) assert set(test_paths).isdisjoint(set(training_paths)) assert len(paths) == (len(validation_paths) + len(training_paths) + len(test_paths)) assert len(validation_paths) == args.retains assert len(test_paths) == args.retains_test assert len(training_paths) == (len(paths) - fold_size) for mode, _paths, n_augment in zip( ["train", "val", "test"], [training_paths, validation_paths, test_paths], [args.n_augment, 0, 0]): img_paths, gt_paths = zip(*_paths) # type: Tuple[Any, Any] dest_dir = Path(dest_path, mode) print(f"Slicing {len(img_paths)} pairs to {dest_dir}") assert len(img_paths) == len(gt_paths) pfun = partial(save_slices, dest_dir=dest_dir, shape=args.shape, n_augment=n_augment) sizes = mmap_(uc_(pfun), zip(img_paths, gt_paths)) # sizes = [] # for paths in tqdm(list(zip(img_paths, gt_paths)), ncols=50): # sizes.append(uc_(pfun)(paths)) sizes_3d, sizes_2d_min, sizes_2d_max = map_(np.asarray, zip(*sizes)) print("2d sizes: ", sizes_2d_min.min(), sizes_2d_max.max()) print("3d sizes: ", sizes_3d.min(), sizes_3d.mean(), sizes_3d.max())
def main(args: argparse.Namespace): train_path: Path = Path(os.path.join(args.source_dir, 'train')) val_path: Path = Path(os.path.join(args.source_dir, 'val')) dest_path: Path = Path(args.dest_dir) # Assume the cleaning up is done before calling the script assert train_path.exists() and val_path.exists() assert not dest_path.exists() # Get all the file names, avoid the temporal ones in the training directory all_paths_train: List[Path] = list(train_path.rglob('*.nii.gz')) nii_paths_train: List[Path] = [ p for p in all_paths_train if "_4D" not in str(p) ] assert len( nii_paths_train ) % 2 == 0, "Number of .nii not multiple of 6, some pairs are broken" # Get all the file names, avoid the temporal ones in the validation directory all_paths_val: List[Path] = list(val_path.rglob('*.nii.gz')) nii_paths_val: List[Path] = [ p for p in all_paths_val if "_4D" not in str(p) ] assert len( nii_paths_val ) % 2 == 0, "Number of .nii not multiple of 2, some pairs GT/CT are broken" # For training IMG_nii_paths_train: List[Path] = sorted(p for p in nii_paths_train if "imagesTr" in str(p)) gt_nii_paths_train: List[Path] = sorted(p for p in nii_paths_train if "labelsTr" in str(p)) assert len(IMG_nii_paths_train) == len(gt_nii_paths_train) paths_train: List[Tuple[Path, ...]] = list( zip(IMG_nii_paths_train, gt_nii_paths_train)) # For validation IMG_nii_paths_val: List[Path] = sorted(p for p in nii_paths_val if "imagesTr" in str(p)) gt_nii_paths_val: List[Path] = sorted(p for p in nii_paths_val if "labelsTr" in str(p)) assert len(IMG_nii_paths_val) == len(gt_nii_paths_val) paths_val: List[Tuple[Path, ...]] = list(zip(IMG_nii_paths_val, gt_nii_paths_val)) print(f"Found training {len(IMG_nii_paths_train)} pairs in total") pprint(paths_train[:2]) print(f"Found training {len(IMG_nii_paths_val)} pairs in total") pprint(paths_val[:2]) validation_paths: List[Tuple[Path, ...]] = [p for p in paths_val] training_paths: List[Tuple[Path, ...]] = [p for p in paths_train] assert set(validation_paths).isdisjoint(set(training_paths)) #len(paths) == (len(validation_paths) + len(training_paths)) for mode, _paths, n_augment in zip(["train", "val"], [training_paths, validation_paths], [args.n_augment, 0]): three_paths = list(zip(*_paths)) dest_dir = Path(dest_path, mode) print(f"Slicing {len(three_paths[0])} pairs to {dest_dir}") assert len(set(map_(len, three_paths))) == 1 pfun = partial(save_slices, dest_dir=dest_dir, shape=args.shape, n_augment=n_augment, discard_negatives=args.discard_negatives) sizes = mmap_(uc_(pfun), zip(*three_paths)) all_neg, all_pos, space_dicts = zip(*sizes) neg, pos = sum(all_neg), sum(all_pos) ratio = pos / neg print(f"Ratio between pos/neg: {ratio} ({pos}/{neg})") final_dict = { k: v for space_dict in space_dicts for k, v in space_dict.items() } with open(Path(dest_dir, "spacing.pkl"), 'wb') as f: pickle.dump(final_dict, f, pickle.HIGHEST_PROTOCOL) print(f"Saved spacing dictionnary to {f}")
def main(args: argparse.Namespace): src_path: Path = Path(args.source_dir) dest_path: Path = Path(args.dest_dir) # Assume the cleaning up is done before calling the script assert src_path.exists() assert not dest_path.exists() # Get all the file names, avoid the temporal ones nii_paths: list[Path] = [ p for p in src_path.rglob('*.nii.gz') if "_4d" not in str(p) ] assert len( nii_paths) % 2 == 0, "Uneven number of .nii, one+ pair is broken" # We sort now, but also id matching is checked while iterating later on img_nii_paths: list[Path] = sorted(p for p in nii_paths if "_gt" not in str(p)) gt_nii_paths: list[Path] = sorted(p for p in nii_paths if "_gt" in str(p)) assert len(img_nii_paths) == len( gt_nii_paths) == 200 # Hardcode that value for sanity test paths: list[Tuple[Path, Path]] = list(zip(img_nii_paths, gt_nii_paths)) print(f"Found {len(img_nii_paths)} pairs in total") pprint(paths[:5]) pids: list[str] = sorted(set(map_(get_p_id, img_nii_paths))) # Sanity test: there is two scans per patients: we don't want to mix them up assert len(pids) == (len(img_nii_paths) // 2), (len(pids), len(img_nii_paths)) random.shuffle( pids ) # Shuffle before to avoid any problem if the patients are sorted in any way fold_size: int = args.retains + args.retains_test offset: int = args.fold * fold_size # offset by (fold_size) at the beginning validation_slice = slice(offset, offset + args.retains) # offset by (fold_size + val_retains) at the beginning) test_slice = slice(offset + args.retains, offset + args.retains + args.retains_test) validation_pids: list[str] = pids[validation_slice] test_pids: list[str] = pids[test_slice] training_pids: list[str] = [ pid for pid in pids if (pid not in validation_pids) and (pid not in test_pids) ] assert len(validation_pids) == args.retains assert (len(validation_pids) + len(training_pids) + len(test_pids)) == len(pids) assert set(validation_pids).union(training_pids).union(test_pids) == set( pids) assert set(validation_pids).isdisjoint(training_pids) assert set(validation_pids).isdisjoint(test_pids) assert set(test_pids).isdisjoint(training_pids) # assert len(test_pids) == args.retains_test validation_paths: list[Tuple[Path, Path]] = [ p for p in paths if get_p_id(p[0]) in validation_pids ] test_paths: list[Tuple[Path, Path]] = [ p for p in paths if get_p_id(p[0]) in test_pids ] training_paths: list[Tuple[Path, Path]] = [ p for p in paths if get_p_id(p[0]) in training_pids ] # redundant sanity, but you never know assert set(validation_paths).isdisjoint(set(training_paths)) assert set(validation_paths).isdisjoint(set(test_paths)) assert set(test_paths).isdisjoint(set(training_paths)) assert len(paths) == (len(validation_paths) + len(training_paths) + len(test_paths)) assert len(validation_paths) == 2 * args.retains assert len(test_paths) == 2 * args.retains_test assert len(training_paths) == (len(paths) - 2 * fold_size) for mode, _paths, n_augment in zip( ["train", "val", "test"], [training_paths, validation_paths, test_paths], [args.n_augment, 0, 0]): img_paths, gt_paths = zip(*_paths) # type: Tuple[Any, Any] dest_dir = Path(dest_path, mode) print(f"Slicing {len(img_paths)} pairs to {dest_dir}") assert len(img_paths) == len(gt_paths) pfun = partial(save_slices, dest_dir=dest_dir, shape=args.shape, n_augment=n_augment) all_sizes = mmap_(uc_(pfun), zip(img_paths, gt_paths)) # for paths in tqdm(list(zip(img_paths, gt_paths)), ncols=50): # uc_(pfun)(paths) all_slices_sizes_px, all_slices_sizes_mm2, all_volume_size_px, all_volume_size_mm3 = zip( *all_sizes) flat_sizes_px = flatten_(all_slices_sizes_px) flat_sizes_mm2 = flatten_(all_slices_sizes_mm2) print("px", len(flat_sizes_px), min(flat_sizes_px), max(flat_sizes_px)) print('\t', "px 5/95", np.percentile(flat_sizes_px, 5), np.percentile(flat_sizes_px, 95)) print('\t', "mm2", f"{min(flat_sizes_mm2):.02f}", f"{max(flat_sizes_mm2):.02f}") _, axes = plt.subplots(nrows=2, ncols=2) axes = axes.flatten() axes[0].set_title("Slice surface (pixel)") axes[0].boxplot(all_slices_sizes_px, whis=[0, 100]) axes[1].set_title("Slice surface (mm2)") axes[1].boxplot(all_slices_sizes_mm2, whis=[0, 100]) axes[2].set_title("LV volume (pixel)") axes[2].hist(all_volume_size_px, bins=len(all_volume_size_px) // 2) axes[3].set_title("LV volume (mm3)") axes[3].hist(all_volume_size_mm3, bins=len(all_volume_size_px) // 2)