def main(args: argparse.Namespace): src_path: Path = Path(args.source_dir) dest_path: Path = Path(args.dest_dir) # Assume the cleaning up is done before calling the script assert src_path.exists() assert not dest_path.exists() # Get all the file names, avoid the temporal ones nii_paths: List[Path] = [p for p in src_path.rglob('*.nii.gz') if "_4d" not in str(p)] assert len(nii_paths) % 2 == 0, "Uneven number of .nii, one+ pair is broken" # We sort now, but also id matching is checked while iterating later on img_nii_paths: List[Path] = sorted(p for p in nii_paths if "_gt" not in str(p)) gt_nii_paths: List[Path] = sorted(p for p in nii_paths if "_gt" in str(p)) assert len(img_nii_paths) == len(gt_nii_paths) paths: List[Tuple[Path, Path]] = list(zip(img_nii_paths, gt_nii_paths)) print(f"Found {len(img_nii_paths)} pairs in total") pprint(paths[:5]) validation_paths: List[Tuple[Path, Path]] = random.sample(paths, args.retain) training_paths: List[Tuple[Path, Path]] = [p for p in paths if p not in validation_paths] assert set(validation_paths).isdisjoint(set(training_paths)) assert len(paths) == (len(validation_paths) + len(training_paths)) for mode, _paths, n_augment in zip(["train", "val"], [training_paths, validation_paths], [args.n_augment, 0]): img_paths, gt_paths = zip(*_paths) # type: Tuple[Any, Any] dest_dir = Path(dest_path, mode) print(f"Slicing {len(img_paths)} pairs to {dest_dir}") assert len(img_paths) == len(gt_paths) pfun = partial(save_slices, dest_dir=dest_dir, shape=args.shape, n_augment=n_augment) all_sizes = mmap_(uc_(pfun), zip(img_paths, gt_paths)) # for paths in tqdm(list(zip(img_paths, gt_paths)), ncols=50): # uc_(pfun)(paths) all_slices_sizes_px, all_slices_sizes_mm2, all_volume_size_px, all_volume_size_mm3 = zip(*all_sizes) flat_sizes_px = flatten_(all_slices_sizes_px) flat_sizes_mm2 = flatten_(all_slices_sizes_mm2) print("px", len(flat_sizes_px), min(flat_sizes_px), max(flat_sizes_px)) print('\t', "px 5/95", np.percentile(flat_sizes_px, 5), np.percentile(flat_sizes_px, 95)) print('\t', "mm2", f"{min(flat_sizes_mm2):.02f}", f"{max(flat_sizes_mm2):.02f}") _, axes = plt.subplots(nrows=2, ncols=2) axes = axes.flatten() axes[0].set_title("Slice surface (pixel)") axes[0].boxplot(all_slices_sizes_px, whis=[0, 100]) axes[1].set_title("Slice surface (mm2)") axes[1].boxplot(all_slices_sizes_mm2, whis=[0, 100]) axes[2].set_title("LV volume (pixel)") axes[2].hist(all_volume_size_px, bins=len(all_volume_size_px) // 2) axes[3].set_title("LV volume (mm3)") axes[3].hist(all_volume_size_mm3, bins=len(all_volume_size_px) // 2)
def main(args: argparse.Namespace): src_path: Path = Path(args.source_dir) dest_path: Path = Path(args.dest_dir) # Assume the cleaning up is done before calling the script assert src_path.exists() assert not dest_path.exists() # Get all the file names, avoid the temporal ones nii_paths: List[Path] = [p for p in src_path.rglob('*.nii')] assert len( nii_paths) % 2 == 0, "Uneven number of .nii, one+ pair is broken" # We sort now, but also id matching is checked while iterating later on img_nii_paths: List[Path] = sorted(p for p in nii_paths if "_Labels" not in str(p)) gt_nii_paths: List[Path] = sorted(p for p in nii_paths if "_Labels" in str(p)) assert len(img_nii_paths) == len(gt_nii_paths) paths: List[Tuple[Path, Path]] = list(zip(img_nii_paths, gt_nii_paths)) print(f"Found {len(img_nii_paths)} pairs in total") pprint(paths[:5]) validation_paths: List[Tuple[Path, Path]] = random.sample(paths, args.retain) training_paths: List[Tuple[Path, Path]] = [ p for p in paths if p not in validation_paths ] assert set(validation_paths).isdisjoint(set(training_paths)) assert len(paths) == (len(validation_paths) + len(training_paths)) for mode, _paths in zip(["train", "val"], [training_paths, validation_paths]): img_paths, gt_paths = zip(*_paths) # type: Tuple[Any, Any] dest_dir = Path(dest_path, mode) print(f"Slicing {len(img_paths)} pairs to {dest_dir}") assert len(img_paths) == len(gt_paths) pfun = partial(process_patient, dest_dir=dest_dir, shape=args.shape, cr=args.crop) sizess = mmap_(uc_(pfun), zip(img_paths, gt_paths)) # for paths in tqdm(list(zip(img_paths, gt_paths)), ncols=50): # uc_(pfun)(paths) all_sizes = np.array(flatten_(sizess)) all_pos = all_sizes[all_sizes > 0] print( f"sizes: min={np.min(all_pos)}, 5th={np.percentile(all_pos, 5):0.02f}, median={np.median(all_pos):0.0f}, " + f"mean={np.mean(all_pos):0.02f}, 95th={np.percentile(all_pos, 95):0.02f}, max={np.max(all_pos)}" )
def main(args: argparse.Namespace): src_path: Path = Path(args.source_dir) dest_path: Path = Path(args.dest_dir) # Assume the cleaning up is done before calling the script assert src_path.exists() assert not dest_path.exists() # Get all the file names, avoid the temporal ones nii_paths: list[Path] = [ p for p in src_path.rglob('*.nii.gz') if "_4d" not in str(p) ] assert len( nii_paths) % 2 == 0, "Uneven number of .nii, one+ pair is broken" # We sort now, but also id matching is checked while iterating later on img_nii_paths: list[Path] = sorted(p for p in nii_paths if "_gt" not in str(p)) gt_nii_paths: list[Path] = sorted(p for p in nii_paths if "_gt" in str(p)) assert len(img_nii_paths) == len( gt_nii_paths) == 200 # Hardcode that value for sanity test paths: list[Tuple[Path, Path]] = list(zip(img_nii_paths, gt_nii_paths)) print(f"Found {len(img_nii_paths)} pairs in total") pprint(paths[:5]) pids: list[str] = sorted(set(map_(get_p_id, img_nii_paths))) # Sanity test: there is two scans per patients: we don't want to mix them up assert len(pids) == (len(img_nii_paths) // 2), (len(pids), len(img_nii_paths)) random.shuffle( pids ) # Shuffle before to avoid any problem if the patients are sorted in any way fold_size: int = args.retains + args.retains_test offset: int = args.fold * fold_size # offset by (fold_size) at the beginning validation_slice = slice(offset, offset + args.retains) # offset by (fold_size + val_retains) at the beginning) test_slice = slice(offset + args.retains, offset + args.retains + args.retains_test) validation_pids: list[str] = pids[validation_slice] test_pids: list[str] = pids[test_slice] training_pids: list[str] = [ pid for pid in pids if (pid not in validation_pids) and (pid not in test_pids) ] assert len(validation_pids) == args.retains assert (len(validation_pids) + len(training_pids) + len(test_pids)) == len(pids) assert set(validation_pids).union(training_pids).union(test_pids) == set( pids) assert set(validation_pids).isdisjoint(training_pids) assert set(validation_pids).isdisjoint(test_pids) assert set(test_pids).isdisjoint(training_pids) # assert len(test_pids) == args.retains_test validation_paths: list[Tuple[Path, Path]] = [ p for p in paths if get_p_id(p[0]) in validation_pids ] test_paths: list[Tuple[Path, Path]] = [ p for p in paths if get_p_id(p[0]) in test_pids ] training_paths: list[Tuple[Path, Path]] = [ p for p in paths if get_p_id(p[0]) in training_pids ] # redundant sanity, but you never know assert set(validation_paths).isdisjoint(set(training_paths)) assert set(validation_paths).isdisjoint(set(test_paths)) assert set(test_paths).isdisjoint(set(training_paths)) assert len(paths) == (len(validation_paths) + len(training_paths) + len(test_paths)) assert len(validation_paths) == 2 * args.retains assert len(test_paths) == 2 * args.retains_test assert len(training_paths) == (len(paths) - 2 * fold_size) for mode, _paths, n_augment in zip( ["train", "val", "test"], [training_paths, validation_paths, test_paths], [args.n_augment, 0, 0]): img_paths, gt_paths = zip(*_paths) # type: Tuple[Any, Any] dest_dir = Path(dest_path, mode) print(f"Slicing {len(img_paths)} pairs to {dest_dir}") assert len(img_paths) == len(gt_paths) pfun = partial(save_slices, dest_dir=dest_dir, shape=args.shape, n_augment=n_augment) all_sizes = mmap_(uc_(pfun), zip(img_paths, gt_paths)) # for paths in tqdm(list(zip(img_paths, gt_paths)), ncols=50): # uc_(pfun)(paths) all_slices_sizes_px, all_slices_sizes_mm2, all_volume_size_px, all_volume_size_mm3 = zip( *all_sizes) flat_sizes_px = flatten_(all_slices_sizes_px) flat_sizes_mm2 = flatten_(all_slices_sizes_mm2) print("px", len(flat_sizes_px), min(flat_sizes_px), max(flat_sizes_px)) print('\t', "px 5/95", np.percentile(flat_sizes_px, 5), np.percentile(flat_sizes_px, 95)) print('\t', "mm2", f"{min(flat_sizes_mm2):.02f}", f"{max(flat_sizes_mm2):.02f}") _, axes = plt.subplots(nrows=2, ncols=2) axes = axes.flatten() axes[0].set_title("Slice surface (pixel)") axes[0].boxplot(all_slices_sizes_px, whis=[0, 100]) axes[1].set_title("Slice surface (mm2)") axes[1].boxplot(all_slices_sizes_mm2, whis=[0, 100]) axes[2].set_title("LV volume (pixel)") axes[2].hist(all_volume_size_px, bins=len(all_volume_size_px) // 2) axes[3].set_title("LV volume (mm3)") axes[3].hist(all_volume_size_mm3, bins=len(all_volume_size_px) // 2)