def generate_shepplogan_dataset(output_dir, img_size, labels_distribution, samples=100, smoothing=True): check_and_clean(join(output_dir, "subjects")) commandline_to_json({ "output_dir": output_dir, "img_size": img_size, "labels_distribution": labels_distribution, "samples": samples, "smoothing": smoothing }) columns = ["participant_id", "session_id", "diagnosis", "subtype"] data_df = pd.DataFrame(columns=columns) for i, label in enumerate(labels_distribution.keys()): for j in range(samples): participant_id = "sub-CLNC%i%04d" % (i, j) session_id = "ses-M00" subtype = np.random.choice(np.arange( len(labels_distribution[label])), p=labels_distribution[label]) row_df = pd.DataFrame( [[participant_id, session_id, label, subtype]], columns=columns) data_df = data_df.append(row_df) # Image generation path_out = join( output_dir, "subjects", "%s_%s%s.pt" % (participant_id, session_id, FILENAME_TYPE["shepplogan"])) img = generate_shepplogan_phantom(img_size, label=subtype, smoothing=smoothing) torch_img = torch.from_numpy(img).float().unsqueeze(0) torch.save(torch_img, path_out) data_df.to_csv(join(output_dir, 'data.tsv'), sep="\t", index=False) missing_path = join(output_dir, "missing_mods") if not exists(missing_path): makedirs(missing_path) sessions = data_df.session_id.unique() for session in sessions: session_df = data_df[data_df.session_id == session] out_df = copy(session_df[["participant_id"]]) out_df["t1w"] = [1] * len(out_df) out_df.to_csv(join(missing_path, "missing_mods_%s.tsv" % session), sep="\t", index=False)
def group_backprop(options): main_logger = return_logger(options.verbose, "main process") options = translate_parameters(options) fold_list = [ fold for fold in os.listdir(options.model_path) if fold[:5:] == "fold-" ] if len(fold_list) == 0: raise ValueError("No folds were found at path %s" % options.model_path) for fold in fold_list: main_logger.info(fold) for selection in options.selection: results_path = path.join(options.model_path, fold, 'gradients', selection, options.name) model_options = argparse.Namespace() model_options = read_json( model_options, path.join(options.model_path, 'commandline.json')) model_options = translate_parameters(model_options) model_options.gpu = options.gpu if options.tsv_path is None: options.tsv_path = model_options.tsv_path if options.input_dir is None: options.input_dir = model_options.input_dir if options.target_diagnosis is None: options.target_diagnosis = options.diagnosis criterion = get_criterion(model_options.loss) # Data management (remove data not well predicted by the CNN) training_df = load_data_test(options.tsv_path, [options.diagnosis], baseline=options.baseline) training_df.reset_index(drop=True, inplace=True) # Model creation _, all_transforms = get_transforms( model_options.mode, minmaxnormalization=model_options.minmaxnormalization) data_example = return_dataset(model_options.mode, options.input_dir, training_df, model_options.preprocessing, train_transformations=None, all_transformations=all_transforms, params=options) model = create_model(model_options, data_example.size) model_dir = os.path.join(options.model_path, fold, 'models', selection) model, best_epoch = load_model(model, model_dir, gpu=options.gpu, filename='model_best.pth.tar') options.output_dir = results_path commandline_to_json(options, logger=main_logger) # Keep only subjects who were correctly / wrongly predicted by the network training_df = sort_predicted(model, training_df, options.input_dir, model_options, criterion, options.keep_true, batch_size=options.batch_size, num_workers=options.num_workers, gpu=options.gpu) if len(training_df) > 0: # Save the tsv files used for the saliency maps training_df.to_csv(path.join('data.tsv'), sep='\t', index=False) data_train = return_dataset(model_options.mode, options.input_dir, training_df, model_options.preprocessing, train_transformations=None, all_transformations=all_transforms, params=options) train_loader = DataLoader(data_train, batch_size=options.batch_size, shuffle=True, num_workers=options.num_workers, pin_memory=True) interpreter = VanillaBackProp(model, gpu=options.gpu) cum_map = 0 for data in train_loader: if options.gpu: input_batch = data['image'].cuda() else: input_batch = data['image'] maps = interpreter.generate_gradients( input_batch, data_train.diagnosis_code[options.target_diagnosis]) cum_map += maps.sum(axis=0) mean_map = cum_map / len(data_train) if len(data_train.size) == 4: if options.nifti_template_path is not None: image_nii = nib.load(options.nifti_template_path) affine = image_nii.affine else: affine = np.eye(4) mean_map_nii = nib.Nifti1Image(mean_map[0], affine) nib.save(mean_map_nii, path.join(results_path, "map.nii.gz")) np.save(path.join(results_path, "map.npy"), mean_map[0]) else: jpg_path = path.join(results_path, "map.jpg") plt.imshow(mean_map[0], cmap="coolwarm", vmin=-options.vmax, vmax=options.vmax) plt.colorbar() plt.savefig(jpg_path) plt.close() numpy_path = path.join(results_path, "map.npy") np.save(numpy_path, mean_map[0]) else: main_logger.warn("There are no subjects for the given options")
def generate_trivial_dataset(caps_dir, output_dir, n_subjects, tsv_path=None, preprocessing="linear", mask_path=None, atrophy_percent=60): """ Generates a fully separable dataset. Generates a dataset, based on the images of the CAPS directory, where a half of the image is processed using a mask to oclude a specific region. This procedure creates a dataset fully separable (images with half-right processed and image with half-left processed) Args: caps_dir: (str) path to the CAPS directory. output_dir: (str) folder containing the synthetic dataset in CAPS format. n_subjects: (int) number of subjects in each class of the synthetic dataset. tsv_path: (str) path to tsv file of list of subjects/sessions. preprocessing: (str) preprocessing performed. Must be in ['linear', 'extensive']. mask_path: (str) path to the extracted masks to generate the two labels. atrophy_percent: (float) percentage of atrophy applied. Returns: Folder structure where images are stored in CAPS format. Raises: ValueError: if `n_subjects` is higher than the length of the TSV file at `tsv_path`. """ from pathlib import Path commandline_to_json({ "output_dir": output_dir, "caps_dir": caps_dir, "preprocessing": preprocessing, "n_subjects": n_subjects, "atrophy_percent": atrophy_percent }) # Read DataFrame data_df = load_and_check_tsv(tsv_path, caps_dir, output_dir) data_df = baseline_df(data_df, "None") home = str(Path.home()) cache_clinicadl = join(home, '.cache', 'clinicadl', 'ressources', 'masks') url_aramis = 'https://aramislab.paris.inria.fr/files/data/masks/' FILE1 = RemoteFileStructure( filename='AAL2.tar.gz', url=url_aramis, checksum= '89427970921674792481bffd2de095c8fbf49509d615e7e09e4bc6f0e0564471') makedirs(cache_clinicadl, exist_ok=True) if n_subjects > len(data_df): raise ValueError( f"The number of subjects {n_subjects} cannot be higher " f"than the number of subjects in the baseline dataset of size {len(data_df)}" ) if mask_path is None: if not exists(join(cache_clinicadl, 'AAL2')): try: print('Try to download AAL2 masks') mask_path_tar = fetch_file(FILE1, cache_clinicadl) tar_file = tarfile.open(mask_path_tar) print('File: ' + mask_path_tar) try: tar_file.extractall(cache_clinicadl) tar_file.close() mask_path = join(cache_clinicadl, 'AAL2') except RuntimeError: print('Unable to extract downloaded files') except IOError as err: print('Unable to download required templates:', err) raise ValueError( '''Unable to download masks, please download them manually at https://aramislab.paris.inria.fr/files/data/masks/ and provide a valid path.''') else: mask_path = join(cache_clinicadl, 'AAL2') # Create subjects dir makedirs(join(output_dir, 'subjects'), exist_ok=True) # Output tsv file columns = ['participant_id', 'session_id', 'diagnosis', 'age_bl', 'sex'] output_df = pd.DataFrame(columns=columns) diagnosis_list = ["AD", "CN"] for i in range(2 * n_subjects): data_idx = i // 2 label = i % 2 participant_id = data_df.loc[data_idx, "participant_id"] session_id = data_df.loc[data_idx, "session_id"] filename = f'sub-TRIV{i}_ses-M00' + FILENAME_TYPE['cropped'] + '.nii.gz' path_image = join(output_dir, 'subjects', f'sub-TRIV{i}', 'ses-M00', 't1_linear') makedirs(path_image, exist_ok=True) image_path = find_image_path(caps_dir, participant_id, session_id, preprocessing) image_nii = nib.load(image_path) image = image_nii.get_data() atlas_to_mask = nib.load(join(mask_path, f'mask-{label + 1}.nii')).get_data() # Create atrophied image trivial_image = im_loss_roi_gaussian_distribution( image, atlas_to_mask, atrophy_percent) trivial_image_nii = nib.Nifti1Image(trivial_image, affine=image_nii.affine) trivial_image_nii.to_filename(join(path_image, filename)) # Append row to output tsv row = [f'sub-TRIV{i}', 'ses-M00', diagnosis_list[label], 60, 'F'] row_df = pd.DataFrame([row], columns=columns) output_df = output_df.append(row_df) output_df.to_csv(join(output_dir, 'data.tsv'), sep='\t', index=False) missing_path = join(output_dir, "missing_mods") makedirs(missing_path, exist_ok=True) sessions = output_df.session_id.unique() for session in sessions: session_df = output_df[output_df.session_id == session] out_df = copy(session_df[["participant_id"]]) out_df["synthetic"] = [1] * len(out_df) out_df.to_csv(join(missing_path, f"missing_mods_{session}.tsv"), sep="\t", index=False)
def generate_random_dataset(caps_dir, output_dir, n_subjects, tsv_path=None, mean=0, sigma=0.5, preprocessing="t1-linear"): """ Generates a random dataset. Creates a random dataset for intractable classification task from the first subject of the tsv file (other subjects/sessions different from the first one are ignored. Degree of noise can be parameterized. Args: caps_dir: (str) Path to the (input) CAPS directory. output_dir: (str) folder containing the synthetic dataset in (output) CAPS format. n_subjects: (int) number of subjects in each class of the synthetic dataset tsv_path: (str) path to tsv file of list of subjects/sessions. mean: (float) mean of the gaussian noise sigma: (float) standard deviation of the gaussian noise preprocessing: (str) preprocessing performed. Must be in ['t1-linear', 't1-extensive']. Returns: A folder written on the output_dir location (in CAPS format), also a tsv file describing this output """ commandline_to_json({ "output_dir": output_dir, "caps_dir": caps_dir, "preprocessing": preprocessing, "n_subjects": n_subjects, "mean": mean, "sigma": sigma }) # Read DataFrame data_df = load_and_check_tsv(tsv_path, caps_dir, output_dir) # Create subjects dir makedirs(join(output_dir, 'subjects'), exist_ok=True) # Retrieve image of first subject participant_id = data_df.loc[0, 'participant_id'] session_id = data_df.loc[0, 'session_id'] image_path = find_image_path(caps_dir, participant_id, session_id, preprocessing) image_nii = nib.load(image_path) image = image_nii.get_data() # Create output tsv file participant_id_list = [f'sub-RAND{i}' for i in range(2 * n_subjects)] session_id_list = ['ses-M00'] * 2 * n_subjects diagnosis_list = ['AD'] * n_subjects + ['CN'] * n_subjects data = np.array([participant_id_list, session_id_list, diagnosis_list]) data = data.T output_df = pd.DataFrame( data, columns=['participant_id', 'session_id', 'diagnosis']) output_df['age_bl'] = 60 output_df['sex'] = 'F' output_df.to_csv(join(output_dir, 'data.tsv'), sep='\t', index=False) for i in range(2 * n_subjects): gauss = np.random.normal(mean, sigma, image.shape) participant_id = f'sub-RAND{i}' noisy_image = image + gauss noisy_image_nii = nib.Nifti1Image(noisy_image, header=image_nii.header, affine=image_nii.affine) noisy_image_nii_path = join(output_dir, 'subjects', participant_id, 'ses-M00', 't1_linear') noisy_image_nii_filename = participant_id + '_ses-M00' + FILENAME_TYPE[ 'cropped'] + '.nii.gz' makedirs(noisy_image_nii_path, exist_ok=True) nib.save(noisy_image_nii, join(noisy_image_nii_path, noisy_image_nii_filename)) missing_path = join(output_dir, "missing_mods") makedirs(missing_path, exist_ok=True) sessions = output_df.session_id.unique() for session in sessions: session_df = output_df[output_df.session_id == session] out_df = copy(session_df[["participant_id"]]) out_df["synthetic"] = [1] * len(out_df) out_df.to_csv(join(missing_path, f"missing_mods_{session}.tsv"), sep="\t", index=False)
def individual_backprop(options): main_logger = return_logger(options.verbose, "main process") options = translate_parameters(options) fold_list = [ fold for fold in os.listdir(options.model_path) if fold[:5:] == "fold-" ] if len(fold_list) == 0: raise ValueError("No folds were found at path %s" % options.model_path) model_options = argparse.Namespace() model_options = read_json( model_options, path.join(options.model_path, 'commandline.json')) model_options = translate_parameters(model_options) model_options.gpu = options.gpu if model_options.network_type == "multicnn": raise NotImplementedError( "The interpretation of multi-CNN is not implemented.") if options.tsv_path is None and options.input_dir is None: options.multi_cohort = model_options.multi_cohort if options.tsv_path is None: options.tsv_path = model_options.tsv_path if options.input_dir is None: options.input_dir = model_options.input_dir if options.target_diagnosis is None: options.target_diagnosis = options.diagnosis for fold in fold_list: main_logger.info(fold) for selection in options.selection: results_path = path.join(options.model_path, fold, 'gradients', selection, options.name) criterion = get_criterion(model_options.loss) # Data management (remove data not well predicted by the CNN) training_df = load_data_test(options.tsv_path, [options.diagnosis], baseline=options.baseline, multi_cohort=options.multi_cohort) training_df.reset_index(drop=True, inplace=True) # Model creation _, all_transforms = get_transforms( model_options.mode, minmaxnormalization=model_options.minmaxnormalization) with warnings.catch_warnings(): warnings.simplefilter("ignore") data_example = return_dataset( model_options.mode, options.input_dir, training_df, model_options.preprocessing, train_transformations=None, all_transformations=all_transforms, prepare_dl=options.prepare_dl, multi_cohort=options.multi_cohort, params=model_options) model = create_model(model_options, data_example.size) model_dir = os.path.join(options.model_path, fold, 'models', selection) model, best_epoch = load_model(model, model_dir, gpu=options.gpu, filename='model_best.pth.tar') options.output_dir = results_path commandline_to_json(options, logger=main_logger) # Keep only subjects who were correctly / wrongly predicted by the network training_df = sort_predicted(model, training_df, options.input_dir, model_options, criterion, options.keep_true, batch_size=options.batch_size, num_workers=options.num_workers, gpu=options.gpu) if len(training_df) > 0: # Save the tsv files used for the saliency maps training_df.to_csv(path.join('data.tsv'), sep='\t', index=False) with warnings.catch_warnings(): warnings.simplefilter("ignore") data_train = return_dataset( model_options.mode, options.input_dir, training_df, model_options.preprocessing, train_transformations=None, all_transformations=all_transforms, prepare_dl=options.prepare_dl, multi_cohort=options.multi_cohort, params=model_options) train_loader = DataLoader(data_train, batch_size=options.batch_size, shuffle=True, num_workers=options.num_workers, pin_memory=True) interpreter = VanillaBackProp(model, gpu=options.gpu) for data in train_loader: if options.gpu: input_batch = data['image'].cuda() else: input_batch = data['image'] map_np = interpreter.generate_gradients( input_batch, data_train.diagnosis_code[options.target_diagnosis]) for i in range(options.batch_size): single_path = path.join(results_path, data['participant_id'][i], data['session_id'][i]) os.makedirs(single_path, exist_ok=True) if len(data_train.size) == 4: if options.nifti_template_path is not None: image_nii = nib.load( options.nifti_template_path) affine = image_nii.affine else: affine = np.eye(4) map_nii = nib.Nifti1Image(map_np[i, 0, :, :, :], affine) nib.save(map_nii, path.join(single_path, "map.nii.gz")) else: jpg_path = path.join(single_path, "map.jpg") plt.imshow(map_np[i, 0, :, :], cmap="coolwarm", vmin=-options.vmax, vmax=options.vmax) plt.colorbar() plt.savefig(jpg_path) plt.close() np.save(path.join(single_path, "map.npy"), map_np[i])