示例#1
0
def generate_shepplogan_dataset(output_dir,
                                img_size,
                                labels_distribution,
                                samples=100,
                                smoothing=True):

    check_and_clean(join(output_dir, "subjects"))
    commandline_to_json({
        "output_dir": output_dir,
        "img_size": img_size,
        "labels_distribution": labels_distribution,
        "samples": samples,
        "smoothing": smoothing
    })
    columns = ["participant_id", "session_id", "diagnosis", "subtype"]
    data_df = pd.DataFrame(columns=columns)

    for i, label in enumerate(labels_distribution.keys()):
        for j in range(samples):
            participant_id = "sub-CLNC%i%04d" % (i, j)
            session_id = "ses-M00"
            subtype = np.random.choice(np.arange(
                len(labels_distribution[label])),
                                       p=labels_distribution[label])
            row_df = pd.DataFrame(
                [[participant_id, session_id, label, subtype]],
                columns=columns)
            data_df = data_df.append(row_df)

            # Image generation
            path_out = join(
                output_dir, "subjects", "%s_%s%s.pt" %
                (participant_id, session_id, FILENAME_TYPE["shepplogan"]))
            img = generate_shepplogan_phantom(img_size,
                                              label=subtype,
                                              smoothing=smoothing)
            torch_img = torch.from_numpy(img).float().unsqueeze(0)
            torch.save(torch_img, path_out)

    data_df.to_csv(join(output_dir, 'data.tsv'), sep="\t", index=False)

    missing_path = join(output_dir, "missing_mods")
    if not exists(missing_path):
        makedirs(missing_path)

    sessions = data_df.session_id.unique()
    for session in sessions:
        session_df = data_df[data_df.session_id == session]
        out_df = copy(session_df[["participant_id"]])
        out_df["t1w"] = [1] * len(out_df)
        out_df.to_csv(join(missing_path, "missing_mods_%s.tsv" % session),
                      sep="\t",
                      index=False)
示例#2
0
def group_backprop(options):

    main_logger = return_logger(options.verbose, "main process")
    options = translate_parameters(options)

    fold_list = [
        fold for fold in os.listdir(options.model_path) if fold[:5:] == "fold-"
    ]
    if len(fold_list) == 0:
        raise ValueError("No folds were found at path %s" % options.model_path)

    for fold in fold_list:
        main_logger.info(fold)
        for selection in options.selection:
            results_path = path.join(options.model_path, fold, 'gradients',
                                     selection, options.name)

            model_options = argparse.Namespace()
            model_options = read_json(
                model_options, path.join(options.model_path,
                                         'commandline.json'))
            model_options = translate_parameters(model_options)
            model_options.gpu = options.gpu

            if options.tsv_path is None:
                options.tsv_path = model_options.tsv_path
            if options.input_dir is None:
                options.input_dir = model_options.input_dir
            if options.target_diagnosis is None:
                options.target_diagnosis = options.diagnosis

            criterion = get_criterion(model_options.loss)

            # Data management (remove data not well predicted by the CNN)
            training_df = load_data_test(options.tsv_path, [options.diagnosis],
                                         baseline=options.baseline)
            training_df.reset_index(drop=True, inplace=True)

            # Model creation
            _, all_transforms = get_transforms(
                model_options.mode,
                minmaxnormalization=model_options.minmaxnormalization)
            data_example = return_dataset(model_options.mode,
                                          options.input_dir,
                                          training_df,
                                          model_options.preprocessing,
                                          train_transformations=None,
                                          all_transformations=all_transforms,
                                          params=options)

            model = create_model(model_options, data_example.size)
            model_dir = os.path.join(options.model_path, fold, 'models',
                                     selection)
            model, best_epoch = load_model(model,
                                           model_dir,
                                           gpu=options.gpu,
                                           filename='model_best.pth.tar')
            options.output_dir = results_path
            commandline_to_json(options, logger=main_logger)

            # Keep only subjects who were correctly / wrongly predicted by the network
            training_df = sort_predicted(model,
                                         training_df,
                                         options.input_dir,
                                         model_options,
                                         criterion,
                                         options.keep_true,
                                         batch_size=options.batch_size,
                                         num_workers=options.num_workers,
                                         gpu=options.gpu)

            if len(training_df) > 0:

                # Save the tsv files used for the saliency maps
                training_df.to_csv(path.join('data.tsv'),
                                   sep='\t',
                                   index=False)

                data_train = return_dataset(model_options.mode,
                                            options.input_dir,
                                            training_df,
                                            model_options.preprocessing,
                                            train_transformations=None,
                                            all_transformations=all_transforms,
                                            params=options)

                train_loader = DataLoader(data_train,
                                          batch_size=options.batch_size,
                                          shuffle=True,
                                          num_workers=options.num_workers,
                                          pin_memory=True)

                interpreter = VanillaBackProp(model, gpu=options.gpu)

                cum_map = 0
                for data in train_loader:
                    if options.gpu:
                        input_batch = data['image'].cuda()
                    else:
                        input_batch = data['image']

                    maps = interpreter.generate_gradients(
                        input_batch,
                        data_train.diagnosis_code[options.target_diagnosis])
                    cum_map += maps.sum(axis=0)

                mean_map = cum_map / len(data_train)

                if len(data_train.size) == 4:
                    if options.nifti_template_path is not None:
                        image_nii = nib.load(options.nifti_template_path)
                        affine = image_nii.affine
                    else:
                        affine = np.eye(4)

                    mean_map_nii = nib.Nifti1Image(mean_map[0], affine)
                    nib.save(mean_map_nii, path.join(results_path,
                                                     "map.nii.gz"))
                    np.save(path.join(results_path, "map.npy"), mean_map[0])
                else:
                    jpg_path = path.join(results_path, "map.jpg")
                    plt.imshow(mean_map[0],
                               cmap="coolwarm",
                               vmin=-options.vmax,
                               vmax=options.vmax)
                    plt.colorbar()
                    plt.savefig(jpg_path)
                    plt.close()
                    numpy_path = path.join(results_path, "map.npy")
                    np.save(numpy_path, mean_map[0])
            else:
                main_logger.warn("There are no subjects for the given options")
示例#3
0
def generate_trivial_dataset(caps_dir,
                             output_dir,
                             n_subjects,
                             tsv_path=None,
                             preprocessing="linear",
                             mask_path=None,
                             atrophy_percent=60):
    """
    Generates a fully separable dataset.

    Generates a dataset, based on the images of the CAPS directory, where a
    half of the image is processed using a mask to oclude a specific region.
    This procedure creates a dataset fully separable (images with half-right
    processed and image with half-left processed)

    Args:
        caps_dir: (str) path to the CAPS directory.
        output_dir: (str) folder containing the synthetic dataset in CAPS format.
        n_subjects: (int) number of subjects in each class of the synthetic
            dataset.
        tsv_path: (str) path to tsv file of list of subjects/sessions.
        preprocessing: (str) preprocessing performed. Must be in ['linear', 'extensive'].
        mask_path: (str) path to the extracted masks to generate the two labels.
        atrophy_percent: (float) percentage of atrophy applied.

    Returns:
        Folder structure where images are stored in CAPS format.

    Raises:
        ValueError: if `n_subjects` is higher than the length of the TSV file at `tsv_path`.
    """
    from pathlib import Path

    commandline_to_json({
        "output_dir": output_dir,
        "caps_dir": caps_dir,
        "preprocessing": preprocessing,
        "n_subjects": n_subjects,
        "atrophy_percent": atrophy_percent
    })

    # Read DataFrame
    data_df = load_and_check_tsv(tsv_path, caps_dir, output_dir)
    data_df = baseline_df(data_df, "None")

    home = str(Path.home())
    cache_clinicadl = join(home, '.cache', 'clinicadl', 'ressources', 'masks')
    url_aramis = 'https://aramislab.paris.inria.fr/files/data/masks/'
    FILE1 = RemoteFileStructure(
        filename='AAL2.tar.gz',
        url=url_aramis,
        checksum=
        '89427970921674792481bffd2de095c8fbf49509d615e7e09e4bc6f0e0564471')
    makedirs(cache_clinicadl, exist_ok=True)

    if n_subjects > len(data_df):
        raise ValueError(
            f"The number of subjects {n_subjects} cannot be higher "
            f"than the number of subjects in the baseline dataset of size {len(data_df)}"
        )

    if mask_path is None:
        if not exists(join(cache_clinicadl, 'AAL2')):
            try:
                print('Try to download AAL2 masks')
                mask_path_tar = fetch_file(FILE1, cache_clinicadl)
                tar_file = tarfile.open(mask_path_tar)
                print('File: ' + mask_path_tar)
                try:
                    tar_file.extractall(cache_clinicadl)
                    tar_file.close()
                    mask_path = join(cache_clinicadl, 'AAL2')
                except RuntimeError:
                    print('Unable to extract downloaded files')
            except IOError as err:
                print('Unable to download required templates:', err)
                raise ValueError(
                    '''Unable to download masks, please download them
                                  manually at https://aramislab.paris.inria.fr/files/data/masks/
                                  and provide a valid path.''')
        else:
            mask_path = join(cache_clinicadl, 'AAL2')

    # Create subjects dir
    makedirs(join(output_dir, 'subjects'), exist_ok=True)

    # Output tsv file
    columns = ['participant_id', 'session_id', 'diagnosis', 'age_bl', 'sex']
    output_df = pd.DataFrame(columns=columns)
    diagnosis_list = ["AD", "CN"]

    for i in range(2 * n_subjects):
        data_idx = i // 2
        label = i % 2

        participant_id = data_df.loc[data_idx, "participant_id"]
        session_id = data_df.loc[data_idx, "session_id"]
        filename = f'sub-TRIV{i}_ses-M00' + FILENAME_TYPE['cropped'] + '.nii.gz'
        path_image = join(output_dir, 'subjects', f'sub-TRIV{i}', 'ses-M00',
                          't1_linear')

        makedirs(path_image, exist_ok=True)

        image_path = find_image_path(caps_dir, participant_id, session_id,
                                     preprocessing)
        image_nii = nib.load(image_path)
        image = image_nii.get_data()

        atlas_to_mask = nib.load(join(mask_path,
                                      f'mask-{label + 1}.nii')).get_data()

        # Create atrophied image
        trivial_image = im_loss_roi_gaussian_distribution(
            image, atlas_to_mask, atrophy_percent)
        trivial_image_nii = nib.Nifti1Image(trivial_image,
                                            affine=image_nii.affine)
        trivial_image_nii.to_filename(join(path_image, filename))

        # Append row to output tsv
        row = [f'sub-TRIV{i}', 'ses-M00', diagnosis_list[label], 60, 'F']
        row_df = pd.DataFrame([row], columns=columns)
        output_df = output_df.append(row_df)

    output_df.to_csv(join(output_dir, 'data.tsv'), sep='\t', index=False)

    missing_path = join(output_dir, "missing_mods")
    makedirs(missing_path, exist_ok=True)

    sessions = output_df.session_id.unique()
    for session in sessions:
        session_df = output_df[output_df.session_id == session]
        out_df = copy(session_df[["participant_id"]])
        out_df["synthetic"] = [1] * len(out_df)
        out_df.to_csv(join(missing_path, f"missing_mods_{session}.tsv"),
                      sep="\t",
                      index=False)
示例#4
0
def generate_random_dataset(caps_dir,
                            output_dir,
                            n_subjects,
                            tsv_path=None,
                            mean=0,
                            sigma=0.5,
                            preprocessing="t1-linear"):
    """
    Generates a random dataset.

    Creates a random dataset for intractable classification task from the first
    subject of the tsv file (other subjects/sessions different from the first
    one are ignored. Degree of noise can be parameterized.

    Args:
        caps_dir: (str) Path to the (input) CAPS directory.
        output_dir: (str) folder containing the synthetic dataset in (output)
            CAPS format.
        n_subjects: (int) number of subjects in each class of the
            synthetic dataset
        tsv_path: (str) path to tsv file of list of subjects/sessions.
        mean: (float) mean of the gaussian noise
        sigma: (float) standard deviation of the gaussian noise
        preprocessing: (str) preprocessing performed. Must be in ['t1-linear', 't1-extensive'].

    Returns:
        A folder written on the output_dir location (in CAPS format), also a
        tsv file describing this output

    """
    commandline_to_json({
        "output_dir": output_dir,
        "caps_dir": caps_dir,
        "preprocessing": preprocessing,
        "n_subjects": n_subjects,
        "mean": mean,
        "sigma": sigma
    })
    # Read DataFrame
    data_df = load_and_check_tsv(tsv_path, caps_dir, output_dir)

    # Create subjects dir
    makedirs(join(output_dir, 'subjects'), exist_ok=True)

    # Retrieve image of first subject
    participant_id = data_df.loc[0, 'participant_id']
    session_id = data_df.loc[0, 'session_id']

    image_path = find_image_path(caps_dir, participant_id, session_id,
                                 preprocessing)
    image_nii = nib.load(image_path)
    image = image_nii.get_data()

    # Create output tsv file
    participant_id_list = [f'sub-RAND{i}' for i in range(2 * n_subjects)]
    session_id_list = ['ses-M00'] * 2 * n_subjects
    diagnosis_list = ['AD'] * n_subjects + ['CN'] * n_subjects
    data = np.array([participant_id_list, session_id_list, diagnosis_list])
    data = data.T
    output_df = pd.DataFrame(
        data, columns=['participant_id', 'session_id', 'diagnosis'])
    output_df['age_bl'] = 60
    output_df['sex'] = 'F'
    output_df.to_csv(join(output_dir, 'data.tsv'), sep='\t', index=False)

    for i in range(2 * n_subjects):
        gauss = np.random.normal(mean, sigma, image.shape)
        participant_id = f'sub-RAND{i}'
        noisy_image = image + gauss
        noisy_image_nii = nib.Nifti1Image(noisy_image,
                                          header=image_nii.header,
                                          affine=image_nii.affine)
        noisy_image_nii_path = join(output_dir, 'subjects', participant_id,
                                    'ses-M00', 't1_linear')
        noisy_image_nii_filename = participant_id + '_ses-M00' + FILENAME_TYPE[
            'cropped'] + '.nii.gz'
        makedirs(noisy_image_nii_path, exist_ok=True)
        nib.save(noisy_image_nii,
                 join(noisy_image_nii_path, noisy_image_nii_filename))

    missing_path = join(output_dir, "missing_mods")
    makedirs(missing_path, exist_ok=True)

    sessions = output_df.session_id.unique()
    for session in sessions:
        session_df = output_df[output_df.session_id == session]
        out_df = copy(session_df[["participant_id"]])
        out_df["synthetic"] = [1] * len(out_df)
        out_df.to_csv(join(missing_path, f"missing_mods_{session}.tsv"),
                      sep="\t",
                      index=False)
示例#5
0
def individual_backprop(options):

    main_logger = return_logger(options.verbose, "main process")
    options = translate_parameters(options)

    fold_list = [
        fold for fold in os.listdir(options.model_path) if fold[:5:] == "fold-"
    ]
    if len(fold_list) == 0:
        raise ValueError("No folds were found at path %s" % options.model_path)

    model_options = argparse.Namespace()
    model_options = read_json(
        model_options, path.join(options.model_path, 'commandline.json'))
    model_options = translate_parameters(model_options)
    model_options.gpu = options.gpu

    if model_options.network_type == "multicnn":
        raise NotImplementedError(
            "The interpretation of multi-CNN is not implemented.")

    if options.tsv_path is None and options.input_dir is None:
        options.multi_cohort = model_options.multi_cohort
    if options.tsv_path is None:
        options.tsv_path = model_options.tsv_path
    if options.input_dir is None:
        options.input_dir = model_options.input_dir
    if options.target_diagnosis is None:
        options.target_diagnosis = options.diagnosis

    for fold in fold_list:
        main_logger.info(fold)
        for selection in options.selection:
            results_path = path.join(options.model_path, fold, 'gradients',
                                     selection, options.name)

            criterion = get_criterion(model_options.loss)

            # Data management (remove data not well predicted by the CNN)
            training_df = load_data_test(options.tsv_path, [options.diagnosis],
                                         baseline=options.baseline,
                                         multi_cohort=options.multi_cohort)
            training_df.reset_index(drop=True, inplace=True)

            # Model creation
            _, all_transforms = get_transforms(
                model_options.mode,
                minmaxnormalization=model_options.minmaxnormalization)
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                data_example = return_dataset(
                    model_options.mode,
                    options.input_dir,
                    training_df,
                    model_options.preprocessing,
                    train_transformations=None,
                    all_transformations=all_transforms,
                    prepare_dl=options.prepare_dl,
                    multi_cohort=options.multi_cohort,
                    params=model_options)

            model = create_model(model_options, data_example.size)
            model_dir = os.path.join(options.model_path, fold, 'models',
                                     selection)
            model, best_epoch = load_model(model,
                                           model_dir,
                                           gpu=options.gpu,
                                           filename='model_best.pth.tar')
            options.output_dir = results_path
            commandline_to_json(options, logger=main_logger)

            # Keep only subjects who were correctly / wrongly predicted by the network
            training_df = sort_predicted(model,
                                         training_df,
                                         options.input_dir,
                                         model_options,
                                         criterion,
                                         options.keep_true,
                                         batch_size=options.batch_size,
                                         num_workers=options.num_workers,
                                         gpu=options.gpu)

            if len(training_df) > 0:

                # Save the tsv files used for the saliency maps
                training_df.to_csv(path.join('data.tsv'),
                                   sep='\t',
                                   index=False)

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    data_train = return_dataset(
                        model_options.mode,
                        options.input_dir,
                        training_df,
                        model_options.preprocessing,
                        train_transformations=None,
                        all_transformations=all_transforms,
                        prepare_dl=options.prepare_dl,
                        multi_cohort=options.multi_cohort,
                        params=model_options)

                train_loader = DataLoader(data_train,
                                          batch_size=options.batch_size,
                                          shuffle=True,
                                          num_workers=options.num_workers,
                                          pin_memory=True)

                interpreter = VanillaBackProp(model, gpu=options.gpu)

                for data in train_loader:
                    if options.gpu:
                        input_batch = data['image'].cuda()
                    else:
                        input_batch = data['image']

                    map_np = interpreter.generate_gradients(
                        input_batch,
                        data_train.diagnosis_code[options.target_diagnosis])
                    for i in range(options.batch_size):
                        single_path = path.join(results_path,
                                                data['participant_id'][i],
                                                data['session_id'][i])
                        os.makedirs(single_path, exist_ok=True)

                        if len(data_train.size) == 4:
                            if options.nifti_template_path is not None:
                                image_nii = nib.load(
                                    options.nifti_template_path)
                                affine = image_nii.affine
                            else:
                                affine = np.eye(4)

                            map_nii = nib.Nifti1Image(map_np[i, 0, :, :, :],
                                                      affine)
                            nib.save(map_nii,
                                     path.join(single_path, "map.nii.gz"))
                        else:
                            jpg_path = path.join(single_path, "map.jpg")
                            plt.imshow(map_np[i, 0, :, :],
                                       cmap="coolwarm",
                                       vmin=-options.vmax,
                                       vmax=options.vmax)
                            plt.colorbar()
                            plt.savefig(jpg_path)
                            plt.close()
                        np.save(path.join(single_path, "map.npy"), map_np[i])