def prepare_roi(file): from .extract_utils import extract_roi logger.debug(f" Processing of {file}.") container = container_from_filename(file) subfolder = "roi_based" if parameters["preprocessing"] == "custom": if not parameters["roi_custom_template"]: raise ClinicaDLArgumentError( "A custom template must be defined when the modality is set to custom." ) parameters["roi_template"] = parameters["roi_custom_template"] parameters["roi_mask_pattern"] = parameters[ "roi_custom_mask_pattern"] else: from .extract_utils import PATTERN_DICT, TEMPLATE_DICT parameters["roi_template"] = TEMPLATE_DICT[ parameters["preprocessing"]] parameters["roi_mask_pattern"] = PATTERN_DICT[ parameters["preprocessing"]] parameters["masks_location"] = path.join( caps_directory, "masks", f"tpl-{parameters['roi_template']}") if len(parameters["roi_list"]) == 0: raise ClinicaDLArgumentError( "A list of regions of interest must be given.") else: check_mask_list( parameters["masks_location"], parameters["roi_list"], parameters["roi_mask_pattern"], None if parameters["use_uncropped_image"] is None else not parameters["use_uncropped_image"], ) output_mode = extract_roi( file, masks_location=parameters["masks_location"], mask_pattern=parameters["roi_mask_pattern"], cropped_input=None if parameters["use_uncropped_image"] is None else not parameters["use_uncropped_image"], roi_names=parameters["roi_list"], uncrop_output=parameters["uncropped_roi"], ) logger.debug(f" ROI extracted.") write_output_imgs(output_mode, container, subfolder)
def get_criterion(criterion=None): compatible_losses = ["CrossEntropyLoss", "MultiMarginLoss"] if criterion is None: return nn.CrossEntropyLoss() if criterion not in compatible_losses: raise ClinicaDLArgumentError( f"Classification loss must be chosen in {compatible_losses}." ) return getattr(nn, criterion)()
def get_criterion(criterion=None): compatible_losses = [ "L1Loss", "MSELoss", "KLDivLoss", "BCEWithLogitsLoss", "HuberLoss", "SmoothL1Loss", ] if criterion is None: return nn.MSELoss() if criterion not in compatible_losses: raise ClinicaDLArgumentError( f"Reconstruction loss must be chosen in {compatible_losses}.") return getattr(nn, criterion)()
def _check_tsv_path(tsv_path, multi_cohort): if multi_cohort: if not tsv_path.endswith(".tsv"): raise ClinicaDLArgumentError( "If multi_cohort is given, the TSV_DIRECTORY argument should be a path to a TSV file." ) else: tsv_df = pd.read_csv(tsv_path, sep="\t") SplitManager._check_multi_cohort_tsv(tsv_df, "labels") else: if tsv_path.endswith(".tsv"): raise ClinicaDLConfigurationError( f"You gave the path to a TSV file in tsv_path {tsv_path}. " f"To use multi-cohort framework, please add 'multi_cohort=true' to the configuration file or the --multi_cohort flag." )
def _create_caps_dict(caps_directory, multi_cohort): if multi_cohort: if not caps_directory.endswith(".tsv"): raise ClinicaDLArgumentError( "If multi_cohort is given, the CAPS_DIRECTORY argument should be a path to a TSV file." ) else: caps_df = pd.read_csv(caps_directory, sep="\t") SplitManager._check_multi_cohort_tsv(caps_df, "CAPS") caps_dict = dict() for idx in range(len(caps_df)): cohort = caps_df.loc[idx, "cohort"] caps_path = caps_df.loc[idx, "path"] check_caps_folder(caps_path) caps_dict[cohort] = caps_path else: check_caps_folder(caps_directory) caps_dict = {"single": caps_directory} return caps_dict
def create_caps_dict(caps_directory: str, multi_cohort: bool) -> Dict[str, str]: from clinica.utils.inputs import check_caps_folder if multi_cohort: if not caps_directory.endswith(".tsv"): raise ClinicaDLArgumentError( "If multi_cohort is True, the CAPS_DIRECTORY argument should be a path to a TSV file." ) else: caps_df = pd.read_csv(caps_directory, sep="\t") check_multi_cohort_tsv(caps_df, "CAPS") caps_dict = dict() for idx in range(len(caps_df)): cohort = caps_df.loc[idx, "cohort"] caps_path = caps_df.loc[idx, "path"] check_caps_folder(caps_path) caps_dict[cohort] = caps_path else: check_caps_folder(caps_directory) caps_dict = {"single": caps_directory} return caps_dict
def find_file_type( preprocessing: str, uncropped_image: bool, acq_label: str, suvr_reference_region: str, ) -> Dict[str, str]: if preprocessing == "t1-linear": if uncropped_image: file_type = T1W_LINEAR else: file_type = T1W_LINEAR_CROPPED elif preprocessing == "pet-linear": if acq_label is None or suvr_reference_region is None: raise ClinicaDLArgumentError( "acq_label and suvr_reference_region must be defined " "when using `pet-linear` preprocessing.") file_type = pet_linear_nii(acq_label, suvr_reference_region, uncropped_image) else: raise NotImplementedError( f"Generation of synthetic data is not implemented for preprocessing {preprocessing}" ) return file_type
def split_diagnoses( formatted_data_path: str, n_splits: int = 5, subset_name: str = "validation", MCI_sub_categories: bool = True, stratification: str = None, ): """ Performs a k-fold split for each label independently on the subject level. The train folder will contain two lists per fold per diagnosis (baseline and longitudinal), whereas the test folder will only include the list of baseline sessions for each spli. Writes three files per split per <label>.tsv file present in formatted_data_path: - formatted_data_path/train_splits-<n_splits>/split-<split>/<label>.tsv - formatted_data_path/train_splits-<n_splits>/split-<split>/<label>_baseline.tsv - formatted_data_path/<subset_name>_splits-<n_splits>/split-<split>/<label>_baseline.tsv Args: formatted_data_path: Path to the folder containing data extracted by clinicadl tsvtool getlabels. n_splits: Number of splits in the k-fold cross-validation. subset_name: Name of the subset that is complementary to train. MCI_sub_categories: If True, manages MCI sub-categories to avoid data leakage. stratification: Name of variable used to stratify k-fold. """ commandline_to_json( { "output_dir": formatted_data_path, "n_splits": n_splits, "subset_name": subset_name, "MCI_sub_categories": MCI_sub_categories, "stratification": stratification, }, filename="kfold.json", ) # Read files results_path = formatted_data_path train_path = path.join(results_path, f"train_splits-{n_splits}") if path.exists(train_path): shutil.rmtree(train_path) os.makedirs(train_path) for i in range(n_splits): os.mkdir(path.join(train_path, f"split-{i}")) test_path = path.join(results_path, f"{subset_name}_splits-{n_splits}") if path.exists(test_path): shutil.rmtree(test_path) os.makedirs(test_path) for i in range(n_splits): os.mkdir(path.join(test_path, f"split-{i}")) diagnosis_df_paths = os.listdir(results_path) diagnosis_df_paths = [ x for x in diagnosis_df_paths if (x.endswith(".tsv") and not x.endswith("_baseline.tsv")) ] MCI_special_treatment = False if "MCI.tsv" in diagnosis_df_paths: if MCI_sub_categories: diagnosis_df_paths.remove("MCI.tsv") MCI_special_treatment = True elif "sMCI.tsv" in diagnosis_df_paths or "pMCI.tsv" in diagnosis_df_paths: logger.warning( "MCI special treatment was deactivated though MCI subgroups were found. " "Be aware that it may cause data leakage in transfer learning tasks." ) # The baseline session must be kept before or we are taking all the sessions to mix them for diagnosis_df_path in diagnosis_df_paths: diagnosis = diagnosis_df_path.split(".")[0] diagnosis_df = pd.read_csv(path.join(results_path, diagnosis_df_path), sep="\t") write_splits(diagnosis, diagnosis_df, stratification, n_splits, train_path, test_path) logger.info(f"K-fold split for diagnosis {diagnosis} is done.") if MCI_special_treatment: # Extraction of MCI subjects without intersection with the sMCI / pMCI train diagnosis_df = pd.read_csv(path.join(results_path, "MCI.tsv"), sep="\t") MCI_df = diagnosis_df.set_index(["participant_id", "session_id"]) MCI_df, supplementary_diagnoses = remove_sub_labels( MCI_df, ["sMCI", "pMCI"], diagnosis_df_paths, results_path) if len(supplementary_diagnoses) == 0: raise ClinicaDLArgumentError( "The MCI_sub_categories flag is not needed as there are no intersections with " "MCI subcategories.") MCI_df.reset_index(drop=False, inplace=True) logger.debug(MCI_df) write_splits( "MCI", MCI_df, stratification, n_splits, train_path, test_path, supplementary_diagnoses=supplementary_diagnoses, ) logger.info("K-fold split for diagnosis MCI is done.")
def create_split( diagnosis, diagnosis_df, split_label, n_test, p_age_threshold=0.80, p_sex_threshold=0.80, supplementary_train_df=None, ignore_demographics=False, ): """ Split data at the subject-level in training and test set with equivalent age, sex and split_label distributions Args: diagnosis: (str) diagnosis on which the split is done diagnosis_df: DataFrame with columns including ['participant_id', 'session_id', 'diagnosis'] split_label: (str) label on which the split is done (categorical variables) n_test: (float) If > 1 number of subjects to put in the test set. If < 1 proportion of subjects to put in the test set. p_age_threshold: (float) threshold for the t-test on age. p_sex_threshold: (float) threshold for the chi2 test on sex. supplementary_train_df: (DataFrame) Add data that must be included in the train set. ignore_demographics: (bool): If True the diagnoses are split without taking into account the demographics distributions (age, sex). Returns: train_df (DataFrame) subjects in the train set test_df (DataFrame) subjects in the test set """ if supplementary_train_df is not None: sup_train_sex = [ sex_dict[x] for x in supplementary_train_df.sex.values ] sup_train_age = [float(x) for x in supplementary_train_df.age.values] else: sup_train_sex = [] sup_train_age = [] baseline_df = extract_baseline(diagnosis_df) if n_test >= 1: n_test = int(n_test) else: n_test = int(n_test * len(baseline_df)) if not {split_label}.issubset(set(baseline_df.columns.values)): raise ClinicaDLArgumentError( f"The column {split_label} is missing." f"Please add it using the --variables_of_interest flag in getlabels." ) if not ignore_demographics: try: sex_label = find_label(baseline_df.columns.values, "sex") age_label = find_label(baseline_df.columns.values, "age") except ClinicaDLArgumentError: raise ClinicaDLArgumentError( "This dataset do not have age or sex values. " "Please add the flag --ignore_demographics to split " "without trying to balance age or sex distributions.") sex = list(baseline_df[sex_label].values) age = list(baseline_df[age_label].values) category = list(baseline_df[split_label].values) category = category_conversion(category) category = remove_unicity(category) flag_selection = True n_try = 0 while flag_selection: splits = StratifiedShuffleSplit(n_splits=1, test_size=n_test) for train_index, test_index in splits.split(category, category): # Find the value for different demographics (age & sex) if len(set(age)) != 1: age_test = [float(age[idx]) for idx in test_index] age_train = [float(age[idx]) for idx in train_index] + sup_train_age _, p_age = ttest_ind(age_test, age_train, nan_policy="omit") else: p_age = 1 if len(set(sex)) != 1: sex_test = [sex_dict[sex[idx]] for idx in test_index] sex_train = [sex_dict[sex[idx]] for idx in train_index] + sup_train_sex _, p_sex = chi2(sex_test, sex_train) else: p_sex = 1 logger.debug(f"p_age={p_age:.2f}, p_sex={p_sex:.4f}") if p_sex >= p_sex_threshold and p_age >= p_age_threshold: flag_selection = False test_df = baseline_df.loc[test_index] train_df = baseline_df.loc[train_index] if supplementary_train_df is not None: train_df = pd.concat( [train_df, supplementary_train_df]) train_df.reset_index(drop=True, inplace=True) n_try += 1 logger.info( f"Split for diagnosis {diagnosis} was found after {n_try} trials.") else: idx = np.arange(len(baseline_df)) idx_test = np.random.choice(idx, size=n_test, replace=False) idx_test.sort() idx_train = complementary_list(idx, idx_test) test_df = baseline_df.loc[idx_test] train_df = baseline_df.loc[idx_train] return train_df, test_df
def split_diagnoses( formatted_data_path, n_test=100, subset_name="test", MCI_sub_categories=True, p_age_threshold=0.80, p_sex_threshold=0.80, categorical_split_variable=None, ignore_demographics=False, verbose=0, ): """ Performs a single split for each label independently on the subject level. The train folder will contain two lists per diagnosis (baseline and longitudinal), whereas the test folder will only include the list of baseline sessions. The age and sex distributions between the two sets must be non-significant (according to T-test and chi-square). Args: formatted_data_path (str): Path to the folder containing data extracted by clinicadl tsvtool getlabels. n_test (float): If >= 1, number of subjects to put in set with name 'subset_name'. If < 1, proportion of subjects to put in set with name 'subset_name'. If 0, no training set is created and the whole dataset is considered as one set with name 'subset_name'. subset_name (str): Name of the subset that is complementary to train. MCI_sub_categories (bool): If True, manages MCI sub-categories to avoid data leakage. p_age_threshold (float): The threshold used for the T-test on age distributions. p_sex_threshold (float): The threshold used for the T-test on sex distributions. categorical_split_variable (str): name of a categorical variable to perform a stratified split. ignore_demographics (bool): If True the diagnoses are split without taking into account the demographics distributions (age, sex). verbose (int): level of verbosity. Returns: writes three files per <label>.tsv file present in formatted_data_path: - formatted_data_path/train/<label>.tsv - formatted_data_path/train/<label>_baseline.tsv - formatted_data_path/<subset_name>/<label>_baseline.tsv """ commandline_to_json( { "output_dir": formatted_data_path, "n_test": n_test, "subset_name": subset_name, "MCI_sub_categories": MCI_sub_categories, "p_age_threshold": p_age_threshold, "p_sex_threshold": p_sex_threshold, "categorical_split_variable": categorical_split_variable, "ignore_demographics": ignore_demographics, }, filename="split.json", ) # Read files results_path = formatted_data_path train_path = path.join(results_path, "train") if path.exists(train_path): shutil.rmtree(train_path) if n_test > 0: os.makedirs(train_path) if categorical_split_variable is None: categorical_split_variable = "diagnosis" test_path = path.join(results_path, subset_name) if path.exists(test_path): shutil.rmtree(test_path) os.makedirs(test_path) diagnosis_df_paths = os.listdir(results_path) diagnosis_df_paths = [x for x in diagnosis_df_paths if x.endswith(".tsv")] diagnosis_df_paths = [ x for x in diagnosis_df_paths if not x.endswith("_baseline.tsv") ] MCI_special_treatment = False if "MCI.tsv" in diagnosis_df_paths and n_test > 0: if MCI_sub_categories: diagnosis_df_paths.remove("MCI.tsv") MCI_special_treatment = True elif "sMCI.tsv" in diagnosis_df_paths or "pMCI.tsv" in diagnosis_df_paths: logger.warning( "MCI special treatment was deactivated though MCI subgroups were found." "Be aware that it may cause data leakage in transfer learning tasks." ) # The baseline session must be kept before or we are taking all the sessions to mix them for diagnosis_df_path in diagnosis_df_paths: diagnosis_df = pd.read_csv(path.join(results_path, diagnosis_df_path), sep="\t") interest_columns = diagnosis_df.columns.values diagnosis = diagnosis_df_path.split(".")[0] logger.info(f"Running split for diagnosis {diagnosis}") if n_test > 0: train_df, test_df = create_split( diagnosis, diagnosis_df, categorical_split_variable, n_test=n_test, p_age_threshold=p_age_threshold, p_sex_threshold=p_sex_threshold, ignore_demographics=ignore_demographics, ) # Save baseline splits train_df.to_csv( path.join(train_path, f"{diagnosis}_baseline.tsv"), sep="\t", index=False, ) test_df.to_csv(path.join(test_path, f"{diagnosis}_baseline.tsv"), sep="\t", index=False) long_train_df = retrieve_longitudinal(train_df, diagnosis_df) long_train_df.to_csv(path.join(train_path, f"{diagnosis}.tsv"), sep="\t", index=False) long_test_df = retrieve_longitudinal(test_df, diagnosis_df) long_test_df.to_csv(path.join(test_path, f"{diagnosis}.tsv"), sep="\t", index=False) else: baseline_df = extract_baseline(diagnosis_df) test_df = baseline_df[interest_columns] test_df.to_csv(path.join(test_path, f"{diagnosis}_baseline.tsv"), sep="\t", index=False) long_test_df = retrieve_longitudinal(test_df, diagnosis_df) long_test_df.to_csv(path.join(test_path, f"{diagnosis}.tsv"), sep="\t", index=False) if MCI_special_treatment: # Extraction of MCI subjects without intersection with the sMCI / pMCI train diagnosis_df = pd.read_csv(path.join(results_path, "MCI.tsv"), sep="\t") MCI_df = diagnosis_df.set_index(["participant_id", "session_id"]) baseline_df = extract_baseline(MCI_df, set_index=False) if n_test > 1: n_test = int(n_test) else: n_test = int(n_test * len(baseline_df)) MCI_df, supplementary_diagnoses = remove_sub_labels( MCI_df, ["sMCI", "pMCI"], diagnosis_df_paths, results_path) if len(supplementary_diagnoses) == 0: raise ClinicaDLArgumentError( "The MCI_sub_categories flag is not needed as there are no intersections with" "MCI subcategories.") # Construction of supplementary train supplementary_train_df = pd.DataFrame() for diagnosis in supplementary_diagnoses: sup_baseline_train_df = pd.read_csv(path.join( train_path, f"{diagnosis}_baseline.tsv"), sep="\t") supplementary_train_df = pd.concat( [supplementary_train_df, sup_baseline_train_df]) sub_df = (supplementary_train_df.reset_index().groupby( "participant_id")["session_id"].nunique()) logger.debug( f"supplementary_train_df {len(sub_df)} subjects, {len(supplementary_diagnoses)} scans" ) supplementary_train_df.reset_index(drop=True, inplace=True) # MCI selection MCI_df.reset_index(inplace=True) baseline_df = extract_baseline(MCI_df) train_df, test_df = create_split( "MCI", baseline_df, categorical_split_variable, n_test=n_test, p_age_threshold=p_age_threshold, p_sex_threshold=p_sex_threshold, ignore_demographics=ignore_demographics, supplementary_train_df=supplementary_train_df, ) # Write selection of MCI train_df.to_csv(path.join(train_path, "MCI_baseline.tsv"), sep="\t", index=False) test_df.to_csv(path.join(test_path, "MCI_baseline.tsv"), sep="\t", index=False) long_train_df = retrieve_longitudinal(train_df, diagnosis_df) long_train_df.to_csv(path.join(train_path, "MCI.tsv"), sep="\t", index=False) long_test_df = retrieve_longitudinal(test_df, diagnosis_df) long_test_df.to_csv(path.join(test_path, "MCI.tsv"), sep="\t", index=False)
def load_data_test(test_path, diagnoses_list, baseline=True, multi_cohort=False): """ Load data not managed by split_manager. Args: test_path (str): path to the test TSV files / split directory / TSV file for multi-cohort diagnoses_list (List[str]): list of the diagnoses wanted in case of split_dir or multi-cohort baseline (bool): If True baseline sessions only used (split_dir handling only). multi_cohort (bool): If True considers multi-cohort setting. """ # TODO: computes baseline sessions on-the-fly to manager TSV file case if multi_cohort: if not test_path.endswith(".tsv"): raise ClinicaDLArgumentError( "If multi_cohort is given, the TSV_DIRECTORY argument should be a path to a TSV file." ) else: tsv_df = pd.read_csv(test_path, sep="\t") check_multi_cohort_tsv(tsv_df, "labels") test_df = pd.DataFrame() found_diagnoses = set() for idx in range(len(tsv_df)): cohort_name = tsv_df.loc[idx, "cohort"] cohort_path = tsv_df.loc[idx, "path"] cohort_diagnoses = (tsv_df.loc[idx, "diagnoses"].replace( " ", "").split(",")) if bool(set(cohort_diagnoses) & set(diagnoses_list)): target_diagnoses = list( set(cohort_diagnoses) & set(diagnoses_list)) cohort_test_df = load_data_test_single(cohort_path, target_diagnoses, baseline=baseline) cohort_test_df["cohort"] = cohort_name test_df = pd.concat([test_df, cohort_test_df]) found_diagnoses = found_diagnoses | (set(cohort_diagnoses) & set(diagnoses_list)) if found_diagnoses != set(diagnoses_list): raise ValueError( f"The diagnoses found in the multi cohort dataset {found_diagnoses} " f"do not correspond to the diagnoses wanted {set(diagnoses_list)}." ) test_df.reset_index(inplace=True, drop=True) else: if test_path.endswith(".tsv"): tsv_df = pd.read_csv(test_path, sep="\t") multi_col = {"cohort", "path"} if multi_col.issubset(tsv_df.columns.values): raise ClinicaDLConfigurationError( "To use multi-cohort framework, please add 'multi_cohort=true' in your configuration file or '--multi_cohort' flag to the command line." ) test_df = load_data_test_single(test_path, diagnoses_list, baseline=baseline) test_df["cohort"] = "single" return test_df
def get_labels( merged_tsv: str, missing_mods: str, results_path: str, diagnoses: List[str], modality: str = "t1w", restriction_path: str = None, time_horizon: int = 36, variables_of_interest: List[str] = None, remove_smc: bool = True, ): """ Writes one TSV file per label in diagnoses argument based on merged_tsv and missing_mods. Args: merged_tsv: Path to the file obtained by the command clinica iotools merge-tsv. missing_mods: Path to the folder where the outputs of clinica iotools check-missing-modalities are. results_path: Path to the folder where tsv files are extracted. diagnoses: Labels that must be extracted from merged_tsv. modality: Modality to select sessions. Sessions which do not include the modality will be excluded. restriction_path: Path to a tsv containing the sessions that can be included. time_horizon: Time horizon to analyse stability of MCI subjects. variables_of_interest: columns that should be kept in the output tsv files. remove_smc: if True SMC participants are removed from the lists. """ commandline_to_json( { "output_dir": results_path, "merged_tsv": merged_tsv, "missing_mods": missing_mods, "diagnoses": diagnoses, "modality": modality, "restriction_path": restriction_path, "time_horizon": time_horizon, "variables_of_interest": variables_of_interest, "remove_smc": remove_smc, }, filename="getlabels.json", ) # Reading files bids_df = pd.read_csv(merged_tsv, sep="\t") bids_df.set_index(["participant_id", "session_id"], inplace=True) variables_list = ["diagnosis"] try: variables_list.append(find_label(bids_df.columns.values, "age")) variables_list.append(find_label(bids_df.columns.values, "sex")) except ValueError: logger.warning("The age or sex values were not found in the dataset.") if variables_of_interest is not None: variables_set = set(variables_of_interest) | set(variables_list) variables_list = list(variables_set) if not set(variables_list).issubset(set(bids_df.columns.values)): raise ClinicaDLArgumentError( f"The variables asked by the user {variables_of_interest} do not " f"exist in the data set.") list_files = os.listdir(missing_mods) missing_mods_dict = {} for file in list_files: filename, fileext = path.splitext(file) if fileext == ".tsv": session = filename.split("_")[-1] missing_mods_df = pd.read_csv(path.join(missing_mods, file), sep="\t") if len(missing_mods_df) == 0: raise ClinicaDLTSVError( f"Given TSV file at {path.join(missing_mods, file)} loads an empty DataFrame." ) missing_mods_df.set_index("participant_id", drop=True, inplace=True) missing_mods_dict[session] = missing_mods_df # Creating results path os.makedirs(results_path, exist_ok=True) # Remove SMC patients if remove_smc: if "diagnosis_bl" in bids_df.columns.values: # Retro-compatibility bids_df = bids_df[~(bids_df.diagnosis_bl == "SMC")] if "diagnosis_sc" in bids_df.columns.values: bids_df = bids_df[~(bids_df.diagnosis_sc == "SMC")] # Adding the field baseline_diagnosis bids_copy_df = copy(bids_df) bids_copy_df["baseline_diagnosis"] = pd.Series(np.zeros(len(bids_df)), index=bids_df.index) for subject, subject_df in bids_df.groupby(level=0): baseline_diagnosis = subject_df.loc[(subject, "ses-M00"), "diagnosis"] bids_copy_df.loc[subject, "baseline_diagnosis"] = baseline_diagnosis bids_df = copy(bids_copy_df) time_MCI_df = None if "AD" in diagnoses: logger.info("Beginning the selection of AD label") output_df = stable_selection(bids_df, diagnosis="AD") output_df = mod_selection(output_df, missing_mods_dict, modality) output_df = apply_restriction(output_df, restriction_path) diagnosis_df = output_df[variables_list] diagnosis_df.to_csv(path.join(results_path, "AD.tsv"), sep="\t") sub_df = (diagnosis_df.reset_index().groupby("participant_id") ["session_id"].nunique()) logger.info( f"Found {len(sub_df)} AD subjects for a total of {len(diagnosis_df)} sessions\n" ) if "BV" in diagnoses: logger.info("Beginning the selection of BV label") output_df = stable_selection(bids_df, diagnosis="BV") output_df = mod_selection(output_df, missing_mods_dict, modality) output_df = apply_restriction(output_df, restriction_path) diagnosis_df = output_df[variables_list] diagnosis_df.to_csv(path.join(results_path, "BV.tsv"), sep="\t") sub_df = (diagnosis_df.reset_index().groupby("participant_id") ["session_id"].nunique()) logger.info( f"Found {len(sub_df)} BV subjects for a total of {len(diagnosis_df)} sessions\n" ) if "CN" in diagnoses: logger.info("Beginning the selection of CN label") output_df = stable_selection(bids_df, diagnosis="CN") output_df = mod_selection(output_df, missing_mods_dict, modality) output_df = apply_restriction(output_df, restriction_path) diagnosis_df = output_df[variables_list] diagnosis_df.to_csv(path.join(results_path, "CN.tsv"), sep="\t") sub_df = (diagnosis_df.reset_index().groupby("participant_id") ["session_id"].nunique()) logger.info( f"Found {len(sub_df)} CN subjects for a total of {len(diagnosis_df)} sessions\n" ) if "MCI" in diagnoses: logger.info("Beginning of the selection of MCI label") MCI_df = mci_stability( bids_df, 10**4) # Remove rMCI independently from time horizon output_df = diagnosis_removal(MCI_df, diagnosis_list=["rMCI"]) output_df = mod_selection(output_df, missing_mods_dict, modality) output_df = apply_restriction(output_df, restriction_path) # Relabelling everything as MCI output_df.diagnosis = ["MCI"] * len(output_df) diagnosis_df = output_df[variables_list] diagnosis_df.to_csv(path.join(results_path, "MCI.tsv"), sep="\t") sub_df = (diagnosis_df.reset_index().groupby("participant_id") ["session_id"].nunique()) logger.info( f"Found {len(sub_df)} MCI subjects for a total of {len(diagnosis_df)} sessions\n" ) if "sMCI" in diagnoses: logger.info("Beginning of the selection of sMCI label") time_MCI_df = mci_stability(bids_df, time_horizon) output_df = diagnosis_removal(time_MCI_df, diagnosis_list=["rMCI", "pMCI"]) output_df = output_df[output_df.diagnosis == "sMCI"] output_df = mod_selection(output_df, missing_mods_dict, modality) output_df = apply_restriction(output_df, restriction_path) diagnosis_df = output_df[variables_list] diagnosis_df.to_csv(path.join(results_path, "sMCI.tsv"), sep="\t") sub_df = (diagnosis_df.reset_index().groupby("participant_id") ["session_id"].nunique()) logger.info( f"Found {len(sub_df)} sMCI subjects for a total of {len(diagnosis_df)} sessions\n" ) if "pMCI" in diagnoses: logger.info("Beginning of the selection of pMCI label") if time_MCI_df is None: time_MCI_df = mci_stability(bids_df, time_horizon) output_df = time_MCI_df[time_MCI_df.diagnosis == "pMCI"] output_df = mod_selection(output_df, missing_mods_dict, modality) output_df = apply_restriction(output_df, restriction_path) diagnosis_df = output_df[variables_list] diagnosis_df.to_csv(path.join(results_path, "pMCI.tsv"), sep="\t") sub_df = (diagnosis_df.reset_index().groupby("participant_id") ["session_id"].nunique()) logger.info( f"Found {len(sub_df)} pMCI subjects for a total of {len(diagnosis_df)} sessions\n" )
def check_and_complete(options, random_search=False): """ This function initializes missing fields with missing values. Some fields are mandatory and cannot be initialized by default; this will raise an issue if they are missing. Args: options: (dict) the options used for training. random_search: (bool) If True the options are looking for mandatory values of random-search. """ def set_default(params_dict, default_dict): for name, default_value in default_dict.items(): if name not in params_dict: params_dict[name] = default_value default_values = { "accumulation_steps": 1, "baseline": False, "batch_size": 2, "compensation": "memory", "data_augmentation": False, "diagnoses": ["AD", "CN"], "dropout": 0, "epochs": 20, "evaluation_steps": 0, "learning_rate": 4, "loss": "default", "multi": False, "multi_cohort": False, "n_splits": 0, "n_proc": 2, "optimizer": "Adam", "unnormalize": False, "patience": 0, "predict_atlas_intensities": [], "split": [], "seed": None, "selection_metrics": ["loss"], "tolerance": 0.0, "deterministic": False, "transfer_learning_path": "", "transfer_learning_selection": "best_loss", "gpu": True, "wd_bool": True, "weight_decay": 4, "sampler": "random", } mode_default_values = { "patch": { "patch_size": 50, "stride_size": 50, "use_extracted_patches": False, }, "roi": { "roi_list": [], "uncropped_roi": False, "use_extracted_roi": False, }, "slice": { "discarded_slices": 20, "slice_direction": 0, "use_extracted_slices": False, }, "image": {}, } task_default_values = { "classification": { "label": "diagnosis", "selection_threshold": 0, }, "regression": { "label": "age", }, } if random_search: default_values["d_reduction"] = "MaxPooling" default_values["network_normalization"] = "BatchNorm" default_values["channels_limit"] = 512 default_values["n_conv"] = 1 set_default(options, default_values) mandatory_arguments = [ "network_task", "mode", "tsv_path", "caps_directory", "preprocessing", ] if random_search: mandatory_arguments += [ "n_convblocks", "first_conv_width", "n_fcblocks" ] for argument in mandatory_arguments: if argument not in options: raise ClinicaDLArgumentError( f"The argument {argument} must be specified in the parameters." ) if random_search: for mode, mode_dict in mode_default_values.items(): set_default(options, mode_dict) if options["network_task"] not in task_default_values: raise NotImplementedError( f"The task default arguments corresponding to {options['network_task']} were not implemented." ) task_dict = task_default_values[options["network_task"]] set_default(options, task_dict) else: if options["mode"] not in mode_default_values: raise NotImplementedError( f"The mode default arguments corresponding to {options['mode']} were not implemented." ) if options["network_task"] not in task_default_values: raise NotImplementedError( f"The task default arguments corresponding to {options['network_task']} were not implemented." ) mode_dict = mode_default_values[options["mode"]] task_dict = task_default_values[options["network_task"]] set_default(options, mode_dict) set_default(options, task_dict)
def check_gpu(): import torch if not torch.cuda.is_available(): raise ClinicaDLArgumentError( "No GPU is available. Please add the --no-gpu flag to run on CPU.")
def quality_check( caps_dir: str, output_path: str, tsv_path: str = None, threshold: float = 0.5, batch_size: int = 1, n_proc: int = 0, gpu: bool = True, ): logger = getLogger("clinicadl") if not output_path.endswith(".tsv"): raise ClinicaDLArgumentError( f"Output path {output_path} must be a TSV file.") # Fetch QC model home = str(Path.home()) cache_clinicadl = join(home, ".cache", "clinicadl", "models") url_aramis = "https://aramislab.paris.inria.fr/files/data/models/dl/qc/" logger.info("Downloading quality check model.") FILE1 = RemoteFileStructure( filename="resnet18.pth.tar", url=url_aramis, checksum= "a97a781be3820b06424fe891ec405c78b87ad51a27b6b81614dbdb996ce60104", ) makedirs(cache_clinicadl, exist_ok=True) model_file = join(cache_clinicadl, FILE1.filename) if not (exists(model_file)): try: model_file = fetch_file(FILE1, cache_clinicadl) except IOError as err: print("Unable to download required model for QC process:", err) # Load QC model logger.debug("Loading quality check model.") model = resnet_qc_18() model.load_state_dict(torch.load(model_file)) model.eval() if gpu: logger.debug("Working on GPU.") model.cuda() # Transform caps_dir in dict caps_dict = CapsDataset.create_caps_dict(caps_dir, multi_cohort=False) # Load DataFrame logger.debug("Loading data to check.") df = load_and_check_tsv(tsv_path, caps_dict, dirname(abspath(output_path))) dataset = QCDataset(caps_dir, df) dataloader = DataLoader(dataset, num_workers=n_proc, batch_size=batch_size, pin_memory=True) columns = ["participant_id", "session_id", "pass_probability", "pass"] qc_df = pd.DataFrame(columns=columns) softmax = torch.nn.Softmax(dim=1) logger.info( f"Quality check will be performed over {len(dataloader)} images.") for data in dataloader: logger.debug(f"Processing subject {data['participant_id']}.") inputs = data["image"] if gpu: inputs = inputs.cuda() outputs = softmax.forward(model(inputs)) for idx, sub in enumerate(data["participant_id"]): pass_probability = outputs[idx, 1].item() row = [[ sub, data["session_id"][idx], pass_probability, pass_probability > threshold, ]] logger.debug(f"Quality score is {pass_probability}.") row_df = pd.DataFrame(row, columns=columns) qc_df = qc_df.append(row_df) qc_df.sort_values("pass_probability", ascending=False, inplace=True) qc_df.to_csv(output_path, sep="\t", index=False) logger.info(f"Results are stored at {output_path}.")