Пример #1
0
def FeatureConverter(feat_in, toolbox, config, feat_out):
    """
    Convert features as extracted by a third-party toolbox to WORC format.

    Parameters
    ----------
    feat_in: string
        Path to input feature file as outputted by the feature extraction
        toolbox.

    toolbox: string
        Name of toolbox from which features are extracted.

    config: string
        Path to .ini file containing the configuration for this function.

    feat_out: string
        Path to .hdf5 file to which converted features should be saved

    """
    # Convert input features
    if toolbox == 'PREDICT':
        convert_PREDICT(feat_in, feat_out)
    elif toolbox == 'PyRadiomics':
        convert_pyradiomics(feat_in, feat_out)
    else:
        raise WORCexceptions.WORCKeyError(f'Toolbox {toolbox} not recognized.')
    def __init__(self,
                 method='robust_z_score',
                 skip_features=None,
                 verbose=False):
        """Initialize object.

        Parameters
        ------------
        method: string
            Name of scaler used: robust_z_score, z_score, robust, or minmax
        skip_features: list of strings
            If any of these elements occur as substring in a feature label,
            this feature is excluded.

        """
        self.method = method
        self.skip_features = skip_features
        self.verbose = verbose

        if method not in accepted_scalers:
            raise ae.WORCKeyError(f'{method} is not a ' +
                                  'valid scaling method. Should be any of ' +
                                  f'{accepted_scalers}.')

        self.included_feature_indices = list()
        self.excluded_feature_indices = list()
    def fit(self, X_train, feature_labels=None):
        """Fit the scaler."""
        # Determine whether features should be skipped
        if feature_labels is None or self.skip_features is None or not self.skip_features:
            # Nothing should be skipped
            X_train_scaling = X_train

            self.included_feature_indices = range(0, X_train.shape[1])
            self.excluded_feature_indices = list()

        else:
            # Skip part of features in scaling
            if self.verbose:
                print(
                    f'\t Excluding features containing: {self.skip_features}')

            # Determine indices of excluded features
            included_feature_indices = []
            excluded_feature_indices = []
            for fnum, i in enumerate(feature_labels):
                if not any(e in i for e in self.skip_features):
                    included_feature_indices.append(fnum)
                else:
                    excluded_feature_indices.append(fnum)

            # Actually exclude the features
            X_train_scaling = [
                np.asarray(i)[included_feature_indices].tolist()
                for i in X_train
            ]

            self.included_feature_indices = included_feature_indices
            self.excluded_feature_indices = excluded_feature_indices

        # Fit the actual scaler
        if self.method == 'robust_z_score':
            scaler = RobustStandardScaler().fit(X_train_scaling)
        elif self.method == 'z_score':
            scaler = StandardScaler().fit(X_train_scaling)
        elif self.method == 'robust':
            scaler = RobustScaler().fit(X_train_scaling)
        elif self.method == 'minmax':
            scaler = MinMaxScaler().fit(X_train_scaling)
        else:
            raise ae.WORCKeyError(f'{self.method} is not a ' +
                                  'valid scaling method. Should be any of ' +
                                  f'{accepted_scalers}.')

        self.scaler = scaler
Пример #4
0
def load_config(config_file_path):
    """
    Load the config ini, parse settings to WORC

    Args:
        config_file_path (String): path of the .ini config file

    Returns:
        settings_dict (dict): dict with the loaded settings
    """
    if not os.path.exists(config_file_path):
        e = f'File {config_file_path} does not exist!'
        raise ae.WORCKeyError(e)

    settings = configparser.ConfigParser()
    settings.read(config_file_path)

    settings_dict = {'ComBat': dict()}

    # Convert settings
    settings_dict['ComBat']['batch'] =\
        [str(item).strip() for item in
         settings['ComBat']['batch'].split(',')]

    settings_dict['ComBat']['mod'] =\
        [str(item).strip() for item in
         settings['ComBat']['mod'].split(',')]

    settings_dict['ComBat']['par'] =\
        settings['ComBat'].getint('par')

    settings_dict['ComBat']['eb'] =\
        settings['ComBat'].getint('eb')

    settings_dict['ComBat']['language'] =\
        str(settings['ComBat']['language'])

    settings_dict['ComBat']['matlab'] =\
        str(settings['ComBat']['matlab'])

    settings_dict['ComBat']['per_feature'] =\
        int(settings['ComBat']['per_feature'])

    settings_dict['ComBat']['excluded_features'] =\
        [str(item).strip() for item in
         settings['ComBat']['excluded_features'].split(',')]

    return settings_dict
Пример #5
0
def load_labels(label_file, label_type):
    """Loads the label data from a label file

    Args:
        label_file (string): The path to the label file
        label_type (list): List of the names of the labels to load

    Returns:
        dict: A dict containing 'patient_IDs', 'label' and
         'label_type'
    """
    if not os.path.exists(label_file):
        raise ae.WORCKeyError(f'File {label_file} does not exist!')

    _, extension = os.path.splitext(label_file)
    if extension == '.txt':
        label_names, patient_IDs, label_status = load_label_txt(
            label_file)
    elif extension == '.csv':
        label_names, patient_IDs, label_status = load_label_csv(
            label_file)
    elif extension == '.ini':
        label_names, patient_IDs, label_status = load_label_XNAT(
            label_file)
    else:
        raise ae.WORCIOError(extension + ' is not valid label file extension.')

    print("Label names to extract: " + str(label_type))
    labels = list()
    for i_label in label_type:
        label_index = np.where(label_names == i_label)[0]
        if label_index.size == 0:
            raise ae.WORCValueError('Could not find label: ' + str(i_label))
        else:
            labels.append(label_status[:, label_index])

    label_data = dict()
    label_data['patient_IDs'] = patient_IDs
    label_data['label'] = labels
    label_data['label_name'] = label_type

    return label_data
Пример #6
0
def main():
    parser = argparse.ArgumentParser(description='Radiomics classification')
    parser.add_argument('-ed', '--ed', metavar='ed',
                        dest='ed', type=str, required=True,
                        help='Estimator data in (HDF)')
    parser.add_argument('-tt', '--tt', metavar='tt',
                        dest='tt', type=str, required=True,
                        help='Train- and testdata in (HDF)')
    parser.add_argument('-para', '--para', metavar='para',
                        dest='para', type=str, required=True,
                        help='Parameters (JSON)')
    parser.add_argument('-out', '--out', metavar='out',
                        dest='out', type=str, required=True,
                        help='Output: fitted estimator (HDF)')
    parser.add_argument('-verbose', '--verbose', metavar='verbose',
                        nargs='+', dest='verbose', type=str, required=False,
                        default=None, help='verbose')
    args = parser.parse_args()

    # Convert lists into strings
    if type(args.ed) is list:
        args.ed = ''.join(args.ed)
    if type(args.tt) is list:
        args.tt = ''.join(args.tt)
    if type(args.para) is list:
        args.para = ''.join(args.para)
    if type(args.out) is list:
        args.out = ''.join(args.out)
    if type(args.verbose) is list:
        args.verbose = ''.join(args.verbose)

    # Read the data
    data = pd.read_hdf(args.ed)
    traintest = pd.read_hdf(args.tt)
    with open(args.para, 'rb') as fp:
        para = json.load(fp)

    # Check whether verbose is given or not
    if args.verbose is None:
        args.verbose = False
    elif args.verbose == 'False':
        args.verbose = False
    elif args.verbose == 'True':
        args.verbose = True
    else:
        raise ae.WORCKeyError(f'{args.verbose} is not a valid verbose option!')

    # Run the tool
    n_cores = 1
    ret = Parallel(
        n_jobs=n_cores, verbose=args.verbose,
        pre_dispatch=2*n_cores
    )(delayed(fit_and_score)(X=data['X'], y=data['y'],
                             scoring=data['scoring'],
                             train=traintest['train'],
                             test=traintest['test'], verbose=args.verbose,
                             parameters=parameters,
                             fit_params=data['fit_params'],
                             return_train_score=data['return_train_score'],
                             return_parameters=data['return_parameters'],
                             return_n_test_samples=data['return_n_test_samples'],
                             return_times=data['return_times'],
                             return_estimator=data['return_estimator'],
                             error_score=data['error_score'],
                             return_all=False)
      for parameters in para.values())

    source_labels = ['RET']

    source_data =\
        pd.Series([ret],
                  index=source_labels,
                  name='Fit and Score Output')
    source_data.to_hdf(args.out, 'RET')
def nocrossval(config,
               label_data_train,
               label_data_test,
               image_features_train,
               image_features_test,
               param_grid=None,
               use_fastr=False,
               fastr_plugin=None,
               ensemble={'Use': False},
               modus='singlelabel'):
    """Constructs multiple individual classifiers based on the label settings.

    Arguments:
        config (Dict): Dictionary with config settings
        label_data (Dict): should contain:
        patient_ids (list): ids of the patients, used to keep track of test and
                 training sets, and label data
        label (list): List of lists, where each list contains the
                               label status for that patient for each
                               label
        label_name (list): Contains the different names that are stored
                              in the label object
        image_features (numpy array): Consists of a tuple of two lists for each patient:
                                    (feature_values, feature_labels)

        ensemble: dictionary, optional
                Contains the configuration for constructing an ensemble.

        modus: string, default 'singlelabel'
                Determine whether one-vs-all classification (or regression) for
                each single label is used ('singlelabel') or if multilabel
                classification is performed ('multilabel').

    Returns:
        classifier_data (pandas dataframe)
    """

    patient_ids_train = label_data_train['patient_IDs']
    label_value_train = label_data_train['label']
    label_name_train = label_data_train['label_name']

    patient_ids_test = label_data_test['patient_IDs']
    if 'label' in label_data_test.keys():
        label_value_test = label_data_test['label']
    else:
        label_value_test = [None] * len(patient_ids_test)

    logfilename = os.path.join(os.getcwd(), 'classifier.log')
    logging.basicConfig(filename=logfilename, level=logging.DEBUG)

    classifier_labelss = dict()

    logging.debug('Starting classifier')

    # Determine modus
    if modus == 'singlelabel':
        print('Performing Single class classification.')
        logging.debug('Performing Single class classification.')
    elif modus == 'multilabel':
        print('Performing Multi label classification.')
        logging.debug('Performing Multi class classification.')
        label_name_train = [label_name_train]
    else:
        m = ('{} is not a valid modus!').format(modus)
        logging.debug(m)
        raise ae.WORCKeyError(m)

    # We only need one label instance, assuming they are all the sample
    feature_labels = image_features_train[0][1]
    for i_name in label_name_train:

        save_data = list()

        random_seed = np.random.randint(5000)

        # Split into test and training set, where the percentage of each
        # label is maintained
        X_train = image_features_train
        X_test = image_features_test
        if modus == 'singlelabel':
            Y_train = label_value_train.ravel()
            Y_test = label_value_test.ravel()
        else:
            # Sklearn multiclass requires rows to be objects/patients
            Y_train = label_value_train
            Y_train_temp = np.zeros((Y_train.shape[1], Y_train.shape[0]))
            for n_patient in range(0, Y_train.shape[1]):
                for n_label in range(0, Y_train.shape[0]):
                    Y_train_temp[n_patient, n_label] = Y_train[n_label,
                                                               n_patient]
            Y_train = Y_train_temp

            Y_test = label_value_test
            Y_test_temp = np.zeros((Y_test.shape[1], Y_test.shape[0]))
            for n_patient in range(0, Y_test.shape[1]):
                for n_label in range(0, Y_test.shape[0]):
                    Y_test_temp[n_patient, n_label] = Y_test[n_label,
                                                             n_patient]
            Y_test = Y_test_temp

        # Find best hyperparameters and construct classifier
        config['HyperOptimization']['use_fastr'] = use_fastr
        config['HyperOptimization']['fastr_plugin'] = fastr_plugin
        n_cores = config['General']['Joblib_ncores']
        trained_classifier =\
            random_search_parameters(features=X_train,
                                     labels=Y_train,
                                     param_grid=param_grid,
                                     n_cores=n_cores,
                                     **config['HyperOptimization'])

        # Create an ensemble if required
        # NOTE: removed to keep memory and storage usage low
        # trained_classifier.create_ensemble(X_train, Y_train, method=ensemble['Use'])

        # Extract the feature values
        X_train = np.asarray([x[0] for x in X_train])
        X_test = np.asarray([x[0] for x in X_test])

        temp_save_data = (trained_classifier, X_train, X_test, Y_train, Y_test,
                          patient_ids_train, patient_ids_test, random_seed)

        save_data.append(temp_save_data)

        [classifiers, X_train_set, X_test_set, Y_train_set, Y_test_set,
         patient_ID_train_set, patient_ID_test_set, seed_set] =\
            zip(*save_data)

        panda_labels = [
            'classifiers', 'X_train', 'X_test', 'Y_train', 'Y_test', 'config',
            'patient_ID_train', 'patient_ID_test', 'random_seed',
            'feature_labels'
        ]

        panda_data_temp =\
            pd.Series([classifiers, X_train_set, X_test_set, Y_train_set,
                       Y_test_set, config, patient_ID_train_set,
                       patient_ID_test_set, seed_set, feature_labels],
                      index=panda_labels,
                      name='Constructed crossvalidation')

        i_name = ''.join(i_name)
        classifier_labelss[i_name] = panda_data_temp

    panda_data = pd.DataFrame(classifier_labelss)

    return panda_data
def crossval(config,
             label_data,
             image_features,
             param_grid=None,
             use_fastr=False,
             fastr_plugin=None,
             tempsave=False,
             fixedsplits=None,
             ensemble={'Use': False},
             outputfolder=None,
             modus='singlelabel'):
    """Constructs multiple individual classifiers based on the label settings.

    Parameters
    ----------
    config: dict, mandatory
            Dictionary with config settings. See the Github Wiki for the
            available fields and formatting.

    label_data: dict, mandatory
            Should contain the following:
            patient_ids (list): ids of the patients, used to keep track of test and
                     training sets, and label data
            label (list): List of lists, where each list contains the
                                   label status for that patient for each
                                   label
            label_name (list): Contains the different names that are stored
                                  in the label object

    image_features: numpy array, mandatory
            Consists of a tuple of two lists for each patient:
            (feature_values, feature_labels)

    param_grid: dictionary, optional
            Contains the parameters and their values wich are used in the
            grid or randomized search hyperparamater optimization. See the
            construct_classifier function for some examples.

    use_fastr: boolean, default False
            If False, parallel execution through Joblib is used for fast
            execution of the hyperparameter optimization. Especially suited
            for execution on mutlicore (H)PC's. The settings used are
            specified in the config.ini file in the IOparser folder, which you
            can adjust to your system.

            If True, fastr is used to split the hyperparameter optimization in
            separate jobs. Parameters for the splitting can be specified in the
            config file. Especially suited for clusters.

    fastr_plugin: string, default None
            Determines which plugin is used for fastr executions.
            When None, uses the default plugin from the fastr config.

    tempsave: boolean, default False
            If True, create a .hdf5 file after each Cross-validation containing
            the classifier and results from that that split. This is written to
            the GSOut folder in your fastr output mount. If False, only
            the result of all combined Cross-validations will be saved to a .hdf5
            file. This will also be done if set to True.

    fixedsplits: string, optional
            By default, random split Cross-validation is used to train and
            evaluate the machine learning methods. Optionally, you can provide
            a .xlsx file containing fixed splits to be used. See the Github Wiki
            for the format.

    ensemble: dictionary, optional
            Contains the configuration for constructing an ensemble.

    modus: string, default 'singlelabel'
            Determine whether one-vs-all classification (or regression) for
            each single label is used ('singlelabel') or if multilabel
            classification is performed ('multilabel').

    Returns
    ----------
    panda_data: pandas dataframe
            Contains all information on the trained classifier.

    """
    # Process input data
    patient_ids = label_data['patient_IDs']
    label_value = label_data['label']
    label_name = label_data['label_name']

    if outputfolder is None:
        outputfolder = os.getcwd()

    logfilename = os.path.join(outputfolder, 'classifier.log')
    print("Logging to file " + str(logfilename))

    # Cross-validation iteration to start with
    start = 0
    save_data = list()
    if tempsave:
        tempfolder = os.path.join(outputfolder, 'tempsave')
        if not os.path.exists(tempfolder):
            # No previous tempsaves
            os.makedirs(tempfolder)
        else:
            # Previous tempsaves, start where we left of
            tempsaves = glob.glob(os.path.join(tempfolder, 'tempsave_*.hdf5'))
            start = len(tempsaves)

            # Load previous tempsaves and add to save data
            tempsaves.sort()
            for t in tempsaves:
                t = pd.read_hdf(t)
                t = t['Constructed crossvalidation']
                temp_save_data = (t.trained_classifier, t.X_train, t.X_test,
                                  t.Y_train, t.Y_test, t.patient_ID_train,
                                  t.patient_ID_test, t.random_seed)

                save_data.append(temp_save_data)
    else:
        tempfolder = None

    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)

    logging.basicConfig(filename=logfilename, level=logging.DEBUG)
    crossval_type = config['CrossValidation']['Type']
    n_iterations = config['CrossValidation']['N_iterations']
    test_size = config['CrossValidation']['test_size']
    fixed_seed = config['CrossValidation']['fixed_seed']

    classifier_labelss = dict()
    logging.debug('Starting fitting of estimators.')

    # We only need one label instance, assuming they are all the sample
    feature_labels = image_features[0][1]

    # Check if we need to use fixedsplits:
    if fixedsplits is not None and '.csv' in fixedsplits:
        fixedsplits = pd.read_csv(fixedsplits, header=0)

    if modus == 'singlelabel':
        print('Performing single-class classification.')
        logging.debug('Performing single-class classification.')
    elif modus == 'multilabel':
        print('Performing multi-label classification.')
        logging.debug('Performing multi-class classification.')
        label_value = [label_value]
        label_name = [label_name]
    else:
        m = ('{} is not a valid modus!').format(modus)
        logging.debug(m)
        raise ae.WORCKeyError(m)

    for i_class, i_name in zip(label_value, label_name):
        if not tempsave:
            save_data = list()

        if crossval_type == 'random_split':
            print('Performing random-split cross-validations.')
            logging.debug('Performing random-split cross-validations.')
            save_data =\
                random_split_cross_validation(image_features=image_features,
                                              feature_labels=feature_labels,
                                              classes=i_class,
                                              patient_ids=patient_ids,
                                              n_iterations=n_iterations,
                                              param_grid=param_grid,
                                              config=config,
                                              modus=modus,
                                              test_size=test_size,
                                              start=start,
                                              save_data=save_data,
                                              tempsave=tempsave,
                                              tempfolder=tempfolder,
                                              fixedsplits=fixedsplits,
                                              fixed_seed=fixed_seed,
                                              use_fastr=use_fastr,
                                              fastr_plugin=fastr_plugin)
        elif crossval_type == 'LOO':
            print('Performing leave-one-out cross-validations.')
            logging.debug('Performing leave-one-out cross-validations.')
            save_data =\
                LOO_cross_validation(image_features=image_features,
                                     feature_labels=feature_labels,
                                     classes=i_class,
                                     patient_ids=patient_ids,
                                     param_grid=param_grid,
                                     config=config,
                                     modus=modus,
                                     test_size=test_size,
                                     start=start,
                                     save_data=save_data,
                                     tempsave=tempsave,
                                     tempfolder=tempfolder,
                                     fixedsplits=fixedsplits,
                                     fixed_seed=fixed_seed,
                                     use_fastr=use_fastr,
                                     fastr_plugin=fastr_plugin)
        else:
            raise ae.WORCKeyError(
                f'{crossval_type} is not a recognized cross-validation type.')

        [classifiers, X_train_set, X_test_set, Y_train_set, Y_test_set,
         patient_ID_train_set, patient_ID_test_set, seed_set] =\
            zip(*save_data)

        # Convert to lists
        classifiers = list(classifiers)
        X_train_set = list(X_train_set)
        X_test_set = list(X_test_set)
        Y_train_set = list(Y_train_set)
        Y_test_set = list(Y_test_set)
        patient_ID_train_set = list(patient_ID_train_set)
        patient_ID_test_set = list(patient_ID_test_set)
        seed_set = list(seed_set)

        panda_labels = [
            'classifiers', 'X_train', 'X_test', 'Y_train', 'Y_test', 'config',
            'patient_ID_train', 'patient_ID_test', 'random_seed',
            'feature_labels'
        ]

        panda_data_temp =\
            pd.Series([classifiers, X_train_set, X_test_set, Y_train_set,
                       Y_test_set, config, patient_ID_train_set,
                       patient_ID_test_set, seed_set, feature_labels],
                      index=panda_labels,
                      name='Constructed crossvalidation')

        if modus == 'singlelabel':
            i_name = ''.join(i_name)
        elif modus == 'multilabel':
            i_name = ','.join(i_name)

        classifier_labelss[i_name] = panda_data_temp

    panda_data = pd.DataFrame(classifier_labelss)

    return panda_data
def random_split_cross_validation(image_features,
                                  feature_labels,
                                  classes,
                                  patient_ids,
                                  n_iterations,
                                  param_grid,
                                  config,
                                  modus,
                                  test_size,
                                  start=0,
                                  save_data=None,
                                  tempsave=False,
                                  tempfolder=None,
                                  fixedsplits=None,
                                  fixed_seed=False,
                                  use_fastr=None,
                                  fastr_plugin=None):
    """Cross-validation in which data is randomly split in each iteration.

    Due to options of doing single-label and multi-label classification,
    stratified splitting, and regression, we use a manual loop instead
    of the default scikit-learn object.

    Parameters
    ------------

    Returns
    ------------

    """
    print('Starting random-split cross-validation.')
    logging.debug('Starting random-split cross-validation.')
    if save_data is None:
        # Start from zero, thus empty list of previos data
        save_data = list()

    for i in range(start, n_iterations):
        print(('Cross-validation iteration {} / {} .').format(
            str(i + 1), str(n_iterations)))
        logging.debug(('Cross-validation iteration {} / {} .').format(
            str(i + 1), str(n_iterations)))
        timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
        print(f'\t Time: {timestamp}.')
        logging.debug(f'\t Time: {timestamp}.')
        if fixed_seed:
            random_seed = i**2
        else:
            random_seed = np.random.randint(5000)

        t = time.time()

        # Split into test and training set, where the percentage of each
        # label is maintained
        if any(clf in regressors for clf in param_grid['classifiers']):
            # We cannot do a stratified shuffle split with regression
            stratify = None
        else:
            if modus == 'singlelabel':
                classes_temp = stratify = classes.ravel()
            elif modus == 'multilabel':
                # Create a stratification object from the labels
                # Label = 0 means no label equals one
                # Other label numbers refer to the label name that is 1
                stratify = list()
                for pnum in range(0, len(classes[0])):
                    plabel = 0
                    for lnum, slabel in enumerate(classes):
                        if slabel[pnum] == 1:
                            plabel = lnum + 1
                    stratify.append(plabel)

                # Sklearn multiclass requires rows to be objects/patients
                classes_temp = np.zeros((classes.shape[1], classes.shape[0]))
                for n_patient in range(0, classes.shape[1]):
                    for n_label in range(0, classes.shape[0]):
                        classes_temp[n_patient, n_label] = classes[n_label,
                                                                   n_patient]
            else:
                raise ae.WORCKeyError('{} is not a valid modus!').format(modus)

        if fixedsplits is None:
            # Use Random Split. Split per patient, not per sample
            unique_patient_ids, unique_indices =\
                np.unique(np.asarray(patient_ids), return_index=True)
            if any(clf in regressors for clf in param_grid['classifiers']):
                unique_stratify = None
            else:
                unique_stratify = [stratify[i] for i in unique_indices]

            try:
                unique_PID_train, indices_PID_test\
                    = train_test_split(unique_patient_ids,
                                       test_size=test_size,
                                       random_state=random_seed,
                                       stratify=unique_stratify)
            except ValueError as e:
                e = str(e) + ' Increase the size of your validation set.'
                raise ae.WORCValueError(e)

            # Check for all ids if they are in test or training
            indices_train = list()
            indices_test = list()
            patient_ID_train = list()
            patient_ID_test = list()
            for num, pid in enumerate(patient_ids):
                if pid in unique_PID_train:
                    indices_train.append(num)

                    # Make sure we get a unique ID
                    if pid in patient_ID_train:
                        n = 1
                        while str(pid + '_' + str(n)) in patient_ID_train:
                            n += 1
                        pid = str(pid + '_' + str(n))
                    patient_ID_train.append(pid)
                else:
                    indices_test.append(num)

                    # Make sure we get a unique ID
                    if pid in patient_ID_test:
                        n = 1
                        while str(pid + '_' + str(n)) in patient_ID_test:
                            n += 1
                        pid = str(pid + '_' + str(n))
                    patient_ID_test.append(pid)

            # Split features and labels accordingly
            X_train = [image_features[i] for i in indices_train]
            X_test = [image_features[i] for i in indices_test]
            if modus == 'singlelabel':
                Y_train = classes_temp[indices_train]
                Y_test = classes_temp[indices_test]
            elif modus == 'multilabel':
                Y_train = classes_temp[indices_train, :]
                Y_test = classes_temp[indices_test, :]
            else:
                raise ae.WORCKeyError('{} is not a valid modus!').format(modus)

        else:
            # Use pre defined splits
            train = fixedsplits[str(i) + '_train'].values
            test = fixedsplits[str(i) + '_test'].values

            # Convert the numbers to the correct indices
            ind_train = list()
            for j in train:
                success = False
                for num, p in enumerate(patient_ids):
                    if j == p:
                        ind_train.append(num)
                        success = True
                if not success:
                    raise ae.WORCIOError("Patient " + str(j).zfill(3) +
                                         " is not included!")

            ind_test = list()
            for j in test:
                success = False
                for num, p in enumerate(patient_ids):
                    if j == p:
                        ind_test.append(num)
                        success = True
                if not success:
                    raise ae.WORCIOError("Patient " + str(j).zfill(3) +
                                         " is not included!")

            X_train = [image_features[i] for i in ind_train]
            X_test = [image_features[i] for i in ind_test]

            patient_ID_train = patient_ids[ind_train]
            patient_ID_test = patient_ids[ind_test]

            if modus == 'singlelabel':
                Y_train = classes_temp[ind_train]
                Y_test = classes_temp[ind_test]
            elif modus == 'multilabel':
                Y_train = classes_temp[ind_train, :]
                Y_test = classes_temp[ind_test, :]
            else:
                raise ae.WORCKeyError('{} is not a valid modus!').format(modus)

        # Find best hyperparameters and construct classifier
        config['HyperOptimization']['use_fastr'] = use_fastr
        config['HyperOptimization']['fastr_plugin'] = fastr_plugin
        n_cores = config['General']['Joblib_ncores']
        trained_classifier = random_search_parameters(
            features=X_train,
            labels=Y_train,
            param_grid=param_grid,
            n_cores=n_cores,
            random_seed=random_seed,
            **config['HyperOptimization'])

        # We only want to save the feature values and one label array
        X_train = [x[0] for x in X_train]
        X_test = [x[0] for x in X_test]

        temp_save_data = (trained_classifier, X_train, X_test, Y_train, Y_test,
                          patient_ID_train, patient_ID_test, random_seed)

        save_data.append(temp_save_data)

        # Create a temporary save
        if tempsave:
            panda_labels = [
                'trained_classifier', 'X_train', 'X_test', 'Y_train', 'Y_test',
                'config', 'patient_ID_train', 'patient_ID_test', 'random_seed',
                'feature_labels'
            ]

            panda_data_temp =\
                pd.Series([trained_classifier, X_train, X_test, Y_train,
                           Y_test, config, patient_ID_train,
                           patient_ID_test, random_seed, feature_labels],
                          index=panda_labels,
                          name='Constructed crossvalidation')

            panda_data = pd.DataFrame(panda_data_temp)
            n = 0
            filename = os.path.join(tempfolder, 'tempsave_' + str(i) + '.hdf5')
            while os.path.exists(filename):
                n += 1
                filename = os.path.join(tempfolder,
                                        'tempsave_' + str(i + n) + '.hdf5')

            panda_data.to_hdf(filename, 'EstimatorData')
            del panda_data, panda_data_temp

        # Print elapsed time
        elapsed = int((time.time() - t) / 60.0)
        print(f'\t Fitting took {elapsed} minutes.')
        logging.debug(f'\t Fitting took {elapsed} minutes.')

    return save_data
def LOO_cross_validation(image_features,
                         feature_labels,
                         classes,
                         patient_ids,
                         param_grid,
                         config,
                         modus,
                         test_size,
                         start=0,
                         save_data=None,
                         tempsave=False,
                         tempfolder=None,
                         fixedsplits=None,
                         fixed_seed=False,
                         use_fastr=None,
                         fastr_plugin=None):
    """Cross-validation in which each sample is once used as the test set.

    Mostly based on the default sklearn object.

    Parameters
    ------------

    Returns
    ------------

    """
    print('Starting leave-one-out cross-validation.')
    logging.debug('Starting leave-one-out cross-validation.')
    cv = LeaveOneOut()
    n_splits = cv.get_n_splits(image_features)

    if save_data is None:
        # Start from zero, thus empty list of previos data
        save_data = list()

    for i, (indices_train,
            indices_test) in enumerate(cv.split(image_features)):
        if i < start:
            continue

        print(('Cross-validation iteration {} / {} .').format(
            str(i + 1), str(n_splits)))
        logging.debug(('Cross-validation iteration {} / {} .').format(
            str(i + 1), str(n_splits)))
        timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
        print(f'\t Time: {timestamp}.')
        logging.debug(f'\t Time: {timestamp}.')
        if fixed_seed:
            random_seed = i**2
        else:
            random_seed = np.random.randint(5000)

        t = time.time()

        # Split features and labels accordingly
        X_train = [image_features[j] for j in indices_train]
        X_test = [image_features[j] for j in indices_test]
        patient_ID_train = [patient_ids[j] for j in indices_train]
        patient_ID_test = [patient_ids[j] for j in indices_test]

        if modus == 'singlelabel':
            # Simply use the given class labels
            classes_temp = classes.ravel()

            # Split in training and testing
            Y_train = classes_temp[indices_train]
            Y_test = classes_temp[indices_test]

        elif modus == 'multilabel':
            # Sklearn multiclass requires rows to be objects/patients
            classes_temp = np.zeros((classes.shape[1], classes.shape[0]))
            for n_patient in range(0, classes.shape[1]):
                for n_label in range(0, classes.shape[0]):
                    classes_temp[n_patient, n_label] = classes[n_label,
                                                               n_patient]

            # Split in training and testing
            Y_train = classes_temp[indices_train, :]
            Y_test = classes_temp[indices_test, :]

        else:
            raise ae.WORCKeyError('{} is not a valid modus!').format(modus)

        # Find best hyperparameters and construct classifier
        config['HyperOptimization']['use_fastr'] = use_fastr
        config['HyperOptimization']['fastr_plugin'] = fastr_plugin
        n_cores = config['General']['Joblib_ncores']
        trained_classifier = random_search_parameters(
            features=X_train,
            labels=Y_train,
            param_grid=param_grid,
            n_cores=n_cores,
            random_seed=random_seed,
            **config['HyperOptimization'])

        # We only want to save the feature values and one label array
        X_train = [x[0] for x in X_train]
        X_test = [x[0] for x in X_test]

        temp_save_data = (trained_classifier, X_train, X_test, Y_train, Y_test,
                          patient_ID_train, patient_ID_test, random_seed)

        save_data.append(temp_save_data)

        # Create a temporary save
        if tempsave:
            panda_labels = [
                'trained_classifier', 'X_train', 'X_test', 'Y_train', 'Y_test',
                'config', 'patient_ID_train', 'patient_ID_test', 'random_seed',
                'feature_labels'
            ]

            panda_data_temp =\
                pd.Series([trained_classifier, X_train, X_test, Y_train,
                           Y_test, config, patient_ID_train,
                           patient_ID_test, random_seed, feature_labels],
                          index=panda_labels,
                          name='Constructed crossvalidation')

            panda_data = pd.DataFrame(panda_data_temp)
            n = 0
            filename = os.path.join(tempfolder, 'tempsave_' + str(i) + '.hdf5')
            while os.path.exists(filename):
                n += 1
                filename = os.path.join(tempfolder,
                                        'tempsave_' + str(i + n) + '.hdf5')

            panda_data.to_hdf(filename, 'EstimatorData')
            del panda_data, panda_data_temp

        # Print elapsed time
        elapsed = int((time.time() - t) / 60.0)
        print(f'\t Fitting took {elapsed} minutes.')
        logging.debug(f'\t Fitting took {elapsed} minutes.')

    return save_data
Пример #11
0
def fit_and_score(X,
                  y,
                  scoring,
                  train,
                  test,
                  parameters,
                  fit_params=None,
                  return_train_score=True,
                  return_n_test_samples=True,
                  return_times=True,
                  return_parameters=False,
                  return_estimator=False,
                  error_score='raise',
                  verbose=True,
                  return_all=True):
    """Fit an estimator to a dataset and score the performance.

    The following
    methods can currently be applied as preprocessing before fitting, in
    this order:
    0. Apply OneHotEncoder
    1. Apply feature imputation
    2. Select features based on feature type group (e.g. shape, histogram).
    3. Scale features with e.g. z-scoring.
    4. Apply feature selection based on variance of feature among patients.
    5. Univariate statistical testing (e.g. t-test, Wilcoxon).
    6. Use Relief feature selection.
    7. Select features based on a fit with a LASSO model.
    8. Select features using PCA.
    9. Resampling
    10. If a SingleLabel classifier is used for a MultiLabel problem,
        a OneVsRestClassifier is employed around it.

    All of the steps are optional.

    Parameters
    ----------
    estimator: sklearn estimator, mandatory
            Unfitted estimator which will be fit.

    X: array, mandatory
            Array containingfor each object (rows) the feature values
            (1st Column) and the associated feature label (2nd Column).

    y: list(?), mandatory
            List containing the labels of the objects.

    scorer: sklearn scorer, mandatory
            Function used as optimization criterion for the hyperparamater optimization.

    train: list, mandatory
            Indices of the objects to be used as training set.

    test: list, mandatory
            Indices of the objects to be used as testing set.

    parameters: dictionary, mandatory
            Contains the settings used for the above preprocessing functions
            and the fitting. TODO: Create a default object and show the
            fields.

    fit_params:dictionary, default None
            Parameters supplied to the estimator for fitting. See the SKlearn
            site for the parameters of the estimators.

    return_train_score: boolean, default True
            Save the training score to the final SearchCV object.

    return_n_test_samples: boolean, default True
            Save the number of times each sample was used in the test set
            to the final SearchCV object.

    return_times: boolean, default True
            Save the time spend for each fit to the final SearchCV object.

    return_parameters: boolean, default True
            Return the parameters used in the final fit to the final SearchCV
            object.

    return_estimator : bool, default=False
        Whether to return the fitted estimator.

    error_score: numeric or "raise" by default
            Value to assign to the score if an error occurs in estimator
            fitting. If set to "raise", the error is raised. If a numeric
            value is given, FitFailedWarning is raised. This parameter
            does not affect the refit step, which will always raise the error.

    verbose: boolean, default=True
            If True, print intermediate progress to command line. Warnings are
            always printed.

    return_all: boolean, default=True
            If False, only the ret object containing the performance will be
            returned. If True, the ret object plus all fitted objects will be
            returned.

    Returns
    ----------
    Depending on the return_all input parameter, either only ret or all objects
    below are returned.

    ret: list
        Contains optionally the train_scores and the test_scores,
        fit_time, score_time, parameters_est
        and parameters_all.

    GroupSel: WORC GroupSel Object
        Either None if the groupwise feature selection is not used, or
        the fitted object.

    VarSel: WORC VarSel Object
        Either None if the variance threshold feature selection is not used, or
        the fitted object.

    SelectModel: WORC SelectModel Object
        Either None if the feature selection based on a fittd model is not
        used, or the fitted object.

    feature_labels: list
        Labels of the features. Only one list is returned, not one per
        feature object, as we assume all samples have the same feature names.

    scaler: scaler object
        Either None if feature scaling is not used, or
        the fitted object.

    encoder: WORC Encoder Object
        Either None if feature OneHotEncoding is not used, or
        the fitted object.

    imputer: WORC Imputater Object
        Either None if feature imputation is not used, or
        the fitted object.

    pca: WORC PCA Object
        Either None if PCA based feature selection is not used, or
        the fitted object.

    StatisticalSel: WORC StatisticalSel Object
        Either None if the statistical test feature selection is not used, or
        the fitted object.

    ReliefSel: WORC ReliefSel Object
        Either None if the RELIEF feature selection is not used, or
        the fitted object.

    Sampler: WORC ObjectSampler Object
        Either None if no resampling is used, or an ObjectSampler object


    """
    # We copy the parameter object so we can alter it and keep the original
    if verbose:
        print("\n")
        print('#######################################')
        print('Starting fit and score of new workflow.')
    para_estimator = parameters.copy()
    estimator = cc.construct_classifier(para_estimator)

    # Check the scorer
    scorers, __ = check_multimetric_scoring(estimator, scoring=scoring)

    para_estimator = delete_cc_para(para_estimator)

    # Get random seed from parameters
    random_seed = para_estimator['random_seed']
    del para_estimator['random_seed']

    # X is a tuple: split in two arrays
    feature_values = np.asarray([x[0] for x in X])
    feature_labels = np.asarray([x[1] for x in X])

    # Split in train and testing
    X_train, y_train = _safe_split(estimator, feature_values, y, train)
    X_test, y_test = _safe_split(estimator, feature_values, y, test, train)
    train = np.arange(0, len(y_train))
    test = np.arange(len(y_train), len(y_train) + len(y_test))

    # Set some defaults for if a part fails and we return a dummy
    fit_time = np.inf
    score_time = np.inf
    Sampler = None
    encoder = None
    imputer = None
    scaler = None
    GroupSel = None
    SelectModel = None
    pca = None
    StatisticalSel = None
    VarSel = None
    ReliefSel = None
    if isinstance(scorers, dict):
        test_scores = {name: np.nan for name in scorers}
        if return_train_score:
            train_scores = test_scores.copy()
    else:
        test_scores = error_score
        if return_train_score:
            train_scores = error_score

    # Initiate dummy return object for when fit and scoring failes: sklearn defaults
    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(para_estimator)
    if return_estimator:
        ret.append(estimator)

    # Additional to sklearn defaults: return all parameters
    ret.append(parameters)

    # ------------------------------------------------------------------------
    # OneHotEncoder
    if 'OneHotEncoding' in para_estimator.keys():
        if para_estimator['OneHotEncoding'] == 'True':
            if verbose:
                print(f'Applying OneHotEncoding, will ignore unknowns.')
            feature_labels_tofit =\
                para_estimator['OneHotEncoding_feature_labels_tofit']
            encoder =\
                OneHotEncoderWrapper(handle_unknown='ignore',
                                     feature_labels_tofit=feature_labels_tofit,
                                     verbose=verbose)
            encoder.fit(X_train, feature_labels)

            if encoder.encoder is not None:
                # Encoder is fitted
                feature_labels = encoder.encoder.encoded_feature_labels
                X_train = encoder.transform(X_train)
                X_test = encoder.transform(X_test)

        del para_estimator['OneHotEncoding']
        del para_estimator['OneHotEncoding_feature_labels_tofit']

    # Delete the object if we do not need to return it
    if not return_all:
        del encoder

    # ------------------------------------------------------------------------
    # Feature imputation
    if 'Imputation' in para_estimator.keys():
        if para_estimator['Imputation'] == 'True':
            imp_type = para_estimator['ImputationMethod']
            if verbose:
                print(f'Imputing NaN with {imp_type}.')
            imp_nn = para_estimator['ImputationNeighbours']

            imputer = Imputer(missing_values=np.nan,
                              strategy=imp_type,
                              n_neighbors=imp_nn)
            imputer.fit(X_train)

            original_shape = X_train.shape
            X_train = imputer.transform(X_train)
            imputed_shape = X_train.shape
            X_test = imputer.transform(X_test)

            if original_shape != imputed_shape:
                removed_features = original_shape[1] - imputed_shape[1]
                raise ae.WORCValueError(
                    f'Several features ({removed_features}) were np.NaN for all objects. Hence, imputation was not possible. Either make sure this is correct and turn of imputation, or correct the feature.'
                )

        del para_estimator['Imputation']
        del para_estimator['ImputationMethod']
        del para_estimator['ImputationNeighbours']

    # Delete the object if we do not need to return it
    if not return_all:
        del imputer

    # Remove any NaN feature values if these are still left after imputation
    X_train = replacenan(X_train,
                         verbose=verbose,
                         feature_labels=feature_labels[0])
    X_test = replacenan(X_test,
                        verbose=verbose,
                        feature_labels=feature_labels[0])

    # ------------------------------------------------------------------------
    # Groupwise feature selection
    if 'SelectGroups' in para_estimator:
        if verbose:
            print("Selecting groups of features.")
        del para_estimator['SelectGroups']
        # TODO: more elegant way to solve this
        feature_groups = [
            'shape_features', 'histogram_features', 'orientation_features',
            'texture_gabor_features', 'texture_glcm_features',
            'texture_gldm_features', 'texture_glcmms_features',
            'texture_glrlm_features', 'texture_glszm_features',
            'texture_gldzm_features', 'texture_ngtdm_features',
            'texture_ngldm_features', 'texture_lbp_features', 'dicom_features',
            'semantic_features', 'coliage_features', 'vessel_features',
            'phase_features', 'fractal_features', 'location_features',
            'rgrd_features', 'original_features', 'wavelet_features',
            'log_features'
        ]

        # First take out the toolbox selection, which is a list
        toolboxes = para_estimator['toolbox']
        del para_estimator['toolbox']

        # Check per feature group if the parameter is present
        parameters_featsel = dict()
        for group in feature_groups:
            if group not in para_estimator:
                # Default: do use the group, except for texture features
                if group == 'texture_features':
                    value = 'False'
                else:
                    value = 'True'
            else:
                value = para_estimator[group]
                del para_estimator[group]

            parameters_featsel[group] = value

        # Fit groupwise feature selection object
        GroupSel = SelectGroups(parameters=parameters_featsel,
                                toolboxes=toolboxes)
        GroupSel.fit(feature_labels[0])
        if verbose:
            print("\t Original Length: " + str(len(X_train[0])))

        # Transform all objectd accordingly
        X_train = GroupSel.transform(X_train)
        X_test = GroupSel.transform(X_test)
        if verbose:
            print("\t New Length: " + str(len(X_train[0])))
        feature_labels = GroupSel.transform(feature_labels)

    # Delete the object if we do not need to return it
    if not return_all:
        del GroupSel

    # Check whether there are any features left
    if len(X_train[0]) == 0:
        # TODO: Make a specific WORC exception for this warning.
        if verbose:
            print(
                '[WARNING]: No features are selected! Probably all feature groups were set to False. Parameters:'
            )
            print(parameters)

        # Delete the non-used fields
        para_estimator = delete_nonestimator_parameters(para_estimator)

        if return_all:
            return ret, GroupSel, VarSel, SelectModel, feature_labels[
                0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
        else:
            return ret

    # ------------------------------------------------------------------------
    # Feature scaling
    if verbose and para_estimator['FeatureScaling'] != 'None':
        print(f'Fitting scaler and transforming features, method ' +
              f'{para_estimator["FeatureScaling"]}.')

    scaling_method = para_estimator['FeatureScaling']
    if scaling_method == 'None':
        scaler = None
    else:
        skip_features = para_estimator['FeatureScaling_skip_features']
        n_skip_feat = len([
            i for i in feature_labels[0] if any(e in i for e in skip_features)
        ])
        if n_skip_feat == len(X_train[0]):
            # Don't need to scale any features
            if verbose:
                print(
                    '[WORC Warning] Skipping scaling, only skip features selected.'
                )
            scaler = None
        else:
            scaler = WORCScaler(method=scaling_method,
                                skip_features=skip_features)
            scaler.fit(X_train, feature_labels[0])

    if scaler is not None:
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

    del para_estimator['FeatureScaling']

    # Delete the object if we do not need to return it
    if not return_all:
        del scaler

    # --------------------------------------------------------------------
    # Feature selection based on variance
    if para_estimator['Featsel_Variance'] == 'True':
        if verbose:
            print("Selecting features based on variance.")
        if verbose:
            print("\t Original Length: " + str(len(X_train[0])))
        try:
            X_train, feature_labels, VarSel =\
                selfeat_variance(X_train, feature_labels)
            X_test = VarSel.transform(X_test)
        except ValueError:
            if verbose:
                print(
                    '[WARNING]: No features meet the selected Variance threshold! Skipping selection.'
                )
        if verbose:
            print("\t New Length: " + str(len(X_train[0])))

    del para_estimator['Featsel_Variance']

    # Delete the object if we do not need to return it
    if not return_all:
        del VarSel

    # Check whether there are any features left
    if len(X_train[0]) == 0:
        # TODO: Make a specific WORC exception for this warning.
        if verbose:
            print(
                '[WARNING]: No features are selected! Probably your features have too little variance. Parameters:'
            )
            print(parameters)
        para_estimator = delete_nonestimator_parameters(para_estimator)

        if return_all:
            return ret, GroupSel, VarSel, SelectModel, feature_labels[
                0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
        else:
            return ret

    # --------------------------------------------------------------------
    # Relief feature selection, possibly multi classself.
    # Needs to be done after scaling!
    # para_estimator['ReliefUse'] = 'True'
    if 'ReliefUse' in para_estimator.keys():
        if para_estimator['ReliefUse'] == 'True':
            if verbose:
                print("Selecting features using relief.")

            # Get parameters from para_estimator
            n_neighbours = para_estimator['ReliefNN']
            sample_size = para_estimator['ReliefSampleSize']
            distance_p = para_estimator['ReliefDistanceP']
            numf = para_estimator['ReliefNumFeatures']

            # Fit RELIEF object
            ReliefSel = SelectMulticlassRelief(n_neighbours=n_neighbours,
                                               sample_size=sample_size,
                                               distance_p=distance_p,
                                               numf=numf,
                                               random_state=random_seed)
            ReliefSel.fit(X_train, y)
            if verbose:
                print("\t Original Length: " + str(len(X_train[0])))

            # Transform all objects accordingly
            X_train = ReliefSel.transform(X_train)
            X_test = ReliefSel.transform(X_test)

            if verbose:
                print("\t New Length: " + str(len(X_train[0])))
            feature_labels = ReliefSel.transform(feature_labels)

        del para_estimator['ReliefUse']
        del para_estimator['ReliefNN']
        del para_estimator['ReliefSampleSize']
        del para_estimator['ReliefDistanceP']
        del para_estimator['ReliefNumFeatures']

    # Delete the object if we do not need to return it
    if not return_all:
        del ReliefSel

    # Check whether there are any features left
    if len(X_train[0]) == 0:
        # TODO: Make a specific WORC exception for this warning.
        if verbose:
            print(
                '[WARNING]: No features are selected! Probably RELIEF could not properly select features. Parameters:'
            )
            print(parameters)
        para_estimator = delete_nonestimator_parameters(para_estimator)

        if return_all:
            return ret, GroupSel, VarSel, SelectModel, feature_labels[
                0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
        else:
            return ret

    # ------------------------------------------------------------------------
    # Perform feature selection using a model
    para_estimator['SelectFromModel'] = 'True'
    if 'SelectFromModel' in para_estimator.keys(
    ) and para_estimator['SelectFromModel'] == 'True':
        model = para_estimator['SelectFromModel_estimator']
        if verbose:
            print(f"Selecting features using model {model}.")

        if model == 'Lasso':
            # Use lasso model for feature selection
            alpha = para_estimator['SelectFromModel_lasso_alpha']
            selectestimator = Lasso(alpha=alpha)

        elif model == 'LR':
            # Use logistic regression model for feature selection
            selectestimator = LogisticRegression()

        elif model == 'RF':
            # Use random forest model for feature selection
            n_estimators = para_estimator['SelectFromModel_n_trees']
            selectestimator = RandomForestClassifier(n_estimators=n_estimators)
        else:
            raise ae.WORCKeyError(
                f'Model {model} is not known for SelectFromModel. Use Lasso, LR, or RF.'
            )

        # Prefit model
        selectestimator.fit(X_train, y_train)

        # Use fit to select optimal features
        SelectModel = SelectFromModel(selectestimator, prefit=True)
        if verbose:
            print("\t Original Length: " + str(len(X_train[0])))

        X_train_temp = SelectModel.transform(X_train)
        if len(X_train_temp[0]) == 0:
            if verbose:
                print(
                    '[WORC WARNING]: No features are selected! Probably your data is too noisy or the selection too strict. Skipping SelectFromModel.'
                )
            SelectModel = None
            parameters['SelectFromModel'] = 'False'
        else:
            X_train = SelectModel.transform(X_train)
            X_test = SelectModel.transform(X_test)
            feature_labels = SelectModel.transform(feature_labels)

            if verbose:
                print("\t New Length: " + str(len(X_train[0])))

    if 'SelectFromModel' in para_estimator.keys():
        del para_estimator['SelectFromModel']
        del para_estimator['SelectFromModel_lasso_alpha']
        del para_estimator['SelectFromModel_estimator']
        del para_estimator['SelectFromModel_n_trees']

    # Delete the object if we do not need to return it
    if not return_all:
        del SelectModel

    # Check whether there are any features left
    if len(X_train[0]) == 0:
        # TODO: Make a specific WORC exception for this warning.
        if verbose:
            print(
                '[WARNING]: No features are selected! Probably SelectFromModel could not properly select features. Parameters:'
            )
            print(parameters)
        para_estimator = delete_nonestimator_parameters(para_estimator)

        if return_all:
            return ret, GroupSel, VarSel, SelectModel, feature_labels[
                0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
        else:
            return ret

    # ----------------------------------------------------------------
    # PCA dimensionality reduction
    # Principle Component Analysis
    if 'UsePCA' in para_estimator.keys(
    ) and para_estimator['UsePCA'] == 'True':
        if verbose:
            print('Fitting PCA')
            print("\t Original Length: " + str(len(X_train[0])))
        if para_estimator['PCAType'] == '95variance':
            # Select first X components that describe 95 percent of the explained variance
            pca = PCA(n_components=None, random_state=random_seed)
            try:
                pca.fit(X_train)
            except (ValueError, LinAlgError) as e:
                if verbose:
                    print(
                        f'[WARNING]: skipping this setting due to PCA Error: {e}.'
                    )

                if return_all:
                    return ret, GroupSel, VarSel, SelectModel, feature_labels[
                        0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
                else:
                    return ret

            evariance = pca.explained_variance_ratio_
            num = 0
            sum = 0
            while sum < 0.95:
                sum += evariance[num]
                num += 1

            # Make a PCA based on the determined amound of components
            pca = PCA(n_components=num, random_state=random_seed)
            try:
                pca.fit(X_train)
            except (ValueError, LinAlgError) as e:
                if verbose:
                    print(
                        f'[WARNING]: skipping this setting due to PCA Error: {e}.'
                    )

                if return_all:
                    return ret, GroupSel, VarSel, SelectModel, feature_labels[
                        0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
                else:
                    return ret

            X_train = pca.transform(X_train)
            X_test = pca.transform(X_test)

        else:
            # Assume a fixed number of components: cannot be larger than
            # n_samples
            n_components = min(len(X_train), int(para_estimator['PCAType']))

            if n_components >= len(X_train[0]):
                if verbose:
                    print(
                        f"[WORC WARNING] PCA n_components ({n_components})> n_features ({len(X_train[0])}): skipping PCA."
                    )
            else:
                pca = PCA(n_components=n_components, random_state=random_seed)
                pca.fit(X_train)
                X_train = pca.transform(X_train)
                X_test = pca.transform(X_test)

        if verbose:
            print("\t New Length: " + str(len(X_train[0])))

    # Delete the object if we do not need to return it
    if not return_all:
        del pca

    if 'UsePCA' in para_estimator.keys():
        del para_estimator['UsePCA']
        del para_estimator['PCAType']

    # --------------------------------------------------------------------
    # Feature selection based on a statistical test
    if 'StatisticalTestUse' in para_estimator.keys():
        if para_estimator['StatisticalTestUse'] == 'True':
            metric = para_estimator['StatisticalTestMetric']
            threshold = para_estimator['StatisticalTestThreshold']
            if verbose:
                print(
                    f"Selecting features based on statistical test. Method {metric}, threshold {round(threshold, 5)}."
                )
                print("\t Original Length: " + str(len(X_train[0])))

            StatisticalSel = StatisticalTestThreshold(metric=metric,
                                                      threshold=threshold)

            StatisticalSel.fit(X_train, y)
            X_train_temp = StatisticalSel.transform(X_train)
            if len(X_train_temp[0]) == 0:
                if verbose:
                    print(
                        '[WORC WARNING]: No features are selected! Probably your statistical test feature selection was too strict. Skipping thresholding.'
                    )
                StatisticalSel = None
                parameters['StatisticalTestUse'] = 'False'
            else:
                X_train = StatisticalSel.transform(X_train)
                X_test = StatisticalSel.transform(X_test)
                feature_labels = StatisticalSel.transform(feature_labels)

            if verbose:
                print("\t New Length: " + str(len(X_train[0])))

        del para_estimator['StatisticalTestUse']
        del para_estimator['StatisticalTestMetric']
        del para_estimator['StatisticalTestThreshold']

    # Delete the object if we do not need to return it
    if not return_all:
        del StatisticalSel

    # ------------------------------------------------------------------------
    # Use object resampling
    if 'Resampling_Use' in para_estimator.keys():
        if para_estimator['Resampling_Use'] == 'True':

            # Determine our starting balance
            pos_initial = int(np.sum(y_train))
            neg_initial = int(len(y_train) - pos_initial)
            len_in = len(y_train)

            # Fit ObjectSampler and transform dataset
            # NOTE: need to save random state for this one as well!
            Sampler =\
                ObjectSampler(method=para_estimator['Resampling_Method'],
                              sampling_strategy=para_estimator['Resampling_sampling_strategy'],
                              n_jobs=para_estimator['Resampling_n_cores'],
                              n_neighbors=para_estimator['Resampling_n_neighbors'],
                              k_neighbors=para_estimator['Resampling_k_neighbors'],
                              threshold_cleaning=para_estimator['Resampling_threshold_cleaning'],
                              verbose=verbose)

            try:
                Sampler.fit(X_train, y_train)
                X_train_temp, y_train_temp = Sampler.transform(
                    X_train, y_train)

            except ae.WORCValueError as e:
                message = str(e)
                if verbose:
                    print('[WORC WARNING] Skipping resampling: ' + message)
                Sampler = None
                parameters['Resampling_Use'] = 'False'

            except RuntimeError as e:
                if 'ADASYN is not suited for this specific dataset. Use SMOTE instead.' in str(
                        e):
                    # Seldomly occurs, therefore return performance dummy
                    if verbose:
                        print(
                            f'[WARNING]: {e}. Returning dummies. Parameters: ')
                        print(parameters)
                    para_estimator = delete_nonestimator_parameters(
                        para_estimator)

                    if return_all:
                        return ret, GroupSel, VarSel, SelectModel, feature_labels[
                            0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
                    else:
                        return ret
                else:
                    raise e
            else:
                pos = int(np.sum(y_train_temp))
                neg = int(len(y_train_temp) - pos)
                if pos < 10 or neg < 10:
                    if verbose:
                        print(
                            f'[WORC WARNING] Skipping resampling: to few objects returned in one or both classes (pos: {pos}, neg: {neg}).'
                        )
                    Sampler = None
                    parameters['Resampling_Use'] = 'False'
                else:
                    X_train = X_train_temp
                    y_train = y_train_temp

                    # Notify the user what the resampling did
                    pos = int(np.sum(y_train))
                    neg = int(len(y_train) - pos)
                    if verbose:
                        message = f"Resampling from {len_in} ({pos_initial} pos," +\
                                  f" {neg_initial} neg) to {len(y_train)} ({pos} pos, {neg} neg) patients."
                        print(message)

                    # Also reset train and test indices
                    train = np.arange(0, len(y_train))
                    test = np.arange(len(y_train), len(y_train) + len(y_test))

        del para_estimator['Resampling_Use']
        del para_estimator['Resampling_Method']
        del para_estimator['Resampling_sampling_strategy']
        del para_estimator['Resampling_n_neighbors']
        del para_estimator['Resampling_k_neighbors']
        del para_estimator['Resampling_threshold_cleaning']
        del para_estimator['Resampling_n_cores']

    # Delete the object if we do not need to return it
    if not return_all:
        del Sampler

    # ----------------------------------------------------------------
    # Fitting and scoring
    # Only when using fastr this is an entry
    if 'Number' in para_estimator.keys():
        del para_estimator['Number']

    # For certainty, we delete all parameters again
    para_estimator = delete_nonestimator_parameters(para_estimator)

    # NOTE: This just has to go to the construct classifier function,
    # although it is more convenient here due to the hyperparameter search
    if type(y) is list:
        labellength = 1
    else:
        try:
            labellength = y.shape[1]
        except IndexError:
            labellength = 1

    if labellength > 1 and type(estimator) not in [
            RankedSVM, RandomForestClassifier
    ]:
        # Multiclass, hence employ a multiclass classifier for e.g. SVM, LR
        estimator.set_params(**para_estimator)
        estimator = OneVsRestClassifier(estimator)

    if verbose:
        print(f"Fitting ML method: {parameters['classifiers']}.")

    # Recombine feature values and label for train and test set
    feature_values = np.concatenate((X_train, X_test), axis=0)
    y = np.concatenate((y_train, y_test), axis=0)
    para_estimator = None

    try:
        ret = _fit_and_score(estimator,
                             feature_values,
                             y,
                             scorers,
                             train,
                             test,
                             verbose,
                             para_estimator,
                             fit_params,
                             return_train_score=return_train_score,
                             return_parameters=return_parameters,
                             return_n_test_samples=return_n_test_samples,
                             return_times=return_times,
                             return_estimator=return_estimator,
                             error_score=error_score)
    except (ValueError, LinAlgError) as e:
        if type(estimator) == LDA:
            if verbose:
                print(
                    f'[WARNING]: skipping this setting due to LDA Error: {e}.')

            if return_all:
                return ret, GroupSel, VarSel, SelectModel, feature_labels[
                    0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
            else:
                return ret
        else:
            raise e

    # Add original parameters to return object
    ret.append(parameters)

    if return_all:
        return ret, GroupSel, VarSel, SelectModel, feature_labels[
            0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
    else:
        return ret
def plot_estimator_performance(prediction,
                               label_data,
                               label_type,
                               crossval_type=None,
                               alpha=0.95,
                               ensemble=None,
                               verbose=True,
                               ensemble_scoring=None,
                               output=None,
                               modus=None,
                               thresholds=None,
                               survival=False,
                               shuffle_estimators=False,
                               bootstrap=None,
                               bootstrap_N=None,
                               overfit_scaler=None):
    """Plot the output of a single estimator, e.g. a SVM.

    Parameters
    ----------
    prediction: pandas dataframe or string, mandatory
        output of trainclassifier function, either a pandas dataframe
        or a HDF5 file

    label_data: string, mandatory
        Contains the path referring to a .txt file containing the
        patient label(s) and value(s) to be used for learning. See
        the Github Wiki for the format.

    label_type: string, mandatory
        Name of the label to extract from the label data to test the
        estimator on.

    alpha: float, default 0.95
        Significance of confidence intervals.

    ensemble: False, integer or 'Caruana'
        Determine whether an ensemble will be created. If so,
        either provide an integer to determine how many of the
        top performing classifiers should be in the ensemble, or use
        the string "Caruana" to use smart ensembling based on
        Caruana et al. 2004.

    verbose: boolean, default True
        Plot intermedate messages.

    ensemble_scoring: string, default None
        Metric to be used for evaluating the ensemble. If None,
        the option set in the prediction object will be used.

    output: string, default stats
        Determine which results are put out. If stats, the statistics of the
        estimator will be returned. If scores, the scores will be returned.

    thresholds: list of integer(s), default None
        If None, use default threshold of sklearn (0.5) on posteriors to
        converge to a binary prediction. If one integer is provided, use that one.
        If two integers are provided, posterior < thresh[0] = 0, posterior > thresh[1] = 1.

    Returns
    ----------
    Depending on the output parameters, the following outputs are returned:

    If output == 'stats':
    stats: dictionary
        Contains the confidence intervals of the performance metrics
        and the number of times each patient was classifier correctly
        or incorrectly.

    If output == 'scores':
    y_truths: list
        Contains the true label for each object.

    y_scores: list
        Contains the score (e.g. posterior) for each object.

    y_predictions: list
        Contains the predicted label for each object.

    pids: list
        Contains the patient ID/name for each object.

    """
    # Load the prediction object if it's a hdf5 file
    if type(prediction) is not pd.core.frame.DataFrame:
        if os.path.isfile(prediction):
            prediction = pd.read_hdf(prediction)
        else:
            raise ae.WORCIOError(
                ('{} is not an existing file!').format(str(prediction)))

    # Select the estimator from the pandas dataframe to use
    keys = prediction.keys()
    if label_type is None:
        label_type = keys[0]

    # Load the label data
    if type(label_data) is not dict:
        if os.path.isfile(label_data):
            if type(label_type) is not list:
                # Singlelabel: convert to list
                label_type = [[label_type]]
            label_data = lp.load_labels(label_data, label_type)
        else:
            raise ae.WORCValueError(
                f"Label data {label_data} incorrect: not a dictionary, or file does not exist."
            )

    n_labels = len(label_type)
    patient_IDs = label_data['patient_IDs']
    labels = label_data['label']

    if type(label_type) is list:
        # FIXME: Support for multiple label types not supported yet.
        print(
            '[WORC Warning] Support for multiple label types not supported yet. Taking first label for plot_estimator_performance.'
        )
        label_type = keys[0]

    # Extract the estimators, features and labels
    regression = is_regressor(
        prediction[label_type]['classifiers'][0].best_estimator_)
    feature_labels = prediction[label_type]['feature_labels']

    # Get some configuration variables if present in the prediction
    config = prediction[label_type].config
    if ensemble is None:
        ensemble = int(config['Ensemble']['Use'])

    if modus is None:
        modus = config['Labels']['modus']

    if crossval_type is None:
        crossval_type = config['CrossValidation']['Type']

    if bootstrap is None:
        bootstrap = config['Bootstrap']['Use']

    if bootstrap_N is None:
        bootstrap_N = int(config['Bootstrap']['N_iterations'])

    if overfit_scaler is None:
        overfit_scaler = config['Evaluation']['OverfitScaler']

    ensemble_metric = config['Ensemble']['Metric']

    # Create lists for performance measures
    if not regression:
        sensitivity = list()
        specificity = list()
        precision = list()
        npv = list()
        accuracy = list()
        bca = list()
        auc = list()
        f1_score_list = list()

        if modus == 'multilabel':
            acc_av = list()

            # Also add scoring measures for all single label scores
            sensitivity_single = [list() for j in n_labels]
            specificity_single = [list() for j in n_labels]
            precision_single = [list() for j in n_labels]
            npv_single = [list() for j in n_labels]
            accuracy_single = [list() for j in n_labels]
            bca_single = [list() for j in n_labels]
            auc_single = [list() for j in n_labels]
            f1_score_list_single = [list() for j in n_labels]

    else:
        r2score = list()
        MSE = list()
        coefICC = list()
        PearsonC = list()
        PearsonP = list()
        SpearmanC = list()
        SpearmanP = list()

    patient_classification_list = dict()
    percentages_selected = list()

    if output in ['scores', 'decision'] or crossval_type == 'LOO':
        # Keep track of all groundth truths and scores
        y_truths = list()
        y_scores = list()
        y_predictions = list()
        pids = list()

    # Extract sample size
    N_1 = float(len(prediction[label_type]['patient_ID_train'][0]))
    N_2 = float(len(prediction[label_type]['patient_ID_test'][0]))

    # Convert tuples to lists if required
    if type(prediction[label_type]['X_test']) is tuple:
        prediction[label_type]['X_test'] = list(
            prediction[label_type]['X_test'])
        prediction[label_type]['X_train'] = list(
            prediction[label_type]['X_train'])
        prediction[label_type]['Y_train'] = list(
            prediction[label_type]['Y_train'])
        prediction[label_type]['Y_test'] = list(
            prediction[label_type]['Y_test'])
        prediction[label_type]['patient_ID_test'] = list(
            prediction[label_type]['patient_ID_test'])
        prediction[label_type]['patient_ID_train'] = list(
            prediction[label_type]['patient_ID_train'])
        prediction[label_type]['classifiers'] = list(
            prediction[label_type]['classifiers'])

    # Loop over the test sets, which correspond to cross-validation
    # or bootstrapping iterations
    n_iter = len(prediction[label_type]['Y_test'])
    if bootstrap:
        iterobject = range(0, bootstrap_N)
    else:
        iterobject = range(0, n_iter)

    for i in iterobject:
        print("\n")
        if bootstrap:
            print(f"Bootstrap {i + 1} / {bootstrap_N}.")
        else:
            print(f"Cross-validation {i + 1} / {n_iter}.")

        test_indices = list()

        # When bootstrapping, there is only a single train/test set.
        if bootstrap:
            if i == 0:
                X_test_temp_or = prediction[label_type]['X_test'][0]
                X_train_temp = prediction[label_type]['X_train'][0]
                Y_train_temp = prediction[label_type]['Y_train'][0]
                Y_test_temp_or = prediction[label_type]['Y_test'][0]
                test_patient_IDs_or = prediction[label_type][
                    'patient_ID_test'][0]
                train_patient_IDs = prediction[label_type]['patient_ID_train'][
                    0]
                fitted_model = prediction[label_type]['classifiers'][0]

                # Objects required for first iteration
                test_patient_IDs = test_patient_IDs_or[:]
                X_test_temp = X_test_temp_or[:]
                Y_test_temp = Y_test_temp_or[:]
        else:
            X_test_temp = prediction[label_type]['X_test'][i]
            X_train_temp = prediction[label_type]['X_train'][i]
            Y_train_temp = prediction[label_type]['Y_train'][i]
            Y_test_temp = prediction[label_type]['Y_test'][i]
            test_patient_IDs = prediction[label_type]['patient_ID_test'][i]
            train_patient_IDs = prediction[label_type]['patient_ID_train'][i]
            fitted_model = prediction[label_type]['classifiers'][i]

        # Check which patients are in the test set.
        if output == 'stats' and crossval_type != 'LOO':
            for i_ID in test_patient_IDs:
                # Initiate counting how many times a patient is classified correctly
                if i_ID not in patient_classification_list:
                    patient_classification_list[i_ID] = dict()
                    patient_classification_list[i_ID]['N_test'] = 0
                    patient_classification_list[i_ID]['N_correct'] = 0
                    patient_classification_list[i_ID]['N_wrong'] = 0

                patient_classification_list[i_ID]['N_test'] += 1

                # Check if this is exactly the label of the patient within the label file
                if i_ID not in patient_IDs:
                    print(
                        f'[WORC WARNING] Patient {i_ID} is not found the patient labels, removing underscore.'
                    )
                    i_ID = i_ID.split("_")[0]
                    if i_ID not in patient_IDs:
                        print(
                            f'[WORC WARNING] Did not help, excluding patient {i_ID}.'
                        )
                        continue

                test_indices.append(np.where(patient_IDs == i_ID)[0][0])

        # Extract ground truth
        y_truth = Y_test_temp

        # If required, shuffle estimators for "Random" ensembling
        if shuffle_estimators:
            # Randomly shuffle the estimators
            print('Shuffling estimators for random ensembling.')
            shuffle(fitted_model.cv_results_['params'])

        # If requested, first let the SearchCV object create an ensemble
        if bootstrap and i > 0:
            # For bootstrapping, only do this at the first iteration
            pass
        elif not fitted_model.ensemble:
            # If required, rank according to generalization score instead of mean_validation_score
            if ensemble_metric == 'generalization':
                print('Using generalization score for estimator ranking.')
                indices = fitted_model.cv_results_['rank_generalization_score']
                fitted_model.cv_results_['params'] = [
                    fitted_model.cv_results_['params'][i]
                    for i in indices[::-1]
                ]
            elif ensemble_metric != 'Default':
                raise ae.WORCKeyError(
                    f'Metric {ensemble_metric} is not known: use Default or generalization.'
                )

            # NOTE: Added for backwards compatability
            if not hasattr(fitted_model, 'cv_iter'):
                cv_iter = list(
                    fitted_model.cv.split(X_train_temp, Y_train_temp))
                fitted_model.cv_iter = cv_iter

            # Create the ensemble
            X_train_temp = [(x, feature_labels) for x in X_train_temp]
            fitted_model.create_ensemble(X_train_temp,
                                         Y_train_temp,
                                         method=ensemble,
                                         verbose=verbose,
                                         scoring=ensemble_scoring,
                                         overfit_scaler=overfit_scaler)

        # If bootstrap, generate a bootstrapped sample
        if bootstrap and i > 0:
            y_truth, y_prediction, y_score, test_patient_IDs =\
                resample(y_truth_all, y_prediction_all,
                         y_score_all, test_patient_IDs_or)
        else:
            # Create prediction
            y_prediction = fitted_model.predict(X_test_temp)

            if regression:
                y_score = y_prediction
            elif modus == 'multilabel':
                y_score = fitted_model.predict_proba(X_test_temp)
            else:
                y_score = fitted_model.predict_proba(X_test_temp)[:, 1]

            # Create a new binary score based on the thresholds if given
            if thresholds is not None:
                if len(thresholds) == 1:
                    y_prediction = y_score >= thresholds[0]
                elif len(thresholds) == 2:
                    # X_train_temp = [x[0] for x in X_train_temp]

                    y_score_temp = list()
                    y_prediction_temp = list()
                    y_truth_temp = list()
                    test_patient_IDs_temp = list()

                    thresholds_val = fit_thresholds(thresholds, fitted_model,
                                                    X_train_temp, Y_train_temp,
                                                    ensemble, ensemble_scoring)
                    for pnum in range(len(y_score)):
                        if y_score[pnum] <= thresholds_val[0] or y_score[
                                pnum] > thresholds_val[1]:
                            y_score_temp.append(y_score[pnum])
                            y_prediction_temp.append(y_prediction[pnum])
                            y_truth_temp.append(y_truth[pnum])
                            test_patient_IDs_temp.append(
                                test_patient_IDs[pnum])

                    perc = float(len(y_prediction_temp)) / float(
                        len(y_prediction))
                    percentages_selected.append(perc)
                    print(
                        f"Selected {len(y_prediction_temp)} from {len(y_prediction)} ({perc}%) patients using two thresholds."
                    )
                    y_score = y_score_temp
                    y_prediction = y_prediction_temp
                    y_truth = y_truth_temp
                    test_patient_IDs = test_patient_IDs_temp
                else:
                    raise ae.WORCValueError(
                        f"Need None, one or two thresholds on the posterior; got {len(thresholds)}."
                    )

            # If all scores are NaN, the classifier cannot do probabilities, thus
            # use hard predictions
            if np.sum(np.isnan(y_score)) == len(y_prediction):
                print(
                    '[WORC Warning] All scores NaN, replacing with prediction.'
                )
                y_score = y_prediction

        if bootstrap and i == 0:
            # Save objects for re-use
            y_truth_all = y_truth[:]
            y_prediction_all = y_prediction[:]
            y_score_all = y_score[:]

        print("Truth: " + str(y_truth))
        print("Prediction: " + str(y_prediction))
        print("Score: " + str(y_score))

        if output == 'stats' and crossval_type != 'LOO':
            # Add if patient was classified correctly or not to counting
            for i_truth, i_predict, i_test_ID in zip(y_truth, y_prediction,
                                                     test_patient_IDs):
                if modus == 'multilabel':
                    success = (i_truth == i_predict).all()
                else:
                    success = i_truth == i_predict

                if success:
                    patient_classification_list[i_test_ID]['N_correct'] += 1
                else:
                    patient_classification_list[i_test_ID]['N_wrong'] += 1

        if output in ['decision', 'scores'] or crossval_type == 'LOO':
            # Output the posteriors
            y_scores.append(y_score)
            y_truths.append(y_truth)
            y_predictions.append(y_prediction)
            pids.append(test_patient_IDs)

        elif output == 'stats':
            # Compute statistics
            print('Computing performance statistics.')
            # Compute confusion matrix and use for sensitivity/specificity
            performances = compute_statistics(y_truth, y_score, y_prediction,
                                              modus, regression)

            # Print AUC to keep you up to date
            if not regression:
                if modus == 'singlelabel':
                    accuracy_temp, bca_temp, sensitivity_temp,\
                        specificity_temp, precision_temp, npv_temp,\
                        f1_score_temp, auc_temp = performances
                else:
                    accuracy_temp, sensitivity_temp,\
                        specificity_temp, precision_temp, npv_temp,\
                        f1_score_temp, auc_temp, acc_av_temp,\
                        accuracy_temp_single,\
                        bca_temp_single, sensitivity_temp_single,\
                        specificity_temp_single, precision_temp_single,\
                        npv_temp_single, f1_score_temp_single,\
                        auc_temp_single = performances

                print('AUC: ' + str(auc_temp))

                # Append performance to lists for all cross validations
                accuracy.append(accuracy_temp)
                bca.append(bca_temp)
                sensitivity.append(sensitivity_temp)
                specificity.append(specificity_temp)
                auc.append(auc_temp)
                f1_score_list.append(f1_score_temp)
                precision.append(precision_temp)
                npv.append(npv_temp)

                if modus == 'multilabel':
                    acc_av.append(acc_av_temp)
                    for j in n_labels:
                        accuracy_single[j].append(accuracy_temp_single[j])
                        bca_single[j].append(bca_temp_single[j])
                        sensitivity_single[j].append(
                            sensitivity_temp_single[j])
                        specificity_single[j].append(
                            specificity_temp_single[j])
                        auc_single[j].append(auc_temp_single[j])
                        f1_score_list_single[j].append(f1_score_temp_single[j])
                        precision_single[j].append(precision_temp_single[j])
                        npv_single[j].append(npv_temp_single[j])

            else:
                r2score_temp, MSE_temp, coefICC_temp, PearsonC_temp,\
                    PearsonP_temp, SpearmanC_temp,\
                    SpearmanP_temp = performances

                print('R2 Score: ' + str(r2score_temp))
                r2score.append(r2score_temp)
                MSE.append(MSE_temp)
                coefICC.append(coefICC_temp)
                PearsonC.append(PearsonC_temp)
                PearsonP.append(PearsonP_temp)
                SpearmanC.append(SpearmanC_temp)
                SpearmanP.append(SpearmanP_temp)

        # Delete some objects to save memory in cross-validtion
        if not bootstrap:
            del fitted_model, X_test_temp, X_train_temp, Y_train_temp
            del Y_test_temp, test_patient_IDs, train_patient_IDs
            prediction[label_type]['X_test'][i] = None
            prediction[label_type]['X_train'][i] = None
            prediction[label_type]['Y_train'][i] = None
            prediction[label_type]['Y_test'][i] = None
            prediction[label_type]['patient_ID_test'][i] = None
            prediction[label_type]['patient_ID_train'][i] = None
            prediction[label_type]['classifiers'][i] = None

    if output in ['scores', 'decision']:
        # Return the scores and true values of all patients
        return y_truths, y_scores, y_predictions, pids

    elif output == 'stats':
        # Compute statistics
        stats = dict()
        output = dict()
        if crossval_type == 'LOO':
            performances = compute_statistics(y_truths, y_scores,
                                              y_predictions, modus, regression)

            if not regression:
                metric_names_single = [
                    'Accuracy', 'BCA', 'Sensitivity', 'Specificity',
                    'Precision', 'NPV', 'F1-score', 'AUC'
                ]
                if modus == 'singlelabel':
                    metric_names = metric_names_single
                elif modus == 'multilabel':
                    metric_names_multi = [
                        'Accuracy', 'Sensitivity', 'Specificity', 'Precision',
                        'NPV', 'F1-score', 'AUC', 'Average Accuracy'
                    ]
                    metric_names = metric_names_multi + metric_names_single

            else:
                # Regression
                metric_names = [
                    'R2-score', 'MSE', 'ICC', 'PearsonC', 'PearsonP',
                    'SpearmanC', 'SpearmanP'
                ]

            # Put all metrics with their names in the statistics dict
            for k, v in zip(metric_names, performances):
                stats[k] = str(v)

            if thresholds is not None:
                if len(thresholds) == 2:
                    # Compute percentage of patients that was selected
                    stats["Percentage Selected"] = str(percentages_selected[0])

            output['Statistics'] = stats

        else:
            # Compute alpha confidence intervals (CIs)
            # FIXME: multilabel performance per single label not included
            # FIXME: multilabel not working in bootstrap
            # FIXME: bootstrap not done in regression
            if not regression:
                metric_names_single = [
                    'Accuracy', 'BCA', 'Sensitivity', 'Specificity',
                    'Precision', 'NPV', 'F1-score', 'AUC'
                ]

                if bootstrap:
                    # Compute once for the real test set the performance
                    X_test_temp = prediction[label_type]['X_test'][0]
                    y_truth = prediction[label_type]['Y_test'][0]
                    y_prediction = fitted_model.predict(X_test_temp)
                    y_score = fitted_model.predict_proba(X_test_temp)[:, 1]

                    performances_test =\
                        metrics.performance_singlelabel(y_truth,
                                                        y_prediction,
                                                        y_score,
                                                        regression)
                    # Aggregate bootstrapped performances
                    performances_bootstrapped =\
                        [accuracy, bca, sensitivity, specificity, precision,
                         npv, f1_score_list, auc]

                    # Compute confidence intervals for all metrics
                    for p in range(len(metric_names_single)):
                        k = metric_names_single[p] + ' 95%'
                        perf = performances_bootstrapped[p]
                        perf_test = performances_test[p]
                        stats[
                            k] = f"{perf_test} {str(compute_confidence_bootstrap(perf, perf_test, N_1, alpha))}"

                else:
                    stats[
                        "Accuracy 95%:"] = f"{np.nanmean(accuracy)} {str(compute_confidence(accuracy, N_1, N_2, alpha))}"
                    stats[
                        "BCA 95%:"] = f"{np.nanmean(bca)} {str(compute_confidence(bca, N_1, N_2, alpha))}"
                    stats[
                        "AUC 95%:"] = f"{np.nanmean(auc)} {str(compute_confidence(auc, N_1, N_2, alpha))}"
                    stats[
                        "F1-score 95%:"] = f"{np.nanmean(f1_score_list)} {str(compute_confidence(f1_score_list, N_1, N_2, alpha))}"
                    stats[
                        "Precision 95%:"] = f"{np.nanmean(precision)} {str(compute_confidence(precision, N_1, N_2, alpha))}"
                    stats[
                        "NPV 95%:"] = f"{np.nanmean(npv)} {str(compute_confidence(npv, N_1, N_2, alpha))}"
                    stats[
                        "Sensitivity 95%: "] = f"{np.nanmean(sensitivity)} {str(compute_confidence(sensitivity, N_1, N_2, alpha))}"
                    stats[
                        "Specificity 95%:"] = f"{np.nanmean(specificity)} {str(compute_confidence(specificity, N_1, N_2, alpha))}"

                    if modus == 'multilabel':
                        stats[
                            "Average Accuracy 95%:"] = f"{np.nanmean(acc_av)} {str(compute_confidence(acc_av, N_1, N_2, alpha))}"

                if thresholds is not None:
                    if len(thresholds) == 2:
                        # Compute percentage of patients that was selected
                        stats[
                            "Percentage Selected 95%:"] = f"{np.nanmean(percentages_selected)} {str(compute_confidence(percentages_selected, N_1, N_2, alpha))}"

                # Extract statistics on how often patients got classified correctly
                rankings = dict()
                alwaysright = dict()
                alwayswrong = dict()
                percentages = dict()
                timesintestset = dict()
                for i_ID in patient_classification_list:
                    percentage_right = patient_classification_list[i_ID][
                        'N_correct'] / float(
                            patient_classification_list[i_ID]['N_test'])

                    if i_ID in patient_IDs:
                        label = labels[0][np.where(i_ID == patient_IDs)]
                    else:
                        # Multiple instance of one patient
                        label = labels[0][np.where(
                            i_ID.split('_')[0] == patient_IDs)]

                    label = label[0][0]
                    percentages[i_ID] = str(label) + ': ' + str(
                        round(percentage_right, 2) * 100) + '%'
                    if percentage_right == 1.0:
                        alwaysright[i_ID] = label
                        print(f"Always Right: {i_ID}, label {label}.")

                    elif percentage_right == 0:
                        alwayswrong[i_ID] = label
                        print(f"Always Wrong: {i_ID}, label {label}.")

                    timesintestset[i_ID] = patient_classification_list[i_ID][
                        'N_test']

                rankings["Always right"] = alwaysright
                rankings["Always wrong"] = alwayswrong
                rankings['Percentages'] = percentages
                rankings['timesintestset'] = timesintestset

                output['Rankings'] = rankings

            else:
                # Regression
                stats[
                    'R2-score 95%: '] = f"{np.nanmean(r2score)} {str(compute_confidence(r2score, N_1, N_2, alpha))}"
                stats[
                    'MSE 95%: '] = f"{np.nanmean(MSE)} {str(compute_confidence(MSE, N_1, N_2, alpha))}"
                stats[
                    'ICC 95%: '] = f"{np.nanmean(coefICC)} {str(compute_confidence(coefICC, N_1, N_2, alpha))}"
                stats[
                    'PearsonC 95%: '] = f"{np.nanmean(PearsonC)} {str(compute_confidence(PearsonC, N_1, N_2, alpha))}"
                stats[
                    'PearsonP 95%: '] = f"{np.nanmean(PearsonP)} {str(compute_confidence(PearsonP, N_1, N_2, alpha))}"
                stats[
                    'SpearmanC 95%: '] = f"{np.nanmean(SpearmanC)} {str(compute_confidence(SpearmanC, N_1, N_2, alpha))}"
                stats[
                    'SpearmanP 95%: '] = f"{np.nanmean(SpearmanP)} {str(compute_confidence(SpearmanP, N_1, N_2, alpha))}"

        # Print all CI's and add to output
        stats = OrderedDict(sorted(stats.items()))
        for k, v in stats.items():
            print(f"{k} : {v}.")

        output['Statistics'] = stats
        return output
def load_config(config_file_path):
    """Load the config ini, parse settings to WORC.

    Args:
        config_file_path (String): path of the .ini config file

    Returns:
        settings_dict (dict): dict with the loaded settings

    """
    if not os.path.exists(config_file_path):
        e = f'File {config_file_path} does not exist!'
        raise ae.WORCKeyError(e)

    settings = configparser.ConfigParser()
    settings.read(config_file_path)

    settings_dict = {'General': dict(), 'CrossValidation': dict(),
                     'Labels': dict(), 'HyperOptimization': dict(),
                     'Classification': dict(), 'SelectFeatGroup': dict(),
                     'Featsel': dict(), 'FeatureScaling': dict(),
                     'Resampling': dict(), 'Imputation': dict(),
                     'Ensemble': dict(), 'Bootstrap': dict(),
                     'FeatPreProcess': dict(), 'Evaluation': dict(),
                     'OneHotEncoding': dict()}

    settings_dict['General']['cross_validation'] =\
        settings['General'].getboolean('cross_validation')

    settings_dict['General']['Joblib_ncores'] =\
        settings['General'].getint('Joblib_ncores')

    settings_dict['General']['Joblib_backend'] =\
        str(settings['General']['Joblib_backend'])

    settings_dict['General']['tempsave'] =\
        settings['General'].getboolean('tempsave')

    # Feature Scaling
    settings_dict['FeatureScaling']['scale_features'] =\
        settings['FeatureScaling'].getboolean('scale_features')

    settings_dict['FeatureScaling']['scaling_method'] =\
        [str(item).strip() for item in
         settings['FeatureScaling']['scaling_method'].split(',')]

    settings_dict['FeatureScaling']['skip_features'] =\
        [str(item).strip() for item in
         settings['FeatureScaling']['skip_features'].split(',')]

    # Feature selection
    settings_dict['Featsel']['Variance'] =\
        settings['Featsel'].getfloat('Variance')

    settings_dict['Featsel']['SelectFromModel'] =\
        settings['Featsel'].getfloat('SelectFromModel')

    settings_dict['Featsel']['SelectFromModel_lasso_alpha'] =\
        [float(str(item).strip()) for item in
         settings['Featsel']['SelectFromModel_lasso_alpha'].split(',')]

    settings_dict['Featsel']['SelectFromModel_estimator'] =\
        [str(item).strip() for item in
         settings['Featsel']['SelectFromModel_estimator'].split(',')]

    settings_dict['Featsel']['SelectFromModel_n_trees'] =\
        [int(str(item).strip()) for item in
         settings['Featsel']['SelectFromModel_n_trees'].split(',')]

    settings_dict['Featsel']['GroupwiseSearch'] =\
        [str(item).strip() for item in
         settings['Featsel']['GroupwiseSearch'].split(',')]

    settings_dict['Featsel']['UsePCA'] =\
        settings['Featsel'].getfloat('UsePCA')

    settings_dict['Featsel']['PCAType'] =\
        [str(item).strip() for item in
         settings['Featsel']['PCAType'].split(',')]

    settings_dict['Featsel']['StatisticalTestUse'] =\
        settings['Featsel'].getfloat('StatisticalTestUse')

    settings_dict['Featsel']['StatisticalTestMetric'] =\
        [str(item).strip() for item in
         settings['Featsel']['StatisticalTestMetric'].split(',')]

    settings_dict['Featsel']['StatisticalTestThreshold'] =\
        [float(str(item).strip()) for item in
         settings['Featsel']['StatisticalTestThreshold'].split(',')]

    settings_dict['Featsel']['ReliefUse'] =\
        settings['Featsel'].getfloat('ReliefUse')

    settings_dict['Featsel']['ReliefNN'] =\
        [int(str(item).strip()) for item in
         settings['Featsel']['ReliefNN'].split(',')]

    settings_dict['Featsel']['ReliefSampleSize'] =\
        [float(str(item).strip()) for item in
         settings['Featsel']['ReliefSampleSize'].split(',')]

    settings_dict['Featsel']['ReliefDistanceP'] =\
        [int(str(item).strip()) for item in
         settings['Featsel']['ReliefDistanceP'].split(',')]

    settings_dict['Featsel']['ReliefNumFeatures'] =\
        [int(str(item).strip()) for item in
         settings['Featsel']['ReliefNumFeatures'].split(',')]

    settings_dict['FeatPreProcess']['Use'] =\
        [str(settings['FeatPreProcess']['Use'])]

    # Imputation
    settings_dict['Imputation']['use'] =\
        [str(item).strip() for item in
         settings['Imputation']['use'].split(',')]

    settings_dict['Imputation']['strategy'] =\
        [str(item).strip() for item in
         settings['Imputation']['strategy'].split(',')]

    settings_dict['Imputation']['n_neighbors'] =\
        [int(str(item).strip()) for item in
         settings['Imputation']['n_neighbors'].split(',')]

    # OneHotEncoding
    settings_dict['OneHotEncoding']['Use'] =\
        [str(item).strip() for item in
         settings['Imputation']['use'].split(',')]

    settings_dict['OneHotEncoding']['feature_labels_tofit'] =\
        [str(item).strip() for item in
         settings['OneHotEncoding']['feature_labels_tofit'].split(',')]

    # General
    settings_dict['General']['FeatureCalculators'] =\
        [str(item).strip() for item in
         settings['General']['FeatureCalculators'].split(',')]

    # Feature selection options
    for key in settings['SelectFeatGroup'].keys():
        settings_dict['SelectFeatGroup'][key] =\
            [str(item).strip() for item in
             settings['SelectFeatGroup'][key].split(',')]

    # Settings for sample processing, i.e. oversampling, undersampling etc
    settings_dict['Resampling']['Use'] =\
        settings['Resampling'].getfloat('Use')

    settings_dict['Resampling']['Method'] =\
        [str(item).strip() for item in
         settings['Resampling']['Method'].split(',')]

    settings_dict['Resampling']['sampling_strategy'] =\
        [str(item).strip() for item in
         settings['Resampling']['sampling_strategy'].split(',')]

    settings_dict['Resampling']['n_neighbors'] =\
        [int(str(item).strip()) for item in
         settings['Resampling']['n_neighbors'].split(',')]

    settings_dict['Resampling']['k_neighbors'] =\
        [int(str(item).strip()) for item in
         settings['Resampling']['k_neighbors'].split(',')]

    settings_dict['Resampling']['threshold_cleaning'] =\
        [float(str(item).strip()) for item in
         settings['Resampling']['threshold_cleaning'].split(',')]

    # Classification options
    settings_dict['Classification']['fastr'] =\
        settings['Classification'].getboolean('fastr')

    settings_dict['Classification']['fastr_plugin'] =\
        str(settings['Classification']['fastr_plugin'])

    settings_dict['Classification']['classifiers'] =\
        [str(item).strip() for item in
         settings['Classification']['classifiers'].split(',')]

    settings_dict['Classification']['max_iter'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['max_iter'].split(',')]

    # Specific SVM options
    settings_dict['Classification']['SVMKernel'] =\
        [str(item).strip() for item in
         settings['Classification']['SVMKernel'].split(',')]

    settings_dict['Classification']['SVMC'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['SVMC'].split(',')]

    settings_dict['Classification']['SVMdegree'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['SVMdegree'].split(',')]

    settings_dict['Classification']['SVMcoef0'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['SVMcoef0'].split(',')]

    settings_dict['Classification']['SVMgamma'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['SVMgamma'].split(',')]

    # Specific RF options
    settings_dict['Classification']['RFn_estimators'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['RFn_estimators'].split(',')]
    settings_dict['Classification']['RFmin_samples_split'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['RFmin_samples_split'].split(',')]
    settings_dict['Classification']['RFmax_depth'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['RFmax_depth'].split(',')]

    # Specific LR options
    settings_dict['Classification']['LRpenalty'] =\
        [str(item).strip() for item in
         settings['Classification']['LRpenalty'].split(',')]
    settings_dict['Classification']['LRC'] =\
        [float(str(item).strip()) for item in
         settings['Classification']['LRC'].split(',')]
    settings_dict['Classification']['LR_solver'] =\
        [str(item).strip() for item in
         settings['Classification']['LR_solver'].split(',')]
    settings_dict['Classification']['LR_l1_ratio'] =\
        [float(str(item).strip()) for item in
         settings['Classification']['LR_l1_ratio'].split(',')]

    # Specific LDA/QDA options
    settings_dict['Classification']['LDA_solver'] =\
        [str(item).strip() for item in
         settings['Classification']['LDA_solver'].split(',')]
    settings_dict['Classification']['LDA_shrinkage'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['LDA_shrinkage'].split(',')]
    settings_dict['Classification']['QDA_reg_param'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['QDA_reg_param'].split(',')]

    # ElasticNet options
    settings_dict['Classification']['ElasticNet_alpha'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['ElasticNet_alpha'].split(',')]
    settings_dict['Classification']['ElasticNet_l1_ratio'] =\
        [float(str(item).strip()) for item in
         settings['Classification']['ElasticNet_l1_ratio'].split(',')]

    # SGD (R) options
    settings_dict['Classification']['SGD_alpha'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['SGD_alpha'].split(',')]
    settings_dict['Classification']['SGD_l1_ratio'] =\
        [float(str(item).strip()) for item in
         settings['Classification']['SGD_l1_ratio'].split(',')]
    settings_dict['Classification']['SGD_loss'] =\
        [str(item).strip() for item in
         settings['Classification']['SGD_loss'].split(',')]
    settings_dict['Classification']['SGD_penalty'] =\
        [str(item).strip() for item in
         settings['Classification']['SGD_penalty'].split(',')]

    # Naive Bayes options
    settings_dict['Classification']['CNB_alpha'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['CNB_alpha'].split(',')]

    # AdaBoost
    settings_dict['Classification']['AdaBoost_n_estimators'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['AdaBoost_n_estimators'].split(',')]

    settings_dict['Classification']['AdaBoost_learning_rate'] =\
        [float(str(item).strip()) for item in
         settings['Classification']['AdaBoost_learning_rate'].split(',')]

    # XGD Boost
    settings_dict['Classification']['XGB_boosting_rounds'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['XGB_boosting_rounds'].split(',')]

    settings_dict['Classification']['XGB_max_depth'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['XGB_max_depth'].split(',')]

    settings_dict['Classification']['XGB_learning_rate'] =\
        [float(str(item).strip()) for item in
         settings['Classification']['XGB_learning_rate'].split(',')]

    settings_dict['Classification']['XGB_gamma'] =\
        [float(str(item).strip()) for item in
         settings['Classification']['XGB_gamma'].split(',')]

    settings_dict['Classification']['XGB_min_child_weight'] =\
        [int(str(item).strip()) for item in
         settings['Classification']['XGB_min_child_weight'].split(',')]

    settings_dict['Classification']['XGB_colsample_bytree'] =\
        [float(str(item).strip()) for item in
         settings['Classification']['XGB_colsample_bytree'].split(',')]

    # Cross validation settings
    settings_dict['CrossValidation']['Type'] =\
        str(settings['CrossValidation']['Type'])

    settings_dict['CrossValidation']['N_iterations'] =\
        settings['CrossValidation'].getint('N_iterations')

    settings_dict['CrossValidation']['test_size'] =\
        settings['CrossValidation'].getfloat('test_size')

    settings_dict['CrossValidation']['fixed_seed'] =\
        settings['CrossValidation'].getboolean('fixed_seed')

    # Genetic settings
    settings_dict['Labels']['label_names'] =\
        [str(item).strip() for item in
         settings['Labels']['label_names'].split(',')]

    settings_dict['Labels']['modus'] =\
        str(settings['Labels']['modus'])

    # Settings for hyper optimization
    settings_dict['HyperOptimization']['scoring_method'] =\
        str(settings['HyperOptimization']['scoring_method'])
    settings_dict['HyperOptimization']['test_size'] =\
        settings['HyperOptimization'].getfloat('test_size')
    settings_dict['HyperOptimization']['N_iter'] =\
        settings['HyperOptimization'].getint('N_iterations')
    settings_dict['HyperOptimization']['n_splits'] =\
        settings['HyperOptimization'].getint('n_splits')
    settings_dict['HyperOptimization']['n_jobspercore'] =\
        int(settings['HyperOptimization']['n_jobspercore'])
    settings_dict['HyperOptimization']['maxlen'] = \
        settings['HyperOptimization'].getint('maxlen')
    settings_dict['HyperOptimization']['ranking_score'] = \
        str(settings['HyperOptimization']['ranking_score'])
    settings_dict['HyperOptimization']['memory'] = \
        str(settings['HyperOptimization']['memory'])

    # Settings for ensembling
    settings_dict['Ensemble']['Use'] =\
        settings['Ensemble'].getint('Use')

    settings_dict['Ensemble']['Metric'] =\
        settings['Ensemble']['Metric']

    # Settings for bootstrapping
    settings_dict['Bootstrap']['Use'] =\
        settings['Bootstrap'].getboolean('Use')

    settings_dict['Bootstrap']['N_iterations'] =\
        settings['Bootstrap'].getint('N_iterations')

    # Settings for evaluation
    settings_dict['Evaluation']['OverfitScaler'] =\
        settings['Evaluation'].getboolean('OverfitScaler')

    return settings_dict
Пример #14
0
def plot_SVM(prediction, label_data, label_type, show_plots=False,
             alpha=0.95, ensemble=False, verbose=True,
             ensemble_scoring=None, output='stats',
             modus='singlelabel'):
    '''
    Plot the output of a single binary estimator, e.g. a SVM.

    Parameters
    ----------
    prediction: pandas dataframe or string, mandatory
        output of trainclassifier function, either a pandas dataframe
        or a HDF5 file

    label_data: string, mandatory
        Contains the path referring to a .txt file containing the
        patient label(s) and value(s) to be used for learning. See
        the Github Wiki for the format.

    label_type: string, mandatory
        Name of the label to extract from the label data to test the
        estimator on.

    show_plots: Boolean, default False
        Determine whether matplotlib performance plots are made.

    alpha: float, default 0.95
        Significance of confidence intervals.

    ensemble: False, integer or 'Caruana'
        Determine whether an ensemble will be created. If so,
        either provide an integer to determine how many of the
        top performing classifiers should be in the ensemble, or use
        the string "Caruana" to use smart ensembling based on
        Caruana et al. 2004.

    verbose: boolean, default True
        Plot intermedate messages.

    ensemble_scoring: string, default None
        Metric to be used for evaluating the ensemble. If None,
        the option set in the prediction object will be used.

    output: string, default stats
        Determine which results are put out. If stats, the statistics of the
        estimator will be returned. If scores, the scores will be returned.

    Returns
    ----------
    Depending on the output parameters, the following outputs are returned:

    If output == 'stats':
    stats: dictionary
        Contains the confidence intervals of the performance metrics
        and the number of times each patient was classifier correctly
        or incorrectly.

    If output == 'scores':
    y_truths: list
        Contains the true label for each object.

    y_scores: list
        Contains the score (e.g. posterior) for each object.

    y_predictions: list
        Contains the predicted label for each object.

    PIDs: list
        Contains the patient ID/name for each object.
    '''

    # Load the prediction object if it's a hdf5 file
    if type(prediction) is not pd.core.frame.DataFrame:
        if os.path.isfile(prediction):
            prediction = pd.read_hdf(prediction)
        else:
            raise ae.WORCIOError(('{} is not an existing file!').format(str(prediction)))

    # Select the estimator from the pandas dataframe to use
    keys = prediction.keys()
    SVMs = list()
    if label_type is None:
        label_type = keys[0]

    # Load the label data
    if type(label_data) is not dict:
        if os.path.isfile(label_data):
            if type(label_type) is not list:
                # Singlelabel: convert to list
                label_type = [[label_type]]
            label_data = lp.load_labels(label_data, label_type)

    patient_IDs = label_data['patient_IDs']
    labels = label_data['label']

    if type(label_type) is list:
        # FIXME: Support for multiple label types not supported yet.
        print('[WORC Warning] Support for multiple label types not supported yet. Taking first label for plot_SVM.')
        label_type = keys[0]

    # Extract the estimators, features and labels
    SVMs = prediction[label_type]['classifiers']
    regression = is_regressor(SVMs[0].best_estimator_)
    Y_test = prediction[label_type]['Y_test']
    X_test = prediction[label_type]['X_test']
    X_train = prediction[label_type]['X_train']
    Y_train = prediction[label_type]['Y_train']
    feature_labels = prediction[label_type]['feature_labels']

    # Create lists for performance measures
    sensitivity = list()
    specificity = list()
    precision = list()
    accuracy = list()
    auc = list()
    f1_score_list = list()
    patient_classification_list = dict()
    if output in ['scores', 'decision']:
        # Keep track of all groundth truths and scores
        y_truths = list()
        y_scores = list()
        y_predictions = list()
        PIDs = list()

    # Loop over the test sets, which probably correspond with cross validation
    # iterations
    for i in range(0, len(Y_test)):
        print("\n")
        print(("Cross validation {} / {}.").format(str(i + 1), str(len(Y_test))))
        test_patient_IDs = prediction[label_type]['patient_ID_test'][i]
        train_patient_IDs = prediction[label_type]['patient_ID_train'][i]
        X_test_temp = X_test[i]
        X_train_temp = X_train[i]
        Y_train_temp = Y_train[i]
        Y_test_temp = Y_test[i]
        test_indices = list()

        # Check which patients are in the test set.
        for i_ID in test_patient_IDs:
            test_indices.append(np.where(patient_IDs == i_ID)[0][0])

            # Initiate counting how many times a patient is classified correctly
            if i_ID not in patient_classification_list:
                patient_classification_list[i_ID] = dict()
                patient_classification_list[i_ID]['N_test'] = 0
                patient_classification_list[i_ID]['N_correct'] = 0
                patient_classification_list[i_ID]['N_wrong'] = 0

            patient_classification_list[i_ID]['N_test'] += 1

        # Extract ground truth
        y_truth = Y_test_temp

        # If requested, first let the SearchCV object create an ensemble
        if ensemble:
            # NOTE: Added for backwards compatability
            if not hasattr(SVMs[i], 'cv_iter'):
                cv_iter = list(SVMs[i].cv.split(X_train_temp, Y_train_temp))
                SVMs[i].cv_iter = cv_iter

            # Create the ensemble
            X_train_temp = [(x, feature_labels) for x in X_train_temp]
            SVMs[i].create_ensemble(X_train_temp, Y_train_temp,
                                    method=ensemble, verbose=verbose,
                                    scoring=ensemble_scoring)

        # Create prediction
        y_prediction = SVMs[i].predict(X_test_temp)

        if regression:
            y_score = y_prediction
        else:
            y_score = SVMs[i].predict_proba(X_test_temp)[:, 1]

        print("Truth: " + str(y_truth))
        print("Prediction: " + str(y_prediction))

        # Add if patient was classified correctly or not to counting
        for i_truth, i_predict, i_test_ID in zip(y_truth, y_prediction, test_patient_IDs):
            if modus == 'multilabel':
                success = (i_truth == i_predict).all()
            else:
                success = i_truth == i_predict

            if success:
                patient_classification_list[i_test_ID]['N_correct'] += 1
            else:
                patient_classification_list[i_test_ID]['N_wrong'] += 1

        y_score = SVMs[i].predict_proba(X_test_temp)[:, 1]

        if output == 'decision':
            # Output the posteriors
            y_scores.append(y_score)
            y_truths.append(y_truth)
            y_predictions.append(y_prediction)
            PIDs.append(test_patient_IDs)

        elif output == 'scores':
            # Output the posteriors
            y_scores.append(y_score)
            y_truths.append(y_truth)
            y_predictions.append(y_prediction)
            PIDs.append(test_patient_IDs)

        elif output == 'stats':
            # Compute statistics
            # Compute confusion matrix and use for sensitivity/specificity
            if modus == 'singlelabel':
                # Compute singlelabel performance metrics
                if not regression:
                    accuracy_temp, sensitivity_temp, specificity_temp,\
                        precision_temp, f1_score_temp, auc_temp =\
                        metrics.performance_singlelabel(y_truth,
                                                        y_prediction,
                                                        y_score,
                                                        regression)
                else:
                    r2score, MSE, coefICC, PearsonC, PearsonP, SpearmanC,\
                        SpearmanP =\
                        metrics.performance_singlelabel(y_truth,
                                                        y_prediction,
                                                        y_score,
                                                        regression)

            elif modus == 'multilabel':
                # Convert class objects to single label per patient
                y_truth_temp = list()
                y_prediction_temp = list()
                for yt, yp in zip(y_truth, y_prediction):
                    label = np.where(yt == 1)
                    if len(label) > 1:
                        raise ae.WORCNotImplementedError('Multiclass classification evaluation is not supported in WORC.')

                    y_truth_temp.append(label[0][0])
                    label = np.where(yp == 1)
                    y_prediction_temp.append(label[0][0])

                y_truth = y_truth_temp
                y_prediction = y_prediction_temp

                # Compute multilabel performance metrics
                accuracy_temp, sensitivity_temp, specificity_temp,\
                    precision_temp, f1_score_temp, auc_temp =\
                    metrics.performance_multilabel(y_truth,
                                                   y_prediction,
                                                   y_score)

            else:
                raise ae.WORCKeyError('{} is not a valid modus!').format(modus)

            # Print AUC to keep you up to date
            print('AUC: ' + str(auc_temp))

            # Append performance to lists for all cross validations
            accuracy.append(accuracy_temp)
            sensitivity.append(sensitivity_temp)
            specificity.append(specificity_temp)
            auc.append(auc_temp)
            f1_score_list.append(f1_score_temp)
            precision.append(precision_temp)

    if output in ['scores', 'decision']:
        # Return the scores and true values of all patients
        return y_truths, y_scores, y_predictions, PIDs
    elif output == 'stats':
        # Compute statistics
        # Extract sample size
        N_1 = float(len(train_patient_IDs))
        N_2 = float(len(test_patient_IDs))

        # Compute alpha confidence intervallen
        stats = dict()
        stats["Accuracy 95%:"] = str(compute_CI.compute_confidence(accuracy, N_1, N_2, alpha))

        stats["AUC 95%:"] = str(compute_CI.compute_confidence(auc, N_1, N_2, alpha))

        stats["F1-score 95%:"] = str(compute_CI.compute_confidence(f1_score_list, N_1, N_2, alpha))

        stats["Precision 95%:"] = str(compute_CI.compute_confidence(precision, N_1, N_2, alpha))

        stats["Sensitivity 95%: "] = str(compute_CI.compute_confidence(sensitivity, N_1, N_2, alpha))

        stats["Specificity 95%:"] = str(compute_CI.compute_confidence(specificity, N_1, N_2, alpha))

        print("Accuracy 95%:" + str(compute_CI.compute_confidence(accuracy, N_1, N_2, alpha)))

        print("AUC 95%:" + str(compute_CI.compute_confidence(auc, N_1, N_2, alpha)))

        print("F1-score 95%:" + str(compute_CI.compute_confidence(f1_score_list, N_1, N_2, alpha)))

        print("Precision 95%:" + str(compute_CI.compute_confidence(precision, N_1, N_2, alpha)))

        print("Sensitivity 95%: " + str(compute_CI.compute_confidence(sensitivity, N_1, N_2, alpha)))

        print("Specificity 95%:" + str(compute_CI.compute_confidence(specificity, N_1, N_2, alpha)))

        # Extract statistics on how often patients got classified correctly
        alwaysright = dict()
        alwayswrong = dict()
        percentages = dict()
        for i_ID in patient_classification_list:
            percentage_right = patient_classification_list[i_ID]['N_correct'] / float(patient_classification_list[i_ID]['N_test'])

            if i_ID in patient_IDs:
                label = labels[0][np.where(i_ID == patient_IDs)]
            else:
                # Multiple instance of one patient
                label = labels[0][np.where(i_ID.split('_')[0] == patient_IDs)]

            label = label[0][0]
            percentages[i_ID] = str(label) + ': ' + str(round(percentage_right, 2) * 100) + '%'
            if percentage_right == 1.0:
                alwaysright[i_ID] = label
                print(("Always Right: {}, label {}").format(i_ID, label))

            elif percentage_right == 0:
                alwayswrong[i_ID] = label
                print(("Always Wrong: {}, label {}").format(i_ID, label))

        stats["Always right"] = alwaysright
        stats["Always wrong"] = alwayswrong
        stats['Percentages'] = percentages

        if show_plots:
            # Plot some characteristics in boxplots
            import matplotlib.pyplot as plt

            plt.figure()
            plt.boxplot(accuracy)
            plt.ylim([-0.05, 1.05])
            plt.ylabel('Accuracy')
            plt.tick_params(
                axis='x',          # changes apply to the x-axis
                which='both',      # both major and minor ticks are affected
                bottom='off',      # ticks along the bottom edge are off
                top='off',         # ticks along the top edge are off
                labelbottom='off')  # labels along the bottom edge are off
            plt.tight_layout()
            plt.show()

            plt.figure()
            plt.boxplot(auc)
            plt.ylim([-0.05, 1.05])
            plt.ylabel('AUC')
            plt.tick_params(
                axis='x',          # changes apply to the x-axis
                which='both',      # both major and minor ticks are affected
                bottom='off',      # ticks along the bottom edge are off
                top='off',         # ticks along the top edge are off
                labelbottom='off')  # labels along the bottom edge are off
            plt.tight_layout()
            plt.show()

            plt.figure()
            plt.boxplot(precision)
            plt.ylim([-0.05, 1.05])
            plt.ylabel('Precision')
            plt.tick_params(
                axis='x',          # changes apply to the x-axis
                which='both',      # both major and minor ticks are affected
                bottom='off',      # ticks along the bottom edge are off
                top='off',         # ticks along the top edge are off
                labelbottom='off')  # labels along the bottom edge are off
            plt.tight_layout()
            plt.show()

            plt.figure()
            plt.boxplot(sensitivity)
            plt.ylim([-0.05, 1.05])
            plt.ylabel('Sensitivity')
            plt.tick_params(
                axis='x',          # changes apply to the x-axis
                which='both',      # both major and minor ticks are affected
                bottom='off',      # ticks along the bottom edge are off
                top='off',         # ticks along the top edge are off
                labelbottom='off')  # labels along the bottom edge are off
            plt.tight_layout()
            plt.show()

            plt.figure()
            plt.boxplot(specificity)
            plt.ylim([-0.05, 1.05])
            plt.ylabel('Specificity')
            plt.tick_params(
                axis='x',          # changes apply to the x-axis
                which='both',      # both major and minor ticks are affected
                bottom='off',      # ticks along the bottom edge are off
                top='off',         # ticks along the top edge are off
                labelbottom='off')  # labels along the bottom edge are off
            plt.tight_layout()
            plt.show()

        return stats
def load_config(config_file_path):
    """ Parse a WORC configuration file.

    Arguments:
        config_file_path: path to the configuration file to be parsed.

    Returns:
        settings_dict: dictionary containing all parsed settings.
    """
    if not os.path.exists(config_file_path):
        e = f'File {config_file_path} does not exist!'
        raise ae.WORCKeyError(e)

    settings = configparser.ConfigParser()
    settings.read(config_file_path)

    settings_dict = {
        'Preprocessing': dict(),
        'ImageFeatures': dict(),
        'General': dict()
    }

    # General settings
    settings_dict['ImageFeatures']['image_type'] =\
        [str(item).strip() for item in
         settings['ImageFeatures']['image_type'].split(',')]

    settings_dict['General']['AssumeSameImageAndMaskMetadata'] =\
        settings['General'].getboolean('AssumeSameImageAndMaskMetadata')

    # Detect incorrect spacing
    settings_dict['Preprocessing']['CheckSpacing'] =\
        settings['Preprocessing'].getboolean('CheckSpacing')

    # Clipping
    settings_dict['Preprocessing']['Clipping'] =\
        settings['Preprocessing'].getboolean('Clipping')

    settings_dict['Preprocessing']['Clipping_Range'] =\
        [float(item) for item in
         settings['Preprocessing']['Clipping_Range'].split(',')]

    if len(settings_dict['Preprocessing']['Clipping_Range']) != 2:
        raise ae.WORCValueError(
            f"Clipping range should be two floats split by a comma, got {settings['Preprocessing']['Clipping_Range']}."
        )

    # Normalization
    settings_dict['Preprocessing']['Normalize'] =\
        settings['Preprocessing'].getboolean('Normalize')

    settings_dict['Preprocessing']['Normalize_ROI'] =\
        str(settings['Preprocessing']['Normalize_ROI'])

    settings_dict['Preprocessing']['ROIdilate'] =\
        str(settings['Preprocessing']['ROIdilate'])

    settings_dict['Preprocessing']['ROIDetermine'] =\
        str(settings['Preprocessing']['ROIDetermine'])

    settings_dict['Preprocessing']['ROIdilateradius'] =\
        int(settings['Preprocessing']['ROIdilateradius'])

    settings_dict['Preprocessing']['Method'] =\
        str(settings['Preprocessing']['Method'])

    # Bias Correction
    settings_dict['Preprocessing']['BiasCorrection'] =\
        settings['Preprocessing'].getboolean('BiasCorrection')

    settings_dict['Preprocessing']['BiasCorrection_Mask'] =\
        settings['Preprocessing'].getboolean('BiasCorrection_Mask')

    # Re-orientation
    settings_dict['Preprocessing']['CheckOrientation'] =\
        settings['Preprocessing'].getboolean('CheckOrientation')

    settings_dict['Preprocessing']['OrientationPrimaryAxis'] =\
        str(settings['Preprocessing']['OrientationPrimaryAxis'])

    # Resampling
    settings_dict['Preprocessing']['Resampling'] =\
        settings['Preprocessing'].getboolean('Resampling')

    settings_dict['Preprocessing']['Resampling_spacing'] =\
        [float(item) for item in
         settings['Preprocessing']['Resampling_spacing'].split(',')]

    if len(settings_dict['Preprocessing']['Resampling_spacing']) != 3:
        s = settings_dict['Preprocessing']['Resampling_spacing']
        raise ae.WORCValueError(
            f'Resampling spacing should be three elements, got {s}')

    return settings_dict
Пример #16
0
def findlabeldata(patientinfo, label_type, filenames=None,
                  objects=None, pids=None):
    """
    Load the label data and match to the unage features.

    Args:
        patientinfo (string): file with patient label data
        label_type (string): name of the label read out from patientinfo
        filenames (list): names of the patient feature files, used for matching
        objects (np.array or list): array of objects you want to order as well

    Returns:
        label_data (dict): contains patient ids, their labels and the label name
    """
    # Get the labels and patient IDs
    label_data_temp = load_labels(patientinfo, label_type)
    label_data = dict()
    patient_IDs = list()
    label_value = list()
    for i_len in range(len(label_data_temp['label_name'])):
        label_value.append(list())

    # Check per feature file / pid if there is a match in the label data
    if filenames:
        iterator = filenames
    elif pids:
        iterator = pids
    else:
        raise ae.WORCValueError('Either input pids or filenames for label matching!')

    objects_out = list()
    for i_feat, feat in enumerate(iterator):
        ifound = 0
        matches = list()
        for i_num, i_patient in enumerate(label_data_temp['patient_IDs']):
            if i_patient.lower() in str(feat).lower():

                # Match: add the patient ID to the ID's and to the matches
                patient_IDs.append(i_patient)
                matches.append(i_patient)

                # If there are feature files given, add it to the list
                if objects is not None:
                    objects_out.append(objects[i_feat])

                # For each label that we have, add the value to the label list
                for i_len in range(len(label_data_temp['label_name'])):
                    label_value[i_len].append(label_data_temp['label'][i_len][i_num])

                # Calculate how many matches we found for this (feature) file: should be one
                ifound += 1

        if ifound > 1:
            message = ('Multiple matches ({}) found in labeling for feature file {}.').format(str(matches), str(feat))
            raise ae.WORCValueError(message)

        elif ifound == 0:
            message = ('No entry found in labeling for feature file {}.').format(str(feat))
            raise ae.WORCKeyError(message)

    # Convert to arrays
    for i_len in range(len(label_value)):
        label_value[i_len] = np.asarray(label_value[i_len])

    label_data['patient_IDs'] = np.asarray(patient_IDs)
    label_data['label'] = np.asarray(label_value)
    label_data['label_name'] = label_data_temp['label_name']

    return label_data, objects_out
def compute_statistics(y_truth, y_score, y_prediction, modus, regression):
    """Compute statistics on predictions."""
    if modus == 'singlelabel':
        # Compute singlelabel performance metrics
        if not regression:
            return metrics.performance_singlelabel(y_truth, y_prediction,
                                                   y_score, regression)

        else:
            return metrics.performance_singlelabel(y_truth, y_prediction,
                                                   y_score, regression)
            return

    elif modus == 'multilabel':
        # Convert class objects to single label per patient
        y_truth_temp = list()
        y_prediction_temp = list()
        for yt, yp in zip(y_truth, y_prediction):
            label = np.where(yt == 1)
            if len(label) > 1:
                raise ae.WORCNotImplementedError(
                    'Multiclass classification evaluation is not supported in WORC.'
                )

            y_truth_temp.append(label[0][0])
            label = np.where(yp == 1)
            y_prediction_temp.append(label[0][0])

        y_truth = y_truth_temp
        y_prediction = y_prediction_temp

        # Compute multilabel performance metrics
        predictions_multilabel =\
            metrics.performance_multilabel(y_truth,
                                           y_prediction,
                                           y_score)

        # Compute all single label performance metrics as well
        n_labels = len(np.unique(y_truth))
        for i_label in range(n_labels):
            y_truth_single = [i == i_label for i in y_truth]
            y_prediction_single = [i == i_label for i in y_prediction]
            y_score_single = y_score[:, i_label]

            predictions_singlelabel_temp =\
                metrics.performance_singlelabel(y_truth_single,
                                                y_prediction_single,
                                                y_score_single,
                                                regression)

            if i_label == 0:
                predictions_singlelabel =\
                    [[i] for i in predictions_singlelabel_temp]
            else:
                for num, metric in enumerate(predictions_singlelabel_temp):
                    predictions_singlelabel[num].append(metric)

        output = predictions_multilabel + predictions_singlelabel
        return output

    else:
        raise ae.WORCKeyError('{modus} is not a valid modus!')
def construct_classifier(config):
    """Interface to create classification

    Different classifications can be created using this common interface

    Parameters
    ----------
        config: dict, mandatory
                Contains the required config settings. See the Github Wiki for
                all available fields.

    Returns:
        Constructed classifier
    """

    # NOTE: Function is not working anymore for regression: need
    # to move param grid creation to the create_param_grid function
    max_iter = config['max_iter']
    if 'SVM' in config['classifiers']:
        # Support Vector Machine
        classifier = construct_SVM(config)

    elif config['classifiers'] == 'SVR':
        # Support Vector Regression
        classifier = construct_SVM(config, True)

    elif config['classifiers'] == 'RF':
        # Random forest kernel
        classifier = RandomForestClassifier(
            verbose=0,
            class_weight='balanced',
            n_estimators=config['RFn_estimators'],
            min_samples_split=config['RFmin_samples_split'],
            max_depth=config['RFmax_depth'])

    elif config['classifiers'] == 'RFR':
        # Random forest kernel regression
        classifier = RandomForestRegressor(
            verbose=0,
            n_estimators=config['RFn_estimators'],
            min_samples_split=config['RFmin_samples_split'],
            max_depth=config['RFmax_depth'])

    elif config['classifiers'] == 'ElasticNet':
        # Elastic Net Regression
        classifier = ElasticNet(alpha=config['ElasticNet_alpha'],
                                l1_ratio=config['ElasticNet_l1_ratio'],
                                max_iter=max_iter)

    elif config['classifiers'] == 'Lasso':
        # LASSO Regression
        param_grid = {'alpha': scipy.stats.uniform(loc=1.0, scale=0.5)}
        classifier = Lasso(max_iter=max_iter)

    elif config['classifiers'] == 'SGD':
        # Stochastic Gradient Descent classifier
        classifier = SGDClassifier(n_iter=config['max_iter'],
                                   alpha=config['SGD_alpha'],
                                   l1_ratio=config['SGD_l1_ratio'],
                                   loss=config['SGD_loss'],
                                   penalty=config['SGD_penalty'])

    elif config['classifiers'] == 'SGDR':
        # Stochastic Gradient Descent regressor
        classifier = SGDRegressor(n_iter=config['max_iter'],
                                  alpha=config['SGD_alpha'],
                                  l1_ratio=config['SGD_l1_ratio'],
                                  loss=config['SGD_loss'],
                                  penalty=config['SGD_penalty'])

    elif config['classifiers'] == 'LR':
        # Logistic Regression
        classifier = LogisticRegression(max_iter=max_iter,
                                        penalty=config['LRpenalty'],
                                        C=config['LRC'])
    elif config['classifiers'] == 'GaussianNB':
        # Naive Bayes classifier using Gaussian distributions
        classifier = GaussianNB()

    elif config['classifiers'] == 'ComplementNB':
        # Complement Naive Bayes classifier
        classifier = ComplementNB()

    elif config['classifiers'] == 'LDA':
        # Linear Discriminant Analysis
        if config['LDA_solver'] == 'svd':
            # Shrinkage does not work with svd solver
            shrinkage = None
        else:
            shrinkage = config['LDA_shrinkage']

        classifier = LDA(solver=config['LDA_solver'], shrinkage=shrinkage)

    elif config['classifiers'] == 'QDA':
        # Linear Discriminant Analysis
        classifier = QDA(reg_param=config['QDA_reg_param'])
    else:
        message = ('Classifier {} unknown.').format(str(config['classifiers']))
        raise ae.WORCKeyError(message)

    return classifier
def load_config(config_file_path):
    """ Parse a segmentix configuration file.

    Arguments:
        config_file_path: path to the configuration file to be parsed.

    Returns:
        settings_dict: dictionary containing all parsed settings.
    """
    if not os.path.exists(config_file_path):
        e = f'File {config_file_path} does not exist!'
        raise ae.WORCKeyError(e)

    settings = configparser.ConfigParser()
    settings.read(config_file_path)

    settings_dict = {'Segmentix': dict(), 'Preprocessing': dict()}

    # Segmentation settings
    settings_dict['Sementix'] = dict()
    settings_dict['Segmentix']['type'] =\
        str(settings['Segmentix']['segtype'])

    settings_dict['Segmentix']['mask'] =\
        str(settings['Segmentix']['mask'])

    settings_dict['Segmentix']['radius'] =\
        int(settings['Segmentix']['segradius'])

    settings_dict['Segmentix']['N_blobs'] =\
        int(settings['Segmentix']['N_blobs'])

    settings_dict['Segmentix']['fillholes'] =\
        settings['Segmentix'].getboolean('fillholes')

    settings_dict['Segmentix']['remove_small_objects'] =\
        settings['Segmentix'].getboolean('remove_small_objects')

    settings_dict['Segmentix']['min_object_size'] =\
        int(settings['Segmentix']['min_object_size'])

    settings_dict['Segmentix']['AssumeSameImageAndMaskMetadata'] =\
        settings['General'].getboolean('AssumeSameImageAndMaskMetadata')

    # Check spacing
    settings_dict['Preprocessing']['CheckSpacing'] =\
        settings['Preprocessing'].getboolean('CheckSpacing')

    # Re-orientation
    settings_dict['Preprocessing']['CheckOrientation'] =\
        settings['Preprocessing'].getboolean('CheckOrientation')

    settings_dict['Preprocessing']['OrientationPrimaryAxis'] =\
        str(settings['Preprocessing']['OrientationPrimaryAxis'])

    # Resampling
    settings_dict['Preprocessing']['Resampling'] =\
        settings['Preprocessing'].getboolean('Resampling')

    settings_dict['Preprocessing']['Resampling_spacing'] =\
        [float(item) for item in
         settings['Preprocessing']['Resampling_spacing'].split(',')]

    if len(settings_dict['Preprocessing']['Resampling_spacing']) != 3:
        s = settings_dict['Preprocessing']['Resampling_spacing']
        raise ae.WORCValueError(
            f'Resampling spacing should be three elements, got {s}')

    return settings_dict
def construct_classifier(config):
    """Interface to create classification.

    Different classifications can be created using this common interface

    Parameters
    ----------
        config: dict, mandatory
                Contains the required config settings. See the Github Wiki for
                all available fields.

    Returns:
        Constructed classifier

    """
    # NOTE: Function is not working anymore for regression: need
    # to move param grid creation to the create_param_grid function
    max_iter = config['max_iter']
    if 'SVM' in config['classifiers']:
        # Support Vector Machine
        classifier = construct_SVM(config)

    elif config['classifiers'] == 'SVR':
        # Support Vector Regression
        classifier = construct_SVM(config, True)

    elif config['classifiers'] == 'AdaBoostClassifier':
        # AdaBoost classifier
        learning_rate = config['AdaBoost_learning_rate']
        n_estimators = config['AdaBoost_n_estimators']
        classifier = AdaBoostClassifier(n_estimators=n_estimators,
                                        learning_rate=learning_rate)

    elif config['classifiers'] == 'AdaBoostRegressor':
        # AdaBoost regressor
        learning_rate = config['AdaBoost_learning_rate']
        n_estimators = config['AdaBoost_n_estimators']
        classifier = AdaBoostRegressor(n_estimators=n_estimators,
                                       learning_rate=learning_rate)

    elif config['classifiers'] == 'XGBClassifier':
        # XGB Classifier
        max_depth = config['XGB_max_depth']
        learning_rate = config['XGB_learning_rate']
        gamma = config['XGB_gamma']
        min_child_weight = config['XGB_min_child_weight']
        boosting_rounds = config['XGB_boosting_rounds']
        colsample_bytree = config['XGB_colsample_bytree']
        classifier = XGBClassifier(max_depth=max_depth,
                                   learning_rate=learning_rate,
                                   gamma=gamma,
                                   min_child_weight=min_child_weight,
                                   n_estimators=boosting_rounds,
                                   colsample_bytree=colsample_bytree)

    elif config['classifiers'] == 'XGBRegressor':
        # XGB Classifier
        max_depth = config['XGB_max_depth']
        learning_rate = config['XGB_learning_rate']
        gamma = config['XGB_gamma']
        min_child_weight = config['XGB_min_child_weight']
        boosting_rounds = config['XGB_boosting_rounds']
        colsample_bytree = config['XGB_colsample_bytree']
        classifier = XGBRegressor(max_depth=max_depth,
                                  learning_rate=learning_rate,
                                  gamma=gamma,
                                  min_child_weight=min_child_weight,
                                  n_estimators=boosting_rounds,
                                  colsample_bytree=colsample_bytree)

    elif config['classifiers'] == 'RF':
        # Random forest kernel
        classifier = RandomForestClassifier(
            verbose=0,
            class_weight='balanced',
            n_estimators=config['RFn_estimators'],
            min_samples_split=config['RFmin_samples_split'],
            max_depth=config['RFmax_depth'],
            random_state=config['random_seed'])

    elif config['classifiers'] == 'RFR':
        # Random forest kernel regression
        classifier = RandomForestRegressor(
            verbose=0,
            n_estimators=config['RFn_estimators'],
            min_samples_split=config['RFmin_samples_split'],
            max_depth=config['RFmax_depth'],
            random_state=config['random_seed'])

    elif config['classifiers'] == 'ElasticNet':
        # Elastic Net Regression
        classifier = ElasticNet(alpha=config['ElasticNet_alpha'],
                                l1_ratio=config['ElasticNet_l1_ratio'],
                                max_iter=max_iter,
                                random_state=config['random_seed'])

    elif config['classifiers'] == 'Lasso':
        # LASSO Regression
        classifier = Lasso(max_iter=max_iter,
                           random_state=config['random_seed'])

    elif config['classifiers'] == 'SGD':
        # Stochastic Gradient Descent classifier
        classifier = SGDClassifier(n_iter=config['max_iter'],
                                   alpha=config['SGD_alpha'],
                                   l1_ratio=config['SGD_l1_ratio'],
                                   loss=config['SGD_loss'],
                                   penalty=config['SGD_penalty'],
                                   random_state=config['random_seed'])

    elif config['classifiers'] == 'SGDR':
        # Stochastic Gradient Descent regressor
        classifier = SGDRegressor(n_iter=config['max_iter'],
                                  alpha=config['SGD_alpha'],
                                  l1_ratio=config['SGD_l1_ratio'],
                                  loss=config['SGD_loss'],
                                  penalty=config['SGD_penalty'],
                                  random_state=config['random_seed'])

    elif config['classifiers'] == 'LR':
        # Logistic Regression
        if config['LRpenalty'] == 'elasticnet' or config['LRpenalty'] == 'l1':
            # saga solver required for elasticnet
            if config['LR_solver'] != 'saga':
                p = config['LRpenalty']
                print(f"[WORC Warning] {p} penalty requires saga " +\
                      f"solver, got {config['LR_solver']}. Changing solver.")
                config['LR_solver'] = 'saga'

        classifier = LogisticRegression(max_iter=max_iter,
                                        penalty=config['LRpenalty'],
                                        solver=config['LR_solver'],
                                        l1_ratio=config['LR_l1_ratio'],
                                        C=config['LRC'],
                                        random_state=config['random_seed'])
    elif config['classifiers'] == 'GaussianNB':
        # Naive Bayes classifier using Gaussian distributions
        classifier = GaussianNB()

    elif config['classifiers'] == 'ComplementNB':
        # Complement Naive Bayes classifier
        classifier = ComplementNB()

    elif config['classifiers'] == 'LDA':
        # Linear Discriminant Analysis
        if config['LDA_solver'] == 'svd':
            # Shrinkage does not work with svd solver
            shrinkage = None
        else:
            shrinkage = config['LDA_shrinkage']

        classifier = LDA(solver=config['LDA_solver'], shrinkage=shrinkage)

    elif config['classifiers'] == 'QDA':
        # Linear Discriminant Analysis
        classifier = QDA(reg_param=config['QDA_reg_param'])
    else:
        message = ('Classifier {} unknown.').format(str(config['classifiers']))
        raise ae.WORCKeyError(message)

    return classifier
Пример #21
0
def createfixedsplits(label_file=None,
                      label_type=None,
                      patient_IDs=None,
                      test_size=0.2,
                      N_iterations=1,
                      regression=False,
                      stratify=None,
                      modus='singlelabel',
                      output=None):
    '''
    Create fixed splits for a cross validation.
    '''
    # Check whether input is valid
    if patient_IDs is None:
        if label_file is not None and label_type is not None:
            # Read the label file
            label_data = load_labels(label_file, label_type)
            patient_IDs = label_data['patient_IDs']

            # Create the stratification object
            if modus == 'singlelabel':
                stratify = label_data['label']
            elif modus == 'multilabel':
                # Create a stratification object from the labels
                # Label = 0 means no label equals one
                # Other label numbers refer to the label name that is 1
                stratify = list()
                labels = label_data['label']
                for pnum in range(0, len(labels[0])):
                    plabel = 0
                    for lnum, slabel in enumerate(labels):
                        if slabel[pnum] == 1:
                            plabel = lnum + 1
                    stratify.append(plabel)

            else:
                raise ae.WORCKeyError('{} is not a valid modus!').format(modus)
        else:
            raise ae.WORCIOError(
                'Either a label file and label type or patient_IDs need to be provided!'
            )

    pd_dict = dict()
    for i in range(N_iterations):
        print(f'Splitting iteration {i + 1} / {N_iterations}')
        # Create a random seed for the splitting
        random_seed = np.random.randint(5000)

        # Define stratification
        unique_patient_IDs, unique_indices =\
            np.unique(np.asarray(patient_IDs), return_index=True)
        if regression:
            unique_stratify = None
        else:
            unique_stratify = [stratify[i] for i in unique_indices]

        # Split, throw error when dataset is too small for split ratio's
        try:
            unique_PID_train, indices_PID_test\
                = train_test_split(unique_patient_IDs,
                                   test_size=test_size,
                                   random_state=random_seed,
                                   stratify=unique_stratify)
        except ValueError as e:
            e = str(e) + ' Increase the size of your test set.'
            raise ae.WORCValueError(e)

        # Check for all IDs if they are in test or training
        indices_train = list()
        indices_test = list()
        patient_ID_train = list()
        patient_ID_test = list()
        for num, pid in enumerate(patient_IDs):
            if pid in unique_PID_train:
                indices_train.append(num)

                # Make sure we get a unique ID
                if pid in patient_ID_train:
                    n = 1
                    while str(pid + '_' + str(n)) in patient_ID_train:
                        n += 1
                    pid = str(pid + '_' + str(n))
                patient_ID_train.append(pid)
            else:
                indices_test.append(num)

                # Make sure we get a unique ID
                if pid in patient_ID_test:
                    n = 1
                    while str(pid + '_' + str(n)) in patient_ID_test:
                        n += 1
                    pid = str(pid + '_' + str(n))
                patient_ID_test.append(pid)

        # Add to train object
        pd_dict[str(i) + '_train'] = patient_ID_train

        # Test object has to be same length as training object
        extras = [""] * (len(patient_ID_train) - len(patient_ID_test))
        patient_ID_test.extend(extras)
        pd_dict[str(i) + '_test'] = patient_ID_test

    # Convert into pandas dataframe for easy use and conversion
    df = pd.DataFrame(pd_dict)

    # Write output if required
    if output is not None:
        print("Writing Output.")
        df.to_csv(output)

    return df
Пример #22
0
def crossval(config, label_data, image_features,
             param_grid=None, use_fastr=False,
             fastr_plugin=None, tempsave=False,
             fixedsplits=None, ensemble={'Use': False}, outputfolder=None,
             modus='singlelabel'):
    """
    Constructs multiple individual classifiers based on the label settings

    Parameters
    ----------
    config: dict, mandatory
            Dictionary with config settings. See the Github Wiki for the
            available fields and formatting.

    label_data: dict, mandatory
            Should contain the following:
            patient_IDs (list): IDs of the patients, used to keep track of test and
                     training sets, and label data
            label (list): List of lists, where each list contains the
                                   label status for that patient for each
                                   label
            label_name (list): Contains the different names that are stored
                                  in the label object

    image_features: numpy array, mandatory
            Consists of a tuple of two lists for each patient:
            (feature_values, feature_labels)

    param_grid: dictionary, optional
            Contains the parameters and their values wich are used in the
            grid or randomized search hyperparamater optimization. See the
            construct_classifier function for some examples.

    use_fastr: boolean, default False
            If False, parallel execution through Joblib is used for fast
            execution of the hyperparameter optimization. Especially suited
            for execution on mutlicore (H)PC's. The settings used are
            specified in the config.ini file in the IOparser folder, which you
            can adjust to your system.

            If True, fastr is used to split the hyperparameter optimization in
            separate jobs. Parameters for the splitting can be specified in the
            config file. Especially suited for clusters.

    fastr_plugin: string, default None
            Determines which plugin is used for fastr executions.
            When None, uses the default plugin from the fastr config.

    tempsave: boolean, default False
            If True, create a .hdf5 file after each cross validation containing
            the classifier and results from that that split. This is written to
            the GSOut folder in your fastr output mount. If False, only
            the result of all combined cross validations will be saved to a .hdf5
            file. This will also be done if set to True.

    fixedsplits: string, optional
            By default, random split cross validation is used to train and
            evaluate the machine learning methods. Optionally, you can provide
            a .xlsx file containing fixed splits to be used. See the Github Wiki
            for the format.

    ensemble: dictionary, optional
            Contains the configuration for constructing an ensemble.

    modus: string, default 'singlelabel'
            Determine whether one-vs-all classification (or regression) for
            each single label is used ('singlelabel') or if multilabel
            classification is performed ('multilabel').

    Returns
    ----------
    panda_data: pandas dataframe
            Contains all information on the trained classifier.

    """
    if tempsave:
        import fastr


    # Define all possible regressors
    regressors = ['SVR', 'RFR', 'SGDR', 'Lasso', 'ElasticNet']

    # Process input data
    patient_IDs = label_data['patient_IDs']
    label_value = label_data['label']
    label_name = label_data['label_name']

    if outputfolder is None:
        logfilename = os.path.join(os.getcwd(), 'classifier.log')
    else:
        logfilename = os.path.join(outputfolder, 'classifier.log')
    print("Logging to file " + str(logfilename))

    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)

    logging.basicConfig(filename=logfilename, level=logging.DEBUG)
    N_iterations = config['CrossValidation']['N_iterations']
    test_size = config['CrossValidation']['test_size']

    classifier_labelss = dict()
    logging.debug('Starting classifier')

    # We only need one label instance, assuming they are all the sample
    feature_labels = image_features[0][1]

    # Check if we need to use fixedsplits:
    if fixedsplits is not None and '.xlsx' in fixedsplits:
        # fixedsplits = '/home/mstarmans/Settings/RandomSufflingOfData.xlsx'
        wb = xlrd.open_workbook(fixedsplits)
        wb = wb.sheet_by_index(1)

    if modus == 'singlelabel':
        print('Performing Single class classification.')
        logging.debug('Performing Single class classification.')
    elif modus == 'multilabel':
        print('Performing Multi label classification.')
        logging.debug('Performing Multi class classification.')
        label_value = [label_value]
        label_name = [label_name]
    else:
        m = ('{} is not a valid modus!').format(modus)
        logging.debug(m)
        raise ae.WORCKeyError(m)

    for i_class, i_name in zip(label_value, label_name):
        if modus == 'singlelabel':
            i_class_temp = i_class.ravel()

        save_data = list()

        for i in range(0, N_iterations):
            print(('Cross validation iteration {} / {} .').format(str(i + 1), str(N_iterations)))
            logging.debug(('Cross validation iteration {} / {} .').format(str(i + 1), str(N_iterations)))
            random_seed = np.random.randint(5000)

            # Split into test and training set, where the percentage of each
            # label is maintained
            if any(clf in regressors for clf in param_grid['classifiers']):
                # We cannot do a stratified shuffle split with regression
                stratify = None
            else:
                if modus == 'singlelabel':
                    stratify = i_class_temp
                elif modus == 'multilabel':
                    # Create a stratification object from the labels
                    # Label = 0 means no label equals one
                    # Other label numbers refer to the label name that is 1
                    stratify = list()
                    for pnum in range(0, len(i_class[0])):
                        plabel = 0
                        for lnum, slabel in enumerate(i_class):
                            if slabel[pnum] == 1:
                                plabel = lnum + 1
                        stratify.append(plabel)

                    # Sklearn multiclass requires rows to be objects/patients
                    # i_class = i_class.reshape(i_class.shape[1], i_class.shape[0])
                    i_class_temp = np.zeros((i_class.shape[1], i_class.shape[0]))
                    for n_patient in range(0, i_class.shape[1]):
                        for n_label in range(0, i_class.shape[0]):
                            i_class_temp[n_patient, n_label] = i_class[n_label, n_patient]
                    i_class_temp = i_class_temp
                else:
                    raise ae.WORCKeyError('{} is not a valid modus!').format(modus)

            if fixedsplits is None:
                # Use Random Split. Split per patient, not per sample
                unique_patient_IDs, unique_indices =\
                    np.unique(np.asarray(patient_IDs), return_index=True)
                if any(clf in regressors for clf in param_grid['classifiers']):
                    unique_stratify = None
                else:
                    unique_stratify = [stratify[i] for i in unique_indices]

                try:
                    unique_PID_train, indices_PID_test\
                        = train_test_split(unique_patient_IDs,
                                           test_size=test_size,
                                           random_state=random_seed,
                                           stratify=unique_stratify)
                except ValueError as e:
                    e = str(e) + ' Increase the size of your validation set.'
                    raise ae.WORCValueError(e)

                # Check for all IDs if they are in test or training
                indices_train = list()
                indices_test = list()
                patient_ID_train = list()
                patient_ID_test = list()
                for num, pid in enumerate(patient_IDs):
                    if pid in unique_PID_train:
                        indices_train.append(num)

                        # Make sure we get a unique ID
                        if pid in patient_ID_train:
                            n = 1
                            while str(pid + '_' + str(n)) in patient_ID_train:
                                n += 1
                            pid = str(pid + '_' + str(n))
                        patient_ID_train.append(pid)
                    else:
                        indices_test.append(num)

                        # Make sure we get a unique ID
                        if pid in patient_ID_test:
                            n = 1
                            while str(pid + '_' + str(n)) in patient_ID_test:
                                n += 1
                            pid = str(pid + '_' + str(n))
                        patient_ID_test.append(pid)

                # Split features and labels accordingly
                X_train = [image_features[i] for i in indices_train]
                X_test = [image_features[i] for i in indices_test]
                if modus == 'singlelabel':
                    Y_train = i_class_temp[indices_train]
                    Y_test = i_class_temp[indices_test]
                elif modus == 'multilabel':
                    Y_train = i_class_temp[indices_train, :]
                    Y_test = i_class_temp[indices_test, :]
                else:
                    raise ae.WORCKeyError('{} is not a valid modus!').format(modus)

            else:
                # Use pre defined splits
                indices = wb.col_values(i)
                indices = [int(j) for j in indices[1:]]  # First element is "Iteration x"
                train = indices[0:121]
                test = indices[121:]

                # Convert the numbers to the correct indices
                ind_train = list()
                for j in train:
                    success = False
                    for num, p in enumerate(patient_IDs):
                        if str(j).zfill(3) == p[0:3]:
                            ind_train.append(num)
                            success = True
                    if not success:
                        raise ae.WORCIOError("Patient " + str(j).zfill(3) + " is not included!")

                ind_test = list()
                for j in test:
                    success = False
                    for num, p in enumerate(patient_IDs):
                        if str(j).zfill(3) == p[0:3]:
                            ind_test.append(num)
                            success = True
                    if not success:
                        raise ae.WORCIOError("Patient " + str(j).zfill(3) + " is not included!")

                X_train = np.asarray(image_features)[ind_train].tolist()
                Y_train = np.asarray(i_class_temp)[ind_train].tolist()
                patient_ID_train = patient_IDs[ind_train]
                X_test = np.asarray(image_features)[ind_test].tolist()
                Y_test = np.asarray(i_class_temp)[ind_test].tolist()
                patient_ID_test = patient_IDs[ind_test]

            # Find best hyperparameters and construct classifier
            config['HyperOptimization']['use_fastr'] = use_fastr
            config['HyperOptimization']['fastr_plugin'] = fastr_plugin
            n_cores = config['General']['Joblib_ncores']
            trained_classifier = random_search_parameters(features=X_train,
                                                             labels=Y_train,
                                                             param_grid=param_grid,
                                                             n_cores=n_cores,
                                                             **config['HyperOptimization'])

            # Create an ensemble if required
            if ensemble['Use']:
                trained_classifier.create_ensemble(X_train, Y_train)

            # We only want to save the feature values and one label array
            X_train = [x[0] for x in X_train]
            X_test = [x[0] for x in X_test]

            temp_save_data = (trained_classifier, X_train, X_test, Y_train,
                              Y_test, patient_ID_train, patient_ID_test, random_seed)

            save_data.append(temp_save_data)

            # Create a temporary save
            if tempsave:
                panda_labels = ['trained_classifier', 'X_train', 'X_test', 'Y_train', 'Y_test',
                                'config', 'patient_ID_train', 'patient_ID_test',
                                'random_seed']

                panda_data_temp =\
                    pd.Series([trained_classifier, X_train, X_test, Y_train,
                               Y_test, config, patient_ID_train,
                               patient_ID_test, random_seed],
                              index=panda_labels,
                              name='Constructed crossvalidation')

                panda_data = pd.DataFrame(panda_data_temp)
                n = 0
                filename = os.path.join(fastr.config.mounts['tmp'], 'GSout', 'RS_' + str(i) + '.hdf5')
                while os.path.exists(filename):
                    n += 1
                    filename = os.path.join(fastr.config.mounts['tmp'], 'GSout', 'RS_' + str(i + n) + '.hdf5')

                if not os.path.exists(os.path.dirname(filename)):
                    os.makedirs(os.path.dirname(filename))

                panda_data.to_hdf(filename, 'SVMdata')
                del panda_data, panda_data_temp

        [classifiers, X_train_set, X_test_set, Y_train_set, Y_test_set,
         patient_ID_train_set, patient_ID_test_set, seed_set] =\
            zip(*save_data)

        panda_labels = ['classifiers', 'X_train', 'X_test', 'Y_train', 'Y_test',
                        'config', 'patient_ID_train', 'patient_ID_test',
                        'random_seed', 'feature_labels']

        panda_data_temp =\
            pd.Series([classifiers, X_train_set, X_test_set, Y_train_set,
                       Y_test_set, config, patient_ID_train_set,
                       patient_ID_test_set, seed_set, feature_labels],
                      index=panda_labels,
                      name='Constructed crossvalidation')

        if modus == 'singlelabel':
            i_name = ''.join(i_name)
        elif modus == 'multilabel':
            i_name = ','.join(i_name)

        classifier_labelss[i_name] = panda_data_temp

    panda_data = pd.DataFrame(classifier_labelss)

    return panda_data