예제 #1
0
def fold_maker(X, fold_choice='default', n_fold=4, n_groups=2):
    if fold_choice == 'default':
        folds = KFold(n_splits=n_fold, shuffle=False)
        fold_iter = folds.split(X)
        fold_iter = shuffle_group(fold_iter)
    elif fold_choice == 'earthquake':
        earthquake_id = data_loader.load_earthquake_id()
        group_kfold = LeaveOneGroupOut()
        fold_iter = group_kfold.split(X, groups=earthquake_id)
        # fold_iter = shuffle_group(fold_iter)
        # fold_iter = min_valid_filter(fold_iter)
    elif fold_choice == f'eqCombo':
        earthquake_id = eqComboMaker(n_fold)
        group_kfold = LeaveOneGroupOut()
        fold_iter = group_kfold.split(X, groups=earthquake_id)
        fold_iter = shuffle_group(fold_iter)
    elif fold_choice == 'k-earthquake':
        earthquake_id = data_loader.load_earthquake_id()
        group_kfold = LeavePGroupsOut(n_groups=n_groups)
        fold_iter = group_kfold.split(X, groups=earthquake_id)
        fold_iter = min_valid_filter(fold_iter)
    elif fold_choice == 'customize':
        fold = CVPipe()
        fold_iter = fold.fold_iter(num_fold=n_fold, mini_quake_prob=0.3)
    else:
        raise AttributeError(f"Not support CV {fold_choice} yet...")

    return (list(fold_iter), fold_choice)
예제 #2
0
def getProbsThread(nthread, clf, data, label, allAuthors, modeldir, saveModel):
    crossval = LeaveOneGroupOut()

    crossval.get_n_splits(groups=label)

    prob_per_author = [[0] * (len(allAuthors)) for i in range(len(allAuthors))]

    scores = Parallel(n_jobs=nthread)(
        delayed(getProbsTrainTest)(clf, data, label, train, test, modeldir,
                                   saveModel)
        for train, test in crossval.split(data, label, groups=label))

    for train, test in crossval.split(data, label, groups=label):
        anAuthor = int(label[test[0]])
        train_data_label = label[train]
        trainAuthors = list(set(train_data_label))
        # test_data_label = label[test]
        nTestDoc = len(scores)  # len(test_data_label)
        for j in range(nTestDoc):
            for i in range(len(trainAuthors)):
                try:
                    prob_per_author[anAuthor][int(
                        trainAuthors[i])] += scores[anAuthor - 1][j][i]
                except IndexError:
                    continue

        for i in range(len(trainAuthors)):
            prob_per_author[anAuthor][int(trainAuthors[i])] /= nTestDoc
    return prob_per_author
예제 #3
0
def ATE_fuzzy_subsets(Xg,
                      yg,
                      ys,
                      groups,
                      n_subsets,
                      subset_size,
                      n_jobs,
                      t,
                      rotated_data=False):

    # Make subsets
    if subset_size == 1:
        gss = LeaveOneGroupOut()
        subsets = gss.split(Xg, yg, groups)
    elif subset_size < 14:
        gss = GroupShuffleSplit(n_subsets,
                                test_size=subset_size,
                                random_state=0)
        subsets = gss.split(Xg, yg, groups)
    else:  # subset_size = 14
        subsets = [(None, np.arange(len(Xg)))]

    output = Parallel(n_jobs=n_jobs)(
        delayed(ATE_Fuzzy_Matching)(Xg[idx], yg[idx], ys[idx], t)
        for _, idx in subsets)
    # Save a 2D array with ATEs and n_employees
    if rotated_data:
        np.savetxt(
            fname=f'../results/ATE_subsets/fuzzy/{subset_size}DB_rotated.csv',
            X=np.array(output))
    else:
        np.savetxt(fname=f'../results/ATE_subsets/fuzzy/{subset_size}DB.csv',
                   X=np.array(output))
    return
예제 #4
0
 def pls_cv(self,ncomp_range=range(1,21),plot=False,verbose=False,
            osc_params=(10,1)):
     # Separating X from Y for PLS
     X=self.df[self.freqs].to_numpy()
     Y=self.df[self.y_name].to_numpy().reshape(-1, 1)
     sample_std=np.std(self.df[self.y_name])
     
     # CV based on measurement day
     if self.cval=="MD":
         cv = LeaveOneGroupOut()
         folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name]))
     # kfold CV
     elif self.cval=="kfold":
         cv = KFold(n_splits=self.cval_param)
         folds=list(cv.split(X))
     else:
         raise InputError("Invalid CV type!")
     
     # Array for storing CV errors
     cv_RMSE_all=np.zeros([len(folds),len(ncomp_range)])
     i=0
     for train, val in folds:
         # If OSC model specified
         if len(osc_params)==2:
             osc=OSC(nicomp=osc_params[0],ncomp=osc_params[1])
             osc.fit(X[train], Y[train])
             X_train_osc=osc.X_osc
             X_val_osc=osc.transform(X[val])
         j=0
         for ncomp in ncomp_range:
             pls = PLSRegression(n_components=ncomp,scale=False)
             if len(osc_params)==2:
                 pls.fit(X_train_osc, Y[train])
                 cv_RMSE_all[i,j]=metrics.mean_squared_error(
                     Y[val], pls.predict(X_val_osc))**0.5
             else:
                 pls.fit(X[train], Y[train])
                 cv_RMSE_all[i,j]=metrics.mean_squared_error(
                         Y[val], pls.predict(X[val]))**0.5
             j=j+1
         i=i+1
     # Printing and plotting CV results
     cv_RMSE_ncomp=np.mean(cv_RMSE_all,axis=0)
     cv_RPD_ncomp=sample_std/cv_RMSE_ncomp
     if plot:
         fig = plt.figure(figsize=(12,8))
         plt.gca().xaxis.grid(True)
         plt.xticks(ncomp_range)
         plt.ylabel("RPD")
         plt.xlabel("Number of components")
         plt.plot(ncomp_range,cv_RPD_ncomp)
     # Best model
     rpd_best=max(cv_RPD_ncomp)
     ncomp_best=ncomp_range[cv_RMSE_ncomp.argmin()]
     if verbose:
         print("Best RMSE: ",min(cv_RMSE_ncomp))
         print("Best RPD: ",max(cv_RPD_ncomp))
         print("Number of latent components: ",ncomp_range[cv_RMSE_ncomp.argmin()])
     return (ncomp_best,rpd_best)
예제 #5
0
 def fssregression_cv(self,inner_cv="kfold",inner_cv_param=5,maxvar=2,verbose=False,
                      osc_params=(10,1)):
     #inner CV can be "kfold" or "none"
     # Separating X from Y for PLS
     X=self.df[self.freqs]
     Y=self.df[self.y_name]
     
     # Create list for selected variables
     best_vars=[]
     reg = FSSRegression(inner_cv,inner_cv_param,maxvar)
     # CV based on measurement day
     if self.cval=="MD":
         cv = LeaveOneGroupOut()
         folds=list(cv.split(X=X,y=Y,groups=self.df[self.date_name]))
     # kfold CV
     elif self.cval=="kfold":
         cv = KFold(n_splits=self.cval_param)
         folds=list(cv.split(X))
     else:
         raise InputError("Invalid CV type!")  
     i=0
     #Array for cv values
     cv_RMSE_all=np.zeros([len(folds)])
     for train,val in folds:
         # If OSC model specified
         if len(osc_params)==2:
             osc=OSC(nicomp=osc_params[0],ncomp=osc_params[1])
             # FSSR needs column names, so it uses pandas, but osc uses numpy arrays
             osc.fit(X.iloc[train].to_numpy(), Y.iloc[train].to_numpy().reshape(-1,1))
             X_train_osc=pd.DataFrame(data=osc.X_osc,columns=self.freqs)
             X_val_osc=pd.DataFrame(data=osc.transform(X.iloc[val].to_numpy()),columns=self.freqs)
             # Fit and predict
             reg.fit(X_train_osc, Y.iloc[train])
             cv_RMSE_all[i]=metrics.mean_squared_error(
                     Y.iloc[val], reg.predict(X_val_osc))**0.5
             best_vars.append(reg.bestvar) 
         else:
             reg.fit(X.iloc[train], Y.iloc[train])
             cv_RMSE_all[i]=metrics.mean_squared_error(
                     Y.iloc[val], reg.predict(X.iloc[val]))**0.5
             best_vars.append(reg.bestvar)        
         i=i+1
     cv_RMSE=np.mean(cv_RMSE_all)
     rpd=np.std(self.df[self.y_name])/cv_RMSE
     if verbose:
         print("RMSE: ",cv_RMSE)
         print("RPD: ",rpd)
         print("Selected freqs: ",best_vars)
         k=0
         for day in self.df[self.date_name].unique():    
             print("Date: {0}, Measurements: {1:.0f}, RMSE: {2:.2f}, selected vars: {3}"
                   .format(
                     np.datetime_as_string(day,unit='D'),
                     sum(self.df[self.date_name]==day),
                         cv_RMSE_all[k],
                         len(best_vars[k])))
             k=k+1
           
     return(rpd)
예제 #6
0
def ATE_subsets(Xs,
                Xg,
                ys,
                yg,
                groups,
                n_subsets,
                subset_size,
                n_splits,
                n_jobs,
                matched_data,
                rotated_data=False):
    def foo(Xs, Xg, ys, yg):

        # For each subset, make splits
        kf = KFold(n_splits=n_splits, random_state=0)
        # Store ITEs
        ite = []
        # For each split
        for idx1, idx2 in kf.split(Xs):
            # Init models
            ms = HistGradientBoostingRegressor()
            mg = HistGradientBoostingClassifier()
            # Train models
            ms.fit(Xs[idx1], ys[idx1])
            mg.fit(Xg[idx1], yg[idx1])
            # Make estimates on test set
            ite.append(
                AIPW_estimator(ms, mg, Xs[idx2], Xg[idx2], ys[idx2], yg[idx2]))
        # Return mean ite and n_employees
        return np.concatenate(ite).mean(), len(Xs)

    # Make subsets
    if subset_size == 1:
        gss = LeaveOneGroupOut()
        subsets = gss.split(Xs, ys, groups)
    elif subset_size < 14:
        gss = GroupShuffleSplit(n_subsets,
                                test_size=subset_size,
                                random_state=0)
        subsets = gss.split(Xs, ys, groups)
    else:  # subset_size = 14
        subsets = [(None, np.arange(len(Xs)))]

    output = Parallel(n_jobs=n_jobs)(
        delayed(foo)(Xs[idx], Xg[idx], ys[idx], yg[idx]) for _, idx in subsets)
    # Save a 2D array with ATEs and n_employees
    if rotated_data:
        np.savetxt(
            fname=
            f'../results/ATE_subsets/model/{subset_size}DB_matched={matched_data}_rotated.csv',
            X=np.array(output))
    else:
        np.savetxt(
            fname=
            f'../results/ATE_subsets/model/{subset_size}DB_matched={matched_data}.csv',
            X=np.array(output))
    return output
예제 #7
0
    def make_leave_out(X, y=None, p=5, strategy=None, group=None):
        ### strategy = None / 'group'

        # group strategy
        if strategy == 'group':
            spliter = LeaveOneGroupOut() if p == 1 else LeavePGroupsOut(p)
            if group is None:
                raise Exception('Please provide group parameter.')
            else:
                idx_generator = spliter.split(X, y=y, groups=group)
        # not specific strategy
        else:
            spliter = LeaveOneOut() if p == 1 else LeavePOut(p)
            idx_generator = spliter.split(X, y=y, groups=group)
        return idx_generator
예제 #8
0
    def train_kfold(self, save2disk=False):

        # Build Model
        self._build()

        # Training with cross validation
        self.logger.log_info('Cross Validation training with LeaveOneGroupOut sklearn method')
        logo = LeaveOneGroupOut()
        self._create_groups()
        for train_index, test_index in logo.split(self.x_train, self.y_train, self.groups):
            x_train, x_test = self.x_train[train_index], self.x_train[test_index]
            y_train, y_test = self.y_train[train_index], self.y_train[test_index]

            self._define_jobs(self.clf, self.config["train"]["n_jobs"])
            self.clf.fit(x_train, y_train)
            self._predict(x_test)

            fold_acc = accuracy_score(y_test, self.predictions)
            self.val_fold_scores_.append(fold_acc)

        print('Train accuracy is {}'.format(np.mean(self.val_fold_scores_)))

        # Save model
        if save2disk:
            self._save_model()
        return self.val_fold_scores_
예제 #9
0
def loocv_split(
    tracker: Tracker[T],
) -> Iterator[Tuple[Tuple[List[Config[T]], List[Performance]], Tuple[
        List[Config[T]], List[Performance]], ]]:
    """
    Iterates over the configurations and associated performances obtained from
    a collector. For each item it yields, it leaves out
    configurations/performances for a single dataset for testing and provides
    the configurations/performances for training.

    Args:
        tracker: The tracker to retrieve data from.
        show_progerss: Whether to show progress via tqdm.

    Yields:
        The training data, including configurations and performances, and the test data, including
        configurations and performances.
    """
    data = tracker.get_evaluations()
    groups = [c.dataset.name() for c in data.configurations]

    # Split the data according to the datasets
    loocv = LeaveOneGroupOut()
    for I_train, I_test in loocv.split(data.configurations, groups=groups):
        X_train = [data.configurations[i] for i in I_train]
        y_train = [data.performances[i] for i in I_train]

        X_test = [data.configurations[i] for i in I_test]
        y_test = [data.performances[i] for i in I_test]

        yield (X_train, y_train), (X_test, y_test)
예제 #10
0
class DKULeaveOneGroupOut(object):
    def __init__(self, column_name):
        self.column_name = column_name
        self.splitter = LeaveOneGroupOut()
        pass

    def set_column_labels(self, column_labels):
        self.column_labels = column_labels

    def get_n_splits(self, X, y, groups=None):
        try:
            column_idx = self.column_labels.index(self.column_name)
        except ValueError as e:
            raise Exception("Column %s not found among %s" %
                            (self.column_name, self.column_labels))

        groups_array = X[:, column_idx]

        ret = self.splitter.get_n_splits(X, y, groups_array)
        print("Will use %s splits" % ret)
        return ret

    def split(self, X, y, groups=None):
        try:
            column_idx = self.column_labels.index(self.column_name)
        except ValueError as e:
            raise Exception("Column %s not found among %s" %
                            (self.column_name, self.column_labels))

        groups_array = X[:, column_idx]

        return self.splitter.split(X, y, groups_array)
예제 #11
0
파일: model.py 프로젝트: g-chi/lmic-poverty
def train_final_model(training_data):
    """ calculate the accuracy of the model by leave one country out
    @param training_data_dict dict: training data
    @return regressor list: r2 of all the 50 countries
    """
    xgb_model = xgb.XGBRegressor(n_jobs=-1, random_state=123)
    xgb_params = {
        'max_depth': [1, 3, 5, 10, 15, 20, 30],
        'min_child_weight': [1, 3, 5, 7, 10],
    }
    training_data_final = shuffle(training_data)
    logo = LeaveOneGroupOut()
    cv = logo.split(np.array(training_data_final[all_features]),
                    groups=training_data_final.group)
    regressor = GridSearchCV(xgb_model,
                             xgb_params,
                             scoring={
                                 'r2': pearson_r2_score,
                                 'mse': mean_squared_error_score
                             },
                             cv=cv,
                             refit='mse')
    regressor.fit(np.array(training_data_final[feature_names]),
                  np.array(training_data_final['rwi']))
    return regressor
예제 #12
0
def logo_cv(clf_type,
            data_sets: [GroupedDataSet],
            n_jobs=-1,
            parallel_verbose=1,
            persist=True):
    """
    Parallel leave on group out cross validation.
    :param clf:
    :param data_sets:
    :param n_jobs:
    :param parallel_verbose:
    :param persist:
    :return:
    """
    log.info('Starting leave on group out cv for {!s} sets'.format(
        len(data_sets)))

    parallel = Parallel(n_jobs=n_jobs, verbose=parallel_verbose)

    logo = LeaveOneGroupOut()
    stats_list = parallel(
        delayed(_fit_and_score)(clf, domains, labels, train_index, test_index,
                                -1, data_set_id, -1)
        for domains, labels, groups, data_set_id, clf in
        _grouped_data_sets_generator(data_sets, clf_type)
        for train_index, test_index in logo.split(
            domains, labels, groups=groups))
    where = settings.EVAL_FOLDER + '/' + 'logo_cv_{!s}_{!s}sets_{!s}.pkl'.format(
        clf_type, len(data_sets), settings.NOW_STR)
    return _serialize_cv_results(stats_list, persist, where)
예제 #13
0
class EnhancedLeaveOneGroupOut(LeaveOneGroupOut):
    def __init__(self, return_validate: bool = True):
        super().__init__()
        self.return_validate = return_validate
        if self.return_validate:
            self.validate_spliter = LeaveOneGroupOut()

    def split(self, X, y=None, groups=None):
        if groups is None and y is not None:
            groups = self._generate_sequential_groups(y)
        n_splits = super().get_n_splits(groups=groups)
        for train, test in super().split(X, y, groups):
            if self.return_validate:
                n_repeat = np.random.randint(1, n_splits)
                validate_iter = self.validate_spliter.split(
                    X[train], y[train], groups[train])
                for i in range(n_repeat):
                    train_ind, validate_ind = next(validate_iter)
                yield train[train_ind], train[validate_ind], test
            else:
                yield train, test

    def _generate_sequential_groups(self, y):
        labels = np.unique(y)
        groups = np.zeros((len(y)))
        inds = [y == label for label in labels]
        n_labels = [np.sum(ind) for ind in inds]
        if len(np.unique(n_labels)) > 1:
            warnings.warn(
                "y is not balanced, the generated groups is not balanced as well.",
                RuntimeWarning)
        for ind, n_label in zip(inds, n_labels):
            groups[ind] = np.arange(n_label)
        return groups
예제 #14
0
def cal_merit_lda(spectra: np.ndarray, wavenumbers: np.ndarray,
                  replicate: np.ndarray, label: np.ndarray):
    """
    Benchmark of replicate EMSC correction based on LDA classification
    :param spectra: ndarray of shape [n_samples, n_channels]
    :param wavenumbers: ndarray of shape [n_channels]
    :param replicate: ndarray of shape [n_samples] 
    :param label: ndarray of shape [n_samples] 
    :return: mean sensitivity of leave-one-replicate-out cross-validation
    """

    logo = LeaveOneGroupOut()

    res_true = []
    res_pred = []
    for train, test in logo.split(spectra, label, groups=replicate):
        tmp_model = LinearDiscriminantAnalysis()
        tmp_model.fit(spectra[train], label[train])
        res_pred = np.append(res_pred, tmp_model.predict(spectra[test]))
        res_true = np.append(res_true, label[test])

    c_m = confusion_matrix(res_true, res_pred, labels=np.unique(label))

    res = np.mean(np.diag(c_m) / np.sum(c_m, axis=1))

    return res
예제 #15
0
def cross_val(data_dir, model_dir, save_model):
    # data_dir = '/home/paula/Descargas/tagclouds-api/cstagclouds/extractkeywords/training/*'
    # model_dir = '/home/paula/Descargas/tagclouds-api/cstagclouds/learningtorank/models/unfiltered/'

    x, y, words, qids, rake, groups = (np.array(l) for l in load_data(data_dir))
    logo = LeaveOneGroupOut()

    for train_index, test_index in logo.split(x, y, groups):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        q_train, q_test = qids[train_index], qids[test_index]
        metric = pyltr.metrics.NDCG(len(x_test))
        if save_model:
            model = pyltr.models.LambdaMART(
                metric=metric,
                n_estimators=2000,
                learning_rate=0.03,
                query_subsample=0.5
            )
            model.fit(x_train, y_train, q_train)
            pickle.dump(model,
                        open('%sLambdaMART2/lambdaMART_model_%s.sav' % (model_dir, q_test[0].replace('.txt', '')),
                             'wb'))
        else:
            model = pickle.load(
                open('%sLambdaMART/lambdaMART_model_%s.sav' % (model_dir, q_test[0].replace('.txt', '')), 'rb'))
        pred_test = model.predict(x_test)
        print('%s' % metric.calc_mean(q_test, y_test, pred_test))
예제 #16
0
def LOSOCVPerformance(features, labels, subjects, clf):
    """Return the confusion matrix for the classifier.

  Using leave-one-subject-out cross validation.

  Args:
    features: (np.array) 2D Array, n_samples X n_features. The feature matrix.
    labels: (np.array) Class labels
    subjects: (np.array) The subject id that the datapoint came from
    clf: (sklearn classifier) The model to evaluate.

  Returns:
    A 3x3 confusion matrix.
  """
    class_names = ['bike', 'walk', 'run']
    logo = LeaveOneGroupOut()
    cm = np.zeros((3, 3), dtype='int')
    for train_ind, test_ind in logo.split(features, labels, subjects):
        X_train, y_train = features[train_ind], labels[train_ind]
        X_test, y_test = features[test_ind], labels[test_ind]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        c = confusion_matrix(y_test, y_pred, labels=class_names)
        cm += c
    return cm
def leave_one_patient_out_cross_validation(data_idx, patient_deid):
    print(data_idx.shape, patient_deid.shape)
    logo = LeaveOneGroupOut()
    # cv_rand_idx = np.random.permutation(len_data)
    cv_split_list = list(logo.split(X=data_idx, groups=patient_deid.squeeze()))

    return cv_split_list
def compute_crossvalidated_r2(fmri_runs, design_matrices, alpha, loglabel, logcsvwriter):
    
    def log(r2_train, r2_test):
        """ just logging stats per fold to a csv file """
        logcsvwriter.writerow([loglabel, alpha, 'training', np.mean(r2_train), np.std(r2_train), np.min(r2_train), np.max(r2_train)])
        logcsvwriter.writerow([loglabel, alpha, 'test', np.mean(r2_test), np.std(r2_test), np.min(r2_test), np.max(r2_test)])
    
    r2_train = None  # array to contain the r2 values (1 row per fold, 1 column per voxel)
    r2_test = None
    
    logo = LeaveOneGroupOut()
    for train, test in logo.split(fmri_runs, groups=range(1, 10)):
        fmri_data = np.vstack([fmri_runs[i] for i in train])
        predictors = np.vstack([design_matrices[i] for i in train])
        model = Ridge(alpha=alpha).fit(predictors, fmri_data)
            
        rsquares_training = clean_rscores(r2_score(fmri_data, 
                                                   model.predict(predictors), multioutput='raw_values'), 
                                          .0, .99)
        test_run = test[0]
        rsquares_test = clean_rscores(r2_score(fmri_runs[test_run], 
                                               model.predict(design_matrices[test_run]), multioutput='raw_values'),
                                      .0, .99)
        
        log(rsquares_training, rsquares_test)

        r2_train = rsquares_training if r2_train is None else np.vstack([r2_train, rsquares_training])    
        r2_test = rsquares_test if r2_test is None else np.vstack([r2_test, rsquares_test])
        
    return (np.mean(r2_train, axis=0), np.mean(r2_test, axis=0))
예제 #19
0
    def objfunction(self, x):
        kernel = {0: 'gini', 1: 'entropy'}
        criterion = kernel[x[-1]]
        X, Y = df[xfeatures].values, df['sleep'].values

        logo = LeaveOneGroupOut()
        grp = df['id'].values
        scores = []
        for train, test in logo.split(X, Y, grp):
            model = DecisionTreeClassifier(max_depth=int(x[0]),
                                           criterion=criterion,
                                           min_samples_split=int(x[1]),
                                           max_leaf_nodes=int(x[2]),
                                           min_impurity_decrease=int(x[3]),
                                           min_samples_leaf=int(x[4]))
            x_train, x_test = X[train], X[test]
            y_train, y_test = Y[train], Y[test]
            model.fit(x_train, y_train)
            scores.append(metrics.accuracy_score(y_test,
                                                 model.predict(x_test)))

        print(colored('Features:', 'blue'), colored(x, 'green'))
        print(colored('Accuracy:', 'green'),
              colored(np.mean(scores) * 100, 'blue'))
        return 1 - np.mean(scores)
예제 #20
0
    def train(self, train, validate=None, test=None):
        assert validate is None
        logging.info("KFoldLayer::train hash={}".format(
            hash_of_pandas_df(train)))
        folds = LeaveOneGroupOut()
        groups = np.floor(
            np.linspace(0, 1, len(train), False) * self.n_splits).astype(int)
        y_pred = pd.Series(index=train.index)
        self.models = []
        local_cvs = []
        for fold_, (trn_idx_, val_idx_) in enumerate(
                folds.split(train.values, groups, groups)):
            trn_idx = train.iloc[trn_idx_].index
            val_idx = train.iloc[val_idx_].index

            logging.info(f"fold {fold_}")
            train_part, validate_part = self.split_train(
                train, trn_idx, val_idx)
            model = self.model_factory(**self.model_kwargs)
            cv = model.train(train_part, validate_part, test)
            local_cvs.append(cv)
            self.models.append(model)

            logging.info(f"KFoldLayer::train fold {fold_} local cv: {cv:.4f}")
            y_pred.loc[val_idx] = model.predict(validate_part).values

        total_cv = self.loss(y_pred, train['target'], train['weight'])
        logging.info(
            f"KFoldLayer::train total cv: {total_cv:.4f} local_cvs: {local_cvs}"
        )
        return total_cv
예제 #21
0
def gridsearch(x,
               y_target,
               subjects,
               cross_v,
               experiment,
               clf_method,
               nor_method,
               cv_start,
               njobs=1):

    note = '{}_{}_{}_{}cvstart'.format(experiment, nor_method, clf_method,
                                       cv_start)
    svm_parameters = [{
        'kernel': ['linear'],
        'C': [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
    }]

    logis_parameters = [{
        'penalty': ['l1', 'l2'],
        'C': [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
    }]
    clf = LogisticRegression if clf_method == 'logis' else SVC
    params = logis_parameters if clf_method == 'logis' else svm_parameters

    train, test = cross_v
    x_train = x[train]
    y_train = y_target[train]
    x_test = x[test]
    y_test = y_target[test]
    subjects_train = subjects[train]
    if experiment == 'meg':
        # gkf = GroupKFold(n_splits=7)
        # grid_cv = list(gkf.split(x_train, y_train, subjects_train))
        logo = LeaveOneGroupOut()
        grid_cv = list(logo.split(x_train, y_train, subjects_train))

    else:
        # leave 2 subjects out for fmri in gridsearch
        gkf = GroupKFold(n_splits=45)
        grid_cv = list(gkf.split(x_train, y_train, subjects_train))

        # logo = LeaveOneGroupOut()
        # grid_cv = list(logo.split(x_train, y_train, subjects_train))
        # print('logo')
    grid_clf = GridSearchCV(clf(), params, cv=grid_cv, n_jobs=njobs)
    grid_clf.fit(x_train, y_train)
    print('best params', grid_clf.best_params_)
    joblib.dump(grid_clf.best_params_,
                result_dir + '/gridtables/{}cv_gridbest.pkl'.format(note))
    # grid_bestpara = joblib.load(result_dir + 'joblib.pkl')
    grid_csv = pd.DataFrame.from_dict(grid_clf.cv_results_)
    with open(result_dir + '/gridtables/{}cv_gridtable.csv'.format(note),
              'w') as f:
        grid_csv.to_csv(f)

    pre = grid_clf.predict(x_test)
    score_op = accuracy_score(y_test, pre)
    print('optimal transport accuracy of {}th split:'.format(cv_start),
          score_op)
    return score_op
예제 #22
0
def qsar_classification(emb, groups, labels):
    """Helper function that fits and scores a SVM classifier on the extracted molecular
    descriptor in a leave-one-group-out cross-validation manner.

    Args:
        emb: Embedding (molecular descriptor) that is used as input for the SVM
        groups: Array or list with n_samples entries defining the fold membership for the
        crossvalidtion.
        labels: Target values of the of the qsar task.
    Returns:
        The mean accuracy, F1-score, ROC-AUC and prescion-recall-AUC of the cross-validation.
    """
    acc = []
    f1 = []
    roc_auc = []
    pr_auc = []
    logo = LeaveOneGroupOut()
    clf = SVC(kernel='rbf', C=5.0, probability=True)
    for train_index, test_index in logo.split(emb, groups=groups):
        clf.fit(emb[train_index], labels[train_index])
        y_pred = clf.predict(emb[test_index])
        y_pred_prob = clf.predict_proba(emb[test_index])[:, 1]
        y_true = labels[test_index]
        precision, recall, t = precision_recall_curve(y_true, y_pred_prob)
        acc.append(accuracy_score(y_true, y_pred))
        f1.append(f1_score(y_true, y_pred))
        roc_auc.append(roc_auc_score(y_true, y_pred_prob))
        pr_auc.append(auc(recall, precision))
    return np.mean(acc), np.mean(f1), np.mean(roc_auc), np.mean(pr_auc)
예제 #23
0
def qsar_regression(emb, groups, labels):
    """Helper function that fits and scores a SVM regressor on the extracted molecular
    descriptor in a leave-one-group-out cross-validation manner.

    Args:
        emb: Embedding (molecular descriptor) that is used as input for the SVM
        groups: Array or list with n_samples entries defining the fold membership for the
        crossvalidtion.
        labels: Target values of the of the qsar task.
    Returns:
        The mean accuracy, F1-score, ROC-AUC and prescion-recall-AUC of the cross-validation.
    """
    r2 = []
    r = []
    mse = []
    mae = []
    logo = LeaveOneGroupOut()
    clf = SVR(kernel='rbf', C=5.0)
    for train_index, test_index in logo.split(emb, groups=groups):
        clf.fit(emb[train_index], labels[train_index])
        y_pred = clf.predict(emb[test_index])
        y_true = labels[test_index]
        r2.append(r2_score(y_true, y_pred))
        r.append(spearmanr(y_true, y_pred)[0])
        mse.append(mean_squared_error(y_true, y_pred))
        mae.append(mean_absolute_error(y_true, y_pred))
    return np.mean(r2), np.mean(r), np.mean(mse), np.mean(mae)
예제 #24
0
def classify_loso(X, y, group, clf):
    """ Main classification function to train and test a ml model with Leave one subject out

        Args:
            X (numpy matrix): this is the feature matrix with row being a data point
            y (numpy vector): this is the label vector with row belonging to a data point
            group (numpy vector): this is the group vector (which is a the participant id)
            clf (sklearn classifier): this is a classifier made in sklearn with fit, transform and predict functionality

        Returns:
            f1s (list): the f1 at for each leave one out participant
    """
    logo = LeaveOneGroupOut()

    accuracies = []
    cms = np.zeros((2, 2))
    for train_index, test_index in logo.split(X, y, group):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        with joblib.parallel_backend('loky'):
            clf.fit(X_train, y_train)
        y_hat = clf.predict(X_test)

        acc = accuracy_score(y_test, y_hat)
        cm = confusion_matrix(y_test, y_hat)

        accuracies.append(acc)
        cms = np.add(cms, cm)
    return accuracies, cms
예제 #25
0
def permutation_test(X, y, group, clf, num_permutation=1000):
    """ Helper function to validate that a classifier is performing higher than chance

        Args:
            X (numpy matrix): this is the feature matrix with row being a data point
            y (numpy vector): this is the label vector with row belonging to a data point
            group (numpy vector): this is the group vector (which is a the participant id)
            clf (sklearn classifier): this is a classifier made in sklearn with fit, transform and predict functionality
            num_permutation (int): the number of time to permute y
            random_state (int): this is used for reproducible output
        Returns:
            f1s (list): the f1 at for each leave one out participant

    """

    logo = LeaveOneGroupOut()
    train_test_splits = logo.split(X, y, group)

    with joblib.parallel_backend('loky'):
        (accuracies, permutation_scores,
         p_value) = permutation_test_score(clf,
                                           X,
                                           y,
                                           groups=group,
                                           cv=train_test_splits,
                                           n_permutations=num_permutation,
                                           verbose=num_permutation,
                                           n_jobs=-1)

    return accuracies, permutation_scores, p_value
예제 #26
0
def basari_hesapla(giris, cikis, CustomerID):
    #Kişi bazlı çapraz doğrulama
    logo = LeaveOneGroupOut()
    #Destek vektör sınıflandırıcısı
    clf = SVC(C=1, gamma=0.2, kernel='rbf')
    #clf = RandomForestClassifier(criterion='entropy',n_estimators=60)
    toplamBasari = 0
    toplamFSkor = 0

    for train_index, test_index in logo.split(giris, cikis, CustomerID):
        #Eğitim ve test verilerini ayır
        X_train, X_test = giris[train_index, :], giris[test_index, :]
        y_train, y_test = cikis.iloc[train_index], cikis.iloc[test_index]

        #Modeli eğit.
        clf.fit(X_train, y_train)
        #Modelden tahmin iste.
        pred_y = clf.predict(X_test)

        #Tahminlerin başarılarını hesapla.
        toplamBasari += accuracy_score(y_test, pred_y)
        toplamFSkor += f1_score(y_test, pred_y)
    #Ortalama Başarı = toplam başarı / parça sayısı
    return toplamBasari / logo.get_n_splits(
        giris, cikis, CustomerID), toplamFSkor / logo.get_n_splits(
            giris, cikis, CustomerID)
예제 #27
0
def cross_val(data_dir, model_dir, save_model):
    # data_dir = '/home/paula/Descargas/tagclouds-api/cstagclouds/extractkeywords/training/*'
    # model_dir = '/home/paula/Descargas/tagclouds-api/cstagclouds/learningtorank/models/unfiltered/'

    x, y, words, qids, rake, groups = (np.array(l)
                                       for l in load_data(data_dir))
    scorer = NDCGScorer(k=90)
    logo = LeaveOneGroupOut()

    for train_index, test_index in logo.split(x, y, groups):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        q_train, q_test = qids[train_index], qids[test_index]
        if save_model:
            model = AdaRank(max_iter=100, estop=10, scorer=scorer)
            model.fit(x_train, y_train, q_train)
            pickle.dump(
                model,
                open(
                    '%sAdaRank/adaRank_model_%s.sav' %
                    (model_dir, q_test[0].replace('.txt', '')), 'wb'))
        else:
            model = pickle.load(
                open(
                    '%sAdaRank/adaRank_model_%s.sav' %
                    (model_dir, q_test[0].replace('.txt', '')), 'rb'))
        pred_test = model.predict(x_test, q_test)
        metric = pyltr.metrics.NDCG(len(x_test))
        print('%s' % metric.calc_mean(q_test, y_test, pred_test))
예제 #28
0
def leave_one_group_out(data_tbl,
                        features,
                        group_name,
                        method='random_forest',
                        n_estimators=100,
                        n_neighbours=21,
                        n_components=10,
                        seed=None):
    scores = []
    classifier_score = 0
    classifier = get_classifier(data_tbl,
                                features,
                                method=method,
                                n_estimators=n_estimators,
                                n_neighbours=n_neighbours,
                                n_components=n_components,
                                seed=None)
    leave_one_group = LeaveOneGroupOut()
    for train_index, test_index in leave_one_group.split(data_tbl,
                                                         y=features,
                                                         groups=group_name):
        X_train, X_test = data_tbl[train_index], data_tbl[test_index]
        y_train, y_test = features[train_index], features[test_index]
        classifier.fit(X_train, y_train)
        scores.append(classifier.score(X_test, y_test))
        if classifier_score < classifier.score(X_test, y_test):
            X_train_best, X_test_best, y_train_best, y_test_best = X_train, X_test, y_train, y_test
    return classifier.fit(
        X_train_best,
        y_train_best), np.mean(scores), np.std(scores), X_test, y_test
def train_test(model, X, y, groups):
    """
    do cross validation and get the cv_results
    :param model: model after hyperparameter tuning
    :param X: array like,training x
    :param y: array like, training y
    :param score: estimated score, could be "r2",'neg_mean_absolute_error','neg_mean_squared_error'
    :return:
    """
    ytest = np.array([])
    ypred = np.array([])
    kfold = LeaveOneGroupOut()
    for idx, (train_index,
              test_index) in enumerate(kfold.split(X, y, groups=groups)):
        x_train = X[train_index]
        y_train = y[train_index]
        reg = model.estimator.fit(x_train, y_train)
        y_test = y[test_index]
        x_test = X[test_index]
        y_pred = reg.predict(x_test)
        ytest = np.append(ytest, y_test)
        ypred = np.append(ypred, y_pred)
    estimator_name = model.model_name
    test_score = r2_score(ytest, ypred)
    r = stats.pearsonr(ytest, ypred)
    fitted_estimator = model.estimator.fit(X, y)

    return estimator_name, test_score, r, ytest, ypred, fitted_estimator
예제 #30
0
    def get_sets(self, test_size=0.2):

        if not self.time_series:
            # Use train_test_split if the data is not time series data
            rand_state = np.random.randint(0, 100)
            img_train, img_test, label_train, label_test = train_test_split(
                self.images,
                self.image_labels,
                test_size=test_size,
                random_state=rand_state)
        else:
            # Divide the data into 1/test_size groups to minimize the probability
            # that a time series will span the train set-test set split
            n_groups = math.ceil(1. / test_size)
            group_size = math.ceil(len(self.images) / n_groups)
            groups = np.repeat(np.arange(n_groups),
                               group_size)[:len(self.images)]
            logo = LeaveOneGroupOut()
            for train_idx, test_idx in logo.split(self.images,
                                                  y=self.image_labels,
                                                  groups=groups):
                img_train, img_test, label_train, label_test = self.images[
                    train_idx], self.images[test_idx], self.image_labels[
                        train_idx], self.image_labels[test_idx]

        return img_train, img_test, label_train, label_test
예제 #31
0
파일: lgo.py 프로젝트: aatapa/RLScore
def lgo_core(X,y, groups, regparam):
    logo = LeaveOneGroupOut()
    rls = RLS(X,y, regparam=regparam, kernel="GaussianKernel", gamma=0.01)
    errors = []
    for train, test in logo.split(X, y, groups=groups):
        p = rls.holdout(test)
        e = sqerror(y[test], p)
        errors.append(e)
    return np.mean(errors)
예제 #32
0
파일: lgo.py 프로젝트: aatapa/RLScore
def lgo_sklearn(X,y, groups, regparam):
    logo = LeaveOneGroupOut()
    errors = []
    for train, test in logo.split(X, y, groups=groups):
        rls = KernelRidge(kernel="rbf", gamma=0.01)
        rls.fit(X[train], y[train])
        p = rls.predict(X[test])
        e = sqerror(y[test], p)       
        errors.append(e)
    return np.mean(errors)
예제 #33
0
def get_pred_cv(lm, x, y_true, groups, use_logs):
    cv = LeaveOneGroupOut()
    y_pred = pd.Series()
    for train_ix, test_ix in cv.split(x, y_true, groups):
        x_train = x.iloc[train_ix]
        y_train = y_true.iloc[train_ix]
        x_test = x.iloc[test_ix]
        lm.fit(x_train, y_train)
        if use_logs:
            lm.fit(np.log(x_train), np.log(y_train))
            arr = np.exp(lm.predict(np.log(x_test)))
        else:
            lm.fit(x_train, y_train)
            arr = lm.predict(x_test)
        s = pd.Series(arr, index=x_test.index)
        y_pred = y_pred.append(s, verify_integrity=True)
    return y_pred.sort_index()
for i,n in enumerate(sorted(names)):
    roi_name=fold+'mni4060/asymroi_'+smt+'_'+n+'.npz'            
    roi=np.load(roi_name)['roi']
    roi=roi[:,motor_label-1]
    roi_imp=roi[mask_imp]
    roi_imag=roi[mask_imag]
    roi_imp_all=np.vstack((roi_imp_all,roi_imp))
    roi_imag_all=np.vstack((roi_imag_all,roi_imag))
    y_imp_all=np.append(y_imp_all,y_imp)
    y_imag_all=np.append(y_imag_all,y_imag)
    groups=np.append(groups,np.ones(len(y_imp))*i)
result_cv_tr_imp=[] 
result_cv_tr_imag=[]   
pipeline = Pipeline([('scale', scaler),('svm', svm)])    
from sklearn.model_selection import LeaveOneGroupOut
logo = LeaveOneGroupOut()
for train_index, test_index in logo.split(roi_imp_all, y_imp_all, groups):
    X_train, X_test = roi_imp_all[train_index], roi_imag_all[test_index]
    y_train, y_test = y_imp_all[train_index], y_imp_all[test_index]
    pipeline.fit(X_train,y_train)
    prediction = pipeline.predict(X_test)  
    result_cv_tr_imp.append(accuracy_score(prediction,y_test))
    
    X_train, X_test = roi_imag_all[train_index], roi_imp_all[test_index]
    y_train, y_test = y_imp_all[train_index], y_imp_all[test_index]
    pipeline.fit(X_train,y_train)
    prediction = pipeline.predict(X_test)  
    result_cv_tr_imag.append(accuracy_score(prediction,y_test))

from scipy.stats import ttest_1samp
tt,p=ttest_1samp(np.array(result_cv_tr_imag),0.5)
예제 #35
0
def test_generalization_across_time():
    """Test time generalization decoding."""
    from sklearn.svm import SVC
    # KernelRidge is used for testing 1) regression analyses 2) n-dimensional
    # predictions.
    from sklearn.kernel_ridge import KernelRidge
    from sklearn.preprocessing import LabelEncoder
    from sklearn.metrics import roc_auc_score, mean_squared_error

    epochs = make_epochs()
    y_4classes = np.hstack((epochs.events[:7, 2], epochs.events[7:, 2] + 1))
    if check_version('sklearn', '0.18'):
        from sklearn.model_selection import (KFold, StratifiedKFold,
                                             ShuffleSplit, LeaveOneGroupOut)
        cv = LeaveOneGroupOut()
        cv_shuffle = ShuffleSplit()
        # XXX we cannot pass any other parameters than X and y to cv.split
        # so we have to build it before hand
        cv_lolo = [(train, test) for train, test in cv.split(
                   y_4classes, y_4classes, y_4classes)]

        # With sklearn >= 0.17, `clf` can be identified as a regressor, and
        # the scoring metrics can therefore be automatically assigned.
        scorer_regress = None
    else:
        from sklearn.cross_validation import (KFold, StratifiedKFold,
                                              ShuffleSplit, LeaveOneLabelOut)
        cv_shuffle = ShuffleSplit(len(epochs))
        cv_lolo = LeaveOneLabelOut(y_4classes)

        # With sklearn < 0.17, `clf` cannot be identified as a regressor, and
        # therefore the scoring metrics cannot be automatically assigned.
        scorer_regress = mean_squared_error
    # Test default running
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(picks='foo')
    assert_equal("<GAT | no fit, no prediction, no score>", "%s" % gat)
    assert_raises(ValueError, gat.fit, epochs)
    with warnings.catch_warnings(record=True):
        # check classic fit + check manual picks
        gat.picks = [0]
        gat.fit(epochs)
        # check optional y as array
        gat.picks = None
        gat.fit(epochs, y=epochs.events[:, 2])
        # check optional y as list
        gat.fit(epochs, y=epochs.events[:, 2].tolist())
    assert_equal(len(gat.picks_), len(gat.ch_names), 1)
    assert_equal("<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), no "
                 "prediction, no score>", '%s' % gat)
    assert_equal(gat.ch_names, epochs.ch_names)
    # test different predict function:
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(predict_method='decision_function')
    gat.fit(epochs)
    # With classifier, the default cv is StratifiedKFold
    assert_true(gat.cv_.__class__ == StratifiedKFold)
    gat.predict(epochs)
    assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 1))
    gat.predict_method = 'predict_proba'
    gat.predict(epochs)
    assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 2))
    gat.predict_method = 'foo'
    assert_raises(NotImplementedError, gat.predict, epochs)
    gat.predict_method = 'predict'
    gat.predict(epochs)
    assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 1))
    assert_equal("<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), "
                 "predicted 14 epochs, no score>",
                 "%s" % gat)
    gat.score(epochs)
    assert_true(gat.scorer_.__name__ == 'accuracy_score')
    # check clf / predict_method combinations for which the scoring metrics
    # cannot be inferred.
    gat.scorer = None
    gat.predict_method = 'decision_function'
    assert_raises(ValueError, gat.score, epochs)
    # Check specifying y manually
    gat.predict_method = 'predict'
    gat.score(epochs, y=epochs.events[:, 2])
    gat.score(epochs, y=epochs.events[:, 2].tolist())
    assert_equal("<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), "
                 "predicted 14 epochs,\n scored "
                 "(accuracy_score)>", "%s" % gat)
    with warnings.catch_warnings(record=True):
        gat.fit(epochs, y=epochs.events[:, 2])

    old_mode = gat.predict_mode
    gat.predict_mode = 'super-foo-mode'
    assert_raises(ValueError, gat.predict, epochs)
    gat.predict_mode = old_mode

    gat.score(epochs, y=epochs.events[:, 2])
    assert_true("accuracy_score" in '%s' % gat.scorer_)
    epochs2 = epochs.copy()

    # check _DecodingTime class
    assert_equal("<DecodingTime | start: -0.200 (s), stop: 0.499 (s), step: "
                 "0.050 (s), length: 0.050 (s), n_time_windows: 15>",
                 "%s" % gat.train_times_)
    assert_equal("<DecodingTime | start: -0.200 (s), stop: 0.499 (s), step: "
                 "0.050 (s), length: 0.050 (s), n_time_windows: 15 x 15>",
                 "%s" % gat.test_times_)

    # the y-check
    gat.predict_mode = 'mean-prediction'
    epochs2.events[:, 2] += 10
    gat_ = copy.deepcopy(gat)
    with use_log_level('error'):
        assert_raises(ValueError, gat_.score, epochs2)
    gat.predict_mode = 'cross-validation'

    # Test basics
    # --- number of trials
    assert_true(gat.y_train_.shape[0] ==
                gat.y_true_.shape[0] ==
                len(gat.y_pred_[0][0]) == 14)
    # ---  number of folds
    assert_true(np.shape(gat.estimators_)[1] == gat.cv)
    # ---  length training size
    assert_true(len(gat.train_times_['slices']) == 15 ==
                np.shape(gat.estimators_)[0])
    # ---  length testing sizes
    assert_true(len(gat.test_times_['slices']) == 15 ==
                np.shape(gat.scores_)[0])
    assert_true(len(gat.test_times_['slices'][0]) == 15 ==
                np.shape(gat.scores_)[1])

    # Test score_mode
    gat.score_mode = 'foo'
    assert_raises(ValueError, gat.score, epochs)
    gat.score_mode = 'fold-wise'
    scores = gat.score(epochs)
    assert_array_equal(np.shape(scores), [15, 15, 5])
    gat.score_mode = 'mean-sample-wise'
    scores = gat.score(epochs)
    assert_array_equal(np.shape(scores), [15, 15])
    gat.score_mode = 'mean-fold-wise'
    scores = gat.score(epochs)
    assert_array_equal(np.shape(scores), [15, 15])
    gat.predict_mode = 'mean-prediction'
    with warnings.catch_warnings(record=True) as w:
        gat.score(epochs)
        assert_true(any("score_mode changed from " in str(ww.message)
                        for ww in w))

    # Test longer time window
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(train_times={'length': .100})
    with warnings.catch_warnings(record=True):
        gat2 = gat.fit(epochs)
    assert_true(gat is gat2)  # return self
    assert_true(hasattr(gat2, 'cv_'))
    assert_true(gat2.cv_ != gat.cv)
    with warnings.catch_warnings(record=True):  # not vectorizing
        scores = gat.score(epochs)
    assert_true(isinstance(scores, np.ndarray))  # type check
    assert_equal(len(scores[0]), len(scores))  # shape check
    assert_equal(len(gat.test_times_['slices'][0][0]), 2)
    # Decim training steps
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(train_times={'step': .100})
    with warnings.catch_warnings(record=True):
        gat.fit(epochs)
    gat.score(epochs)
    assert_true(len(gat.scores_) == len(gat.estimators_) == 8)  # training time
    assert_equal(len(gat.scores_[0]), 15)  # testing time

    # Test start stop training & test cv without n_fold params
    y_4classes = np.hstack((epochs.events[:7, 2], epochs.events[7:, 2] + 1))
    train_times = dict(start=0.090, stop=0.250)
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(cv=cv_lolo, train_times=train_times)
    # predict without fit
    assert_raises(RuntimeError, gat.predict, epochs)
    with warnings.catch_warnings(record=True):
        gat.fit(epochs, y=y_4classes)
    gat.score(epochs)
    assert_equal(len(gat.scores_), 4)
    assert_equal(gat.train_times_['times'][0], epochs.times[6])
    assert_equal(gat.train_times_['times'][-1], epochs.times[9])

    # Test score without passing epochs & Test diagonal decoding
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(test_times='diagonal')
    with warnings.catch_warnings(record=True):  # not vectorizing
        gat.fit(epochs)
    assert_raises(RuntimeError, gat.score)
    with warnings.catch_warnings(record=True):  # not vectorizing
        gat.predict(epochs)
    scores = gat.score()
    assert_true(scores is gat.scores_)
    assert_equal(np.shape(gat.scores_), (15, 1))
    assert_array_equal([tim for ttime in gat.test_times_['times']
                        for tim in ttime], gat.train_times_['times'])
    # Test generalization across conditions
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(predict_mode='mean-prediction', cv=2)
    with warnings.catch_warnings(record=True):
        gat.fit(epochs[0:6])
    with warnings.catch_warnings(record=True):
        # There are some empty test folds because of n_trials
        gat.predict(epochs[7:])
        gat.score(epochs[7:])

    # Test training time parameters
    gat_ = copy.deepcopy(gat)
    # --- start stop outside time range
    gat_.train_times = dict(start=-999.)
    with use_log_level('error'):
        assert_raises(ValueError, gat_.fit, epochs)
    gat_.train_times = dict(start=999.)
    assert_raises(ValueError, gat_.fit, epochs)
    # --- impossible slices
    gat_.train_times = dict(step=.000001)
    assert_raises(ValueError, gat_.fit, epochs)
    gat_.train_times = dict(length=.000001)
    assert_raises(ValueError, gat_.fit, epochs)
    gat_.train_times = dict(length=999.)
    assert_raises(ValueError, gat_.fit, epochs)

    # Test testing time parameters
    # --- outside time range
    gat.test_times = dict(start=-999.)
    with warnings.catch_warnings(record=True):  # no epochs in fold
        assert_raises(ValueError, gat.predict, epochs)
    gat.test_times = dict(start=999.)
    with warnings.catch_warnings(record=True):  # no test epochs
        assert_raises(ValueError, gat.predict, epochs)
    # --- impossible slices
    gat.test_times = dict(step=.000001)
    with warnings.catch_warnings(record=True):  # no test epochs
        assert_raises(ValueError, gat.predict, epochs)
    gat_ = copy.deepcopy(gat)
    gat_.train_times_['length'] = .000001
    gat_.test_times = dict(length=.000001)
    with warnings.catch_warnings(record=True):  # no test epochs
        assert_raises(ValueError, gat_.predict, epochs)
    # --- test time region of interest
    gat.test_times = dict(step=.150)
    with warnings.catch_warnings(record=True):  # not vectorizing
        gat.predict(epochs)
    assert_array_equal(np.shape(gat.y_pred_), (15, 5, 14, 1))
    # --- silly value
    gat.test_times = 'foo'
    with warnings.catch_warnings(record=True):  # no test epochs
        assert_raises(ValueError, gat.predict, epochs)
    assert_raises(RuntimeError, gat.score)
    # --- unmatched length between training and testing time
    gat.test_times = dict(length=.150)
    assert_raises(ValueError, gat.predict, epochs)
    # --- irregular length training and testing times
    # 2 estimators, the first one is trained on two successive time samples
    # whereas the second one is trained on a single time sample.
    train_times = dict(slices=[[0, 1], [1]])
    # The first estimator is tested once, the second estimator is tested on
    # two successive time samples.
    test_times = dict(slices=[[[0, 1]], [[0], [1]]])
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(train_times=train_times,
                                       test_times=test_times)
    gat.fit(epochs)
    with warnings.catch_warnings(record=True):  # not vectorizing
        gat.score(epochs)
    assert_array_equal(np.shape(gat.y_pred_[0]), [1, len(epochs), 1])
    assert_array_equal(np.shape(gat.y_pred_[1]), [2, len(epochs), 1])
    # check cannot Automatically infer testing times for adhoc training times
    gat.test_times = None
    assert_raises(ValueError, gat.predict, epochs)

    svc = SVC(C=1, kernel='linear', probability=True)
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(clf=svc, predict_mode='mean-prediction')
    with warnings.catch_warnings(record=True):
        gat.fit(epochs)

    # sklearn needs it: c.f.
    # https://github.com/scikit-learn/scikit-learn/issues/2723
    # and http://bit.ly/1u7t8UT
    with use_log_level('error'):
        assert_raises(ValueError, gat.score, epochs2)
        gat.score(epochs)
    assert_true(0.0 <= np.min(scores) <= 1.0)
    assert_true(0.0 <= np.max(scores) <= 1.0)

    # Test that error if cv is not partition
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(cv=cv_shuffle,
                                       predict_mode='cross-validation')
    gat.fit(epochs)
    assert_raises(ValueError, gat.predict, epochs)
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(cv=cv_shuffle,
                                       predict_mode='mean-prediction')
    gat.fit(epochs)
    gat.predict(epochs)

    # Test that gets error if train on one dataset, test on another, and don't
    # specify appropriate cv:
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime()
    gat.fit(epochs)
    with warnings.catch_warnings(record=True):
        gat.fit(epochs)

    gat.predict(epochs)
    assert_raises(ValueError, gat.predict, epochs[:10])

    # Make CV with some empty train and test folds:
    # --- empty test fold(s) should warn when gat.predict()
    gat._cv_splits[0] = [gat._cv_splits[0][0], np.empty(0)]
    with warnings.catch_warnings(record=True) as w:
        gat.predict(epochs)
        assert_true(len(w) > 0)
        assert_true(any('do not have any test epochs' in str(ww.message)
                        for ww in w))
    # --- empty train fold(s) should raise when gat.fit()
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(cv=[([0], [1]), ([], [0])])
    assert_raises(ValueError, gat.fit, epochs[:2])

    # Check that still works with classifier that output y_pred with
    # shape = (n_trials, 1) instead of (n_trials,)
    if check_version('sklearn', '0.17'):  # no is_regressor before v0.17
        with warnings.catch_warnings(record=True):  # dep
            gat = GeneralizationAcrossTime(clf=KernelRidge(), cv=2)
        epochs.crop(None, epochs.times[2])
        gat.fit(epochs)
        # With regression the default cv is KFold and not StratifiedKFold
        assert_true(gat.cv_.__class__ == KFold)
        gat.score(epochs)
        # with regression the default scoring metrics is mean squared error
        assert_true(gat.scorer_.__name__ == 'mean_squared_error')

    # Test combinations of complex scenarios
    # 2 or more distinct classes
    n_classes = [2, 4]  # 4 tested
    # nicely ordered labels or not
    le = LabelEncoder()
    y = le.fit_transform(epochs.events[:, 2])
    y[len(y) // 2:] += 2
    ys = (y, y + 1000)
    # Univariate and multivariate prediction
    svc = SVC(C=1, kernel='linear', probability=True)
    reg = KernelRidge()

    def scorer_proba(y_true, y_pred):
        return roc_auc_score(y_true, y_pred[:, 0])

    # We re testing 3 scenario: default, classifier + predict_proba, regressor
    scorers = [None, scorer_proba, scorer_regress]
    predict_methods = [None, 'predict_proba', None]
    clfs = [svc, svc, reg]
    # Test all combinations
    for clf, predict_method, scorer in zip(clfs, predict_methods, scorers):
        for y in ys:
            for n_class in n_classes:
                for predict_mode in ['cross-validation', 'mean-prediction']:
                    # Cannot use AUC for n_class > 2
                    if (predict_method == 'predict_proba' and n_class != 2):
                        continue

                    y_ = y % n_class

                    with warnings.catch_warnings(record=True):
                        gat = GeneralizationAcrossTime(
                            cv=2, clf=clf, scorer=scorer,
                            predict_mode=predict_mode)
                        gat.fit(epochs, y=y_)
                        gat.score(epochs, y=y_)

                    # Check that scorer is correctly defined manually and
                    # automatically.
                    scorer_name = gat.scorer_.__name__
                    if scorer is None:
                        if is_classifier(clf):
                            assert_equal(scorer_name, 'accuracy_score')
                        else:
                            assert_equal(scorer_name, 'mean_squared_error')
                    else:
                        assert_equal(scorer_name, scorer.__name__)