예제 #1
0
def getProbsThread(nthread, clf, data, label, allAuthors, modeldir, saveModel):
    crossval = LeaveOneGroupOut()

    crossval.get_n_splits(groups=label)

    prob_per_author = [[0] * (len(allAuthors)) for i in range(len(allAuthors))]

    scores = Parallel(n_jobs=nthread)(
        delayed(getProbsTrainTest)(clf, data, label, train, test, modeldir,
                                   saveModel)
        for train, test in crossval.split(data, label, groups=label))

    for train, test in crossval.split(data, label, groups=label):
        anAuthor = int(label[test[0]])
        train_data_label = label[train]
        trainAuthors = list(set(train_data_label))
        # test_data_label = label[test]
        nTestDoc = len(scores)  # len(test_data_label)
        for j in range(nTestDoc):
            for i in range(len(trainAuthors)):
                try:
                    prob_per_author[anAuthor][int(
                        trainAuthors[i])] += scores[anAuthor - 1][j][i]
                except IndexError:
                    continue

        for i in range(len(trainAuthors)):
            prob_per_author[anAuthor][int(trainAuthors[i])] /= nTestDoc
    return prob_per_author
예제 #2
0
def basari_hesapla(giris, cikis, CustomerID):
    #Kişi bazlı çapraz doğrulama
    logo = LeaveOneGroupOut()
    #Destek vektör sınıflandırıcısı
    clf = SVC(C=1, gamma=0.2, kernel='rbf')
    #clf = RandomForestClassifier(criterion='entropy',n_estimators=60)
    toplamBasari = 0
    toplamFSkor = 0

    for train_index, test_index in logo.split(giris, cikis, CustomerID):
        #Eğitim ve test verilerini ayır
        X_train, X_test = giris[train_index, :], giris[test_index, :]
        y_train, y_test = cikis.iloc[train_index], cikis.iloc[test_index]

        #Modeli eğit.
        clf.fit(X_train, y_train)
        #Modelden tahmin iste.
        pred_y = clf.predict(X_test)

        #Tahminlerin başarılarını hesapla.
        toplamBasari += accuracy_score(y_test, pred_y)
        toplamFSkor += f1_score(y_test, pred_y)
    #Ortalama Başarı = toplam başarı / parça sayısı
    return toplamBasari / logo.get_n_splits(
        giris, cikis, CustomerID), toplamFSkor / logo.get_n_splits(
            giris, cikis, CustomerID)
예제 #3
0
def logistic_logo(features, grades, groups, standard=False, seed=42, use_intercept=False):
    """Calculates logistic regression with leave-one-group-out split and L2 regularization.

    Parameters
    ----------
    features : ndarray
        Input features used in creating regression model.
    grades : ndarray
        Ground truth for the model.
    standard : bool
        Choice whether to center features by the mean of training split.
        Defaults to false, since whitened PCA is assumed to be centered.
    seed : int
        Random seed used in the model.
    use_intercept : bool
        Choice whether to use intercept term on the model.
        If the model does not provide very powerful predictions, it is better to center them by the intercept.
    groups : ndarray
        Patients groups. Used in leave-one-group-out split.
    Returns
    -------
    Array of model predictions, model coefficients and model intercept term.
    """

    # Lists
    predictions, coefs, intercepts = [], [], []
    # Leave one out split
    logo = LeaveOneGroupOut()
    logo.get_n_splits(features, grades, groups)
    logo.get_n_splits(groups=groups)  # 'groups' is always required

    for train_idx, test_idx in logo.split(features, grades, groups):
        # Indices
        x_train, x_test = features[train_idx], features[test_idx]
        y_train, y_test = grades[train_idx], grades[test_idx]

        # Normalize with mean and std
        if standard:
            x_test -= x_train.mean(0)
            x_train -= x_train.mean(0)

        # Linear regression
        model = LogisticRegression(solver='newton-cg', max_iter=1000, random_state=seed, fit_intercept=use_intercept)
        model.fit(x_train, y_train)

        # Predicted score
        p = model.predict_proba(x_test)
        predictions.extend(p[:, 1])  # Add the positive predictions to list
        # Save weights
        coefs.append(model.coef_)
        intercepts.append(model.intercept_)

    # Average coefficients
    coefs = np.mean(np.array(coefs), axis=0).squeeze()
    intercepts = np.mean(np.array(intercepts), axis=0).squeeze()

    return np.array(predictions), coefs, intercepts
예제 #4
0
    def preProcessTrainVal(features, labels, groups, K_FOLD = 2):

        # split the data into a training set and a validation set
        from sklearn.model_selection import LeaveOneGroupOut
        logo = LeaveOneGroupOut()

        print(logo.get_n_splits(features, labels, groups))
예제 #5
0
class DKULeaveOneGroupOut(object):
    def __init__(self, column_name):
        self.column_name = column_name
        self.splitter = LeaveOneGroupOut()
        pass

    def set_column_labels(self, column_labels):
        self.column_labels = column_labels

    def get_n_splits(self, X, y, groups=None):
        try:
            column_idx = self.column_labels.index(self.column_name)
        except ValueError as e:
            raise Exception("Column %s not found among %s" %
                            (self.column_name, self.column_labels))

        groups_array = X[:, column_idx]

        ret = self.splitter.get_n_splits(X, y, groups_array)
        print("Will use %s splits" % ret)
        return ret

    def split(self, X, y, groups=None):
        try:
            column_idx = self.column_labels.index(self.column_name)
        except ValueError as e:
            raise Exception("Column %s not found among %s" %
                            (self.column_name, self.column_labels))

        groups_array = X[:, column_idx]

        return self.splitter.split(X, y, groups_array)
예제 #6
0
def perform_kNearestNeighbours(Xn, yn, nSess=1):
    groups = get_groups(Xn, nSess)
    logo_fold = LeaveOneGroupOut()
    n_folds = logo_fold.get_n_splits(groups = groups)

    total_samples = Xn.shape[0]
    n_young_samples = int(total_samples/2)
    actual_ = np.zeros((n_folds, 2))
    predict_ = np.zeros((n_folds, 2))
    decifunc = np.zeros((n_folds, 2, 2))
    ylabel = np.zeros((n_folds, 2, 2))
    ngood = np.zeros(n_folds)
    folds_iter = 0

    print("\nClassify using K-nearest neigbours method:")
    print(" Performing leave one subject out cross fold with %d outer_folds"
          " and %d inner_folds" % (n_folds, n_folds-1))

    
    # For each iteration sessions of one subject are left out, the
    # classifier is trained with the sessions of the other subjects and,
    # the classifier is tested against the data of the left out subject.
    yn_toUse = label_binarize(yn, range(3))[:,:-1]

    kNeigh = KNeighborsClassifier(n_neighbors=1, weights='uniform', leaf_size=40)
    for train_index, test_index in logo_fold.split(Xn, yn, groups):
        # X_t_test and y_test are used for calculating classifier
        # accuracy for this iteration
        X_t_train, X_t_test = Xn[train_index], Xn[test_index]
        y_train, y_test = yn[train_index], yn[test_index]
        pgrid = { "n_neighbors": np.arange(1, n_folds, 1),
                  "leaf_size": [40, 50, 60],
                  "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
                  "weights": ["uniform", "distance"],
                }
        # Inner LOOCV fold to tune the hyper parameters of the classifier
        inner_fold = LeaveOneGroupOut()
        gridclf = GridSearchCV(estimator = kNeigh, param_grid = pgrid, refit=True,
                               cv = inner_fold)
        g = gridclf.fit(X_t_train, y_train, groups = groups[train_index])
        ngood[folds_iter] = gridclf.best_params_.get('n_neighbors')
        actual_[folds_iter] = y_test
        predict_[folds_iter] = gridclf.predict(X_t_test)
        ylabel[folds_iter] = yn_toUse[test_index]
        decifunc[folds_iter] = gridclf.predict_proba(X_t_test)
        folds_iter += 1

    # Calculate the accuracy of the classifier
    actual = actual_.reshape(total_samples,)
    predict = predict_.reshape(total_samples,)
    success = (actual == predict)
    n_success = len(success[success == True])
    print(" Classification accuracy =", (n_success / total_samples) * 100, "%")
    print(' Confusion Matrix:\n', confusion_matrix(actual, predict))
    ylabel = ylabel.reshape(total_samples, 2)
    decifunc = decifunc.reshape(total_samples, 2)
    print(' roc_auc_score =', roc_auc_score(ylabel, decifunc))
예제 #7
0
 def get_val_splitter(self):
     if self.splitter == "predefined":
         return self.__get_predefined_splitter()
     elif self.splitter == "loso":
         loso = LeaveOneGroupOut()
         return loso.get_n_splits(groups=self.train_data['student_id'])
     elif self.splitter == 'kfold':
         return KFold(5).get_n_splits(groups=self.train_y)
     else:
         return self.__get_predefined_splitter()
def checkForOutliers(Xin):
    n_samples = Xin.shape[0]
    yin = np.ones(n_samples)

    groups = np.zeros(n_samples)
    groups_iter = np.arange(0, len(groups), 2)
    for i in groups_iter:
        groups[i:i + 2] = (i / 2)

    logo_fold = LeaveOneGroupOut()
    n_folds = logo_fold.get_n_splits(groups=groups)

    outliers_fraction = 0.1
    rng = np.random.RandomState(42)

    # Run IsolationForest and LocalOutlierFactor classifiers
    classifiers = {
        "Isolation Forest":
        IsolationForest(max_samples=n_samples - 2,
                        contamination=outliers_fraction,
                        random_state=rng),
        "Local Outlier Factor":
        LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction)
    }

    folds_iter_if = 0
    outlier_list_if = np.zeros((n_folds, 5))
    folds_iter_lof = 0
    outlier_list_lof = np.zeros((n_folds, 5))

    # Perform LeaveOneOutCrossFold and identify the outliers
    for train_index, outlier_index in logo_fold.split(Xin, yin, groups):
        X_train = Xin[train_index]
        y_train = yin[train_index]
        for i, (clf_name, clf) in enumerate(classifiers.items()):
            if clf_name == "Local Outlier Factor":
                y_pred = clf.fit_predict(X_train)
                n_errors = (y_pred != y_train).sum()
                outliers_idx = np.argsort(y_pred)[0:n_errors]
                outlier_list_lof[folds_iter_lof] = outliers_idx
                folds_iter_lof += 1
            else:
                clf = clf.fit(X_train)
                y_pred = clf.predict(X_train)
                n_errors = (y_pred != y_train).sum()
                outliers_idx = np.argsort(y_pred)[0:n_errors]
                outlier_list_if[folds_iter_if] = outliers_idx
                folds_iter_if += 1

    print('\nLocal Outlier Factor:')
    print(outlier_list_lof)

    print('\nIsolation Forest:')
    print(outlier_list_if)
예제 #9
0
def get_cv(k_fold, groups, X, y):
    if groups is None:
        ### Personal CV
        skf = StratifiedKFold(n_splits=k_fold, shuffle=True)
        n_split = skf.get_n_splits(X, y)
        cv = skf.split(X, y)
    else:
        ### Group (leave one subject out)
        logo = LeaveOneGroupOut()
        n_split = logo.get_n_splits(X, y, groups)
        cv = logo.split(X, y, groups=groups)
    return cv, n_split
예제 #10
0
def identify_top_features(Xn, yn, nSess=1):
    features_a = []
    tscores_a = []
    pval_a = []

    groups = get_groups(Xn, nSess)
    logo_fold = LeaveOneGroupOut()
    n_folds = logo_fold.get_n_splits(groups=groups)

    print("\nIdentify the signifcant features:")

    # Perform LOOCV to identify the most significant features
    # For each iteration sessions of one subject are left out, the
    # most signifcant features are identified using the sessions of the
    # remaining subjects.
    print(" Performing Leave one subject out cross fold(#folds = %d)" %
          n_folds)

    for train_index, test_index in logo_fold.split(Xn, yn, groups):
        X_train, X_test = Xn[train_index], Xn[test_index]
        y_train, y_test = yn[train_index], yn[test_index]
        x1_idx = np.argwhere(y_train == 0).flatten()
        x2_idx = np.argwhere(y_train == 1).flatten()
        x1 = X_train[x1_idx]
        x2 = X_train[x2_idx]
        top_features, tscore, pval = get_ttest_scores(x1, x2)
        features_a.append(top_features)
        tscores_a.append(tscore)
        pval_a.append(pval)

    # Pick the intersection of the features across all the iterations
    top_features = np.array(
        list(reduce(set.intersection, [set(item) for item in features_a])))

    nfeatures = top_features.shape[0]
    top_features_tscores = np.zeros(nfeatures)
    top_features_pval = np.zeros(nfeatures)

    # Get the t-scores and p-values of the signifcant features
    iter = 0
    for tf in top_features:
        for i, v in enumerate(features_a):
            if tf in v:
                i1 = np.where(v == tf)
                top_features_tscores[iter] = tscores_a[i][i1]
                top_features_pval[iter] = pval_a[i][i1]
                iter += 1
                break
    # Sort the features based on the ttest value
    sorted_idx = np.argsort(np.abs(top_features_tscores))[::-1]
    return top_features[sorted_idx], top_features_tscores[
        sorted_idx], top_features_pval[sorted_idx]
예제 #11
0
class LeaveOneSubjectOut():
    def __init__(self, subjects_indexes):
        self.subjects_indexes = subjects_indexes
        self.splitter = LeaveOneGroupOut()

    def split(self, X=None, y=None, groups=None):
        if groups == None:
            groups = self.subjects_indexes
        return self.splitter.split(X, y, groups)

    def get_n_splits(self, X=None, y=None, groups=None):
        if groups == None:
            groups = self.subjects_indexes
        return self.splitter.get_n_splits(X, y, groups)
예제 #12
0
def classify_loso_model_selection(X, y, group, gs):
    """ This do classification using LOSO while also doing model selection using LOSO

        Args:
            X (numpy matrix): this is the feature matrix with row being a data point
            y (numpy vector): this is the label vector with row belonging to a data point
            group (numpy vector): this is the group vector (which is a the participant id)
            gs (sklearn GridSearchCV): this is a gridsearch object that will output the best model

        Returns:
            accuracies (list): the accuracy at for each leave one out participant
    """

    logo = LeaveOneGroupOut()

    accuracies = []
    f1s = []
    cms = []

    best_params = []

    num_folds = logo.get_n_splits(X, y,
                                  group)  # keep track of how many folds left
    for train_index, test_index in logo.split(X, y, group):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        group_train, group_test = group[train_index], group[test_index]

        print(f"Number of folds left: {num_folds}")

        with joblib.parallel_backend('loky'):
            gs.fit(X_train, y_train, groups=group_train)

        y_hat = gs.predict(X_test)

        accuracy = accuracy_score(y_test, y_hat)
        f1 = f1_score(y_test, y_hat)
        cm = confusion_matrix(y_test, y_hat)

        accuracies.append(accuracy)
        f1s.append(f1)
        cms.append(cm)

        best_params.append(gs.best_params_)

        num_folds = num_folds - 1
    return accuracies, f1s, cms, best_params
예제 #13
0
class LeaveOneClusterOut():
    """
	Wrapper for sklearn LeaveOneGroupOut splitter
	Stores clusters as attribute (rather than fit param) as a workaround to enable LOCO-CV in mlxtend SequentialFeatureSelector
	
	Args:
		clusters: list of cluster labels for observations (n-vector)
	"""
    def __init__(self, clusters):
        self.logo = LeaveOneGroupOut()
        self.clusters = clusters

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.logo.get_n_splits(groups=self.clusters)

    def split(self, X, y=None, groups=None):
        return self.logo.split(X, y, groups=self.clusters)
예제 #14
0
def perform_elm(Xn, yn, nSess=1, kernelType='linear'):
    groups = get_groups(Xn, nSess)
    logo_fold = LeaveOneGroupOut()
    n_folds = logo_fold.get_n_splits(groups = groups)

    total_samples = Xn.shape[0]
    n_young_samples = int(total_samples/2)
    actual_ = np.zeros((n_folds, 2))
    predict_ = np.zeros((n_folds, 2))
    decifunc_gri = np.zeros((n_folds, 2))
    folds_iter = 0

    print('\nClassify using ELM: (%s)' % kernelType)
    print(" Performing leave one subject out cross fold with %d outer_folds"
          " and %d inner_folds" % (n_folds, n_folds-1))

    # For each iteration sessions of one subject are left out, the
    # classifier is trained with the sessions of the other subjects and,
    # the classifier is tested against the data of the left out subject.

    for train_index, test_index in logo_fold.split(Xn, yn, groups):
        X_t_train, X_t_test = Xn[train_index], Xn[test_index]
        y_train, y_test = yn[train_index], yn[test_index]
        
        inner_fold = LeaveOneGroupOut()
        pgrid = { "n_hidden": np.arange(10, 300, 10),
                "rbf_width": np.arange(0.1, 0.5, 0.05)
                }
        elmc_ = ELMClassifier(n_hidden=10, random_state=42, rbf_width=0.1, activation_func=kernelType, binarizer=LabelBinarizer(0, 1))
        gridclf = GridSearchCV(estimator = elmc_, param_grid = pgrid, refit=True,
                               cv = inner_fold)
        g = gridclf.fit(X_t_train, y_train, groups = groups[train_index])
        actual_[folds_iter] = y_test
        predict_[folds_iter] = gridclf.predict(X_t_test)
        decifunc_gri[folds_iter] = gridclf.decision_function(X_t_test).reshape(2,)
        folds_iter += 1
            
    actual = actual_.reshape(total_samples,)
    predict = predict_.reshape(total_samples,)
    success = (actual == predict)
    n_success = len(success[success == True])
    print(" Classification accuracy =", (n_success / total_samples) * 100, "%")
    print(' Confusion Matrix:\n', confusion_matrix(actual, predict))
    decifunc_gri = decifunc_gri.reshape(total_samples,)
    print(' roc_auc_score =', roc_auc_score(actual, decifunc_gri))
    
예제 #15
0
def perform_leastSquareLinearClassifier(Xn, yn, nSess=1):
    groups = get_groups(Xn, nSess)
    logo_fold = LeaveOneGroupOut()
    n_folds = logo_fold.get_n_splits(groups = groups)

    total_samples = Xn.shape[0]
    actual_ = np.zeros((n_folds, 2))
    predict_ = np.zeros((n_folds, 2))
    decifunc_gri = np.zeros((n_folds, 2))
    folds_iter = 0

    print("\nClassify using Linear Classifier:")
    print(" Performing leave one subject out cross fold with %d outer_folds"
          % (n_folds))

    linearReg = linear_model.LinearRegression(normalize=True)
    for train_index, test_index in logo_fold.split(Xn, yn, groups):
        # X_t_test and y_test are used for calculating classifier
        # accuracy for this iteration
        X_t_train, X_t_test = Xn[train_index], Xn[test_index]
        y_train, y_test = yn[train_index], yn[test_index]
        linearReg.fit(X_t_train, y_train)
        pred_ = linearReg.predict(X_t_test)
        predict_[folds_iter] = pred_[:]>0
        decifunc_gri[folds_iter] = linearReg._decision_function(X_t_test)
        actual_[folds_iter] = y_test
        folds_iter += 1

    # Calculate the accuracy of the classifier
    actual = actual_.reshape(total_samples,)
    predict = predict_.reshape(total_samples,)
    success = (actual == predict)
    n_success = len(success[success == True])
    print(' Classification accuracy =', (n_success / (total_samples)) * 100, "%")
    print(' Confusion Matrix:\n', confusion_matrix(actual, predict))
    decifunc_gri = decifunc_gri.reshape(total_samples,)
    print(' roc_auc_score =', roc_auc_score(actual, decifunc_gri))
예제 #16
0
    def findParametersAndEvaluate(self,
                                  data,
                                  strategy,
                                  label_name,
                                  group=None,
                                  dataset=None,
                                  cv=5):

        self.strategy = strategy
        self.results = {}

        print('-------------------------------')
        print(' STEP : Finding Parameters & Evaluate Models')
        print('-------------------------------')

        self.label_name_check(label_name)
        #print(self.labelset.columns)

        # store performance data for each strategy

        if (strategy == 'train_test_split' or strategy == 'all'):

            self.train_test = dict()
            for model in self.models.keys():
                self.train_test[model] = None

            print('===> Evaluation strategy: Train and Test Split   ')
            X_train, X_test, y_train, y_test = train_test_split(
                data,
                self.label_set[label_name],
                train_size=.7,
                random_state=self.seed)

            print('===> Parameters find-> Start')
            for model in self.models.keys():
                if model == 'vot':
                    continue
                if not self.configured:
                    gd = GridSearchCV(self.models[model],
                                      self.params[model],
                                      cv=cv,
                                      scoring='neg_root_mean_squared_error')
                    gd.fit(X_train, y_train)
                    print('       Parameters for ', model, ': ',
                          gd.best_params_)
                    self.models[model] = gd.best_estimator_

            print('===> Parameters find-> End')

            test_performances = dict()
            print('===> Test data performance[RMSE] ')
            for model in self.models.keys():
                self.models[model].fit(X_train, y_train)
                test_performances[model] = mean_squared_error(
                    y_test, self.models[model].predict(X_test), squared=False)
                #print('       Model[',model,']:',test_performances[model])
                self.train_test[model] = test_performances[model]
            print(self.train_test)

            self.results['train_test'] = self.train_test

        if (strategy == 'cross_val' or strategy == 'all'):

            self.cross_val = dict()
            cross_val = dict()
            for model in self.models.keys():
                self.cross_val[model] = None

            print('==============================================')
            print('Evaluation strategy: Cross Validation')
            print('==============================================')
            for model in self.models.keys():

                if model != 'vot' and not self.configured:
                    print('    ==> Finding params for ', model)
                    gd = GridSearchCV(self.models[model],
                                      self.params[model],
                                      cv=10,
                                      scoring='neg_root_mean_squared_error')
                    gd.fit(data, self.label_set[label_name])
                    print('        Parameters: ', gd.best_params_)
                    self.models[model] = gd.best_estimator_

                cross_val[model] = cross_val_score(
                    self.models[model],
                    data,
                    self.label_set[label_name],
                    scoring='neg_root_mean_squared_error',
                    cv=cv)
                #print('  Score[',model,']:',cross_val_scores[model])

                cross_val_mean = -1 * statistics.mean(cross_val[model])
                cross_val_var = statistics.variance(cross_val[model])

                self.cross_val[model] = [cross_val_mean, cross_val_var]

            self.results['cross_val'] = self.cross_val

        if (strategy == 'leave_one_group_out' or strategy == 'all'):

            self.leave_group = dict()
            for model in self.models.keys():
                self.leave_group[model] = None

            print('==============================================')
            print('Evaluation strategy: Leave one group out')
            print('==============================================')

            logo = LeaveOneGroupOut()
            n_splits = logo.get_n_splits(groups=group)

            error = dict()

            for model in self.models.keys():
                error[model] = [None] * n_splits

            k = 0
            for train_index, test_index in logo.split(
                    data, self.label_set[label_name], group):
                #print(test_index)

                X_train, y_train = data.iloc[train_index], self.label_set[
                    label_name][train_index]
                X_test, y_test = data.iloc[test_index], self.label_set[
                    label_name][test_index]

                for model in self.models.keys():

                    if model != 'vot' and not self.configured:

                        print('    ==> Finding params for ', model)
                        gd = GridSearchCV(
                            self.models[model],
                            self.params[model],
                            cv=10,
                            scoring='neg_root_mean_squared_error')
                        gd.fit(X_train, y_train)
                        print('        Parameters: ', gd.best_params_)
                        estimator = gd.best_estimator_

                        self.models[model] = estimator

                    self.models[model].fit(X_train, y_train)
                    error[model][k] = mean_squared_error(
                        y_test,
                        self.models[model].predict(X_test),
                        squared=False)

                    #print('    Model[',model,']:',error[model])

                k = k + 1

            for model in self.models.keys():
                err_mean = statistics.mean(error[model])
                err_var = statistics.variance(error[model])
                self.leave_group[model] = [err_mean, err_var]

            self.results['leave_group'] = self.leave_group

        if (strategy == 'leave_one_dataset_out' or strategy == 'all'):
            self.leave_dataset = dict()
            for model in self.models.keys():
                self.leave_dataset[model] = None

            print('==============================================')
            print('Evaluation strategy: Leave one dataset out')
            print('==============================================')

            logo = LeaveOneGroupOut()
            n_splits = logo.get_n_splits(groups=dataset)

            error = dict()

            for model in self.models.keys():
                error[model] = [None] * n_splits

            k = 0

            for train_index, test_index in logo.split(
                    data, self.label_set[label_name], dataset):

                X_train, y_train = data.iloc[train_index], self.label_set[
                    label_name][train_index]
                X_test, y_test = data.iloc[test_index], self.label_set[
                    label_name][test_index]

                for model in self.models.keys():

                    if model != 'vot' and not self.configured:

                        print('    ==> Finding params for ', model)
                        gd = GridSearchCV(
                            self.models[model],
                            self.params[model],
                            cv=10,
                            scoring='neg_root_mean_squared_error')
                        gd.fit(X_train, y_train)
                        #print('        Parameters: ',gd.best_params_)
                        estimator = gd.best_estimator_

                        self.models[model] = estimator

                    self.models[model].fit(X_train, y_train)

                    error[model][k] = mean_squared_error(
                        y_test,
                        self.models[model].predict(X_test),
                        squared=False)

                    #print('    Model[',model,']:',error[model])
                k = k + 1
            for model in self.models.keys():
                err_mean = statistics.mean(error[model])
                err_var = statistics.variance(error[model])

                self.leave_dataset[model] = [err_mean, err_var]

            self.results['leave_dataset'] = self.leave_dataset

        if (strategy == 'sorted_stratified' or strategy == 'all'):

            self.stratified = dict()
            for model in self.models.keys():
                self.stratified[model] = None

            # idea from https://scottclowe.com/2016-03-19-stratified-regression-partitions/
            print('==============================================')
            print('Evaluation strategy: Sorted Stratification')
            print('==============================================')

            label_df = pd.DataFrame(self.label_set)

            indices = label_df.sort_values(by=[label_name]).index.tolist()
            splits = dict()

            error = dict()
            for model in self.models.keys():
                error[model] = [None] * cv

            for i in range(cv):
                splits[i] = list()

            for i in range(len(indices)):
                if i % cv == 0:
                    pick = random.sample(range(cv), cv)
                cur_pick = pick.pop()
                splits[cur_pick].append(indices[i])

            for i in range(cv):
                test_index = splits[i]
                train_index = []
                for j in range(cv):
                    if j != i:
                        train_index = train_index + splits[j]

                ##########################################

                # Code to training model on sorted stratified set
                X_train, y_train = data.iloc[train_index], self.label_set[
                    label_name][train_index]
                X_test, y_test = data.iloc[test_index], self.label_set[
                    label_name][test_index]

                for model in self.models.keys():
                    if model != 'vot' and not self.configured:

                        print('    ==> Finding params for ', model)
                        gd = GridSearchCV(
                            self.models[model],
                            self.params[model],
                            cv=10,
                            scoring='neg_root_mean_squared_error')
                        gd.fit(X_train, y_train)
                        print('        Parameters: ', gd.best_params_)
                        estimator = gd.best_estimator_
                        self.models[model] = estimator

                    self.models[model].fit(X_train, y_train)

                    error[model][i] = mean_squared_error(
                        y_test,
                        self.models[model].predict(X_test),
                        squared=False)

                    #print('    Model[',model,']:',error[model])

            for model in self.models.keys():
                err_mean = statistics.mean(error[model])
                err_var = statistics.variance(error[model])
                self.stratified[model] = [err_mean, err_var]
                ##########################################

            self.results['stratified'] = self.stratified

        else:
            print('Unsupported evaluation strategy')
            return None

        return self.results

        # Preparing dataframe with results for report generation
        """        
lr = lm.LinearRegression()


##### CbS + LOGO #####

Sex_ctrl_30 = pd.get_dummies(sex_ctrl_30)
assert Sex_ctrl_30.shape == (313, 2)
Residuals_ctrl_30_bySite = np.array([X_ctrl_30_bySite[:, j] - lr.fit(Sex_ctrl_30, X_ctrl_30_bySite[:, j]).predict(Sex_ctrl_30) for j in range(X_ctrl_30_bySite.shape[1])]).T
assert Residuals_ctrl_30_bySite.shape == (313, 162)

X = Residuals_ctrl_30_bySite
y = age_ctrl_30
groups = site_ctrl_30
logo = LeaveOneGroupOut()
assert logo.get_n_splits(X, y, groups) == 10

param_grid = {'alpha': 10. ** np.arange(-5, 5)}
model = GridSearchCV(lm.Ridge(max_iter=10000, tol = 0.0001, random_state = 42), param_grid, cv=10)
scaler = StandardScaler()

y_test_pred = np.zeros(len(y))
for train, test in logo.split(X, y, groups):
    X_train, X_test, y_train, y_test = X[train, :], X[test, :], y[train], y[test]
    X_train_s = scaler.fit_transform(X_train)
    X_test_s = scaler.transform(X_test)
    model.fit(X_train_s, y_train)
    y_test_pred[test] = model.predict(X_test_s)
    
print("Test r2:%.2f" % metrics.r2_score(y, y_test_pred)) # Test r2:-26.94
print(model.best_params_) # {'alpha': 100.0}
예제 #18
0
def regress_logo(features, grades, groups, method='ridge', standard=False, use_intercept=True, convert='none', alpha=1.0):
    """Calculates linear regression with leave-one-group-out split and L2 regularization.

    Parameters
    ----------
    features : ndarray
        Input features used in creating regression model.
    grades : ndarray
        Ground truth for the model.
    method : str
        Regression model used. Defaults to ridge regression, but lasso is also possible. Ridge seems to perform better.
    standard : bool
        Choice whether to center features by the mean of training split.
        Defaults to false, since whitened PCA is assumed to be centered.
    use_intercept : bool
        Choice whether to use intercept term on the model.
        If the model does not provide very powerful predictions, it is better to center them by the intercept.
    groups : ndarray
        Patients groups. Used in leave-one-group-out split.
    convert : str
        Possibility to predict exp or log of ground truth. Defaults to no conversion.
    alpha : float
        Regularization coefficient. c^-1
    Returns
    -------
    Array of model prdictions, model coefficients and model intercept term.
    """

    # Convert grades
    if convert == 'exp':
        grades = np.exp(grades)
    elif convert == 'log':
        grades = np.log(grades)
    else:
        pass

    # Lists
    predictions, coefs, intercepts = [], [], []

    # Leave one out split
    logo = LeaveOneGroupOut()
    logo.get_n_splits(features, grades, groups)
    logo.get_n_splits(groups=groups)  # 'groups' is always required

    for train_idx, test_idx in logo.split(features, grades, groups):
        # Indices
        x_train, x_test = features[train_idx], features[test_idx]
        y_train, y_test = grades[train_idx], grades[test_idx]

        # Normalize with mean and std
        if standard:
            x_test -= x_train.mean(0)
            x_train -= x_train.mean(0)

        # Linear regression
        if method == 'ridge':
            model = Ridge(alpha=alpha, normalize=True, random_state=42, fit_intercept=use_intercept)
        elif method == 'lasso':
            model = Lasso(alpha=alpha, normalize=True, random_state=42, fit_intercept=use_intercept)
        else:
            model = LinearRegression(normalize=True, fit_intercept=use_intercept, n_jobs=-1)
        model.fit(x_train, y_train)

        # Predicted score
        predictions.append(model.predict(x_test))
        # Save weights
        coefs.append(model.coef_)
        intercepts.append(model.intercept_)

    predictions_flat = []
    for group in predictions:
        for p in group:
            predictions_flat.append(p)

    # Convert grades back
    if convert == 'exp':
        predictions = np.log(np.array(predictions_flat))
    elif convert == 'log':
        predictions = np.exp(np.array(predictions_flat))
    else:
        predictions = np.array(predictions_flat)

    return predictions, np.mean(np.array(coefs), axis=0), np.mean(np.array(intercepts), axis=0)
예제 #19
0
def pca_regress_pipeline_log(features, grades, groups, n_components=0.9, solver='full', whitening=True, standard=False,
                             seed=42, mod_coefs=True, alpha=0.1, grade_name='', savepath=None):

    feature_names = ['Center +', 'Center -', 'Large U-1', 'Large U-2', 'Large U-3', 'Large U-4', 'Large U-5', 'Large U-6',
                     'Large U-7', 'Large N-U', 'Small U-1', 'Small U-2', 'Small U-3', 'Small U-4', 'Small U-5', 'Small U-6', 'Small U-7',
                     'Small N-U', 'Radial U-0', 'Radial U-1', 'Radial U-2', 'Radial U-3', 'Radial U-4', 'Radial U-5', 'Radial U-6',
                     'Radial U-7', 'Radial U-8', 'Radial N-U']
    grades_log = grades

    # Fit PCA to full data
    pca = PCA(n_components=n_components, svd_solver=solver, whiten=whitening, random_state=seed)
    pca.fit(features)

    # Leave one out split
    logo = LeaveOneGroupOut()
    logo.get_n_splits(features, grades_log, groups)
    logo.get_n_splits(groups=groups)  # 'groups' is always required
    all_shap_values, all_shap_values_lin = [], []
    for train_idx, test_idx in logo.split(features, grades_log, groups):
        # Indices
        x_train, x_test = features[train_idx], features[test_idx]
        y_train, y_test = grades_log[train_idx], grades_log[test_idx]

        # Normalize with mean and std
        if standard:
            x_test -= x_train.mean(0)
            x_train -= x_train.mean(0)

        # Logistic regression
        model = LogisticRegression(solver='newton-cg', max_iter=1000, random_state=seed, fit_intercept=False)
        model.fit(pca.transform(x_train), y_train > 1)

        model_lin = Ridge(alpha=alpha, normalize=True, random_state=seed, fit_intercept=True)
        model_lin.fit(pca.transform(x_train), y_train)

        # Predicted score (for logistic regression)
        p = model.predict_proba(pca.transform(x_test))
        p_lin = model_lin.predict(pca.transform(x_test))

        # Merge PCA into the linear model
        if mod_coefs:
            coef = (pca.components_.T / pca.singular_values_) @ model.coef_.T * np.sqrt(pca.n_samples_ - 1)
            coef_lin = (pca.components_.T / pca.singular_values_) @ model_lin.coef_.T * np.sqrt(pca.n_samples_ - 1)

            # Update models
            model.coef_ = coef.T
            model_lin.coef_ = coef_lin.T

            p2_lin = model_lin.predict(x_test)
            p2 = model.predict_proba(x_test)

            # Inference
            p_inf = (x_test @ coef).squeeze()
            p_inf = (1 + np.exp(-p_inf)) ** -1

            eps = 1.0e-10
            assert np.sum(np.abs(p - p2)) < eps, 'LOGReg results are not equal'
            assert np.sum(np.abs(p_inf - p[:, 1])) < eps, 'LOGReg results are not equal'
            assert np.sum(np.abs(p_lin - p2_lin)) < eps, 'LINReg results are not equal'
        else:  # Otherwise run PCA
            x_train = pca.transform(x_train)
            x_test = pca.transform(x_test)

        # Interpretability

        # Logistic regression
        explainer = shap.LinearExplainer(model, x_train, feature_dependence='correlation', nsamples=x_train.shape[0])
        shap_values = explainer.shap_values(x_test)

        # Linear regression
        explainer_lin = shap.LinearExplainer(model_lin, x_train, feature_dependence='correlation',
                                             nsamples=x_train.shape[0])
        shap_values_lin = explainer_lin.shap_values(x_test)

        # Append prediction
        all_shap_values.append(shap_values)
        all_shap_values_lin.append(shap_values_lin)

    # Combine shap values and plot the summary
    all_shap_values = np.vstack(all_shap_values)
    all_shap_values_lin = np.vstack(all_shap_values_lin)

    # Inverse PCA for the model without PCA
    if not mod_coefs:
        all_shap_values = pca.inverse_transform(all_shap_values)
        all_shap_values_lin = pca.inverse_transform(all_shap_values_lin)

    # Force plot
    # shap.force_plot(explainer.expected_value, all_shap_values, features)
    # plt.show()

    # Summary plots
    shap.summary_plot(all_shap_values, features, show=False, feature_names=feature_names)
    # plt.title(f'Logistic Regression ({grade_name})')
    if savepath is not None:
        plt.savefig(f'{savepath}{grade_name}_logistic_cov.png', transparent=False, bbox_inches='tight')
        plt.show()
    else:
        plt.show()
    shap.summary_plot(all_shap_values_lin, features, show=False, feature_names=feature_names)
    # plt.title(f'Linear Ridge Regression ({grade_name})')
    if savepath is not None:
        plt.savefig(f'{savepath}{grade_name}_linear_cov.png', transparent=False, bbox_inches='tight')
        plt.show()
    else:
        plt.show()
예제 #20
0
def rforest_logo(features, grades, groups, standard=False, seed=42, n_trees=50, tree_depth=None, savepath=None, zone=''):
    """Calculates logistic regression with leave-one-group-out split and L2 regularization.

    Parameters
    ----------
    features : ndarray
        Input features used in creating regression model.
    grades : ndarray
        Ground truth for the model.
    standard : bool
        Choice whether to center features by the mean of training split.
        Defaults to false, since whitened PCA is assumed to be centered.
    seed : int
        Random seed used in the model.
    n_trees : int
        Number of trees in the Random Forest
    tree_depth : int
        Maximum depth of the individual tree.
    groups : ndarray
        Patients groups. Used in leave-one-group-out split.
    savepath : str
        Path to save the model.
    zone : str
        Zone that is graded.
    Returns
    -------
    Array of model predictions, model coefficients and model intercept term.
    """

    # Lists
    predictions, coefs, intercepts, models = [], [], [], []
    # Leave one out split
    logo = LeaveOneGroupOut()
    logo.get_n_splits(features, grades, groups)
    logo.get_n_splits(groups=groups)  # 'groups' is always required

    for train_idx, test_idx in logo.split(features, grades, groups):
        # Indices
        x_train, x_test = features[train_idx], features[test_idx]
        y_train, y_test = grades[train_idx], grades[test_idx]

        # Normalize with mean and std
        if standard:
            x_test -= x_train.mean(0)
            x_train -= x_train.mean(0)

        # Linear regression
        model = RandomForestClassifier(n_estimators=n_trees, random_state=seed, max_depth=tree_depth, n_jobs=12)
        model.fit(x_train, y_train)

        # Predicted score
        p = model.predict_proba(x_test)
        predictions.append(p)

        # Save weights
        coefs.append(model.feature_importances_)  # Importance of PCA components is returned
        intercepts.append(0.0)  # No intercept in RF
        models.append(model)

    predictions_flat = []
    for group in predictions:
        for p in group:
            predictions_flat.append(p)

    if savepath is not None:
        Path(savepath + '/models/').mkdir(exist_ok=True)
        filename = savepath + '/models/' + strftime(f'RF_model_{zone}_%Y_%m_%d_%H_%M_%S.sav')
        dump(models, filename)

    return np.array(predictions_flat)[:, 1], np.mean(np.array(coefs), axis=0).squeeze(), np.mean(np.array(intercepts), axis=0).squeeze()
예제 #21
0
    saver = tf.train.Saver()

## This is same with Deep Neural Network session part

# Same with DNN part
n_epochs = 20
batch_size = 30

# Leave one out cross validation - group making
groups = []
for i in range(1, 13):
    group = [i] * 120
    for i in group:
        groups.append(i)
logo = LeaveOneGroupOut()
logo.get_n_splits(X_data, Y_data, groups)

looop = []
times = 1
for train_index, test_index in logo.split(X_data, Y_data, groups):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X_data[train_index], X_data[test_index]
    y_train, y_test = Y_data[train_index], Y_data[test_index]
    with tf.Session() as sess:
        init.run()
        accuracy_test = []
        for epoch in range(n_epochs):
            i = 0
            for batch in range(len(X_train) // batch_size):
                X_batch = X_train[i:i + batch_size]
                y_batch = y_train[i:i + batch_size]
예제 #22
0
def main(req: func.HttpRequest) -> func.HttpResponse:
    logging.info('Python HTTP trigger function processed a request.')

    unique_instance_str = str(uuid.uuid1())

    X_file = req.files['X_file']
    y_file = req.files['y_file']
    Wk_file = req.files['Wk_file']

    tempFilePath = tempfile.gettempdir()
    staging_dir = tempFilePath + '/staging/' + unique_instance_str

    if not os.path.exists(staging_dir):
        logging.info('Creating ' + staging_dir)
        os.makedirs(staging_dir)

    X_file.save(staging_dir + '/X.parquet')
    y_file.save(staging_dir + '/y.parquet')
    Wk_file.save(staging_dir + '/Wk.parquet')

    X = pd.read_parquet(staging_dir + '/X.parquet')
    X = X.reindex(sorted(X.columns), axis=1)
    y = pd.read_parquet(staging_dir + '/y.parquet').iloc[:, 0]
    Week = pd.read_parquet(staging_dir + '/Wk.parquet').iloc[:, 0]

    os.remove(staging_dir + '/X.parquet')
    os.remove(staging_dir + '/y.parquet')
    os.remove(staging_dir + '/Wk.parquet')
    os.rmdir(staging_dir)

    logo = LeaveOneGroupOut()
    n_splits = logo.get_n_splits(groups=Week)

    r2_total = 0
    mae_total = 0
    rmse_total = 0
    logging.info('Beginning CV.')
    c = 0

    target_splits = 3
    n_actual_splits = 0
    nth_split = 0

    for train_index, test_index in logo.split(X, y, Week):

        cv_prob = max(0, (target_splits - n_actual_splits) /
                      (n_splits - nth_split))
        nth_split += 1

        if np.random.rand() > cv_prob:
            continue

        logging.info('Split {}.'.format(c))

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        y_train = np.asarray(y_train).ravel()
        y_test = np.asarray(y_test).ravel()

        test_model = XGBRegressor()
        test_model.fit(X_train, y_train)
        y_pred = test_model.predict(X_test)

        r2_total += r2_score(y_true=y_test, y_pred=y_pred)
        mae_total += mean_absolute_error(y_true=y_test, y_pred=y_pred)
        rmse_total += np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred))
        n_actual_splits += 1
        c += 1

    avg_sales = y.mean()
    r2 = r2_total / n_actual_splits
    mae = mae_total / n_actual_splits
    mpe = mae / avg_sales
    rmse = rmse_total / n_actual_splits

    del X
    del y
    del Week

    gc.collect()

    outp = {
        'avg_sales': float(avg_sales),
        'r2_score': float(r2),
        'mae_score': float(mae),
        'mpe_score': float(mpe),
        'rmse_score': float(rmse)
    }

    return func.HttpResponse(
        json.dumps(outp),
        mimetype='application/json',
    )
## store all in pickle dumps
pickle.dump(segments, open( "segments_90_acc.p","wb"))
pickle.dump(labels, open( "labels_90_acc.p","wb"))
pickle.dump(subjects, open( "subjects_90_acc.p","wb"))

# segments = pickle.load(open('segments_90_acc.p','rb'))
# labels = pickle.load(open('labels_90_acc.p', 'rb'))
# subjects = pickle.load(open('subjects_90_acc.p','rb'))

numOfRows = segments.shape[1]
numOfColumns = segments.shape[2]

groups = np.array(subjects)
logo = LeaveOneGroupOut()
logo.get_n_splits(segments, labels, groups)

# reshaping the data for network input
reshapedSegments = segments.reshape(segments.shape[0], numOfRows, numOfColumns,1)
# categorically defining the classes of the activities
labels = np.asarray(pd.get_dummies(labels),dtype = np.int8)

# # open a file, where you stored the pickled data
# # dump information to that file
# segments = pickle.load(segments)
# labels = pickle.load(labels)


# ==================================================================================

# splitting in training and testing data
예제 #24
0
def main(args, pipe=False):
    '''
    Checks passed arguments and performs requested actions.
    '''
    if not pipe:
        parser = argparse.ArgumentParser(
            description='Classify call segments as positive or negative.')
        parser.add_argument('-f',
                            '--features',
                            dest='feat_loc',
                            required=True,
                            help='Path to CSV feature file.')
        parser.add_argument(
            '-o',
            '--out',
            dest='out_loc',
            required=True,
            help='Path to where classification summary should be saved.')
        parser.add_argument('--hmm',
                            dest='hmm_flag',
                            action='store_true',
                            help='Classify with a Hidden Markov Model.')
        parser.add_argument('--rf',
                            dest='rf_flag',
                            action='store_true',
                            help='Classify with a random forest.')
        parser.add_argument('--n_components',
                            dest='n_components',
                            help='Number of components for the HMM.')
        parser.add_argument('--n_mix',
                            dest='n_mix',
                            help='Number of Gaussian mixtures for the HMM.')
        parser.add_argument(
            '--n_estimators',
            dest='n_estimators',
            help='Number of tree estimators for the random forest.')
        args = parser.parse_args()

    if args.hmm_flag or args.rf_flag:
        # store scores from all runs to calc stats
        hmm_chunk_scores = []
        hmm_overall_scores = []
        rf_chunk_scores = []
        rf_overall_scores = []

        # split data for leave-one-group(call)-out validation
        data, labels, ids = sep_data_labels(args.feat_loc)
        logo = LeaveOneGroupOut()
        curr_split = 1
        num_splits = logo.get_n_splits(data, labels, ids)

        # loop through all cross validation folds
        for train_index, test_index in logo.split(data, labels, ids):
            print('Split ' + str(curr_split) + ' out of ' + str(num_splits))
            data_train, data_test = data[train_index], data[test_index]
            labels_train, labels_test = labels[train_index], labels[test_index]
            # classify with the selected models
            if args.hmm_flag:
                if args.n_components:
                    n_components = int(args.n_components)
                else:
                    n_components = 2
                if args.n_mix:
                    n_mix = int(args.n_mix)
                else:
                    n_mix = 2
                hmm_model = HmmMorency(n_components=n_components, n_mix=n_mix)
                chunk_scores, call_score = train_and_test(
                    hmm_model, data_train, data_test, labels_train,
                    labels_test)
                hmm_chunk_scores.append(chunk_scores)
                hmm_overall_scores.append(call_score)

            if args.rf_flag:
                if args.n_estimators:
                    n_estimators = int(args.n_estimators)
                else:
                    n_estimators = 100
                rf_model = RandomForestClassifier(n_estimators=n_estimators,
                                                  n_jobs=-1,
                                                  random_state=10)
                chunk_scores, call_score = train_and_test(
                    rf_model, data_train, data_test, labels_train, labels_test)
                rf_chunk_scores.append(chunk_scores)
                rf_overall_scores.append(call_score)

            curr_split += 1

        # evaluate the scores for all models
        out_file = os.path.join(args.out_loc, 'results.txt')
        if args.hmm_flag:
            score_stats(
                'hmm, mix: ' + str(n_mix) + ' states: ' + str(n_components),
                hmm_chunk_scores, hmm_overall_scores, out_file)
        if args.rf_flag:
            score_stats('random forest, estimators: ' + str(n_estimators),
                        rf_chunk_scores, rf_overall_scores, out_file)

    else:
        sys.exit(
            'Must choose at least one classification method. (--hmm, --rf)')
예제 #25
0
def learn(X: (dict, pd.DataFrame),
          y: (dict, pd.Series),
          data_folder: str,
          groups: list = None,
          test_split: float = None,
          name: str = None):
    '''
	This function trains either a classification or regression random forest model. It is able to handle
	either a singular pandas DataFrame or a dictionary of pandas DataFrames. If the input is a singular
	pandas DataFrame, the rows will be split into a training and testing dataset using test_split (0 - 1).
	If the input is a dictionary of pandas DataFrames, a leave one out method will be used to verify the
	models accuracy.

	Inputs:

	X: a dictionary of pandas DataFrames or a singular pandas DataFrame

	y: a dictionary of pandas Series of a singular pandas Series

	data_folder: the location of where to save the output

	groups: a list of the trial names
			NOTE: this is only required if the X/y input is a dictionary

	test_split: the decimal percentage to split the training and testing datasets
				NOTE: this is only required if the X/y input is not a dictionary

	name: the name of the trial
		  NOTE: this is only required if the X/y input is not a dictionary

	Alex Woodall

	Auckland Bioengineering Institute

	08/04/2020

	'''

    if 'force' in data_folder or 'time' in data_folder:
        mode = 'regression'

    elif 'binary' in data_folder:
        mode = 'classification'

    if type(X) is pd.DataFrame:
        # Learning using one trial (or a combination into a DataFrame rather than a dictionary of DataFrames)

        if mode == 'classification':
            # Split into training and testing
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_split)

            # Create classifier and train
            cl = RandomForestClassifier(n_estimators=128, n_jobs=-1)
            cl.fit(X_train, y_train)

            # Predict on classifier and convert to a pandas series, save output
            y_predict = cl.predict(X_test)
            y_predict = pd.Series(y_predict, index=X_test.index)

            y_predict.to_csv("{}y_predict.csv".format(data_folder),
                             index=True,
                             header=True)
            y_test.to_csv("{}y_test.csv".format(data_folder),
                          index=True,
                          header=True)

            # Print score and confusion matrix
            score = roc_auc_score(y_test, y_predict)
            conf_mat = confusion_matrix(y_test, y_predict)

            print("Roc auc = {}\n".format(score))
            print(conf_mat)

        elif mode == 'regression':
            # Split into training and testing
            split_int = int(len(X) * (1 - test_split))

            X_train = X.head(split_int)
            y_train = y.head(split_int)
            X_test = X.tail(len(X) - split_int)
            y_test = y.tail(len(X) - split_int)

            # Create regressor and train
            rg = RandomForestRegressor(n_estimators=100, n_jobs=-1)
            rg.fit(X_train, y_train)

            # Predict
            y_predict = rg.predict(X_test)

            # Filter force array
            ''' Filter force plate data at 60 Hz '''
            analog_frequency = 1000
            cut_off = 60  # Derie (2017), Robberechts et al (2019)
            order = 2  # Weyand (2017), Robberechts et al (2019)
            b_f, a_f = signal.butter(N=order,
                                     Wn=cut_off / (analog_frequency / 2),
                                     btype='low')

            new_F = signal.filtfilt(b_f, a_f, y_predict)
            ''' Rezero filtered forces'''
            threshold = 50  # 20 N
            filter_plate = rezero_filter(original_fz=new_F,
                                         threshold=threshold)

            y_predict = filter_plate * new_F

            # Convert output into a pandas series and save
            y_predict = pd.Series(y_predict, index=X_test.index)
            y_predict.to_csv("{}y_predict.csv".format(data_folder),
                             index=True,
                             header=True)
            y_test.to_csv("{}y_test.csv".format(data_folder),
                          index=True,
                          header=True)

            # Calculate R2 score and print
            score = r2_score(y_test, y_predict)
            print("R2 = {}\n".format(score))

            # Plot result
            plt.plot(y_test.tail(1000), 'k', label='True data')
            plt.plot(y_predict.tail(1000), 'r', label='Estimate data')
            plt.legend()
            plt.ylabel('Force (N)')
            plt.xlabel('Time (ms)')
            plt.title('Estimated data for {}'.format(name))

            # Save figure
            score = round(score, 4)

            plt.savefig('{}{}_{}.png'.format(data_folder, name,
                                             '_'.join(str(score).split('.'))))
            plt.show()

    elif type(X) is dict:

        # Create leave one group out split
        group_num = np.arange(len(groups))
        logo = LeaveOneGroupOut()
        logo.get_n_splits(groups=group_num)

        if mode == 'classification':
            # Create results text file
            f = open("{}results.txt".format(data_folder), "w")
            f.write("Results for classification\n\n")
            f.close()

            roc = []
            # Train on n - 1 groups, test on 1. Repeat for all
            for train_index, test_index in logo.split(X=X, groups=group_num):
                cl = RandomForestClassifier(n_estimators=128, n_jobs=-1)

                # Training data
                print('Hold out trial: {}'.format(groups[test_index[0]]))

                for index in train_index:
                    try:
                        X_train = X_train.append(X[groups[index]],
                                                 ignore_index=True)
                        y_train = y_train.append(y[groups[index]],
                                                 ignore_index=True)

                    except NameError:
                        X_train = X[groups[index]]
                        y_train = y[groups[index]]

                cl.fit(X_train, y_train)

                # Testing data
                X_test = X[groups[test_index[0]]]
                y_test = y[groups[test_index[0]]]

                # Predict
                y_estimate_test = cl.predict(X_test)
                y_estimate_test = pd.Series(y_estimate_test,
                                            index=X_test.index)

                roc.append(roc_auc_score(y_test, y_estimate_test))

                conf = confusion_matrix(y_test, y_estimate_test)

                np.savetxt("{}y_estimate_conf_{}.txt".format(
                    data_folder, groups[test_index[0]]),
                           conf,
                           delimiter='\t',
                           fmt='%i')

                f = open("{}results.txt".format(data_folder), "a")
                f.write("Predicting on {}: {}\n".format(
                    groups[test_index[0]], round(roc[-1], 4)))
                f.close()

                # Save estimate
                y_estimate_test.to_csv("{}y_estimate_test_{}.csv".format(
                    data_folder, groups[test_index[0]]),
                                       index=True,
                                       header=True)

                # Remove datasets
                del X_train
                del X_test
                del y_train
                del y_test

                # Save model
                f = open(
                    "{}{}_cl.pkl".format(data_folder, groups[test_index[0]]),
                    "wb")
                pickle.dump(cl, f)
                f.close()

            f = open("{}results.txt".format(data_folder), "a")
            f.write("\nAverage roc auc score: {}".format(
                round(statistics.mean(roc), 4)))
            f.close()

        elif mode == 'regression':

            # Allow for different number of estimators depending on task
            if 'force' in data_folder:
                n_estimators = 10

            else:
                n_estimators = 10

            # Create results text file
            f = open("{}results.txt".format(data_folder), "w")
            f.write("Results for regression\n\n")
            f.close()

            r2 = []

            for train_index, test_index in logo.split(X=X, groups=group_num):
                rg = RandomForestRegressor(n_estimators=n_estimators,
                                           n_jobs=-1)

                # Training data
                print('Hold out trial: {}'.format(groups[test_index[0]]))

                for index in train_index:
                    try:
                        X_train = X_train.append(X[groups[index]],
                                                 ignore_index=True)
                        y_train = y_train.append(y[groups[index]],
                                                 ignore_index=True)

                    except NameError:
                        X_train = X[groups[index]]
                        y_train = y[groups[index]]

                rg.fit(X_train, y_train)

                # Testing data
                X_test = X[groups[test_index[0]]]
                y_test = y[groups[test_index[0]]]

                # Predict
                y_estimate_test = rg.predict(X_test)

                # Round estimate to a whole number
                y_estimate_test = np.around(y_estimate_test)

                # Any negative number = -1
                y_estimate_test[y_estimate_test < 0] = -1

                y_estimate_test = pd.Series(y_estimate_test,
                                            index=X_test.index)

                r2.append(r2_score(y_test, y_estimate_test))

                f = open("{}results.txt".format(data_folder), "a")
                f.write("Predicting on {}: {}\n".format(
                    groups[test_index[0]], round(r2[-1], 4)))
                f.close()

                # Save estimate
                y_estimate_test.to_csv("{}y_estimate_test_{}.csv".format(
                    data_folder, groups[test_index[0]]),
                                       index=True,
                                       header=True)

                # Remove datasets
                del X_train
                del X_test
                del y_train
                del y_test

                # Save model
                f = open(
                    "{}{}_rg.pkl".format(data_folder, groups[test_index[0]]),
                    "wb")
                pickle.dump(rg, f)
                f.close()

            f = open("{}results.txt".format(data_folder), "a")
            f.write("\nAverage R^2 score: {}".format(
                round(statistics.mean(r2), 4)))
            f.close()

    else:
        print("X should be of type dict or pd.DataFrame")

        return

    return
예제 #26
0
def RF_classifier(X_data,Y_data,options=None):
    from sklearn.ensemble import RandomForestClassifier

    ####################
    # Parse user options
    ####################
    params = {}
    gridsearch   = False
    GS_settings  = None
    randomsearch = False
    RS_settings  = None
    accuracy = False
    cv_type = 'logo'
    scoring = 'f1'

    if (options is not None):

        if (("RF_parameters" in options)==True):
            params = options['RF_parameters']

        if (("grid_search" in options)==True):
            from sklearn.model_selection import GridSearchCV
            gridsearch = True
            GS_params   = options['grid_search']['parameter_grid']
            if (("settings" in options['grid_search'])==True): GS_settings = options['grid_search']['settings'] 

        if (("random_search" in options)==True):
            from sklearn.model_selection import RandomizedSearchCV
            from cfd2ml.utilities import convert_param_dist
            randomsearch = True
            RS_params, RS_Nmax   = convert_param_dist(options['random_search']['parameter_grid'])
            print('RS_Nmax = ', RS_Nmax)
            if (("settings" in options['random_search'])==True): RS_settings = options['random_search']['settings'] 

        if(randomsearch==True and gridsearch==True): quit('********** Stopping! grid_search and random_search both set *********')

        if (("accuracy" in options)==True):
            accuracy = options['accuracy']
            if (accuracy==True):
                from sklearn.model_selection import cross_validate
                from sklearn.metrics import precision_recall_curve, auc, f1_score, accuracy_score, balanced_accuracy_score, confusion_matrix
                from cfd2ml.utilities import print_cm

        if (("scoring" in options)==True):
            scoring = options['scoring']

        if (("cv_type" in options)==True):
            cv_type = options['cv_type']

    ##############
    # Prepare data
    ##############
    if(cv_type=='logo'): groups = X_data['group']
    X_data = X_data.drop(columns='group')

    # Find feature and target headers
    X_headers = X_data.columns
    Y_header  = Y_data.name

    nX = X_headers.size
    print('\nFeatures:')
    for i in range(0,nX):
        print('%d/%d: %s' %(i+1,nX,X_headers[i]) )
    print('\nTarget: ', Y_header)
  
    ########################
    # Prepare other settings
    ########################
    # Setting cross-validation type (either leave-one-group-out or 5-fold)
    if(cv_type=='logo'):
        from sklearn.model_selection import LeaveOneGroupOut
        logo = LeaveOneGroupOut()
        ngroup = logo.get_n_splits(groups=groups)
        print('\nUsing Leave-One-Group-Out cross validation on ', ngroup, ' groups')
    elif(cv_type=='kfold'):
        from sklearn.model_selection import StratifiedKFold
        print('\nUsing 10-fold cross validation')
        k_fold = StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
        cv = k_fold.split(X_data,Y_data)

    #########################
    # Training the classifier
    #########################
    # TODO TODO TODO - improve accuracy by using balanced or weighted random forest
    # (see https://statistics.berkeley.edu/sites/default/files/tech-reports/666.pdf)
    if(gridsearch==True):
        # Finding optimal hyperparameters with GridSearchCV
        print('\n Performing GridSearchCV to find optimal hyperparameters for random forest classifier')
        clf = RandomForestClassifier(**params,random_state=42)
        if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups)
        GS_clf = GridSearchCV(estimator=clf,param_grid=GS_params, cv=cv, scoring=scoring, iid=False, verbose=2, **GS_settings)
        GS_clf.fit(X_data,Y_data)

        # Write out results to file
        scores_df = pd.DataFrame(GS_clf.cv_results_)#.sort_values(by='rank_test_score')
        scores_df.to_csv('GridSearch_results.csv')

        # Pich out best results
        best_params = GS_clf.best_params_
        best_score  = GS_clf.best_score_
        clf = GS_clf.best_estimator_  # (this clf has been fit to all of the X_data,Y_data)

        print('\nBest hyperparameters found:', best_params)
        print('\nScore with these hyperparameters:', best_score)

    elif(randomsearch==True):
        # Finding optimal hyperparameters with RandomSearchCV
        print('\n Performing RandomizedSearchCV to find optimal hyperparameters for random forest classifier')
        clf = RandomForestClassifier(**params,random_state=42)
        if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups)
        RS_clf = RandomizedSearchCV(estimator=clf,param_distributions=RS_params, cv=cv, scoring=scoring,iid=False, verbose=2, error_score=np.nan, **RS_settings)
        RS_clf.fit(X_data,Y_data)
        
        # Write out results to file
        scores_df = pd.DataFrame(RS_clf.cv_results_)#.sort_values(by='rank_test_score')
        scores_df.to_csv('RandomSearch_results.csv')

        # Pick out best results
        best_params = RS_clf.best_params_
        best_score  = RS_clf.best_score_
        clf = RS_clf.best_estimator_  # (this clf has been fit to all of the X_data,Y_data)

        print('\nBest hyperparameters found:', best_params)
        print('\nScore with these hyperparameters:', best_score)


    else:
        # Train RF classifier with hyperparameters given by user
        print('\nTraining random forest classifer with given hyperparameters')
        clf = RandomForestClassifier(**params)
        clf.fit(X_data,Y_data)

    # Cross validation accuracy metrics
    if(accuracy==True):
        print('\nPerforming cross validation to determine train and test accuracy/error, and precision-recall curves')

        #TODO - capability to decide on probablity threshold, and predict with chosen threshold

        # Get generator object depending on cv strategy
        if (cv_type=='logo'): 
            cv = logo.split(X_data,Y_data,groups)
        elif(cv_type=='kfold'):
            cv = k_fold.split(X_data,Y_data)  # Need to regen "Generator" object

        fig1, ax1 = plt.subplots()

        # Init lists
        y_real   = []
        y_proba  = []
        train_f1 = []
        test_f1  = []
        train_A  = []
        test_A   = []
        train_BA = []
        test_BA  = []

        # Loop through CV folds
        i = 0
        for train_index, test_index in cv:
            X_train, X_test = X_data.iloc[train_index], X_data.iloc[test_index]
            Y_train, Y_test = Y_data.iloc[train_index], Y_data.iloc[test_index]

            # Train classifier
            clf_cv = clf
            clf_cv.fit(X_train, Y_train)

            # Predict Y
            Y_pred_train = clf_cv.predict(X_train)
            Y_pred_test  = clf_cv.predict(X_test )

            # F1 scores
            f1score = f1_score(Y_test , Y_pred_test)
            train_f1.append(f1_score(Y_train, Y_pred_train) )
            test_f1.append(f1score)
            # Accuracy scores
            Ascore = accuracy_score(Y_test , Y_pred_test)
            train_A.append(accuracy_score(Y_train, Y_pred_train) )
            test_A.append(Ascore)
            # Balanced accuracy scores
            BAscore = balanced_accuracy_score(Y_test , Y_pred_test)
            train_BA.append(balanced_accuracy_score(Y_train, Y_pred_train) )
            test_BA.append(BAscore)

            # Print validation scores (training scores are stored to print mean later, but not printed for each fold)
            if(cv_type=='logo'):
                print('\nTest group = ', groups.iloc[test_index[0]])
            elif(cv_type=='kfold'):
                print('\nFold = ', i)
            print('-------------------')
            print('F1 score = %.2f %%' %(f1score*100) )
            print('Total error = %.2f %%' %((1.0-Ascore)*100) )
            print('Per-class error = %.2f %%' %((1.0-BAscore)*100) )

            # Print confusion matrix for this fold
            print('Confusion matrix:')
            confuse_mat = confusion_matrix(Y_test, Y_pred_test)
            print_cm(confuse_mat, ['Off','On'])
            
            # Prediction probability based on X_test (used for precision-recall curves)
            pred_proba = clf_cv.predict_proba(X_test)
            precision, recall, _ = precision_recall_curve(Y_test, pred_proba[:,1])
            lab = 'Fold %d AUC=%.4f' % (i+1, auc(recall, precision))
            ax1.step(recall, precision, label=lab)
            y_real.append(Y_test)
            y_proba.append(pred_proba[:,1])

            i += 1

        # Calculate errors from accuracies
        train_TE = 1.0 -  np.array(train_A)
        test_TE  = 1.0 -  np.array(test_A)
        train_CAE = 1.0 - np.array(train_BA)
        test_CAE  = 1.0 - np.array(test_BA)

        # Print performance scores
        print('\nMean training scores:')
        print('F1 score = %.2f %%' %(np.mean(train_f1)*100) )
        print('Total error = %.2f %%' %(np.mean(train_TE)*100) )
        print('Per-class error = %.2f %%' %(np.mean(train_CAE)*100) )
    
        print('\nMean validation scores:')
        print('F1 score = %.2f %%' %(np.mean(test_f1)*100) )
        print('Total error = %.2f %%' %(np.mean(test_TE)*100) )
        print('Per-class error = %.2f %%' %(np.mean(test_CAE)*100) )

        
        # Average precision-recall over folds, and plot curves
        y_real = np.concatenate(y_real)
        y_proba = np.concatenate(y_proba)
        precision, recall, _ = precision_recall_curve(y_real, y_proba)
        lab = 'Overall AUC=%.4f' % (auc(recall, precision))
        ax1.step(recall, precision, label=lab, lw=2, color='black')
        ax1.set_xlabel('Recall')
        ax1.set_ylabel('Precision')
        ax1.legend(loc='lower left', fontsize='small')
        

        plt.show()

    return clf
예제 #27
0
#    plt.plot(gamma_traces[i], '^-')
#    plt.legend(['gamma'+str(j) for j in range(i//2+2)])
#    plt.savefig('gamma'+str(i)+'_{}_{}_{}.png'.format(cf.reg_strength, cf.threshold, cf.warmup), format='png', dpi=800)
#    plt.show()

if cf.dataset == 'PPG_Dalia':
    # retrain and cross-validate
    result = rgkf.RandomGroupKFold_split(groups, 4, cf.a)
    for train_index, test_val_index in result:
        X_train, X_val_test = X[train_index], X[test_val_index]
        y_train, y_val_test = y[train_index], y[test_val_index]
        activity_train, activity_val_test = activity[train_index], activity[
            test_val_index]

        logo = LeaveOneGroupOut()
        logo.get_n_splits(
            groups=groups[test_val_index])  # 'groups' is always required
        for validate_index, test_index in logo.split(X_val_test, y_val_test,
                                                     groups[test_val_index]):
            X_validate, X_test = X_val_test[validate_index], X_val_test[
                test_index]
            y_validate, y_test = y_val_test[validate_index], y_val_test[
                test_index]
            activity_validate, activity_test = activity_val_test[
                validate_index], activity_val_test[test_index]
            groups_val = groups[test_val_index]
            k = groups_val[test_index][0]

            # init
            try:
                del model
            except:
예제 #28
0
def perform_svm(Xn, yn, nSess=1, kernelType='linear'):
    groups = get_groups(Xn, nSess)
    logo_fold = LeaveOneGroupOut()
    n_folds = logo_fold.get_n_splits(groups = groups)

    total_samples = Xn.shape[0]
    n_young_samples = int(total_samples/2)
    actual_ = np.zeros((n_folds, 2))
    predict_ = np.zeros((n_folds, 2))
    scores = np.zeros(n_folds)
    decifunc_gri = np.zeros((n_folds, 2))
    cgood = np.zeros(n_folds)
    ggood = np.zeros(n_folds)
    folds_iter = 0

    svm = SVC(kernel = kernelType, class_weight = 'balanced',
              decision_function_shape = 'ovo', probability = True)
    print('\nClassify using SVM: (%s)' % kernelType)
    print(" Performing leave one subject out cross fold with %d outer_folds"
          " and %d inner_folds" % (n_folds, n_folds-1))
    
    # Even while training(tuning the hyper parameters) the classifier,
    # one more subject's data is left out for each training iteration.
    # So, two(outer and inner) LOOCV folds are run.
    folds_iter = 0
    for train_index, test_index in logo_fold.split(Xn, yn, groups):
        # X_t_test and y_test are used for calculating classifier
        # accuracy for this iteration
        X_t_train, X_t_test = Xn[train_index], Xn[test_index]
        y_train, y_test = yn[train_index], yn[test_index]
        nc = X_t_train.shape[1]
        X_t_std = np.std(X_t_train)
        gamma = 1 / (nc * X_t_std)
        a = svm.set_params(gamma = gamma)
        pgrid = { "C": [0.1, 1, 10, 1e2],
                "gamma": np.arange(0.01, 0.1, 0.01)
                }
        # Inner LOOCV fold to tune the hyper parameters of the classifier
        inner_fold = LeaveOneGroupOut()
        gridclf = GridSearchCV(estimator = svm, param_grid = pgrid, refit=True,
                               cv = inner_fold)
        g = gridclf.fit(X_t_train, y_train, groups = groups[train_index])
        cgood[folds_iter] = gridclf.best_params_.get('C')
        ggood[folds_iter] = gridclf.best_params_.get('gamma')
        scores[folds_iter] = gridclf.score(X_t_test, y_test)
        actual_[folds_iter] = y_test
        predict_[folds_iter] = gridclf.predict(X_t_test)
        decifunc_gri[folds_iter] = gridclf.decision_function(X_t_test)
        folds_iter += 1

    # Calculate the accuracy of the classifier
    actual = actual_.reshape(total_samples,)
    predict = predict_.reshape(total_samples,)
    success = (actual == predict)
    n_success = len(success[success == True])
    print(" Classification accuracy =", (n_success / total_samples) * 100, "%")
    print(' Confusion Matrix:\n', confusion_matrix(actual, predict))
    '''
    print("Mean of scores:", np.mean(scores))
    scoremax_idx = np.argmax(scores)
    print("Max. of C(score max):", cgood[scoremax_idx])
    print("Max. of gamma(score max):", ggood[scoremax_idx])
    '''
    decifunc_gri = decifunc_gri.reshape(total_samples,)
    print(' roc_auc_score =', roc_auc_score(actual, decifunc_gri))
    tar = tarfile.open(tarfile_name, "r:gz")
    tar.extractall()
    tar.close()
    # build list of beta maps
    subj_flist = glob.glob("sub-{:02d}/beta*.nii.gz".format(current_subject))
    subj_flist.sort()
    beta_flist.extend(subj_flist)
    # build list of corresponding label and subject number
    y.extend(np.array(labels_df['label']))
    subj_vect.extend(current_subject * np.ones(len(subj_flist), dtype=int))

chance_level = 1. / len(np.unique(y))

# set up leave-one-subject-out cross-validation
loso = LeaveOneGroupOut()
n_splits = loso.get_n_splits(groups=subj_vect)

# read image data
print("Reading beta maps from all the subjects...")
fmri_nii_list = []
for beta_path in beta_flist:
    beta_nii = nb.load(beta_path)
    fmri_nii_list.append(beta_nii)

print("Concatenating the data from all the subjects...")
fmri_img = concat_imgs(fmri_nii_list)

# reading brain mask
mask_nii = nb.load("brain_mask.nii.gz")

# running searchlight decoding
예제 #30
0
data = scipy.io.loadmat(filepath)

y = data["label"]
x = data["X"]

x = np.array([x[i] for i in range(len(y)) if y[i][0] == 3 or y[i][0] == 4])
y = np.array([y[i][0] for i in range(len(y)) if y[i][0] == 3 or y[i][0] == 4])

subjects = []
subject_N = 51
for i in range(subject_N):
    for j in range(80):
        subjects.append(i)

logo = LeaveOneGroupOut()
logo.get_n_splits(x, y, subjects)

parameters = [0.04, 0.2, 1, 5, 25, 125, 625, 3125]

accuracy_test = [[0] * subject_N for i in range(len(parameters))]

for j in tqdm.tqdm(range(len(parameters))):
    i = 0
    for train_index, test_index in tqdm.tqdm(logo.split(x, y, subjects),
                                             leave=False):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model = SVC(kernel="rbf", C=parameters[j], gamma="scale")
        model.fit(x_train, y_train)

        pred_test = model.predict(x_test)