def getProbsThread(nthread, clf, data, label, allAuthors, modeldir, saveModel): crossval = LeaveOneGroupOut() crossval.get_n_splits(groups=label) prob_per_author = [[0] * (len(allAuthors)) for i in range(len(allAuthors))] scores = Parallel(n_jobs=nthread)( delayed(getProbsTrainTest)(clf, data, label, train, test, modeldir, saveModel) for train, test in crossval.split(data, label, groups=label)) for train, test in crossval.split(data, label, groups=label): anAuthor = int(label[test[0]]) train_data_label = label[train] trainAuthors = list(set(train_data_label)) # test_data_label = label[test] nTestDoc = len(scores) # len(test_data_label) for j in range(nTestDoc): for i in range(len(trainAuthors)): try: prob_per_author[anAuthor][int( trainAuthors[i])] += scores[anAuthor - 1][j][i] except IndexError: continue for i in range(len(trainAuthors)): prob_per_author[anAuthor][int(trainAuthors[i])] /= nTestDoc return prob_per_author
def basari_hesapla(giris, cikis, CustomerID): #Kişi bazlı çapraz doğrulama logo = LeaveOneGroupOut() #Destek vektör sınıflandırıcısı clf = SVC(C=1, gamma=0.2, kernel='rbf') #clf = RandomForestClassifier(criterion='entropy',n_estimators=60) toplamBasari = 0 toplamFSkor = 0 for train_index, test_index in logo.split(giris, cikis, CustomerID): #Eğitim ve test verilerini ayır X_train, X_test = giris[train_index, :], giris[test_index, :] y_train, y_test = cikis.iloc[train_index], cikis.iloc[test_index] #Modeli eğit. clf.fit(X_train, y_train) #Modelden tahmin iste. pred_y = clf.predict(X_test) #Tahminlerin başarılarını hesapla. toplamBasari += accuracy_score(y_test, pred_y) toplamFSkor += f1_score(y_test, pred_y) #Ortalama Başarı = toplam başarı / parça sayısı return toplamBasari / logo.get_n_splits( giris, cikis, CustomerID), toplamFSkor / logo.get_n_splits( giris, cikis, CustomerID)
def logistic_logo(features, grades, groups, standard=False, seed=42, use_intercept=False): """Calculates logistic regression with leave-one-group-out split and L2 regularization. Parameters ---------- features : ndarray Input features used in creating regression model. grades : ndarray Ground truth for the model. standard : bool Choice whether to center features by the mean of training split. Defaults to false, since whitened PCA is assumed to be centered. seed : int Random seed used in the model. use_intercept : bool Choice whether to use intercept term on the model. If the model does not provide very powerful predictions, it is better to center them by the intercept. groups : ndarray Patients groups. Used in leave-one-group-out split. Returns ------- Array of model predictions, model coefficients and model intercept term. """ # Lists predictions, coefs, intercepts = [], [], [] # Leave one out split logo = LeaveOneGroupOut() logo.get_n_splits(features, grades, groups) logo.get_n_splits(groups=groups) # 'groups' is always required for train_idx, test_idx in logo.split(features, grades, groups): # Indices x_train, x_test = features[train_idx], features[test_idx] y_train, y_test = grades[train_idx], grades[test_idx] # Normalize with mean and std if standard: x_test -= x_train.mean(0) x_train -= x_train.mean(0) # Linear regression model = LogisticRegression(solver='newton-cg', max_iter=1000, random_state=seed, fit_intercept=use_intercept) model.fit(x_train, y_train) # Predicted score p = model.predict_proba(x_test) predictions.extend(p[:, 1]) # Add the positive predictions to list # Save weights coefs.append(model.coef_) intercepts.append(model.intercept_) # Average coefficients coefs = np.mean(np.array(coefs), axis=0).squeeze() intercepts = np.mean(np.array(intercepts), axis=0).squeeze() return np.array(predictions), coefs, intercepts
def preProcessTrainVal(features, labels, groups, K_FOLD = 2): # split the data into a training set and a validation set from sklearn.model_selection import LeaveOneGroupOut logo = LeaveOneGroupOut() print(logo.get_n_splits(features, labels, groups))
class DKULeaveOneGroupOut(object): def __init__(self, column_name): self.column_name = column_name self.splitter = LeaveOneGroupOut() pass def set_column_labels(self, column_labels): self.column_labels = column_labels def get_n_splits(self, X, y, groups=None): try: column_idx = self.column_labels.index(self.column_name) except ValueError as e: raise Exception("Column %s not found among %s" % (self.column_name, self.column_labels)) groups_array = X[:, column_idx] ret = self.splitter.get_n_splits(X, y, groups_array) print("Will use %s splits" % ret) return ret def split(self, X, y, groups=None): try: column_idx = self.column_labels.index(self.column_name) except ValueError as e: raise Exception("Column %s not found among %s" % (self.column_name, self.column_labels)) groups_array = X[:, column_idx] return self.splitter.split(X, y, groups_array)
def perform_kNearestNeighbours(Xn, yn, nSess=1): groups = get_groups(Xn, nSess) logo_fold = LeaveOneGroupOut() n_folds = logo_fold.get_n_splits(groups = groups) total_samples = Xn.shape[0] n_young_samples = int(total_samples/2) actual_ = np.zeros((n_folds, 2)) predict_ = np.zeros((n_folds, 2)) decifunc = np.zeros((n_folds, 2, 2)) ylabel = np.zeros((n_folds, 2, 2)) ngood = np.zeros(n_folds) folds_iter = 0 print("\nClassify using K-nearest neigbours method:") print(" Performing leave one subject out cross fold with %d outer_folds" " and %d inner_folds" % (n_folds, n_folds-1)) # For each iteration sessions of one subject are left out, the # classifier is trained with the sessions of the other subjects and, # the classifier is tested against the data of the left out subject. yn_toUse = label_binarize(yn, range(3))[:,:-1] kNeigh = KNeighborsClassifier(n_neighbors=1, weights='uniform', leaf_size=40) for train_index, test_index in logo_fold.split(Xn, yn, groups): # X_t_test and y_test are used for calculating classifier # accuracy for this iteration X_t_train, X_t_test = Xn[train_index], Xn[test_index] y_train, y_test = yn[train_index], yn[test_index] pgrid = { "n_neighbors": np.arange(1, n_folds, 1), "leaf_size": [40, 50, 60], "algorithm": ["auto", "ball_tree", "kd_tree", "brute"], "weights": ["uniform", "distance"], } # Inner LOOCV fold to tune the hyper parameters of the classifier inner_fold = LeaveOneGroupOut() gridclf = GridSearchCV(estimator = kNeigh, param_grid = pgrid, refit=True, cv = inner_fold) g = gridclf.fit(X_t_train, y_train, groups = groups[train_index]) ngood[folds_iter] = gridclf.best_params_.get('n_neighbors') actual_[folds_iter] = y_test predict_[folds_iter] = gridclf.predict(X_t_test) ylabel[folds_iter] = yn_toUse[test_index] decifunc[folds_iter] = gridclf.predict_proba(X_t_test) folds_iter += 1 # Calculate the accuracy of the classifier actual = actual_.reshape(total_samples,) predict = predict_.reshape(total_samples,) success = (actual == predict) n_success = len(success[success == True]) print(" Classification accuracy =", (n_success / total_samples) * 100, "%") print(' Confusion Matrix:\n', confusion_matrix(actual, predict)) ylabel = ylabel.reshape(total_samples, 2) decifunc = decifunc.reshape(total_samples, 2) print(' roc_auc_score =', roc_auc_score(ylabel, decifunc))
def get_val_splitter(self): if self.splitter == "predefined": return self.__get_predefined_splitter() elif self.splitter == "loso": loso = LeaveOneGroupOut() return loso.get_n_splits(groups=self.train_data['student_id']) elif self.splitter == 'kfold': return KFold(5).get_n_splits(groups=self.train_y) else: return self.__get_predefined_splitter()
def checkForOutliers(Xin): n_samples = Xin.shape[0] yin = np.ones(n_samples) groups = np.zeros(n_samples) groups_iter = np.arange(0, len(groups), 2) for i in groups_iter: groups[i:i + 2] = (i / 2) logo_fold = LeaveOneGroupOut() n_folds = logo_fold.get_n_splits(groups=groups) outliers_fraction = 0.1 rng = np.random.RandomState(42) # Run IsolationForest and LocalOutlierFactor classifiers classifiers = { "Isolation Forest": IsolationForest(max_samples=n_samples - 2, contamination=outliers_fraction, random_state=rng), "Local Outlier Factor": LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction) } folds_iter_if = 0 outlier_list_if = np.zeros((n_folds, 5)) folds_iter_lof = 0 outlier_list_lof = np.zeros((n_folds, 5)) # Perform LeaveOneOutCrossFold and identify the outliers for train_index, outlier_index in logo_fold.split(Xin, yin, groups): X_train = Xin[train_index] y_train = yin[train_index] for i, (clf_name, clf) in enumerate(classifiers.items()): if clf_name == "Local Outlier Factor": y_pred = clf.fit_predict(X_train) n_errors = (y_pred != y_train).sum() outliers_idx = np.argsort(y_pred)[0:n_errors] outlier_list_lof[folds_iter_lof] = outliers_idx folds_iter_lof += 1 else: clf = clf.fit(X_train) y_pred = clf.predict(X_train) n_errors = (y_pred != y_train).sum() outliers_idx = np.argsort(y_pred)[0:n_errors] outlier_list_if[folds_iter_if] = outliers_idx folds_iter_if += 1 print('\nLocal Outlier Factor:') print(outlier_list_lof) print('\nIsolation Forest:') print(outlier_list_if)
def get_cv(k_fold, groups, X, y): if groups is None: ### Personal CV skf = StratifiedKFold(n_splits=k_fold, shuffle=True) n_split = skf.get_n_splits(X, y) cv = skf.split(X, y) else: ### Group (leave one subject out) logo = LeaveOneGroupOut() n_split = logo.get_n_splits(X, y, groups) cv = logo.split(X, y, groups=groups) return cv, n_split
def identify_top_features(Xn, yn, nSess=1): features_a = [] tscores_a = [] pval_a = [] groups = get_groups(Xn, nSess) logo_fold = LeaveOneGroupOut() n_folds = logo_fold.get_n_splits(groups=groups) print("\nIdentify the signifcant features:") # Perform LOOCV to identify the most significant features # For each iteration sessions of one subject are left out, the # most signifcant features are identified using the sessions of the # remaining subjects. print(" Performing Leave one subject out cross fold(#folds = %d)" % n_folds) for train_index, test_index in logo_fold.split(Xn, yn, groups): X_train, X_test = Xn[train_index], Xn[test_index] y_train, y_test = yn[train_index], yn[test_index] x1_idx = np.argwhere(y_train == 0).flatten() x2_idx = np.argwhere(y_train == 1).flatten() x1 = X_train[x1_idx] x2 = X_train[x2_idx] top_features, tscore, pval = get_ttest_scores(x1, x2) features_a.append(top_features) tscores_a.append(tscore) pval_a.append(pval) # Pick the intersection of the features across all the iterations top_features = np.array( list(reduce(set.intersection, [set(item) for item in features_a]))) nfeatures = top_features.shape[0] top_features_tscores = np.zeros(nfeatures) top_features_pval = np.zeros(nfeatures) # Get the t-scores and p-values of the signifcant features iter = 0 for tf in top_features: for i, v in enumerate(features_a): if tf in v: i1 = np.where(v == tf) top_features_tscores[iter] = tscores_a[i][i1] top_features_pval[iter] = pval_a[i][i1] iter += 1 break # Sort the features based on the ttest value sorted_idx = np.argsort(np.abs(top_features_tscores))[::-1] return top_features[sorted_idx], top_features_tscores[ sorted_idx], top_features_pval[sorted_idx]
class LeaveOneSubjectOut(): def __init__(self, subjects_indexes): self.subjects_indexes = subjects_indexes self.splitter = LeaveOneGroupOut() def split(self, X=None, y=None, groups=None): if groups == None: groups = self.subjects_indexes return self.splitter.split(X, y, groups) def get_n_splits(self, X=None, y=None, groups=None): if groups == None: groups = self.subjects_indexes return self.splitter.get_n_splits(X, y, groups)
def classify_loso_model_selection(X, y, group, gs): """ This do classification using LOSO while also doing model selection using LOSO Args: X (numpy matrix): this is the feature matrix with row being a data point y (numpy vector): this is the label vector with row belonging to a data point group (numpy vector): this is the group vector (which is a the participant id) gs (sklearn GridSearchCV): this is a gridsearch object that will output the best model Returns: accuracies (list): the accuracy at for each leave one out participant """ logo = LeaveOneGroupOut() accuracies = [] f1s = [] cms = [] best_params = [] num_folds = logo.get_n_splits(X, y, group) # keep track of how many folds left for train_index, test_index in logo.split(X, y, group): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] group_train, group_test = group[train_index], group[test_index] print(f"Number of folds left: {num_folds}") with joblib.parallel_backend('loky'): gs.fit(X_train, y_train, groups=group_train) y_hat = gs.predict(X_test) accuracy = accuracy_score(y_test, y_hat) f1 = f1_score(y_test, y_hat) cm = confusion_matrix(y_test, y_hat) accuracies.append(accuracy) f1s.append(f1) cms.append(cm) best_params.append(gs.best_params_) num_folds = num_folds - 1 return accuracies, f1s, cms, best_params
class LeaveOneClusterOut(): """ Wrapper for sklearn LeaveOneGroupOut splitter Stores clusters as attribute (rather than fit param) as a workaround to enable LOCO-CV in mlxtend SequentialFeatureSelector Args: clusters: list of cluster labels for observations (n-vector) """ def __init__(self, clusters): self.logo = LeaveOneGroupOut() self.clusters = clusters def get_n_splits(self, X=None, y=None, groups=None): return self.logo.get_n_splits(groups=self.clusters) def split(self, X, y=None, groups=None): return self.logo.split(X, y, groups=self.clusters)
def perform_elm(Xn, yn, nSess=1, kernelType='linear'): groups = get_groups(Xn, nSess) logo_fold = LeaveOneGroupOut() n_folds = logo_fold.get_n_splits(groups = groups) total_samples = Xn.shape[0] n_young_samples = int(total_samples/2) actual_ = np.zeros((n_folds, 2)) predict_ = np.zeros((n_folds, 2)) decifunc_gri = np.zeros((n_folds, 2)) folds_iter = 0 print('\nClassify using ELM: (%s)' % kernelType) print(" Performing leave one subject out cross fold with %d outer_folds" " and %d inner_folds" % (n_folds, n_folds-1)) # For each iteration sessions of one subject are left out, the # classifier is trained with the sessions of the other subjects and, # the classifier is tested against the data of the left out subject. for train_index, test_index in logo_fold.split(Xn, yn, groups): X_t_train, X_t_test = Xn[train_index], Xn[test_index] y_train, y_test = yn[train_index], yn[test_index] inner_fold = LeaveOneGroupOut() pgrid = { "n_hidden": np.arange(10, 300, 10), "rbf_width": np.arange(0.1, 0.5, 0.05) } elmc_ = ELMClassifier(n_hidden=10, random_state=42, rbf_width=0.1, activation_func=kernelType, binarizer=LabelBinarizer(0, 1)) gridclf = GridSearchCV(estimator = elmc_, param_grid = pgrid, refit=True, cv = inner_fold) g = gridclf.fit(X_t_train, y_train, groups = groups[train_index]) actual_[folds_iter] = y_test predict_[folds_iter] = gridclf.predict(X_t_test) decifunc_gri[folds_iter] = gridclf.decision_function(X_t_test).reshape(2,) folds_iter += 1 actual = actual_.reshape(total_samples,) predict = predict_.reshape(total_samples,) success = (actual == predict) n_success = len(success[success == True]) print(" Classification accuracy =", (n_success / total_samples) * 100, "%") print(' Confusion Matrix:\n', confusion_matrix(actual, predict)) decifunc_gri = decifunc_gri.reshape(total_samples,) print(' roc_auc_score =', roc_auc_score(actual, decifunc_gri))
def perform_leastSquareLinearClassifier(Xn, yn, nSess=1): groups = get_groups(Xn, nSess) logo_fold = LeaveOneGroupOut() n_folds = logo_fold.get_n_splits(groups = groups) total_samples = Xn.shape[0] actual_ = np.zeros((n_folds, 2)) predict_ = np.zeros((n_folds, 2)) decifunc_gri = np.zeros((n_folds, 2)) folds_iter = 0 print("\nClassify using Linear Classifier:") print(" Performing leave one subject out cross fold with %d outer_folds" % (n_folds)) linearReg = linear_model.LinearRegression(normalize=True) for train_index, test_index in logo_fold.split(Xn, yn, groups): # X_t_test and y_test are used for calculating classifier # accuracy for this iteration X_t_train, X_t_test = Xn[train_index], Xn[test_index] y_train, y_test = yn[train_index], yn[test_index] linearReg.fit(X_t_train, y_train) pred_ = linearReg.predict(X_t_test) predict_[folds_iter] = pred_[:]>0 decifunc_gri[folds_iter] = linearReg._decision_function(X_t_test) actual_[folds_iter] = y_test folds_iter += 1 # Calculate the accuracy of the classifier actual = actual_.reshape(total_samples,) predict = predict_.reshape(total_samples,) success = (actual == predict) n_success = len(success[success == True]) print(' Classification accuracy =', (n_success / (total_samples)) * 100, "%") print(' Confusion Matrix:\n', confusion_matrix(actual, predict)) decifunc_gri = decifunc_gri.reshape(total_samples,) print(' roc_auc_score =', roc_auc_score(actual, decifunc_gri))
def findParametersAndEvaluate(self, data, strategy, label_name, group=None, dataset=None, cv=5): self.strategy = strategy self.results = {} print('-------------------------------') print(' STEP : Finding Parameters & Evaluate Models') print('-------------------------------') self.label_name_check(label_name) #print(self.labelset.columns) # store performance data for each strategy if (strategy == 'train_test_split' or strategy == 'all'): self.train_test = dict() for model in self.models.keys(): self.train_test[model] = None print('===> Evaluation strategy: Train and Test Split ') X_train, X_test, y_train, y_test = train_test_split( data, self.label_set[label_name], train_size=.7, random_state=self.seed) print('===> Parameters find-> Start') for model in self.models.keys(): if model == 'vot': continue if not self.configured: gd = GridSearchCV(self.models[model], self.params[model], cv=cv, scoring='neg_root_mean_squared_error') gd.fit(X_train, y_train) print(' Parameters for ', model, ': ', gd.best_params_) self.models[model] = gd.best_estimator_ print('===> Parameters find-> End') test_performances = dict() print('===> Test data performance[RMSE] ') for model in self.models.keys(): self.models[model].fit(X_train, y_train) test_performances[model] = mean_squared_error( y_test, self.models[model].predict(X_test), squared=False) #print(' Model[',model,']:',test_performances[model]) self.train_test[model] = test_performances[model] print(self.train_test) self.results['train_test'] = self.train_test if (strategy == 'cross_val' or strategy == 'all'): self.cross_val = dict() cross_val = dict() for model in self.models.keys(): self.cross_val[model] = None print('==============================================') print('Evaluation strategy: Cross Validation') print('==============================================') for model in self.models.keys(): if model != 'vot' and not self.configured: print(' ==> Finding params for ', model) gd = GridSearchCV(self.models[model], self.params[model], cv=10, scoring='neg_root_mean_squared_error') gd.fit(data, self.label_set[label_name]) print(' Parameters: ', gd.best_params_) self.models[model] = gd.best_estimator_ cross_val[model] = cross_val_score( self.models[model], data, self.label_set[label_name], scoring='neg_root_mean_squared_error', cv=cv) #print(' Score[',model,']:',cross_val_scores[model]) cross_val_mean = -1 * statistics.mean(cross_val[model]) cross_val_var = statistics.variance(cross_val[model]) self.cross_val[model] = [cross_val_mean, cross_val_var] self.results['cross_val'] = self.cross_val if (strategy == 'leave_one_group_out' or strategy == 'all'): self.leave_group = dict() for model in self.models.keys(): self.leave_group[model] = None print('==============================================') print('Evaluation strategy: Leave one group out') print('==============================================') logo = LeaveOneGroupOut() n_splits = logo.get_n_splits(groups=group) error = dict() for model in self.models.keys(): error[model] = [None] * n_splits k = 0 for train_index, test_index in logo.split( data, self.label_set[label_name], group): #print(test_index) X_train, y_train = data.iloc[train_index], self.label_set[ label_name][train_index] X_test, y_test = data.iloc[test_index], self.label_set[ label_name][test_index] for model in self.models.keys(): if model != 'vot' and not self.configured: print(' ==> Finding params for ', model) gd = GridSearchCV( self.models[model], self.params[model], cv=10, scoring='neg_root_mean_squared_error') gd.fit(X_train, y_train) print(' Parameters: ', gd.best_params_) estimator = gd.best_estimator_ self.models[model] = estimator self.models[model].fit(X_train, y_train) error[model][k] = mean_squared_error( y_test, self.models[model].predict(X_test), squared=False) #print(' Model[',model,']:',error[model]) k = k + 1 for model in self.models.keys(): err_mean = statistics.mean(error[model]) err_var = statistics.variance(error[model]) self.leave_group[model] = [err_mean, err_var] self.results['leave_group'] = self.leave_group if (strategy == 'leave_one_dataset_out' or strategy == 'all'): self.leave_dataset = dict() for model in self.models.keys(): self.leave_dataset[model] = None print('==============================================') print('Evaluation strategy: Leave one dataset out') print('==============================================') logo = LeaveOneGroupOut() n_splits = logo.get_n_splits(groups=dataset) error = dict() for model in self.models.keys(): error[model] = [None] * n_splits k = 0 for train_index, test_index in logo.split( data, self.label_set[label_name], dataset): X_train, y_train = data.iloc[train_index], self.label_set[ label_name][train_index] X_test, y_test = data.iloc[test_index], self.label_set[ label_name][test_index] for model in self.models.keys(): if model != 'vot' and not self.configured: print(' ==> Finding params for ', model) gd = GridSearchCV( self.models[model], self.params[model], cv=10, scoring='neg_root_mean_squared_error') gd.fit(X_train, y_train) #print(' Parameters: ',gd.best_params_) estimator = gd.best_estimator_ self.models[model] = estimator self.models[model].fit(X_train, y_train) error[model][k] = mean_squared_error( y_test, self.models[model].predict(X_test), squared=False) #print(' Model[',model,']:',error[model]) k = k + 1 for model in self.models.keys(): err_mean = statistics.mean(error[model]) err_var = statistics.variance(error[model]) self.leave_dataset[model] = [err_mean, err_var] self.results['leave_dataset'] = self.leave_dataset if (strategy == 'sorted_stratified' or strategy == 'all'): self.stratified = dict() for model in self.models.keys(): self.stratified[model] = None # idea from https://scottclowe.com/2016-03-19-stratified-regression-partitions/ print('==============================================') print('Evaluation strategy: Sorted Stratification') print('==============================================') label_df = pd.DataFrame(self.label_set) indices = label_df.sort_values(by=[label_name]).index.tolist() splits = dict() error = dict() for model in self.models.keys(): error[model] = [None] * cv for i in range(cv): splits[i] = list() for i in range(len(indices)): if i % cv == 0: pick = random.sample(range(cv), cv) cur_pick = pick.pop() splits[cur_pick].append(indices[i]) for i in range(cv): test_index = splits[i] train_index = [] for j in range(cv): if j != i: train_index = train_index + splits[j] ########################################## # Code to training model on sorted stratified set X_train, y_train = data.iloc[train_index], self.label_set[ label_name][train_index] X_test, y_test = data.iloc[test_index], self.label_set[ label_name][test_index] for model in self.models.keys(): if model != 'vot' and not self.configured: print(' ==> Finding params for ', model) gd = GridSearchCV( self.models[model], self.params[model], cv=10, scoring='neg_root_mean_squared_error') gd.fit(X_train, y_train) print(' Parameters: ', gd.best_params_) estimator = gd.best_estimator_ self.models[model] = estimator self.models[model].fit(X_train, y_train) error[model][i] = mean_squared_error( y_test, self.models[model].predict(X_test), squared=False) #print(' Model[',model,']:',error[model]) for model in self.models.keys(): err_mean = statistics.mean(error[model]) err_var = statistics.variance(error[model]) self.stratified[model] = [err_mean, err_var] ########################################## self.results['stratified'] = self.stratified else: print('Unsupported evaluation strategy') return None return self.results # Preparing dataframe with results for report generation """
lr = lm.LinearRegression() ##### CbS + LOGO ##### Sex_ctrl_30 = pd.get_dummies(sex_ctrl_30) assert Sex_ctrl_30.shape == (313, 2) Residuals_ctrl_30_bySite = np.array([X_ctrl_30_bySite[:, j] - lr.fit(Sex_ctrl_30, X_ctrl_30_bySite[:, j]).predict(Sex_ctrl_30) for j in range(X_ctrl_30_bySite.shape[1])]).T assert Residuals_ctrl_30_bySite.shape == (313, 162) X = Residuals_ctrl_30_bySite y = age_ctrl_30 groups = site_ctrl_30 logo = LeaveOneGroupOut() assert logo.get_n_splits(X, y, groups) == 10 param_grid = {'alpha': 10. ** np.arange(-5, 5)} model = GridSearchCV(lm.Ridge(max_iter=10000, tol = 0.0001, random_state = 42), param_grid, cv=10) scaler = StandardScaler() y_test_pred = np.zeros(len(y)) for train, test in logo.split(X, y, groups): X_train, X_test, y_train, y_test = X[train, :], X[test, :], y[train], y[test] X_train_s = scaler.fit_transform(X_train) X_test_s = scaler.transform(X_test) model.fit(X_train_s, y_train) y_test_pred[test] = model.predict(X_test_s) print("Test r2:%.2f" % metrics.r2_score(y, y_test_pred)) # Test r2:-26.94 print(model.best_params_) # {'alpha': 100.0}
def regress_logo(features, grades, groups, method='ridge', standard=False, use_intercept=True, convert='none', alpha=1.0): """Calculates linear regression with leave-one-group-out split and L2 regularization. Parameters ---------- features : ndarray Input features used in creating regression model. grades : ndarray Ground truth for the model. method : str Regression model used. Defaults to ridge regression, but lasso is also possible. Ridge seems to perform better. standard : bool Choice whether to center features by the mean of training split. Defaults to false, since whitened PCA is assumed to be centered. use_intercept : bool Choice whether to use intercept term on the model. If the model does not provide very powerful predictions, it is better to center them by the intercept. groups : ndarray Patients groups. Used in leave-one-group-out split. convert : str Possibility to predict exp or log of ground truth. Defaults to no conversion. alpha : float Regularization coefficient. c^-1 Returns ------- Array of model prdictions, model coefficients and model intercept term. """ # Convert grades if convert == 'exp': grades = np.exp(grades) elif convert == 'log': grades = np.log(grades) else: pass # Lists predictions, coefs, intercepts = [], [], [] # Leave one out split logo = LeaveOneGroupOut() logo.get_n_splits(features, grades, groups) logo.get_n_splits(groups=groups) # 'groups' is always required for train_idx, test_idx in logo.split(features, grades, groups): # Indices x_train, x_test = features[train_idx], features[test_idx] y_train, y_test = grades[train_idx], grades[test_idx] # Normalize with mean and std if standard: x_test -= x_train.mean(0) x_train -= x_train.mean(0) # Linear regression if method == 'ridge': model = Ridge(alpha=alpha, normalize=True, random_state=42, fit_intercept=use_intercept) elif method == 'lasso': model = Lasso(alpha=alpha, normalize=True, random_state=42, fit_intercept=use_intercept) else: model = LinearRegression(normalize=True, fit_intercept=use_intercept, n_jobs=-1) model.fit(x_train, y_train) # Predicted score predictions.append(model.predict(x_test)) # Save weights coefs.append(model.coef_) intercepts.append(model.intercept_) predictions_flat = [] for group in predictions: for p in group: predictions_flat.append(p) # Convert grades back if convert == 'exp': predictions = np.log(np.array(predictions_flat)) elif convert == 'log': predictions = np.exp(np.array(predictions_flat)) else: predictions = np.array(predictions_flat) return predictions, np.mean(np.array(coefs), axis=0), np.mean(np.array(intercepts), axis=0)
def pca_regress_pipeline_log(features, grades, groups, n_components=0.9, solver='full', whitening=True, standard=False, seed=42, mod_coefs=True, alpha=0.1, grade_name='', savepath=None): feature_names = ['Center +', 'Center -', 'Large U-1', 'Large U-2', 'Large U-3', 'Large U-4', 'Large U-5', 'Large U-6', 'Large U-7', 'Large N-U', 'Small U-1', 'Small U-2', 'Small U-3', 'Small U-4', 'Small U-5', 'Small U-6', 'Small U-7', 'Small N-U', 'Radial U-0', 'Radial U-1', 'Radial U-2', 'Radial U-3', 'Radial U-4', 'Radial U-5', 'Radial U-6', 'Radial U-7', 'Radial U-8', 'Radial N-U'] grades_log = grades # Fit PCA to full data pca = PCA(n_components=n_components, svd_solver=solver, whiten=whitening, random_state=seed) pca.fit(features) # Leave one out split logo = LeaveOneGroupOut() logo.get_n_splits(features, grades_log, groups) logo.get_n_splits(groups=groups) # 'groups' is always required all_shap_values, all_shap_values_lin = [], [] for train_idx, test_idx in logo.split(features, grades_log, groups): # Indices x_train, x_test = features[train_idx], features[test_idx] y_train, y_test = grades_log[train_idx], grades_log[test_idx] # Normalize with mean and std if standard: x_test -= x_train.mean(0) x_train -= x_train.mean(0) # Logistic regression model = LogisticRegression(solver='newton-cg', max_iter=1000, random_state=seed, fit_intercept=False) model.fit(pca.transform(x_train), y_train > 1) model_lin = Ridge(alpha=alpha, normalize=True, random_state=seed, fit_intercept=True) model_lin.fit(pca.transform(x_train), y_train) # Predicted score (for logistic regression) p = model.predict_proba(pca.transform(x_test)) p_lin = model_lin.predict(pca.transform(x_test)) # Merge PCA into the linear model if mod_coefs: coef = (pca.components_.T / pca.singular_values_) @ model.coef_.T * np.sqrt(pca.n_samples_ - 1) coef_lin = (pca.components_.T / pca.singular_values_) @ model_lin.coef_.T * np.sqrt(pca.n_samples_ - 1) # Update models model.coef_ = coef.T model_lin.coef_ = coef_lin.T p2_lin = model_lin.predict(x_test) p2 = model.predict_proba(x_test) # Inference p_inf = (x_test @ coef).squeeze() p_inf = (1 + np.exp(-p_inf)) ** -1 eps = 1.0e-10 assert np.sum(np.abs(p - p2)) < eps, 'LOGReg results are not equal' assert np.sum(np.abs(p_inf - p[:, 1])) < eps, 'LOGReg results are not equal' assert np.sum(np.abs(p_lin - p2_lin)) < eps, 'LINReg results are not equal' else: # Otherwise run PCA x_train = pca.transform(x_train) x_test = pca.transform(x_test) # Interpretability # Logistic regression explainer = shap.LinearExplainer(model, x_train, feature_dependence='correlation', nsamples=x_train.shape[0]) shap_values = explainer.shap_values(x_test) # Linear regression explainer_lin = shap.LinearExplainer(model_lin, x_train, feature_dependence='correlation', nsamples=x_train.shape[0]) shap_values_lin = explainer_lin.shap_values(x_test) # Append prediction all_shap_values.append(shap_values) all_shap_values_lin.append(shap_values_lin) # Combine shap values and plot the summary all_shap_values = np.vstack(all_shap_values) all_shap_values_lin = np.vstack(all_shap_values_lin) # Inverse PCA for the model without PCA if not mod_coefs: all_shap_values = pca.inverse_transform(all_shap_values) all_shap_values_lin = pca.inverse_transform(all_shap_values_lin) # Force plot # shap.force_plot(explainer.expected_value, all_shap_values, features) # plt.show() # Summary plots shap.summary_plot(all_shap_values, features, show=False, feature_names=feature_names) # plt.title(f'Logistic Regression ({grade_name})') if savepath is not None: plt.savefig(f'{savepath}{grade_name}_logistic_cov.png', transparent=False, bbox_inches='tight') plt.show() else: plt.show() shap.summary_plot(all_shap_values_lin, features, show=False, feature_names=feature_names) # plt.title(f'Linear Ridge Regression ({grade_name})') if savepath is not None: plt.savefig(f'{savepath}{grade_name}_linear_cov.png', transparent=False, bbox_inches='tight') plt.show() else: plt.show()
def rforest_logo(features, grades, groups, standard=False, seed=42, n_trees=50, tree_depth=None, savepath=None, zone=''): """Calculates logistic regression with leave-one-group-out split and L2 regularization. Parameters ---------- features : ndarray Input features used in creating regression model. grades : ndarray Ground truth for the model. standard : bool Choice whether to center features by the mean of training split. Defaults to false, since whitened PCA is assumed to be centered. seed : int Random seed used in the model. n_trees : int Number of trees in the Random Forest tree_depth : int Maximum depth of the individual tree. groups : ndarray Patients groups. Used in leave-one-group-out split. savepath : str Path to save the model. zone : str Zone that is graded. Returns ------- Array of model predictions, model coefficients and model intercept term. """ # Lists predictions, coefs, intercepts, models = [], [], [], [] # Leave one out split logo = LeaveOneGroupOut() logo.get_n_splits(features, grades, groups) logo.get_n_splits(groups=groups) # 'groups' is always required for train_idx, test_idx in logo.split(features, grades, groups): # Indices x_train, x_test = features[train_idx], features[test_idx] y_train, y_test = grades[train_idx], grades[test_idx] # Normalize with mean and std if standard: x_test -= x_train.mean(0) x_train -= x_train.mean(0) # Linear regression model = RandomForestClassifier(n_estimators=n_trees, random_state=seed, max_depth=tree_depth, n_jobs=12) model.fit(x_train, y_train) # Predicted score p = model.predict_proba(x_test) predictions.append(p) # Save weights coefs.append(model.feature_importances_) # Importance of PCA components is returned intercepts.append(0.0) # No intercept in RF models.append(model) predictions_flat = [] for group in predictions: for p in group: predictions_flat.append(p) if savepath is not None: Path(savepath + '/models/').mkdir(exist_ok=True) filename = savepath + '/models/' + strftime(f'RF_model_{zone}_%Y_%m_%d_%H_%M_%S.sav') dump(models, filename) return np.array(predictions_flat)[:, 1], np.mean(np.array(coefs), axis=0).squeeze(), np.mean(np.array(intercepts), axis=0).squeeze()
saver = tf.train.Saver() ## This is same with Deep Neural Network session part # Same with DNN part n_epochs = 20 batch_size = 30 # Leave one out cross validation - group making groups = [] for i in range(1, 13): group = [i] * 120 for i in group: groups.append(i) logo = LeaveOneGroupOut() logo.get_n_splits(X_data, Y_data, groups) looop = [] times = 1 for train_index, test_index in logo.split(X_data, Y_data, groups): #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X_data[train_index], X_data[test_index] y_train, y_test = Y_data[train_index], Y_data[test_index] with tf.Session() as sess: init.run() accuracy_test = [] for epoch in range(n_epochs): i = 0 for batch in range(len(X_train) // batch_size): X_batch = X_train[i:i + batch_size] y_batch = y_train[i:i + batch_size]
def main(req: func.HttpRequest) -> func.HttpResponse: logging.info('Python HTTP trigger function processed a request.') unique_instance_str = str(uuid.uuid1()) X_file = req.files['X_file'] y_file = req.files['y_file'] Wk_file = req.files['Wk_file'] tempFilePath = tempfile.gettempdir() staging_dir = tempFilePath + '/staging/' + unique_instance_str if not os.path.exists(staging_dir): logging.info('Creating ' + staging_dir) os.makedirs(staging_dir) X_file.save(staging_dir + '/X.parquet') y_file.save(staging_dir + '/y.parquet') Wk_file.save(staging_dir + '/Wk.parquet') X = pd.read_parquet(staging_dir + '/X.parquet') X = X.reindex(sorted(X.columns), axis=1) y = pd.read_parquet(staging_dir + '/y.parquet').iloc[:, 0] Week = pd.read_parquet(staging_dir + '/Wk.parquet').iloc[:, 0] os.remove(staging_dir + '/X.parquet') os.remove(staging_dir + '/y.parquet') os.remove(staging_dir + '/Wk.parquet') os.rmdir(staging_dir) logo = LeaveOneGroupOut() n_splits = logo.get_n_splits(groups=Week) r2_total = 0 mae_total = 0 rmse_total = 0 logging.info('Beginning CV.') c = 0 target_splits = 3 n_actual_splits = 0 nth_split = 0 for train_index, test_index in logo.split(X, y, Week): cv_prob = max(0, (target_splits - n_actual_splits) / (n_splits - nth_split)) nth_split += 1 if np.random.rand() > cv_prob: continue logging.info('Split {}.'.format(c)) X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] y_train = np.asarray(y_train).ravel() y_test = np.asarray(y_test).ravel() test_model = XGBRegressor() test_model.fit(X_train, y_train) y_pred = test_model.predict(X_test) r2_total += r2_score(y_true=y_test, y_pred=y_pred) mae_total += mean_absolute_error(y_true=y_test, y_pred=y_pred) rmse_total += np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred)) n_actual_splits += 1 c += 1 avg_sales = y.mean() r2 = r2_total / n_actual_splits mae = mae_total / n_actual_splits mpe = mae / avg_sales rmse = rmse_total / n_actual_splits del X del y del Week gc.collect() outp = { 'avg_sales': float(avg_sales), 'r2_score': float(r2), 'mae_score': float(mae), 'mpe_score': float(mpe), 'rmse_score': float(rmse) } return func.HttpResponse( json.dumps(outp), mimetype='application/json', )
## store all in pickle dumps pickle.dump(segments, open( "segments_90_acc.p","wb")) pickle.dump(labels, open( "labels_90_acc.p","wb")) pickle.dump(subjects, open( "subjects_90_acc.p","wb")) # segments = pickle.load(open('segments_90_acc.p','rb')) # labels = pickle.load(open('labels_90_acc.p', 'rb')) # subjects = pickle.load(open('subjects_90_acc.p','rb')) numOfRows = segments.shape[1] numOfColumns = segments.shape[2] groups = np.array(subjects) logo = LeaveOneGroupOut() logo.get_n_splits(segments, labels, groups) # reshaping the data for network input reshapedSegments = segments.reshape(segments.shape[0], numOfRows, numOfColumns,1) # categorically defining the classes of the activities labels = np.asarray(pd.get_dummies(labels),dtype = np.int8) # # open a file, where you stored the pickled data # # dump information to that file # segments = pickle.load(segments) # labels = pickle.load(labels) # ================================================================================== # splitting in training and testing data
def main(args, pipe=False): ''' Checks passed arguments and performs requested actions. ''' if not pipe: parser = argparse.ArgumentParser( description='Classify call segments as positive or negative.') parser.add_argument('-f', '--features', dest='feat_loc', required=True, help='Path to CSV feature file.') parser.add_argument( '-o', '--out', dest='out_loc', required=True, help='Path to where classification summary should be saved.') parser.add_argument('--hmm', dest='hmm_flag', action='store_true', help='Classify with a Hidden Markov Model.') parser.add_argument('--rf', dest='rf_flag', action='store_true', help='Classify with a random forest.') parser.add_argument('--n_components', dest='n_components', help='Number of components for the HMM.') parser.add_argument('--n_mix', dest='n_mix', help='Number of Gaussian mixtures for the HMM.') parser.add_argument( '--n_estimators', dest='n_estimators', help='Number of tree estimators for the random forest.') args = parser.parse_args() if args.hmm_flag or args.rf_flag: # store scores from all runs to calc stats hmm_chunk_scores = [] hmm_overall_scores = [] rf_chunk_scores = [] rf_overall_scores = [] # split data for leave-one-group(call)-out validation data, labels, ids = sep_data_labels(args.feat_loc) logo = LeaveOneGroupOut() curr_split = 1 num_splits = logo.get_n_splits(data, labels, ids) # loop through all cross validation folds for train_index, test_index in logo.split(data, labels, ids): print('Split ' + str(curr_split) + ' out of ' + str(num_splits)) data_train, data_test = data[train_index], data[test_index] labels_train, labels_test = labels[train_index], labels[test_index] # classify with the selected models if args.hmm_flag: if args.n_components: n_components = int(args.n_components) else: n_components = 2 if args.n_mix: n_mix = int(args.n_mix) else: n_mix = 2 hmm_model = HmmMorency(n_components=n_components, n_mix=n_mix) chunk_scores, call_score = train_and_test( hmm_model, data_train, data_test, labels_train, labels_test) hmm_chunk_scores.append(chunk_scores) hmm_overall_scores.append(call_score) if args.rf_flag: if args.n_estimators: n_estimators = int(args.n_estimators) else: n_estimators = 100 rf_model = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1, random_state=10) chunk_scores, call_score = train_and_test( rf_model, data_train, data_test, labels_train, labels_test) rf_chunk_scores.append(chunk_scores) rf_overall_scores.append(call_score) curr_split += 1 # evaluate the scores for all models out_file = os.path.join(args.out_loc, 'results.txt') if args.hmm_flag: score_stats( 'hmm, mix: ' + str(n_mix) + ' states: ' + str(n_components), hmm_chunk_scores, hmm_overall_scores, out_file) if args.rf_flag: score_stats('random forest, estimators: ' + str(n_estimators), rf_chunk_scores, rf_overall_scores, out_file) else: sys.exit( 'Must choose at least one classification method. (--hmm, --rf)')
def learn(X: (dict, pd.DataFrame), y: (dict, pd.Series), data_folder: str, groups: list = None, test_split: float = None, name: str = None): ''' This function trains either a classification or regression random forest model. It is able to handle either a singular pandas DataFrame or a dictionary of pandas DataFrames. If the input is a singular pandas DataFrame, the rows will be split into a training and testing dataset using test_split (0 - 1). If the input is a dictionary of pandas DataFrames, a leave one out method will be used to verify the models accuracy. Inputs: X: a dictionary of pandas DataFrames or a singular pandas DataFrame y: a dictionary of pandas Series of a singular pandas Series data_folder: the location of where to save the output groups: a list of the trial names NOTE: this is only required if the X/y input is a dictionary test_split: the decimal percentage to split the training and testing datasets NOTE: this is only required if the X/y input is not a dictionary name: the name of the trial NOTE: this is only required if the X/y input is not a dictionary Alex Woodall Auckland Bioengineering Institute 08/04/2020 ''' if 'force' in data_folder or 'time' in data_folder: mode = 'regression' elif 'binary' in data_folder: mode = 'classification' if type(X) is pd.DataFrame: # Learning using one trial (or a combination into a DataFrame rather than a dictionary of DataFrames) if mode == 'classification': # Split into training and testing X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_split) # Create classifier and train cl = RandomForestClassifier(n_estimators=128, n_jobs=-1) cl.fit(X_train, y_train) # Predict on classifier and convert to a pandas series, save output y_predict = cl.predict(X_test) y_predict = pd.Series(y_predict, index=X_test.index) y_predict.to_csv("{}y_predict.csv".format(data_folder), index=True, header=True) y_test.to_csv("{}y_test.csv".format(data_folder), index=True, header=True) # Print score and confusion matrix score = roc_auc_score(y_test, y_predict) conf_mat = confusion_matrix(y_test, y_predict) print("Roc auc = {}\n".format(score)) print(conf_mat) elif mode == 'regression': # Split into training and testing split_int = int(len(X) * (1 - test_split)) X_train = X.head(split_int) y_train = y.head(split_int) X_test = X.tail(len(X) - split_int) y_test = y.tail(len(X) - split_int) # Create regressor and train rg = RandomForestRegressor(n_estimators=100, n_jobs=-1) rg.fit(X_train, y_train) # Predict y_predict = rg.predict(X_test) # Filter force array ''' Filter force plate data at 60 Hz ''' analog_frequency = 1000 cut_off = 60 # Derie (2017), Robberechts et al (2019) order = 2 # Weyand (2017), Robberechts et al (2019) b_f, a_f = signal.butter(N=order, Wn=cut_off / (analog_frequency / 2), btype='low') new_F = signal.filtfilt(b_f, a_f, y_predict) ''' Rezero filtered forces''' threshold = 50 # 20 N filter_plate = rezero_filter(original_fz=new_F, threshold=threshold) y_predict = filter_plate * new_F # Convert output into a pandas series and save y_predict = pd.Series(y_predict, index=X_test.index) y_predict.to_csv("{}y_predict.csv".format(data_folder), index=True, header=True) y_test.to_csv("{}y_test.csv".format(data_folder), index=True, header=True) # Calculate R2 score and print score = r2_score(y_test, y_predict) print("R2 = {}\n".format(score)) # Plot result plt.plot(y_test.tail(1000), 'k', label='True data') plt.plot(y_predict.tail(1000), 'r', label='Estimate data') plt.legend() plt.ylabel('Force (N)') plt.xlabel('Time (ms)') plt.title('Estimated data for {}'.format(name)) # Save figure score = round(score, 4) plt.savefig('{}{}_{}.png'.format(data_folder, name, '_'.join(str(score).split('.')))) plt.show() elif type(X) is dict: # Create leave one group out split group_num = np.arange(len(groups)) logo = LeaveOneGroupOut() logo.get_n_splits(groups=group_num) if mode == 'classification': # Create results text file f = open("{}results.txt".format(data_folder), "w") f.write("Results for classification\n\n") f.close() roc = [] # Train on n - 1 groups, test on 1. Repeat for all for train_index, test_index in logo.split(X=X, groups=group_num): cl = RandomForestClassifier(n_estimators=128, n_jobs=-1) # Training data print('Hold out trial: {}'.format(groups[test_index[0]])) for index in train_index: try: X_train = X_train.append(X[groups[index]], ignore_index=True) y_train = y_train.append(y[groups[index]], ignore_index=True) except NameError: X_train = X[groups[index]] y_train = y[groups[index]] cl.fit(X_train, y_train) # Testing data X_test = X[groups[test_index[0]]] y_test = y[groups[test_index[0]]] # Predict y_estimate_test = cl.predict(X_test) y_estimate_test = pd.Series(y_estimate_test, index=X_test.index) roc.append(roc_auc_score(y_test, y_estimate_test)) conf = confusion_matrix(y_test, y_estimate_test) np.savetxt("{}y_estimate_conf_{}.txt".format( data_folder, groups[test_index[0]]), conf, delimiter='\t', fmt='%i') f = open("{}results.txt".format(data_folder), "a") f.write("Predicting on {}: {}\n".format( groups[test_index[0]], round(roc[-1], 4))) f.close() # Save estimate y_estimate_test.to_csv("{}y_estimate_test_{}.csv".format( data_folder, groups[test_index[0]]), index=True, header=True) # Remove datasets del X_train del X_test del y_train del y_test # Save model f = open( "{}{}_cl.pkl".format(data_folder, groups[test_index[0]]), "wb") pickle.dump(cl, f) f.close() f = open("{}results.txt".format(data_folder), "a") f.write("\nAverage roc auc score: {}".format( round(statistics.mean(roc), 4))) f.close() elif mode == 'regression': # Allow for different number of estimators depending on task if 'force' in data_folder: n_estimators = 10 else: n_estimators = 10 # Create results text file f = open("{}results.txt".format(data_folder), "w") f.write("Results for regression\n\n") f.close() r2 = [] for train_index, test_index in logo.split(X=X, groups=group_num): rg = RandomForestRegressor(n_estimators=n_estimators, n_jobs=-1) # Training data print('Hold out trial: {}'.format(groups[test_index[0]])) for index in train_index: try: X_train = X_train.append(X[groups[index]], ignore_index=True) y_train = y_train.append(y[groups[index]], ignore_index=True) except NameError: X_train = X[groups[index]] y_train = y[groups[index]] rg.fit(X_train, y_train) # Testing data X_test = X[groups[test_index[0]]] y_test = y[groups[test_index[0]]] # Predict y_estimate_test = rg.predict(X_test) # Round estimate to a whole number y_estimate_test = np.around(y_estimate_test) # Any negative number = -1 y_estimate_test[y_estimate_test < 0] = -1 y_estimate_test = pd.Series(y_estimate_test, index=X_test.index) r2.append(r2_score(y_test, y_estimate_test)) f = open("{}results.txt".format(data_folder), "a") f.write("Predicting on {}: {}\n".format( groups[test_index[0]], round(r2[-1], 4))) f.close() # Save estimate y_estimate_test.to_csv("{}y_estimate_test_{}.csv".format( data_folder, groups[test_index[0]]), index=True, header=True) # Remove datasets del X_train del X_test del y_train del y_test # Save model f = open( "{}{}_rg.pkl".format(data_folder, groups[test_index[0]]), "wb") pickle.dump(rg, f) f.close() f = open("{}results.txt".format(data_folder), "a") f.write("\nAverage R^2 score: {}".format( round(statistics.mean(r2), 4))) f.close() else: print("X should be of type dict or pd.DataFrame") return return
def RF_classifier(X_data,Y_data,options=None): from sklearn.ensemble import RandomForestClassifier #################### # Parse user options #################### params = {} gridsearch = False GS_settings = None randomsearch = False RS_settings = None accuracy = False cv_type = 'logo' scoring = 'f1' if (options is not None): if (("RF_parameters" in options)==True): params = options['RF_parameters'] if (("grid_search" in options)==True): from sklearn.model_selection import GridSearchCV gridsearch = True GS_params = options['grid_search']['parameter_grid'] if (("settings" in options['grid_search'])==True): GS_settings = options['grid_search']['settings'] if (("random_search" in options)==True): from sklearn.model_selection import RandomizedSearchCV from cfd2ml.utilities import convert_param_dist randomsearch = True RS_params, RS_Nmax = convert_param_dist(options['random_search']['parameter_grid']) print('RS_Nmax = ', RS_Nmax) if (("settings" in options['random_search'])==True): RS_settings = options['random_search']['settings'] if(randomsearch==True and gridsearch==True): quit('********** Stopping! grid_search and random_search both set *********') if (("accuracy" in options)==True): accuracy = options['accuracy'] if (accuracy==True): from sklearn.model_selection import cross_validate from sklearn.metrics import precision_recall_curve, auc, f1_score, accuracy_score, balanced_accuracy_score, confusion_matrix from cfd2ml.utilities import print_cm if (("scoring" in options)==True): scoring = options['scoring'] if (("cv_type" in options)==True): cv_type = options['cv_type'] ############## # Prepare data ############## if(cv_type=='logo'): groups = X_data['group'] X_data = X_data.drop(columns='group') # Find feature and target headers X_headers = X_data.columns Y_header = Y_data.name nX = X_headers.size print('\nFeatures:') for i in range(0,nX): print('%d/%d: %s' %(i+1,nX,X_headers[i]) ) print('\nTarget: ', Y_header) ######################## # Prepare other settings ######################## # Setting cross-validation type (either leave-one-group-out or 5-fold) if(cv_type=='logo'): from sklearn.model_selection import LeaveOneGroupOut logo = LeaveOneGroupOut() ngroup = logo.get_n_splits(groups=groups) print('\nUsing Leave-One-Group-Out cross validation on ', ngroup, ' groups') elif(cv_type=='kfold'): from sklearn.model_selection import StratifiedKFold print('\nUsing 10-fold cross validation') k_fold = StratifiedKFold(n_splits=10, random_state=42,shuffle=True) cv = k_fold.split(X_data,Y_data) ######################### # Training the classifier ######################### # TODO TODO TODO - improve accuracy by using balanced or weighted random forest # (see https://statistics.berkeley.edu/sites/default/files/tech-reports/666.pdf) if(gridsearch==True): # Finding optimal hyperparameters with GridSearchCV print('\n Performing GridSearchCV to find optimal hyperparameters for random forest classifier') clf = RandomForestClassifier(**params,random_state=42) if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups) GS_clf = GridSearchCV(estimator=clf,param_grid=GS_params, cv=cv, scoring=scoring, iid=False, verbose=2, **GS_settings) GS_clf.fit(X_data,Y_data) # Write out results to file scores_df = pd.DataFrame(GS_clf.cv_results_)#.sort_values(by='rank_test_score') scores_df.to_csv('GridSearch_results.csv') # Pich out best results best_params = GS_clf.best_params_ best_score = GS_clf.best_score_ clf = GS_clf.best_estimator_ # (this clf has been fit to all of the X_data,Y_data) print('\nBest hyperparameters found:', best_params) print('\nScore with these hyperparameters:', best_score) elif(randomsearch==True): # Finding optimal hyperparameters with RandomSearchCV print('\n Performing RandomizedSearchCV to find optimal hyperparameters for random forest classifier') clf = RandomForestClassifier(**params,random_state=42) if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups) RS_clf = RandomizedSearchCV(estimator=clf,param_distributions=RS_params, cv=cv, scoring=scoring,iid=False, verbose=2, error_score=np.nan, **RS_settings) RS_clf.fit(X_data,Y_data) # Write out results to file scores_df = pd.DataFrame(RS_clf.cv_results_)#.sort_values(by='rank_test_score') scores_df.to_csv('RandomSearch_results.csv') # Pick out best results best_params = RS_clf.best_params_ best_score = RS_clf.best_score_ clf = RS_clf.best_estimator_ # (this clf has been fit to all of the X_data,Y_data) print('\nBest hyperparameters found:', best_params) print('\nScore with these hyperparameters:', best_score) else: # Train RF classifier with hyperparameters given by user print('\nTraining random forest classifer with given hyperparameters') clf = RandomForestClassifier(**params) clf.fit(X_data,Y_data) # Cross validation accuracy metrics if(accuracy==True): print('\nPerforming cross validation to determine train and test accuracy/error, and precision-recall curves') #TODO - capability to decide on probablity threshold, and predict with chosen threshold # Get generator object depending on cv strategy if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups) elif(cv_type=='kfold'): cv = k_fold.split(X_data,Y_data) # Need to regen "Generator" object fig1, ax1 = plt.subplots() # Init lists y_real = [] y_proba = [] train_f1 = [] test_f1 = [] train_A = [] test_A = [] train_BA = [] test_BA = [] # Loop through CV folds i = 0 for train_index, test_index in cv: X_train, X_test = X_data.iloc[train_index], X_data.iloc[test_index] Y_train, Y_test = Y_data.iloc[train_index], Y_data.iloc[test_index] # Train classifier clf_cv = clf clf_cv.fit(X_train, Y_train) # Predict Y Y_pred_train = clf_cv.predict(X_train) Y_pred_test = clf_cv.predict(X_test ) # F1 scores f1score = f1_score(Y_test , Y_pred_test) train_f1.append(f1_score(Y_train, Y_pred_train) ) test_f1.append(f1score) # Accuracy scores Ascore = accuracy_score(Y_test , Y_pred_test) train_A.append(accuracy_score(Y_train, Y_pred_train) ) test_A.append(Ascore) # Balanced accuracy scores BAscore = balanced_accuracy_score(Y_test , Y_pred_test) train_BA.append(balanced_accuracy_score(Y_train, Y_pred_train) ) test_BA.append(BAscore) # Print validation scores (training scores are stored to print mean later, but not printed for each fold) if(cv_type=='logo'): print('\nTest group = ', groups.iloc[test_index[0]]) elif(cv_type=='kfold'): print('\nFold = ', i) print('-------------------') print('F1 score = %.2f %%' %(f1score*100) ) print('Total error = %.2f %%' %((1.0-Ascore)*100) ) print('Per-class error = %.2f %%' %((1.0-BAscore)*100) ) # Print confusion matrix for this fold print('Confusion matrix:') confuse_mat = confusion_matrix(Y_test, Y_pred_test) print_cm(confuse_mat, ['Off','On']) # Prediction probability based on X_test (used for precision-recall curves) pred_proba = clf_cv.predict_proba(X_test) precision, recall, _ = precision_recall_curve(Y_test, pred_proba[:,1]) lab = 'Fold %d AUC=%.4f' % (i+1, auc(recall, precision)) ax1.step(recall, precision, label=lab) y_real.append(Y_test) y_proba.append(pred_proba[:,1]) i += 1 # Calculate errors from accuracies train_TE = 1.0 - np.array(train_A) test_TE = 1.0 - np.array(test_A) train_CAE = 1.0 - np.array(train_BA) test_CAE = 1.0 - np.array(test_BA) # Print performance scores print('\nMean training scores:') print('F1 score = %.2f %%' %(np.mean(train_f1)*100) ) print('Total error = %.2f %%' %(np.mean(train_TE)*100) ) print('Per-class error = %.2f %%' %(np.mean(train_CAE)*100) ) print('\nMean validation scores:') print('F1 score = %.2f %%' %(np.mean(test_f1)*100) ) print('Total error = %.2f %%' %(np.mean(test_TE)*100) ) print('Per-class error = %.2f %%' %(np.mean(test_CAE)*100) ) # Average precision-recall over folds, and plot curves y_real = np.concatenate(y_real) y_proba = np.concatenate(y_proba) precision, recall, _ = precision_recall_curve(y_real, y_proba) lab = 'Overall AUC=%.4f' % (auc(recall, precision)) ax1.step(recall, precision, label=lab, lw=2, color='black') ax1.set_xlabel('Recall') ax1.set_ylabel('Precision') ax1.legend(loc='lower left', fontsize='small') plt.show() return clf
# plt.plot(gamma_traces[i], '^-') # plt.legend(['gamma'+str(j) for j in range(i//2+2)]) # plt.savefig('gamma'+str(i)+'_{}_{}_{}.png'.format(cf.reg_strength, cf.threshold, cf.warmup), format='png', dpi=800) # plt.show() if cf.dataset == 'PPG_Dalia': # retrain and cross-validate result = rgkf.RandomGroupKFold_split(groups, 4, cf.a) for train_index, test_val_index in result: X_train, X_val_test = X[train_index], X[test_val_index] y_train, y_val_test = y[train_index], y[test_val_index] activity_train, activity_val_test = activity[train_index], activity[ test_val_index] logo = LeaveOneGroupOut() logo.get_n_splits( groups=groups[test_val_index]) # 'groups' is always required for validate_index, test_index in logo.split(X_val_test, y_val_test, groups[test_val_index]): X_validate, X_test = X_val_test[validate_index], X_val_test[ test_index] y_validate, y_test = y_val_test[validate_index], y_val_test[ test_index] activity_validate, activity_test = activity_val_test[ validate_index], activity_val_test[test_index] groups_val = groups[test_val_index] k = groups_val[test_index][0] # init try: del model except:
def perform_svm(Xn, yn, nSess=1, kernelType='linear'): groups = get_groups(Xn, nSess) logo_fold = LeaveOneGroupOut() n_folds = logo_fold.get_n_splits(groups = groups) total_samples = Xn.shape[0] n_young_samples = int(total_samples/2) actual_ = np.zeros((n_folds, 2)) predict_ = np.zeros((n_folds, 2)) scores = np.zeros(n_folds) decifunc_gri = np.zeros((n_folds, 2)) cgood = np.zeros(n_folds) ggood = np.zeros(n_folds) folds_iter = 0 svm = SVC(kernel = kernelType, class_weight = 'balanced', decision_function_shape = 'ovo', probability = True) print('\nClassify using SVM: (%s)' % kernelType) print(" Performing leave one subject out cross fold with %d outer_folds" " and %d inner_folds" % (n_folds, n_folds-1)) # Even while training(tuning the hyper parameters) the classifier, # one more subject's data is left out for each training iteration. # So, two(outer and inner) LOOCV folds are run. folds_iter = 0 for train_index, test_index in logo_fold.split(Xn, yn, groups): # X_t_test and y_test are used for calculating classifier # accuracy for this iteration X_t_train, X_t_test = Xn[train_index], Xn[test_index] y_train, y_test = yn[train_index], yn[test_index] nc = X_t_train.shape[1] X_t_std = np.std(X_t_train) gamma = 1 / (nc * X_t_std) a = svm.set_params(gamma = gamma) pgrid = { "C": [0.1, 1, 10, 1e2], "gamma": np.arange(0.01, 0.1, 0.01) } # Inner LOOCV fold to tune the hyper parameters of the classifier inner_fold = LeaveOneGroupOut() gridclf = GridSearchCV(estimator = svm, param_grid = pgrid, refit=True, cv = inner_fold) g = gridclf.fit(X_t_train, y_train, groups = groups[train_index]) cgood[folds_iter] = gridclf.best_params_.get('C') ggood[folds_iter] = gridclf.best_params_.get('gamma') scores[folds_iter] = gridclf.score(X_t_test, y_test) actual_[folds_iter] = y_test predict_[folds_iter] = gridclf.predict(X_t_test) decifunc_gri[folds_iter] = gridclf.decision_function(X_t_test) folds_iter += 1 # Calculate the accuracy of the classifier actual = actual_.reshape(total_samples,) predict = predict_.reshape(total_samples,) success = (actual == predict) n_success = len(success[success == True]) print(" Classification accuracy =", (n_success / total_samples) * 100, "%") print(' Confusion Matrix:\n', confusion_matrix(actual, predict)) ''' print("Mean of scores:", np.mean(scores)) scoremax_idx = np.argmax(scores) print("Max. of C(score max):", cgood[scoremax_idx]) print("Max. of gamma(score max):", ggood[scoremax_idx]) ''' decifunc_gri = decifunc_gri.reshape(total_samples,) print(' roc_auc_score =', roc_auc_score(actual, decifunc_gri))
tar = tarfile.open(tarfile_name, "r:gz") tar.extractall() tar.close() # build list of beta maps subj_flist = glob.glob("sub-{:02d}/beta*.nii.gz".format(current_subject)) subj_flist.sort() beta_flist.extend(subj_flist) # build list of corresponding label and subject number y.extend(np.array(labels_df['label'])) subj_vect.extend(current_subject * np.ones(len(subj_flist), dtype=int)) chance_level = 1. / len(np.unique(y)) # set up leave-one-subject-out cross-validation loso = LeaveOneGroupOut() n_splits = loso.get_n_splits(groups=subj_vect) # read image data print("Reading beta maps from all the subjects...") fmri_nii_list = [] for beta_path in beta_flist: beta_nii = nb.load(beta_path) fmri_nii_list.append(beta_nii) print("Concatenating the data from all the subjects...") fmri_img = concat_imgs(fmri_nii_list) # reading brain mask mask_nii = nb.load("brain_mask.nii.gz") # running searchlight decoding
data = scipy.io.loadmat(filepath) y = data["label"] x = data["X"] x = np.array([x[i] for i in range(len(y)) if y[i][0] == 3 or y[i][0] == 4]) y = np.array([y[i][0] for i in range(len(y)) if y[i][0] == 3 or y[i][0] == 4]) subjects = [] subject_N = 51 for i in range(subject_N): for j in range(80): subjects.append(i) logo = LeaveOneGroupOut() logo.get_n_splits(x, y, subjects) parameters = [0.04, 0.2, 1, 5, 25, 125, 625, 3125] accuracy_test = [[0] * subject_N for i in range(len(parameters))] for j in tqdm.tqdm(range(len(parameters))): i = 0 for train_index, test_index in tqdm.tqdm(logo.split(x, y, subjects), leave=False): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] model = SVC(kernel="rbf", C=parameters[j], gamma="scale") model.fit(x_train, y_train) pred_test = model.predict(x_test)