def lgo_core(X,y, groups, regparam): logo = LeaveOneGroupOut() rls = RLS(X,y, regparam=regparam, kernel="GaussianKernel", gamma=0.01) errors = [] for train, test in logo.split(X, y, groups=groups): p = rls.holdout(test) e = sqerror(y[test], p) errors.append(e) return np.mean(errors)
def lgo_sklearn(X,y, groups, regparam): logo = LeaveOneGroupOut() errors = [] for train, test in logo.split(X, y, groups=groups): rls = KernelRidge(kernel="rbf", gamma=0.01) rls.fit(X[train], y[train]) p = rls.predict(X[test]) e = sqerror(y[test], p) errors.append(e) return np.mean(errors)
def get_pred_cv(lm, x, y_true, groups, use_logs): cv = LeaveOneGroupOut() y_pred = pd.Series() for train_ix, test_ix in cv.split(x, y_true, groups): x_train = x.iloc[train_ix] y_train = y_true.iloc[train_ix] x_test = x.iloc[test_ix] lm.fit(x_train, y_train) if use_logs: lm.fit(np.log(x_train), np.log(y_train)) arr = np.exp(lm.predict(np.log(x_test))) else: lm.fit(x_train, y_train) arr = lm.predict(x_test) s = pd.Series(arr, index=x_test.index) y_pred = y_pred.append(s, verify_integrity=True) return y_pred.sort_index()
def test_generalization_across_time(): """Test time generalization decoding.""" from sklearn.svm import SVC # KernelRidge is used for testing 1) regression analyses 2) n-dimensional # predictions. from sklearn.kernel_ridge import KernelRidge from sklearn.preprocessing import LabelEncoder from sklearn.metrics import roc_auc_score, mean_squared_error epochs = make_epochs() y_4classes = np.hstack((epochs.events[:7, 2], epochs.events[7:, 2] + 1)) if check_version('sklearn', '0.18'): from sklearn.model_selection import (KFold, StratifiedKFold, ShuffleSplit, LeaveOneGroupOut) cv = LeaveOneGroupOut() cv_shuffle = ShuffleSplit() # XXX we cannot pass any other parameters than X and y to cv.split # so we have to build it before hand cv_lolo = [(train, test) for train, test in cv.split( y_4classes, y_4classes, y_4classes)] # With sklearn >= 0.17, `clf` can be identified as a regressor, and # the scoring metrics can therefore be automatically assigned. scorer_regress = None else: from sklearn.cross_validation import (KFold, StratifiedKFold, ShuffleSplit, LeaveOneLabelOut) cv_shuffle = ShuffleSplit(len(epochs)) cv_lolo = LeaveOneLabelOut(y_4classes) # With sklearn < 0.17, `clf` cannot be identified as a regressor, and # therefore the scoring metrics cannot be automatically assigned. scorer_regress = mean_squared_error # Test default running with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(picks='foo') assert_equal("<GAT | no fit, no prediction, no score>", "%s" % gat) assert_raises(ValueError, gat.fit, epochs) with warnings.catch_warnings(record=True): # check classic fit + check manual picks gat.picks = [0] gat.fit(epochs) # check optional y as array gat.picks = None gat.fit(epochs, y=epochs.events[:, 2]) # check optional y as list gat.fit(epochs, y=epochs.events[:, 2].tolist()) assert_equal(len(gat.picks_), len(gat.ch_names), 1) assert_equal("<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), no " "prediction, no score>", '%s' % gat) assert_equal(gat.ch_names, epochs.ch_names) # test different predict function: with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(predict_method='decision_function') gat.fit(epochs) # With classifier, the default cv is StratifiedKFold assert_true(gat.cv_.__class__ == StratifiedKFold) gat.predict(epochs) assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 1)) gat.predict_method = 'predict_proba' gat.predict(epochs) assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 2)) gat.predict_method = 'foo' assert_raises(NotImplementedError, gat.predict, epochs) gat.predict_method = 'predict' gat.predict(epochs) assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 1)) assert_equal("<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), " "predicted 14 epochs, no score>", "%s" % gat) gat.score(epochs) assert_true(gat.scorer_.__name__ == 'accuracy_score') # check clf / predict_method combinations for which the scoring metrics # cannot be inferred. gat.scorer = None gat.predict_method = 'decision_function' assert_raises(ValueError, gat.score, epochs) # Check specifying y manually gat.predict_method = 'predict' gat.score(epochs, y=epochs.events[:, 2]) gat.score(epochs, y=epochs.events[:, 2].tolist()) assert_equal("<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), " "predicted 14 epochs,\n scored " "(accuracy_score)>", "%s" % gat) with warnings.catch_warnings(record=True): gat.fit(epochs, y=epochs.events[:, 2]) old_mode = gat.predict_mode gat.predict_mode = 'super-foo-mode' assert_raises(ValueError, gat.predict, epochs) gat.predict_mode = old_mode gat.score(epochs, y=epochs.events[:, 2]) assert_true("accuracy_score" in '%s' % gat.scorer_) epochs2 = epochs.copy() # check _DecodingTime class assert_equal("<DecodingTime | start: -0.200 (s), stop: 0.499 (s), step: " "0.050 (s), length: 0.050 (s), n_time_windows: 15>", "%s" % gat.train_times_) assert_equal("<DecodingTime | start: -0.200 (s), stop: 0.499 (s), step: " "0.050 (s), length: 0.050 (s), n_time_windows: 15 x 15>", "%s" % gat.test_times_) # the y-check gat.predict_mode = 'mean-prediction' epochs2.events[:, 2] += 10 gat_ = copy.deepcopy(gat) with use_log_level('error'): assert_raises(ValueError, gat_.score, epochs2) gat.predict_mode = 'cross-validation' # Test basics # --- number of trials assert_true(gat.y_train_.shape[0] == gat.y_true_.shape[0] == len(gat.y_pred_[0][0]) == 14) # --- number of folds assert_true(np.shape(gat.estimators_)[1] == gat.cv) # --- length training size assert_true(len(gat.train_times_['slices']) == 15 == np.shape(gat.estimators_)[0]) # --- length testing sizes assert_true(len(gat.test_times_['slices']) == 15 == np.shape(gat.scores_)[0]) assert_true(len(gat.test_times_['slices'][0]) == 15 == np.shape(gat.scores_)[1]) # Test score_mode gat.score_mode = 'foo' assert_raises(ValueError, gat.score, epochs) gat.score_mode = 'fold-wise' scores = gat.score(epochs) assert_array_equal(np.shape(scores), [15, 15, 5]) gat.score_mode = 'mean-sample-wise' scores = gat.score(epochs) assert_array_equal(np.shape(scores), [15, 15]) gat.score_mode = 'mean-fold-wise' scores = gat.score(epochs) assert_array_equal(np.shape(scores), [15, 15]) gat.predict_mode = 'mean-prediction' with warnings.catch_warnings(record=True) as w: gat.score(epochs) assert_true(any("score_mode changed from " in str(ww.message) for ww in w)) # Test longer time window with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(train_times={'length': .100}) with warnings.catch_warnings(record=True): gat2 = gat.fit(epochs) assert_true(gat is gat2) # return self assert_true(hasattr(gat2, 'cv_')) assert_true(gat2.cv_ != gat.cv) with warnings.catch_warnings(record=True): # not vectorizing scores = gat.score(epochs) assert_true(isinstance(scores, np.ndarray)) # type check assert_equal(len(scores[0]), len(scores)) # shape check assert_equal(len(gat.test_times_['slices'][0][0]), 2) # Decim training steps with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(train_times={'step': .100}) with warnings.catch_warnings(record=True): gat.fit(epochs) gat.score(epochs) assert_true(len(gat.scores_) == len(gat.estimators_) == 8) # training time assert_equal(len(gat.scores_[0]), 15) # testing time # Test start stop training & test cv without n_fold params y_4classes = np.hstack((epochs.events[:7, 2], epochs.events[7:, 2] + 1)) train_times = dict(start=0.090, stop=0.250) with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(cv=cv_lolo, train_times=train_times) # predict without fit assert_raises(RuntimeError, gat.predict, epochs) with warnings.catch_warnings(record=True): gat.fit(epochs, y=y_4classes) gat.score(epochs) assert_equal(len(gat.scores_), 4) assert_equal(gat.train_times_['times'][0], epochs.times[6]) assert_equal(gat.train_times_['times'][-1], epochs.times[9]) # Test score without passing epochs & Test diagonal decoding with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(test_times='diagonal') with warnings.catch_warnings(record=True): # not vectorizing gat.fit(epochs) assert_raises(RuntimeError, gat.score) with warnings.catch_warnings(record=True): # not vectorizing gat.predict(epochs) scores = gat.score() assert_true(scores is gat.scores_) assert_equal(np.shape(gat.scores_), (15, 1)) assert_array_equal([tim for ttime in gat.test_times_['times'] for tim in ttime], gat.train_times_['times']) # Test generalization across conditions with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(predict_mode='mean-prediction', cv=2) with warnings.catch_warnings(record=True): gat.fit(epochs[0:6]) with warnings.catch_warnings(record=True): # There are some empty test folds because of n_trials gat.predict(epochs[7:]) gat.score(epochs[7:]) # Test training time parameters gat_ = copy.deepcopy(gat) # --- start stop outside time range gat_.train_times = dict(start=-999.) with use_log_level('error'): assert_raises(ValueError, gat_.fit, epochs) gat_.train_times = dict(start=999.) assert_raises(ValueError, gat_.fit, epochs) # --- impossible slices gat_.train_times = dict(step=.000001) assert_raises(ValueError, gat_.fit, epochs) gat_.train_times = dict(length=.000001) assert_raises(ValueError, gat_.fit, epochs) gat_.train_times = dict(length=999.) assert_raises(ValueError, gat_.fit, epochs) # Test testing time parameters # --- outside time range gat.test_times = dict(start=-999.) with warnings.catch_warnings(record=True): # no epochs in fold assert_raises(ValueError, gat.predict, epochs) gat.test_times = dict(start=999.) with warnings.catch_warnings(record=True): # no test epochs assert_raises(ValueError, gat.predict, epochs) # --- impossible slices gat.test_times = dict(step=.000001) with warnings.catch_warnings(record=True): # no test epochs assert_raises(ValueError, gat.predict, epochs) gat_ = copy.deepcopy(gat) gat_.train_times_['length'] = .000001 gat_.test_times = dict(length=.000001) with warnings.catch_warnings(record=True): # no test epochs assert_raises(ValueError, gat_.predict, epochs) # --- test time region of interest gat.test_times = dict(step=.150) with warnings.catch_warnings(record=True): # not vectorizing gat.predict(epochs) assert_array_equal(np.shape(gat.y_pred_), (15, 5, 14, 1)) # --- silly value gat.test_times = 'foo' with warnings.catch_warnings(record=True): # no test epochs assert_raises(ValueError, gat.predict, epochs) assert_raises(RuntimeError, gat.score) # --- unmatched length between training and testing time gat.test_times = dict(length=.150) assert_raises(ValueError, gat.predict, epochs) # --- irregular length training and testing times # 2 estimators, the first one is trained on two successive time samples # whereas the second one is trained on a single time sample. train_times = dict(slices=[[0, 1], [1]]) # The first estimator is tested once, the second estimator is tested on # two successive time samples. test_times = dict(slices=[[[0, 1]], [[0], [1]]]) with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(train_times=train_times, test_times=test_times) gat.fit(epochs) with warnings.catch_warnings(record=True): # not vectorizing gat.score(epochs) assert_array_equal(np.shape(gat.y_pred_[0]), [1, len(epochs), 1]) assert_array_equal(np.shape(gat.y_pred_[1]), [2, len(epochs), 1]) # check cannot Automatically infer testing times for adhoc training times gat.test_times = None assert_raises(ValueError, gat.predict, epochs) svc = SVC(C=1, kernel='linear', probability=True) with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(clf=svc, predict_mode='mean-prediction') with warnings.catch_warnings(record=True): gat.fit(epochs) # sklearn needs it: c.f. # https://github.com/scikit-learn/scikit-learn/issues/2723 # and http://bit.ly/1u7t8UT with use_log_level('error'): assert_raises(ValueError, gat.score, epochs2) gat.score(epochs) assert_true(0.0 <= np.min(scores) <= 1.0) assert_true(0.0 <= np.max(scores) <= 1.0) # Test that error if cv is not partition with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(cv=cv_shuffle, predict_mode='cross-validation') gat.fit(epochs) assert_raises(ValueError, gat.predict, epochs) with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(cv=cv_shuffle, predict_mode='mean-prediction') gat.fit(epochs) gat.predict(epochs) # Test that gets error if train on one dataset, test on another, and don't # specify appropriate cv: with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime() gat.fit(epochs) with warnings.catch_warnings(record=True): gat.fit(epochs) gat.predict(epochs) assert_raises(ValueError, gat.predict, epochs[:10]) # Make CV with some empty train and test folds: # --- empty test fold(s) should warn when gat.predict() gat._cv_splits[0] = [gat._cv_splits[0][0], np.empty(0)] with warnings.catch_warnings(record=True) as w: gat.predict(epochs) assert_true(len(w) > 0) assert_true(any('do not have any test epochs' in str(ww.message) for ww in w)) # --- empty train fold(s) should raise when gat.fit() with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(cv=[([0], [1]), ([], [0])]) assert_raises(ValueError, gat.fit, epochs[:2]) # Check that still works with classifier that output y_pred with # shape = (n_trials, 1) instead of (n_trials,) if check_version('sklearn', '0.17'): # no is_regressor before v0.17 with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(clf=KernelRidge(), cv=2) epochs.crop(None, epochs.times[2]) gat.fit(epochs) # With regression the default cv is KFold and not StratifiedKFold assert_true(gat.cv_.__class__ == KFold) gat.score(epochs) # with regression the default scoring metrics is mean squared error assert_true(gat.scorer_.__name__ == 'mean_squared_error') # Test combinations of complex scenarios # 2 or more distinct classes n_classes = [2, 4] # 4 tested # nicely ordered labels or not le = LabelEncoder() y = le.fit_transform(epochs.events[:, 2]) y[len(y) // 2:] += 2 ys = (y, y + 1000) # Univariate and multivariate prediction svc = SVC(C=1, kernel='linear', probability=True) reg = KernelRidge() def scorer_proba(y_true, y_pred): return roc_auc_score(y_true, y_pred[:, 0]) # We re testing 3 scenario: default, classifier + predict_proba, regressor scorers = [None, scorer_proba, scorer_regress] predict_methods = [None, 'predict_proba', None] clfs = [svc, svc, reg] # Test all combinations for clf, predict_method, scorer in zip(clfs, predict_methods, scorers): for y in ys: for n_class in n_classes: for predict_mode in ['cross-validation', 'mean-prediction']: # Cannot use AUC for n_class > 2 if (predict_method == 'predict_proba' and n_class != 2): continue y_ = y % n_class with warnings.catch_warnings(record=True): gat = GeneralizationAcrossTime( cv=2, clf=clf, scorer=scorer, predict_mode=predict_mode) gat.fit(epochs, y=y_) gat.score(epochs, y=y_) # Check that scorer is correctly defined manually and # automatically. scorer_name = gat.scorer_.__name__ if scorer is None: if is_classifier(clf): assert_equal(scorer_name, 'accuracy_score') else: assert_equal(scorer_name, 'mean_squared_error') else: assert_equal(scorer_name, scorer.__name__)
for eachLine in dat_groups_obj: ret = re.findall('\d+', eachLine) if ret is not None: P_I_pairs.append((int(ret[0]), int(ret[1]), int(ret[1]))) P_I_pairs = np.asarray(P_I_pairs) groups = [a[0] for a in P_I_pairs] X_sparse, y = load_svmlight_file(dat_file_name) X_dense = X_sparse.todense() #remove features in row 6 #x1,x2,x3 = np.hsplit(X_dense, [5,6]) #X = np.hstack((x1,x3)) X = X_dense logo = LeaveOneGroupOut() #regr_dct = MLPRegressor(hidden_layer_sizes=(80,60, 30, 30, 20, 20, 20), activation='logistic', solver='adam', learning_rate_init=0.0001, max_iter=500,random_state=1) regr_dct = MLPRegressor(hidden_layer_sizes=(4), activation='tanh', solver='adam') #regr_dct = MLPRegressor(hidden_layer_sizes=(100, 80,60), activation='logistic', solver='adam') #regr_dct = tree.DecisionTreeRegressor() #regr_dct = ensemble.RandomForestRegressor(n_estimators=20) #regr_dct = gaussian_process.GaussianProcessRegressor(kernel=None) #regr_dct = neighbors.RadiusNeighborsRegressor(radius=1.0) #regr_dct = svm.SVR(kernel='rbf') scores_dct = list() P_scores = list()
svc = SVC(kernel='linear') # FEATURE SELECTION from sklearn.feature_selection import SelectPercentile, f_classif feature_selection = SelectPercentile(f_classif, percentile=10) from sklearn.pipeline import Pipeline anova_svc = Pipeline([('anova', feature_selection), ('svc', svc)]) anova_svc.fit(X, y) y_pred = anova_svc.predict(X) from sklearn.model_selection import LeaveOneGroupOut, cross_val_score cv = LeaveOneGroupOut() # Compute the prediction accuracy for the different folds (i.e. session) #session = behavioral[condition_mask].to_records(index=False) #print(session.dtype.names) # NESTED CROSS VALIDATION nested_cv_scores = cross_val_score(grid, X, y, cv=5) #cv_scores = cross_val_score(anova_svc, X, conditions,) # Print the results print("Nested CV score: %.4f" % np.mean(nested_cv_scores)) # Here is the image coef = svc.coef_
import numpy from keras.models import Sequential from keras.layers import Dense from sklearn.metrics import accuracy_score from sklearn.metrics import recall_score #import matplotlib.pyplot as plt from sklearn.model_selection import LeaveOneGroupOut from keras.utils import np_utils import keras from sklearn.metrics import confusion_matrix from keras.callbacks import EarlyStopping from keras.layers import Dropout logo = LeaveOneGroupOut() X = numpy.loadtxt("data_for_keras/FC_forecast.csv", delimiter=",") Y_int=numpy.loadtxt("data_for_keras/label_forecast.csv", delimiter=",") logo = LeaveOneGroupOut() grp=numpy.loadtxt("data_for_keras/speaker_group.csv", delimiter=",") F2con={} #save the confusion matrix in each speaker case F2ACC={} #save the accuracy in each speaker case F2WR={} #save the weighted accuracy in each speaker case F2UWR={} ##save the Unweighted Accuracy in each speaker case F=0 for train, test in logo.split(X, Y_int, grp): callbacks = [EarlyStopping(monitor='val_loss', patience=10)] # create model F2model = Sequential()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, help='C parameter of SVM', required=True) parser.add_argument('--save_estimator', help='Save the estimator', action='store_true', default=False) args = parser.parse_args() C = args.C save_estimator = args.save_estimator np.random.seed(21) task_name = __file__.split('/')[-1].split('.')[0] print('TASK: {}'.format(task_name)) os.makedirs(os.path.join(results_root, task_name), exist_ok=True) name = 'C_{}.npy'.format(C) if os.path.exists(os.path.join(results_root, task_name, name)): print('{} already exists, skipping...'.format(task_name)) return 1 ff_list = [] y_list = [] y_logo_list = [] # Loading Features for dataset_name, _ in dataset_ext.items(): y_same_param = [] y_logo_same_param = [] feature_path = glob.glob( os.path.join(cooccurrences_root, '{}.npy'.format(dataset_name)))[0] dataset_logo_label = dataset_label[dataset_name] ff = np.load(feature_path) if '_orig' in dataset_name: y_same_param += [0] * len(ff) y_logo_same_param += [dataset_logo_label] * len(ff) elif '_gan' in dataset_name: y_same_param += [1] * len(ff) y_logo_same_param += [dataset_logo_label] * len(ff) ff_list += [ff] y_list += y_same_param y_logo_list += y_logo_same_param ff_list = np.concatenate(ff_list, axis=0) y_list = np.array(y_list) y_logo_list = np.array(y_logo_list) # Subsampling sub_idx = np.random.choice(np.arange(len(ff_list)), int(len(ff_list) // 3)) X = ff_list[sub_idx] y = y_list[sub_idx] y_logo = y_logo_list[sub_idx] # Shuffling training set shuffle_idx = np.arange(len(y)) np.random.shuffle(shuffle_idx) X = X[shuffle_idx] y = y[shuffle_idx] y_logo = y_logo[shuffle_idx] # Normalize samples scaler = StandardScaler() X = scaler.fit_transform(X) print('C: {}, Total {} samples, Leave-One-Group-Out cv. Feature size: {}'. format(C, X.shape[0], X.shape[1])) # Create model model = LinearSVC(dual=False, C=C) # LOGO cv policy logo = LeaveOneGroupOut() cv = cross_validate(estimator=model, X=X, y=y, groups=y_logo, scoring='balanced_accuracy', cv=logo, verbose=2, return_estimator=save_estimator, n_jobs=-1) if save_estimator: result_data = { 'acc': cv['test_score'], 'estimator': cv['estimator'], 'X': X, 'y': y, 'y_logo': y_logo } else: result_data = {'acc': cv['test_score']} np.save(os.path.join(results_root, task_name, name), result_data) del X, y, y_logo, model, cv return 0
def fit(self, X, y, groups=None): """Fit the decoder (learner). Parameters ---------- X: list of Niimg-like objects See http://nilearn.github.io/manipulating_images/input_output.html Data on which model is to be fitted. If this is a list, the affine is considered the same for all. y: numpy.ndarray of shape=(n_samples) or list of length n_samples The dependent variable (age, sex, IQ, yes/no, etc.). Target variable to predict. Must have exactly as many elements as 3D images in niimg. groups: None Group labels for the samples used while splitting the dataset into train/test set. Default None. Note that this parameter must be specified in some scikit-learn cross-validation generators to calculate the number of splits, e.g. sklearn.model_selection.LeaveOneGroupOut or sklearn.model_selection.LeavePGroupsOut. For more details see https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators-for-grouped-data Attributes ---------- `masker_`: instance of NiftiMasker or MultiNiftiMasker The NiftiMasker used to mask the data. `mask_img_`: Nifti1Image Mask computed by the masker object. `classes_`: numpy.ndarray Classes to predict. For classification only. `screening_percentile_`: float Screening percentile corrected according to volume of mask, relative to the volume of standard brain. `coef_`: numpy.ndarray, shape=(n_classes, n_features) Contains the mean of the models weight vector across fold for each class. `coef_img_`: dict of Nifti1Image Dictionary containing `coef_` with class names as keys, and `coef_` transformed in Nifti1Images as values. In the case of a regression, it contains a single Nifti1Image at the key 'beta'. `intercept_`: narray, shape (nclasses,) Intercept (a.k.a. bias) added to the decision function. `cv_`: list of pairs of lists List of the (n_folds,) folds. For the corresponding fold, each pair is composed of two lists of indices, one for the train samples and one for the test samples. `std_coef_`: numpy.ndarray, shape=(n_classes, n_features) Contains the standard deviation of the models weight vector across fold for each class. Note that folds are not independent, see https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators-for-grouped-data `std_coef_img_`: dict of Nifti1Image Dictionary containing `std_coef_` with class names as keys, and `coef_` transformed in Nifti1Image as values. In the case of a regression, it contains a single Nifti1Image at the key 'beta'. `cv_params_`: dict of lists Best point in the parameter grid for each tested fold in the inner cross validation loop. `cv_scores_`: dict, (classes, n_folds) Scores (misclassification) for each parameter, and on each fold """ self.estimator = _check_estimator(self.estimator) self.memory_ = _check_memory(self.memory, self.verbose) X = self._apply_mask(X) X, y = check_X_y(X, y, dtype=np.float, multi_output=True) # Setup scorer scorer = check_scoring(self.estimator, self.scoring) # Setup cross-validation object. Default is StratifiedKFold when groups # is None. If groups is specified but self.cv is not set to custom CV # splitter, default is LeaveOneGroupOut. If self.cv is manually set to # a CV splitter object do check_cv regardless of groups parameter. cv = self.cv if (isinstance(cv, int) or cv is None) and groups is not None: warnings.warn('groups parameter is specified but ' 'cv parameter is not set to custom CV splitter. ' 'Using default object LeaveOneGroupOut().') cv_object = LeaveOneGroupOut() else: cv_object = check_cv(cv, y=y, classifier=self.is_classification) self.cv_ = list(cv_object.split(X, y, groups=groups)) # Define the number problems to solve. In case of classification this # number corresponds to the number of binary problems to solve if self.is_classification: y = self._binarize_y(y) else: y = y[:, np.newaxis] if self.is_classification and self.n_classes_ > 2: n_problems = self.n_classes_ else: n_problems = 1 # Return a suitable screening percentile according to the mask image self.screening_percentile_ = _adjust_screening_percentile( self.screening_percentile, self.mask_img_, verbose=self.verbose) parallel = Parallel(n_jobs=self.n_jobs, verbose=2 * self.verbose) parallel_fit_outputs = parallel( delayed(self._cache(_parallel_fit)) (self.estimator, X, y[:, c], train, test, self.param_grid, self.is_classification, scorer, self.mask_img_, c, self.screening_percentile_) for c, (train, test) in itertools.product(range(n_problems), self.cv_)) coefs, intercepts = self._fetch_parallel_fit_outputs( parallel_fit_outputs, y, n_problems) # Build the final model (the aggregated one) self.coef_ = np.vstack([ np.mean(coefs[class_index], axis=0) for class_index in self.classes_ ]) self.std_coef_ = np.vstack([ np.std(coefs[class_index], axis=0) for class_index in self.classes_ ]) self.intercept_ = np.hstack([ np.mean(intercepts[class_index], axis=0) for class_index in self.classes_ ]) self.coef_img_, self.std_coef_img_ = self._output_image( self.classes_, self.coef_, self.std_coef_) if self.is_classification and (self.n_classes_ == 2): self.coef_ = self.coef_[0, :][np.newaxis, :] self.intercept_ = self.intercept_[0]
X_vect = vectorizer.fit_transform(transcript_lem_list) #TF IDF tfidf = TfidfTransformer() X_tfidf = tfidf.fit_transform(X_vect) #Regressors Ridge = Ridge(alpha=10) RF = RandomForestRegressor(criterion='mse', max_features='sqrt', random_state=42) SVR = LinearSVR(C=0.5) #Leave-One-Interviewer-Out cross-validation LOGO = LeaveOneGroupOut() groups = df_labels.Group.values X_cv = X_tfidf.toarray() #np_gender = df_labels['Gender'].to_numpy().reshape(-1,1) #X_cv_gender = np.concatenate((X_cv,np_gender),axis=1) y_cv = df_labels['Label'].values RMSE_list_Rid = [] RMSE_list_RF = [] RMSE_list_SVR = [] for train_index, test_index in LOGO.split(X_cv, y_cv, groups): X_train, X_test = X_cv[train_index], X_cv[test_index]
def main(): parser = argparse.ArgumentParser() # Names, paths, logs parser.add_argument( '--dataset_path', default=f'/home/ICT2000/jondras/datasets/mimicry/segmented_datasets', help= 'path prefix to dataset directory (excludes the suffix with dataset version e.g. "_v0")' ) parser.add_argument('--dataset_version', default=f'v3', help='version of the dataset (v0|v1|v2|v3)') parser.add_argument( '--logger_path', default= '/home/ICT2000/jondras/deep-virtual-rapport-agent/rapport_model/logs/1569294550_multimodal_multimodal-base-classifier_nod', help='path to logging directory containing the final model') # Data parameters parser.add_argument( '--sequence_length', default=32, type=int, help='maximum length of feature sequences (i.e. window size)') # Training and optimization parser.add_argument( '--loso_cross_validation', default=False, help= 'load model from subject-independent (leave-one-subject-out (LOSO)) cross-validation' ) parser.add_argument( '--fold_num', default=10, type=int, help= 'number of folds, relevant only for subject-dependent cross-validation' ) parser.add_argument('--gpu_id', default=0, type=int, help='ID of a GPU to use (0|1|2|3)') opt = parser.parse_args() # Add derived/additional options opt.dataset_path = f'{opt.dataset_path}_{opt.dataset_version}' # Path to the dataset file with metadata and labels for each sequence opt.dataset_file_path = os.path.join( opt.dataset_path, f'metadata_labels_{opt.sequence_length}ws.csv') # Use the specified GPU # os.environ["CUDA_VISIBLE_DEVICES"] = str(opt.gpu_id) torch.cuda.set_device(int(opt.gpu_id)) # Set up stdout logger (for stdout and stderr) os.makedirs(opt.logger_path, exist_ok=True) logging.basicConfig(filename=os.path.join(opt.logger_path, f'test.log'), level=logging.INFO, format='%(asctime)s %(levelname)s ==> %(message)s') sys.stdout = DualLogger('stdout') sys.stderr = DualLogger('stderr') # Print all args/options/settings print( f'{ConsoleColors.CC_GREY}\nTraining and validating models{ConsoleColors.CC_END}' ) for arg in vars(opt): print( f'{arg}={ConsoleColors.CC_YELLOW}{str(getattr(opt, arg))}{ConsoleColors.CC_END}' ) # Read metadata+labels file metadata_labels = pd.read_csv(opt.dataset_file_path) #[:100] sequence_ids = metadata_labels['sequence_id'].tolist() ids = {} fold = 0 # Load model from subject-independent (leave-one-subject-out (LOSO)) cross-validation if opt.loso_cross_validation: # Use listener subject id for subject-independent (leave-one-subject-out (LOSO)) cross-validation, since the # listener is the target subject subject_ids = metadata_labels['listener_sid'].tolist() opt.fold_num = len(np.unique(subject_ids)) cross_validator = LeaveOneGroupOut() cross_validator_splits = cross_validator.split(sequence_ids, groups=subject_ids) # Otherwise, load model from subject-dependent k-fold cross-validation else: cross_validator = KFold(n_splits=opt.fold_num, shuffle=True) cross_validator_splits = cross_validator.split(sequence_ids) metrics = dict() # Cross-validation loop: test on each fold for _, test_idx in cross_validator_splits: fold_start_time = time.time() print( f'\n{ConsoleColors.CC_BOLD}{ConsoleColors.CC_GREEN}fold:{ConsoleColors.CC_END} {fold + 1}/{opt.fold_num}' ) ids['test'] = [sequence_ids[x] for x in test_idx] # Checkpoint checkpoint = torch.load(os.path.join(opt.logger_path, f'fold_{fold + 1:02d}', 'model_best.pth.tar'), map_location=device) print( f'\t{ConsoleColors.CC_BOLD}best_epoch:{ConsoleColors.CC_END} {checkpoint["epoch"]} \t ' f'{ConsoleColors.CC_BOLD}best_monitored_metric:{ConsoleColors.CC_END} ' f'{checkpoint["best_monitored_metric"]}') checkpoint_opt = checkpoint['opt'] # Dataset partitions that will be generated checkpoint_opt.dataset_partitions = ['test'] # Check whether checkpoint options agree with the current (specified) options assert checkpoint_opt.sequence_length == opt.sequence_length, f'The specified sequence length does not match the checkpoint one ({checkpoint_opt.sequence_length})!' assert checkpoint_opt.loso_cross_validation == opt.loso_cross_validation, f'The specified type of crossvalidation does not match the checkpoint one ({checkpoint_opt.loso_cross_validation})!' assert checkpoint_opt.fold_num == opt.fold_num, f'The specified number of folds does not match the checkpoint one ({checkpoint_opt.fold_num})!' # Data loaders if checkpoint_opt.modality == 'speech' or checkpoint_opt.modality == 'vision': if checkpoint_opt.model_type == 'unimodal-base-classifier' \ or checkpoint_opt.model_type == 'unimodal-tcn-classifier': loaders = get_unimodal_base_dataset_loaders( metadata_labels=metadata_labels, ids=ids, opt=checkpoint_opt) else: print( f'Data loader is not implemented for {checkpoint_opt.modality} -> {checkpoint_opt.model_type}' ) elif checkpoint_opt.modality == 'multimodal': if checkpoint_opt.model_type == 'multimodal-base-classifier' \ or checkpoint_opt.model_type == 'multimodal-tcn-classifier': loaders = get_multimodal_base_dataset_loaders( metadata_labels=metadata_labels, ids=ids, opt=checkpoint_opt) else: print( f'Data loader is not implemented for {checkpoint_opt.modality} -> {checkpoint_opt.model_type}' ) else: print( f'Data loader is not implemented for {checkpoint_opt.modality}' ) # Model if checkpoint_opt.modality == 'speech' or checkpoint_opt.modality == 'vision': if checkpoint_opt.model_type == 'unimodal-base-classifier': model = UnimodalBaseClassifier(opt=checkpoint_opt).cuda() elif checkpoint_opt.model_type == 'unimodal-tcn-classifier': model = UnimodalTCNClassifier(opt=checkpoint_opt).cuda() else: print( f'Model is not implemented for {checkpoint_opt.modality} -> {checkpoint_opt.model_type}' ) elif checkpoint_opt.modality == 'multimodal': if checkpoint_opt.model_type == 'multimodal-base-classifier': model = MultimodalBaseClassifier(opt=checkpoint_opt).cuda() elif checkpoint_opt.model_type == 'multimodal-tcn-classifier': model = MultimodalTCNClassifier(opt=checkpoint_opt).cuda() else: print( f'Model is not implemented for {checkpoint_opt.modality} -> {checkpoint_opt.model_type}' ) else: print(f'Model is not implemented for {checkpoint_opt.modality}') model.load_state_dict(checkpoint['model']) model.eval() # Get test metrics for this fold fold_metrics = test_classifier(loaders['test'], model, checkpoint_opt.labels_names) for label_name in fold_metrics.keys(): if label_name not in metrics: metrics[label_name] = defaultdict(list) for metric_name in fold_metrics[label_name].keys(): metrics[label_name][metric_name].append( fold_metrics[label_name][metric_name]) fold += 1 print( f' fold time: {ConsoleColors.CC_YELLOW2}{time_diff(fold_start_time)}{ConsoleColors.CC_END}' ) # Calculate metrics over all folds for label_name in metrics.keys(): print() for metric_name in metrics[label_name].keys(): metrics[label_name][metric_name] = { 'mean': np.mean(metrics[label_name][metric_name]), 'std': np.std(metrics[label_name][metric_name]) } print( f'- {label_name}\t- {metric_name}:\t {metrics[label_name][metric_name]["mean"]:.4f} ' f'+/- {metrics[label_name][metric_name]["std"]:.4f}')
def decode(spike_times, spike_clusters, event_times, event_groups, pre_time=0, post_time=0.5, classifier='bayes', cross_validation='kfold', num_splits=5, prob_left=None, custom_validation=None, n_neurons='all', iterations=1): """ Use decoding to classify groups of trials (e.g. stim left/right). Classification is done using the population vector of summed spike counts from the specified time window. Cross-validation is achieved using n-fold cross validation or leave-one-out cross validation. Decoders can decode any number of groups. When providing the classfier with an imbalanced dataset (not the same number of trials in each group) the chance level will not be 1/groups. In that case, to compare the classification performance against change one has to either determine chance level by decoding a shuffled dataset or use the 'auroc' metric as readout (this metric is robust against imbalanced datasets) Parameters ---------- spike_times : 1D array spike times (in seconds) spike_clusters : 1D array cluster ids corresponding to each event in `spikes` event_times : 1D array times (in seconds) of the events from the two groups event_groups : 1D array group identities of the events, can be any number of groups, accepts integers and strings pre_time : float time (in seconds) preceding the event times post_time : float time (in seconds) following the event times classifier : string which decoder to use, options are: 'bayes' Naive Bayes 'forest' Random forest (with 100 trees) 'regression' Logistic regression 'lda' Linear Discriminant Analysis cross_validation : string which cross-validation method to use, options are: 'none' No cross-validation 'kfold' K-fold cross-validation 'leave-one-out' Leave out the trial that is being decoded 'block' Leave out the block the to-be-decoded trial is in 'custom' Any custom cross-validation provided by the user num_splits : integer ** only for 'kfold' cross-validation ** Number of splits to use for k-fold cross validation, a value of 5 means that the decoder will be trained on 4/5th of the data and used to predict the remaining 1/5th. This process is repeated five times so that all data has been used as both training and test set. prob_left : 1D array ** only for 'block' cross-validation ** the probability of the stimulus appearing on the left for each trial in event_times custom_validation : generator ** only for 'custom' cross-validation ** a generator object with the splits to be used for cross validation using this format: ( (split1_train_idxs, split1_test_idxs), (split2_train_idxs, split2_test_idxs), (split3_train_idxs, split3_test_idxs), ...) Returns ------- results : dict dictionary with decoding results accuracy : float accuracy of the classifier in percentage correct f1 : float F1 score of the classifier auroc : float the area under the ROC curve of the classification performance confusion_matrix : 2D array normalized confusion matrix predictions : 2D array with dimensions iterations x trials predicted group label for all trials in every iteration probabilities : 2D array with dimensions iterations x trials classification probability for all trials in every iteration """ # Check input assert classifier in ['bayes', 'forest', 'regression', 'lda'] assert cross_validation in ['none', 'kfold', 'leave-one-out', 'block', 'custom'] assert event_times.shape[0] == event_groups.shape[0] if cross_validation == 'block': assert event_times.shape[0] == prob_left.shape[0] if cross_validation == 'custom': assert isinstance(custom_validation, types.GeneratorType) # Get matrix of all neuronal responses times = np.column_stack(((event_times - pre_time), (event_times + post_time))) pop_vector, cluster_ids = _get_spike_counts_in_bins(spike_times, spike_clusters, times) pop_vector = np.rot90(pop_vector) # Initialize classifier if classifier == 'forest': clf = RandomForestClassifier(n_estimators=100) elif classifier == 'bayes': clf = GaussianNB() elif classifier == 'regression': clf = LogisticRegression(solver='liblinear', multi_class='auto') elif classifier == 'lda': clf = LinearDiscriminantAnalysis() # Pre-allocate variables acc = np.zeros(iterations) f1 = np.zeros(iterations) auroc = np.zeros(iterations) conf_matrix_norm = np.zeros((np.shape(np.unique(event_groups))[0], np.shape(np.unique(event_groups))[0], iterations)) pred = np.zeros([iterations, pop_vector.shape[0]]) prob = np.zeros([iterations, pop_vector.shape[0]]) for i in range(iterations): # Pre-allocate variables for this iteration y_pred = np.zeros(event_groups.shape) y_probs = np.zeros(event_groups.shape) # Get neurons to use for this iteration if n_neurons == 'all': sub_pop_vector = pop_vector else: use_neurons = np.random.choice(pop_vector.shape[1], n_neurons, replace=False) sub_pop_vector = pop_vector[:, use_neurons] if cross_validation == 'none': # Fit the model on all the data and predict clf.fit(sub_pop_vector, event_groups) y_pred = clf.predict(sub_pop_vector) # Get the probability of the prediction for ROC analysis probs = clf.predict_proba(sub_pop_vector) y_probs = probs[:, 1] # keep positive only else: # Perform cross-validation if cross_validation == 'leave-one-out': cv = LeaveOneOut().split(pop_vector) elif cross_validation == 'kfold': cv = KFold(n_splits=num_splits).split(pop_vector) elif cross_validation == 'block': block_lengths = [sum(1 for i in g) for k, g in groupby(prob_left)] blocks = np.repeat(np.arange(len(block_lengths)), block_lengths) cv = LeaveOneGroupOut().split(pop_vector, groups=blocks) elif cross_validation == 'custom': cv = custom_validation # Loop over the splits into train and test for train_index, test_index in cv: # Fit the model to the training data clf.fit(sub_pop_vector[train_index], event_groups[train_index]) # Predict the test data y_pred[test_index] = clf.predict(sub_pop_vector[test_index]) # Get the probability of the prediction for ROC analysis probs = clf.predict_proba(sub_pop_vector[test_index]) y_probs[test_index] = probs[:, 1] # keep positive only # Calculate performance metrics and confusion matrix acc[i] = accuracy_score(event_groups, y_pred) f1[i] = f1_score(event_groups, y_pred) auroc[i] = roc_auc_score(event_groups, y_probs) conf_matrix = confusion_matrix(event_groups, y_pred) conf_matrix_norm[:, :, i] = conf_matrix / conf_matrix.sum(axis=1)[:, np.newaxis] # Add prediction and probability to matrix pred[i, :] = y_pred prob[i, :] = y_probs # Make integers from arrays when there's only one iteration if iterations == 1: acc = acc[0] f1 = f1[0] auroc = auroc[0] # Add to results dictionary if cross_validation == 'kfold': results = dict({'accuracy': acc, 'f1': f1, 'auroc': auroc, 'predictions': pred, 'probabilities': prob, 'confusion_matrix': conf_matrix_norm, 'n_groups': np.shape(np.unique(event_groups))[0], 'classifier': classifier, 'cross_validation': '%d-fold' % num_splits, 'iterations': iterations}) else: results = dict({'accuracy': acc, 'f1': f1, 'auroc': auroc, 'predictions': pred, 'probabilities': prob, 'confusion_matrix': conf_matrix_norm, 'n_groups': np.shape(np.unique(event_groups))[0], 'classifier': classifier, 'cross_validation': cross_validation, 'iterations': iterations}) return results
dataPath = 'AlanWalksWales/' # Comment out one of the two data names to change portion of data to use dataName = 'AWW_rest' #dataName = 'AWW_walk' nJobs = 12 # Number of cores to use # Load feature matrices, labels, and groups (denoting which labeled time # segment each row of the feature matrix comes from) featuresAll = np.loadtxt(dataPath + dataName + '_all.csv', delimiter=',') featuresAcc = np.loadtxt(dataPath + dataName + '_acc.csv', delimiter=',') featuresEda = np.loadtxt(dataPath + dataName + '_eda.csv', delimiter=',') labels = np.loadtxt(dataPath + dataName + '_label.csv') groups = np.loadtxt(dataPath + dataName + '_groups.csv') # Leave-one-group-out cross-validation cv = LeaveOneGroupOut() # Parameter tuning by grid search solver = 'lbfgs' activation = 'relu' regParam = 10.0**np.arange(-3, 5) # Comment out one of the choices below (either 1 or 2 hidden layers) # 1 hidden layer hiddenLayerSizes = 2**np.arange(0, 8) """ # 2 hidden layers hidden1,hidden2 = np.meshgrid(2**np.arange(0,8),2**np.arange(0,8)) hiddenLayerSizes = np.reshape(np.stack([hidden1,hidden2]), (2,np.size(hidden1))).T.tolist()
def learn(X: (dict, pd.DataFrame), y: (dict, pd.Series), data_folder: str, groups: list = None, test_split: float = None, name: str = None): ''' This function trains either a classification or regression random forest model. It is able to handle either a singular pandas DataFrame or a dictionary of pandas DataFrames. If the input is a singular pandas DataFrame, the rows will be split into a training and testing dataset using test_split (0 - 1). If the input is a dictionary of pandas DataFrames, a leave one out method will be used to verify the models accuracy. Inputs: X: a dictionary of pandas DataFrames or a singular pandas DataFrame y: a dictionary of pandas Series of a singular pandas Series data_folder: the location of where to save the output groups: a list of the trial names NOTE: this is only required if the X/y input is a dictionary test_split: the decimal percentage to split the training and testing datasets NOTE: this is only required if the X/y input is not a dictionary name: the name of the trial NOTE: this is only required if the X/y input is not a dictionary Alex Woodall Auckland Bioengineering Institute 08/04/2020 ''' if 'force' in data_folder or 'time' in data_folder: mode = 'regression' elif 'binary' in data_folder: mode = 'classification' if type(X) is pd.DataFrame: # Learning using one trial (or a combination into a DataFrame rather than a dictionary of DataFrames) if mode == 'classification': # Split into training and testing X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_split) # Create classifier and train cl = RandomForestClassifier(n_estimators=128, n_jobs=-1) cl.fit(X_train, y_train) # Predict on classifier and convert to a pandas series, save output y_predict = cl.predict(X_test) y_predict = pd.Series(y_predict, index=X_test.index) y_predict.to_csv("{}y_predict.csv".format(data_folder), index=True, header=True) y_test.to_csv("{}y_test.csv".format(data_folder), index=True, header=True) # Print score and confusion matrix score = roc_auc_score(y_test, y_predict) conf_mat = confusion_matrix(y_test, y_predict) print("Roc auc = {}\n".format(score)) print(conf_mat) elif mode == 'regression': # Split into training and testing split_int = int(len(X) * (1 - test_split)) X_train = X.head(split_int) y_train = y.head(split_int) X_test = X.tail(len(X) - split_int) y_test = y.tail(len(X) - split_int) # Create regressor and train rg = RandomForestRegressor(n_estimators=100, n_jobs=-1) rg.fit(X_train, y_train) # Predict y_predict = rg.predict(X_test) # Filter force array ''' Filter force plate data at 60 Hz ''' analog_frequency = 1000 cut_off = 60 # Derie (2017), Robberechts et al (2019) order = 2 # Weyand (2017), Robberechts et al (2019) b_f, a_f = signal.butter(N=order, Wn=cut_off / (analog_frequency / 2), btype='low') new_F = signal.filtfilt(b_f, a_f, y_predict) ''' Rezero filtered forces''' threshold = 50 # 20 N filter_plate = rezero_filter(original_fz=new_F, threshold=threshold) y_predict = filter_plate * new_F # Convert output into a pandas series and save y_predict = pd.Series(y_predict, index=X_test.index) y_predict.to_csv("{}y_predict.csv".format(data_folder), index=True, header=True) y_test.to_csv("{}y_test.csv".format(data_folder), index=True, header=True) # Calculate R2 score and print score = r2_score(y_test, y_predict) print("R2 = {}\n".format(score)) # Plot result plt.plot(y_test.tail(1000), 'k', label='True data') plt.plot(y_predict.tail(1000), 'r', label='Estimate data') plt.legend() plt.ylabel('Force (N)') plt.xlabel('Time (ms)') plt.title('Estimated data for {}'.format(name)) # Save figure score = round(score, 4) plt.savefig('{}{}_{}.png'.format(data_folder, name, '_'.join(str(score).split('.')))) plt.show() elif type(X) is dict: # Create leave one group out split group_num = np.arange(len(groups)) logo = LeaveOneGroupOut() logo.get_n_splits(groups=group_num) if mode == 'classification': # Create results text file f = open("{}results.txt".format(data_folder), "w") f.write("Results for classification\n\n") f.close() roc = [] # Train on n - 1 groups, test on 1. Repeat for all for train_index, test_index in logo.split(X=X, groups=group_num): cl = RandomForestClassifier(n_estimators=128, n_jobs=-1) # Training data print('Hold out trial: {}'.format(groups[test_index[0]])) for index in train_index: try: X_train = X_train.append(X[groups[index]], ignore_index=True) y_train = y_train.append(y[groups[index]], ignore_index=True) except NameError: X_train = X[groups[index]] y_train = y[groups[index]] cl.fit(X_train, y_train) # Testing data X_test = X[groups[test_index[0]]] y_test = y[groups[test_index[0]]] # Predict y_estimate_test = cl.predict(X_test) y_estimate_test = pd.Series(y_estimate_test, index=X_test.index) roc.append(roc_auc_score(y_test, y_estimate_test)) conf = confusion_matrix(y_test, y_estimate_test) np.savetxt("{}y_estimate_conf_{}.txt".format( data_folder, groups[test_index[0]]), conf, delimiter='\t', fmt='%i') f = open("{}results.txt".format(data_folder), "a") f.write("Predicting on {}: {}\n".format( groups[test_index[0]], round(roc[-1], 4))) f.close() # Save estimate y_estimate_test.to_csv("{}y_estimate_test_{}.csv".format( data_folder, groups[test_index[0]]), index=True, header=True) # Remove datasets del X_train del X_test del y_train del y_test # Save model f = open( "{}{}_cl.pkl".format(data_folder, groups[test_index[0]]), "wb") pickle.dump(cl, f) f.close() f = open("{}results.txt".format(data_folder), "a") f.write("\nAverage roc auc score: {}".format( round(statistics.mean(roc), 4))) f.close() elif mode == 'regression': # Allow for different number of estimators depending on task if 'force' in data_folder: n_estimators = 10 else: n_estimators = 10 # Create results text file f = open("{}results.txt".format(data_folder), "w") f.write("Results for regression\n\n") f.close() r2 = [] for train_index, test_index in logo.split(X=X, groups=group_num): rg = RandomForestRegressor(n_estimators=n_estimators, n_jobs=-1) # Training data print('Hold out trial: {}'.format(groups[test_index[0]])) for index in train_index: try: X_train = X_train.append(X[groups[index]], ignore_index=True) y_train = y_train.append(y[groups[index]], ignore_index=True) except NameError: X_train = X[groups[index]] y_train = y[groups[index]] rg.fit(X_train, y_train) # Testing data X_test = X[groups[test_index[0]]] y_test = y[groups[test_index[0]]] # Predict y_estimate_test = rg.predict(X_test) # Round estimate to a whole number y_estimate_test = np.around(y_estimate_test) # Any negative number = -1 y_estimate_test[y_estimate_test < 0] = -1 y_estimate_test = pd.Series(y_estimate_test, index=X_test.index) r2.append(r2_score(y_test, y_estimate_test)) f = open("{}results.txt".format(data_folder), "a") f.write("Predicting on {}: {}\n".format( groups[test_index[0]], round(r2[-1], 4))) f.close() # Save estimate y_estimate_test.to_csv("{}y_estimate_test_{}.csv".format( data_folder, groups[test_index[0]]), index=True, header=True) # Remove datasets del X_train del X_test del y_train del y_test # Save model f = open( "{}{}_rg.pkl".format(data_folder, groups[test_index[0]]), "wb") pickle.dump(rg, f) f.close() f = open("{}results.txt".format(data_folder), "a") f.write("\nAverage R^2 score: {}".format( round(statistics.mean(r2), 4))) f.close() else: print("X should be of type dict or pd.DataFrame") return return
data = pd.read_csv('./data-standard.csv') features = [ 'RMSSD', 'SDNN', 'SDANN', 'SDANNi', 'SDSD', 'pNN50', 'AutoCorrelation' ] data = shuffle(data) X, Y = np.array(data[features]), np.array(data['sleep']) grp = data['id'].values model = RandomForestClassifier(n_estimators=148, criterion='entropy', max_depth=12, min_samples_split=4, min_samples_leaf=7) logo = LeaveOneGroupOut() cv = LeaveOneGroupOut().split(X, Y, grp) scores = [] for train, test in logo.split(X, Y, grp): model = RandomForestClassifier(n_estimators=148, criterion='entropy', max_depth=12, min_samples_split=4, min_samples_leaf=7) x_train, x_test = X[train], X[test] y_train, y_test = Y[train], Y[test] model.fit(x_train, y_train.ravel()) scores.append(metrics.accuracy_score(y_test, model.predict(x_test)))
p_pair_set += [set_dict[key]['pair']] instance_set_array = np.array(instance_set) # define pair sets final_neg_pairs_list = set() for pair in n_pair_set: final_neg_pairs_list.add(tuple(pair)) for pair in p_pair_set: final_neg_pairs_list.add(tuple(pair)) ################### create cross-validation groups (same order with instances) instance_groups = stratified_gene_pair_groups(set_dict, instance_keys) # create data split logo = LeaveOneGroupOut() data_split = [] for train_index, test_index in logo.split(instance_set, label_set, groups=instance_groups): data_split += [[train_index, test_index]] print('##### Building Random Forest predictor for set %d with the LeaveOnePairOut cross-validation procedure (%d iterations)' % \ (trial + 1, len(data_split))) ################################ create overall sets of iterations for statistics sgp_overall_test_set = [] sgp_overall_test_label = [] sgp_overall_pred_label = [] sgp_overall_prob_label = []
def test_decoder_binary_classification(): X, y = make_classification(n_samples=200, n_features=125, scale=3.0, n_informative=5, n_classes=2, random_state=42) X, mask = to_niimgs(X, [5, 5, 5]) # check classification with masker object model = Decoder(mask=NiftiMasker()) model.fit(X, y) y_pred = model.predict(X) assert accuracy_score(y, y_pred) > 0.95 # decoder object use predict_proba for scoring with logistic model model = Decoder(estimator='logistic_l2', mask=mask) model.fit(X, y) y_pred = model.predict(X) assert accuracy_score(y, y_pred) > 0.95 # decoder object use prior as strategy (default) for dummy classifier model = Decoder(estimator='dummy_classifier', mask=mask) model.fit(X, y) y_pred = model.predict(X) assert accuracy_score(y, y_pred) == 0.5 # decoder object use other strategy for dummy classifier param = dict(strategy='stratified') dummy_classifier.set_params(**param) model = Decoder(estimator=dummy_classifier, mask=mask) model.fit(X, y) y_pred = model.predict(X) assert accuracy_score(y, y_pred) >= 0.5 # Returns model coefficients for dummy estimators as None assert model.coef_ is None # Dummy output are nothing but the attributes of the dummy estimators assert model.dummy_output_ is not None assert model.cv_scores_ is not None # model attribute n_outputs_ depending on target y ndim assert model.n_outputs_ == 1 # decoder object use other scoring metric for dummy classifier model = Decoder(estimator='dummy_classifier', mask=mask) model.fit(X, y) y_pred = model.predict(X) assert roc_auc_score(y, y_pred) == 0.5 model = Decoder(estimator='dummy_classifier', mask=mask, scoring='roc_auc') model.fit(X, y) assert np.mean(model.cv_scores_[0]) >= 0.5 # Raises a not implemented error with strategy constant param = dict(strategy='constant') dummy_classifier.set_params(**param) model = Decoder(estimator=dummy_classifier, mask=mask) pytest.raises(NotImplementedError, model.fit, X, y) # check different screening_percentile value for screening_percentile in [100, 20, None]: model = Decoder(mask=mask, screening_percentile=screening_percentile) model.fit(X, y) y_pred = model.predict(X) assert accuracy_score(y, y_pred) > 0.95 for clustering_percentile in [100, 99]: model = FREMClassifier(estimator='logistic_l2', mask=mask, clustering_percentile=clustering_percentile, screening_percentile=90, cv=5) model.fit(X, y) y_pred = model.predict(X) assert accuracy_score(y, y_pred) > 0.9 # check cross-validation scheme and fit attribute with groups enabled rand_local = np.random.RandomState(42) for cv in [KFold(n_splits=5), LeaveOneGroupOut()]: model = Decoder(estimator='svc', mask=mask, standardize=True, cv=cv) if isinstance(cv, LeaveOneGroupOut): groups = rand_local.binomial(2, 0.3, size=len(y)) else: groups = None model.fit(X, y, groups=groups) assert accuracy_score(y, y_pred) > 0.9
small_dict = {} # creating path names for training(X) data for name_x in filename: file_x = os.path.join(inputFolder, name_x) x = np.load(file_x) # preprocessing the trinanig data x_scaled = x / 255 # We are using a Support Vector Classifier with "rbf" kernel clf = svm.SVC(kernel = 'rbf', gamma= 0.01, C = 100) # using leave one groupout method logo = LeaveOneGroupOut() cv = logo.split(x_scaled, y, groups=group) scores = cross_val_score(clf, x_scaled, y, cv = cv, scoring= 'recall_macro') loopMean = scores.mean() loopStd = scores.std() small_name_x = name_x[:-4] small_dict[small_name_x] = (loopMean, loopStd) print(" finished finiding scores with %s data" %name_x) big_name = name[5:12]
def lda_project(spike_times, spike_clusters, event_times, event_groups, pre_time=0, post_time=0.5, cross_validation='kfold', num_splits=5, prob_left=None, custom_validation=None): """ Use linear discriminant analysis to project population vectors to the line that best separates the two groups. When cross-validation is used, the LDA projection is fitted on the training data after which the test data is projected to this projection. spike_times : 1D array spike times (in seconds) spike_clusters : 1D array cluster ids corresponding to each event in `spikes` event_times : 1D array times (in seconds) of the events from the two groups event_groups : 1D array group identities of the events, can be any number of groups, accepts integers and strings pre_time : float time (in seconds) preceding the event times post_time : float time (in seconds) following the event times cross_validation : string which cross-validation method to use, options are: 'none' No cross-validation 'kfold' K-fold cross-validation 'leave-one-out' Leave out the trial that is being decoded 'block' Leave out the block the to-be-decoded trial is in 'custom' Any custom cross-validation provided by the user num_splits : integer ** only for 'kfold' cross-validation ** Number of splits to use for k-fold cross validation, a value of 5 means that the decoder will be trained on 4/5th of the data and used to predict the remaining 1/5th. This process is repeated five times so that all data has been used as both training and test set. prob_left : 1D array ** only for 'block' cross-validation ** the probability of the stimulus appearing on the left for each trial in event_times custom_validation : generator ** only for 'custom' cross-validation ** a generator object with the splits to be used for cross validation using this format: ( (split1_train_idxs, split1_test_idxs), (split2_train_idxs, split2_test_idxs), (split3_train_idxs, split3_test_idxs), ...) n_neurons : int Group size of number of neurons to be sub-selected Returns ------- lda_projection : 1D array the position along the LDA projection axis for the population vector of each trial """ # Check input assert cross_validation in ['none', 'kfold', 'leave-one-out', 'block', 'custom'] assert event_times.shape[0] == event_groups.shape[0] if cross_validation == 'block': assert event_times.shape[0] == prob_left.shape[0] if cross_validation == 'custom': assert isinstance(custom_validation, types.GeneratorType) # Get matrix of all neuronal responses times = np.column_stack(((event_times - pre_time), (event_times + post_time))) pop_vector, cluster_ids = _get_spike_counts_in_bins(spike_times, spike_clusters, times) pop_vector = np.rot90(pop_vector) # Initialize lda = LinearDiscriminantAnalysis() lda_projection = np.zeros(event_groups.shape) if cross_validation == 'none': # Find the best LDA projection on all data and transform those data lda_projection = lda.fit_transform(pop_vector, event_groups) else: # Perform cross-validation if cross_validation == 'leave-one-out': cv = LeaveOneOut().split(pop_vector) elif cross_validation == 'kfold': cv = KFold(n_splits=num_splits).split(pop_vector) elif cross_validation == 'block': block_lengths = [sum(1 for i in g) for k, g in groupby(prob_left)] blocks = np.repeat(np.arange(len(block_lengths)), block_lengths) cv = LeaveOneGroupOut().split(pop_vector, groups=blocks) elif cross_validation == 'custom': cv = custom_validation # Loop over the splits into train and test for train_index, test_index in cv: # Find LDA projection on the training data lda.fit(pop_vector[train_index], [event_groups[j] for j in train_index]) # Project the held-out test data to projection lda_projection[test_index] = np.rot90(lda.transform(pop_vector[test_index]))[0] return lda_projection
def cross_validate(data, lambda_config=(2e0, 2e20, 11), TA=0, name="data_0"): """ Parameters ---------- data : Pandas Dataframe load dataframe using from data_load, using getData function. lambda_config : tuple, optional Lambda range for Ridge regression. The default is (2e0, 2e20, 11). Range from Cross et al 2016 publication. TA : int., optional Test subject nr. The default is 0. name : str., optional Name of txt file with model performance. The default is "data_0". Returns ------- CSV-file with results Plots """ # For reproducibility random_state = 9999 # Train one model per TA data = data[data['TA'] == TA] # Define lambda values for training models lambdas = np.linspace(lambda_config[0], lambda_config[1], lambda_config[2]) # Split into train/testing with leave-one-group-out logo = LeaveOneGroupOut() # Shift EEG 150 ms back EEG_shifted = shift(data[data.columns[:16]].T, lag=-150, freq=64) data = data.iloc[:len(EEG_shifted.T)] data[data.columns[:16]] = EEG_shifted.T # Assign X, y and group variable X = data[data.columns[:16]] y = data['target'] groups = data["trial"] n_outer_groups = len(np.unique(groups)) # Initiate test errors MSEs = [] MSEdummies = [] # For cross correlation scores = [] opt_lambda_list = [] # Parameters for MNE tmin = -.25 tmax = .1 sfreq = 64 ## Leave-trial-out CV ## # Outer fold i = 0 for out_train_idx, out_test_idx in logo.split(X, y, groups): print("Outer fold %i / %i" % (i + 1, n_outer_groups)) X_train = X.iloc[out_train_idx] y_train = y.iloc[out_train_idx] X_test = X.iloc[out_test_idx] y_test = y.iloc[out_test_idx] # Define inner groups, these are n - 1 of n total groups inner_groups = data["trial"].iloc[out_train_idx] n_inner_groups = len(np.unique(inner_groups)) # Initiate errors for inner fold validations vals = np.zeros((n_inner_groups, lambda_config[2])) # Inner fold j = 0 for inn_train_idx, inn_test_idx in logo.split(X_train, y_train, inner_groups): print("\t Inner fold %i / %i" % (j + 1, n_inner_groups)) inn_X_train = X_train.iloc[inn_train_idx] inn_y_train = y_train.iloc[inn_train_idx] inn_X_test = X_train.iloc[inn_test_idx] inn_y_test = y_train.iloc[inn_test_idx] # Validate model with all parameters k = 0 for l in lambdas: # Define model with l parameter model = ReceptiveField(tmin, tmax, sfreq, feature_names=None, estimator=l, scoring="corrcoef") # Fit model to inner fold training data model.fit(np.asarray(inn_X_train), np.asarray(inn_y_train)) # Compute cross correlation for regressional value val = model.score(np.asarray(inn_X_test), np.asarray(inn_y_test)) # Add score to matrix vals[j, k] = val k += 1 j += 1 # Get optimal parameter param_score = np.sum(vals, axis=0) lambda_opt = lambdas[np.argmax(param_score)] print("Optimal lambda = %f" % lambda_opt) # Store optimal lambda parameter opt_lambda_list.append(lambda_opt) # Train optimal model model_opt = ReceptiveField(tmin, tmax, sfreq, feature_names=None, estimator=lambda_opt, scoring="corrcoef") # Fit model to inner fold training data model_opt.fit(np.asarray(X_train), np.asarray(y_train)) # Compute error of optimal model score = model_opt.score(np.asarray(X_test), np.asarray(y_test)) print('Score:') print(score) # Fit dummy model dummy_regr = DummyRegressor(strategy="mean") dummy_regr.fit(np.asarray(X_train), np.asarray(y_train)) # Add error to list scores.append(score) MSE = mean_squared_error(np.asarray(y_test), model_opt.predict(np.asarray(X_test)), squared=True) MSEs.append(MSE) MSEdummy = mean_squared_error(np.asarray(y_test), dummy_regr.predict(np.asarray(X_test)), squared=True) MSEdummies.append(MSEdummy) i += 1 ## Training and testing for optimal model ## # Making dataframes for each SNR cond data_0 = data[data['SNR'] == 0] data_1 = data[data['SNR'] == 1] data_2 = data[data['SNR'] == 2] # Shuffle data data_0 = data_0.sample(frac=1, random_state=random_state) data_1 = data_1.sample(frac=1, random_state=random_state) data_2 = data_2.sample(frac=1, random_state=random_state) # Split data 80/20 for training/testing train_0, test_0 = train_test_split(data_0, test_size=0.2, random_state=random_state) train_1, test_1 = train_test_split(data_1, test_size=0.2, random_state=random_state) train_2, test_2 = train_test_split(data_2, test_size=0.2, random_state=random_state) # Combine training dataframes into one data = train_0.append(train_1, ignore_index=True) data = data.append(train_2, ignore_index=True) # Combine testing dataframes into one data_test = test_0.append(test_1, ignore_index=True) data_test = data_test.append(test_2, ignore_index=True) # Mean score across all folds mu_score = np.mean(scores) print("Mean score = %f" % mu_score) best_fold = np.argmax(scores) + 1 print("Best fold = %i" % best_fold) # Optimal model model_optimal = ReceptiveField( tmin, tmax, sfreq, feature_names=None, estimator=opt_lambda_list[np.argmax(scores)], scoring="corrcoef") # Fit optimal model to training data model_optimal.fit(np.asarray(data[data.columns[:16]]), np.asarray(data["target"])) # Dummy classifier dummy_regr = DummyRegressor(strategy="mean") dummy_regr.fit(np.asarray(data[data.columns[:16]]), np.asarray(data["target"])) # Compute cross correlation scores on test data for all three SNR conds score_0 = signal.correlate(model_optimal.predict( np.asarray(test_0[test_0.columns[:16]])), np.asarray(test_0['target']), mode='same') / len(test_0['target']) score_1 = signal.correlate(model_optimal.predict( np.asarray(test_1[test_1.columns[:16]])), np.asarray(test_1['target']), mode='same') / len(test_1['target']) score_2 = signal.correlate(model_optimal.predict( np.asarray(test_2[test_2.columns[:16]])), np.asarray(test_2['target']), mode='same') / len(test_2['target']) # Compute cross correlation scores on test data for all three SNR conds with dummy regressor score_0_dummy = signal.correlate(dummy_regr.predict( np.asarray(test_0[test_0.columns[:16]])), np.asarray(test_0['target']), mode='same') / len(test_0['target']) score_1_dummy = signal.correlate(dummy_regr.predict( np.asarray(test_1[test_1.columns[:16]])), np.asarray(test_1['target']), mode='same') / len(test_1['target']) score_2_dummy = signal.correlate(dummy_regr.predict( np.asarray(test_2[test_2.columns[:16]])), np.asarray(test_2['target']), mode='same') / len(test_2['target']) # Cross correlate with random speech random_corr = signal.correlate( model_optimal.predict(np.asarray(test_1[test_1.columns[:16]])), np.random.uniform(low=min(test_1['target']), high=max(test_1['target']), size=(np.asarray(test_1).shape[0], )), mode='same') / len( np.random.uniform(low=min(test_1['target']), high=max(test_1['target']), size=(np.asarray(test_1).shape[0], ))) ### SHOW RESULTS IN PLOTS ### ## Make line plots ## # Define x-axes x_axis_0 = np.linspace(0, len(test_0['target']), num=len(test_0['target'])) x_axis_1 = np.linspace(0, len(test_1['target']), num=len(test_1['target'])) x_axis_2 = np.linspace(0, len(test_2['target']), num=len(test_2['target'])) x_all = np.linspace(0, len(data_test['target']), num=len(data_test['target'])) ## All SNRs ## # For True plt.plot(x_all, np.asarray(data_test['target']), color='sandybrown', label='True') # For MNE Ridge regression plt.plot(x_all, model_optimal.predict( np.asarray(data_test[data_test.columns[:16]])), color='deepskyblue', label='Predicted') # For baseline dummy plt.plot(x_all, dummy_regr.predict(np.asarray(data_test[data_test.columns[:16]])), color='rebeccapurple', dashes=[6, 2], label='Baseline (mean)') plt.grid() plt.title(f'TA: {TA} · Predicted and True Speech Envelopes · All SNRs') plt.xlabel('Samples') plt.ylabel('Speech Envelope') plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') plt.show() plt.savefig(f'Figure All - TA {TA}.png') ## -5 DB SNR ## # For True plt.plot(x_axis_0, np.asarray(test_0['target']), color='sandybrown', label='True') # For MNE Ridge regression plt.plot(x_axis_0, model_optimal.predict(np.asarray(test_0[test_0.columns[:16]])), color='deepskyblue', label='Predicted') # For baseline dummy plt.plot(x_axis_0, dummy_regr.predict(np.asarray(test_0[test_0.columns[:16]])), color='rebeccapurple', dashes=[6, 2], label='Baseline (mean)') plt.grid() plt.title(f'TA: {TA} · Predicted and True Speech Envelopes · -5 DB SNR') plt.xlabel('Samples') plt.ylabel('Speech Envelope') plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') plt.show() plt.savefig(f'Figure -5 DB - TA {TA}.png') ## 0 DB SNR ## # For True plt.plot(x_axis_1, np.asarray(test_1['target']), color='sandybrown', label='True') # For MNE Ridge regression plt.plot(x_axis_1, model_optimal.predict(np.asarray(test_1[test_1.columns[:16]])), color='deepskyblue', label='Predicted') # For baseline dummy plt.plot(x_axis_1, dummy_regr.predict(np.asarray(test_1[test_1.columns[:16]])), color='rebeccapurple', dashes=[6, 2], label='Baseline (mean)') plt.grid() plt.title(f'TA: {TA} · Predicted and True Speech Envelopes · 0 DB SNR') plt.xlabel('Samples') plt.ylabel('Speech Envelope') plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') plt.show() plt.savefig(f'Figure 0 DB - TA {TA}.png') ## +5 DB SNR ## # For True plt.plot(x_axis_2, np.asarray(test_2['target']), color='sandybrown', label='True') # For MNE Ridge regression plt.plot(x_axis_2, model_optimal.predict(np.asarray(test_2[test_2.columns[:16]])), color='deepskyblue', label='Predicted') # For baseline dummy plt.plot(x_axis_2, dummy_regr.predict(np.asarray(test_2[test_2.columns[:16]])), color='rebeccapurple', dashes=[6, 2], label='Baseline (mean)') plt.grid() plt.title(f'TA: {TA} · Predicted and True Speech Envelopes · +5 DB SNR') plt.xlabel('Samples') plt.ylabel('Speech Envelope') plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') plt.show() plt.savefig(f'Figure +5 DB - TA {TA}.png') ## Bar chart to compare MSE for L2 and baseline ## measure = [np.mean(MSEs), np.mean(MSEdummies)] variance = [np.var(MSEs), np.var(MSEdummies)] x_labels = ['L2 (MNE)', 'Baseline'] x_pos = [i for i, _ in enumerate(x_labels)] plt.bar(x_pos, measure, color='sandybrown', yerr=variance) plt.grid() plt.xlabel("Model") plt.ylabel("MSE") plt.title("MSEs of L-2 and Baseline Model Compared") plt.xticks(x_pos, x_labels) plt.show() plt.savefig(f'MSEs compared - TA {TA}.png') ## Make boxplot of CrossCorrs ## ticks = ['-5 DB', '0 DB', '+5 DB', 'Random'] # Function to set the colors of the boxplots def set_box_color(bp, color): plt.setp(bp['boxes'], color=color) plt.setp(bp['whiskers'], color=color) plt.setp(bp['caps'], color=color) plt.setp(bp['medians'], color=color) plt.figure() bpl = plt.boxplot( [score_0, score_1, score_2], positions=np.array(range(len([score_0, score_1, score_2]))) * 2.0 - 0.4, sym='', widths=0.6) bpr = plt.boxplot( [score_0_dummy, score_1_dummy, score_2_dummy], positions=np.array( range(len([score_0_dummy, score_1_dummy, score_2_dummy]))) * 2.0 + 0.4, sym='', widths=0.6) bpu = plt.boxplot(random_corr, positions=[6], sym='', widths=0.6) set_box_color(bpl, 'deepskyblue') set_box_color(bpr, 'rebeccapurple') set_box_color(bpu, 'sandybrown') # Draw temporary purple and blue lines and use them to create a legend plt.plot([], c='deepskyblue', label='L2 MNE') plt.plot([], c='rebeccapurple', label='Baseline') plt.plot([], c='sandybrown', label='Random') plt.title( "Cross-correlation Between L-2-Predicted and True Envelopes in All SNR Levels & Random" ) plt.legend() plt.ylabel("Cross-correlation") plt.grid() plt.xticks(range(0, len(ticks) * 2, 2), ticks) plt.xlim(-2, len(ticks) * 2) plt.tight_layout() plt.show() plt.savefig(f'Boxplot over CrossCorr - TA {TA}.png') # Make data matrix data_matrix = np.array([ 'TA:', TA, 'Optimal Lambda Value:', opt_lambda_list[np.argmax(scores)], 'Best Score (Pearson´s R):', scores[np.argmax(scores)], 'Mean Score (Pearson´s R):', np.mean(scores), 'Best MSE:', min(MSEs), 'Mean MSE:', np.mean(MSEs), 'Best MSE Dummy:', min(MSEdummies), 'Mean MSE Dummy', np.mean(MSEdummies), 'CrossCorr for -5DB SNR:', np.mean(score_0), 'CrossCorr for 0DB SNR:', np.mean(score_1), 'CrossCorr for +5DB SNR:', np.mean(score_2), 'CrossCorr for random:', np.mean(random_corr), 'Dummy CrossCorr for -5DB SNR:', np.mean(score_0_dummy), 'Dummy CrossCorr for 0DB SNR:', np.mean(score_1_dummy), 'Dummy CrossCorr for +5DB SNR:', np.mean(score_2_dummy) ]).T # Save as CSV in working directory np.savetxt(name, data_matrix, delimiter=",", fmt='%s')
def Run_Regression_Model(df, reg, cv_num, ALG, df_unknowns, test_df, cv_sets, j, save): from sklearn.model_selection import cross_val_predict from sklearn.metrics import make_scorer from sklearn.metrics import mean_squared_error, r2_score from sklearn.metrics import explained_variance_score # Data from balanced dataframe y = df['Y'] X = df.drop(['Y'], axis=1) # Obtain the predictions using 10 fold cross validation # (uses KFold cv by default): if isinstance(cv_sets, pd.DataFrame): from sklearn.model_selection import LeaveOneGroupOut cv_split = LeaveOneGroupOut() cv_folds = cv_split.split(X, y, cv_sets.iloc[:, j]) cv_pred = cross_val_predict(estimator=reg, X=X, y=y, cv=cv_folds) else: cv_pred = cross_val_predict(estimator=reg, X=X, y=y, cv=cv_num) cv_pred_df = pd.DataFrame(data=cv_pred, index=df.index, columns=['pred']) # Get performance statistics from cross-validation y = y.astype(float) mse = mean_squared_error(y, cv_pred) evs = explained_variance_score(y, cv_pred) r2 = r2_score(y, cv_pred) cor = np.corrcoef(np.array(y), cv_pred) result = [mse, evs, r2, cor[0, 1]] reg.fit(X, y) # Save the model for future persistence print(f'\nSaving model as {save+".joblib"}\n') dump(reg, save+'.joblib') # Apply fit model to unknowns if isinstance(df_unknowns, pd.DataFrame): unk_pred = reg.predict(df_unknowns.drop(['Y'], axis=1)) unk_pred_df = pd.DataFrame(data=unk_pred, index=df_unknowns.index, columns=['pred']) cv_pred_df = cv_pred_df.append(unk_pred_df) if not isinstance(test_df, str): test_y = test_df['Y'] test_pred = reg.predict(test_df.drop(['Y'], axis=1)) test_pred_df = pd.DataFrame(data=test_pred, index=test_df.index, columns=['pred']) cv_pred_df = cv_pred_df.append(test_pred_df) # Get performance stats mse_test = mean_squared_error(test_y, test_pred) evs_test = explained_variance_score(test_y, test_pred) r2_test = r2_score(test_y, test_pred) cor_test = np.corrcoef(np.array(test_y), test_pred) result_test = [mse_test, evs_test, r2_test, cor_test[0, 1]] # Try to extract importance scores try: importances = reg.feature_importances_ except: try: importances = reg.coef_ except: importances = "na" print("Cannot get importance scores") if not isinstance(test_df, str): return result, cv_pred_df, importances, result_test, reg else: return result, cv_pred_df, importances, reg
#temp = np.equal(guess, df['Class']) #guess_acc = temp.sum()/13500*100 datadf = df[Config['selected_features']] ## oigin class in order, shuffle it if Config['debug']: data = datadf.sample(10) else: data = datadf.sample(frac=1) # feature1 = data[Config['selected_features']] label = data['Class'] groups = data['user'] feature = feature1.drop(columns=['Class', 'user']) gss = LeaveOneGroupOut() x = gss.split(feature, label, groups=groups) clf = Config['model'] score = cross_val_score(clf, X=feature, y=label, cv=x) print('the average of c.v. score is:') print(score.mean()) print() print('the standard deviation of c.v. score is:') print(np.std(score)) feature1 = data[Config['selected_features']] label = data['Class'] groups = data['user'] feature = feature1.drop(columns=['Class', 'user']) gss = LeaveOneGroupOut()
LogisticRegression(C=args.c, class_weight='balanced', penalty='l2', solver='liblinear', verbose=0, max_iter=1000)) params = model['logisticregression'].get_params() print( f"Fitting {params['penalty']} penalized logistic regression with C={params['C']}." ) scores_l2 = cross_validate(model, X, y, cv=LeaveOneGroupOut().split(X, y, seqid), scoring=pr_auc, n_jobs=args.njobs, return_estimator=True, pre_dispatch='n_jobs', verbose=1) mean_score = scores_l2['test_score'].mean() std_score = scores_l2['test_score'].std() print(f'Mean score: {mean_score:.2f} (std {std_score:.2f})') model = make_pipeline( StandardScaler(), LogisticRegression(C=args.c, class_weight='balanced',
# initialize performance tables acc_scores = ["participant_id", "modality"] for c in clf_tokens: for s in fs_perc: acc_scores.extend([ "ACC_%s_perc%s" % (c, s), "Perm_%s_perc%s" % (c, s), "P_%s_perc%s" % (c, s) ]) facc = os.path.join( vdir, "group_%s_MVPA-classifier-selection-PermACC_unimodal.csv" % task) with open(facc, 'w') as fid: fid.write(','.join(acc_scores) + '\n') fid.close() # do MVPA for each subject CV = LeaveOneGroupOut() # leave-one-run-out cross-validation for i in range(0, n): subj = subjects.index[i] print( "Perform ROI-based MVPA using classifiers: %s with LOROCV for subject: %s ......\n" % (clf_tokens, subj)) # setup individual path sdir = os.path.join(vdir, subj) # subject working folder bdir = os.path.join(sdir, 'betas_afni') # beta estimates pdir = os.path.join(sdir, 'tvrMVPC') # Trial-wise Volume ROI-based MVPC if not os.path.exists(pdir): os.makedirs(pdir) fbet = "%s/%s_LSS_nilearn.nii.gz" % (bdir, subj) for imod in mods: # read labels and betas according to the modality labs_mod = labs_trl.isin( ['WV', 'PV']) if imod == 'visual' else labs_trl.isin(['WA', 'PA'])
assert tokenize(cv) == tokenize(cv) tokens.append(cv) assert len(set(tokens)) == len(tokens) cv = cvs[0] sol = cv.get_n_splits(np_X, np_y, np_groups) assert compute_n_splits(cv, np_X, np_y, np_groups) == sol with assert_dask_compute(True): assert compute_n_splits(cv, da_X, da_y, da_groups) == sol with assert_dask_compute(False): assert compute_n_splits(cv, np_X, da_y, da_groups) == sol @pytest.mark.parametrize('cvs', [(LeaveOneGroupOut(), ), (LeavePGroupsOut(2), LeavePGroupsOut(3))]) def test_leave_group_out(cvs): tokens = [] for cv in cvs: assert tokenize(cv) == tokenize(cv) tokens.append(cv) assert len(set(tokens)) == len(tokens) cv = cvs[0] sol = cv.get_n_splits(np_X, np_y, np_groups) assert compute_n_splits(cv, np_X, np_y, np_groups) == sol with assert_dask_compute(True): assert compute_n_splits(cv, da_X, da_y, da_groups) == sol
def main(args, pipe=False): ''' Checks passed arguments and performs requested actions. ''' if not pipe: parser = argparse.ArgumentParser( description='Classify call segments as positive or negative.') parser.add_argument('-f', '--features', dest='feat_loc', required=True, help='Path to CSV feature file.') parser.add_argument( '-o', '--out', dest='out_loc', required=True, help='Path to where classification summary should be saved.') parser.add_argument('--hmm', dest='hmm_flag', action='store_true', help='Classify with a Hidden Markov Model.') parser.add_argument('--rf', dest='rf_flag', action='store_true', help='Classify with a random forest.') parser.add_argument('--n_components', dest='n_components', help='Number of components for the HMM.') parser.add_argument('--n_mix', dest='n_mix', help='Number of Gaussian mixtures for the HMM.') parser.add_argument( '--n_estimators', dest='n_estimators', help='Number of tree estimators for the random forest.') args = parser.parse_args() if args.hmm_flag or args.rf_flag: # store scores from all runs to calc stats hmm_chunk_scores = [] hmm_overall_scores = [] rf_chunk_scores = [] rf_overall_scores = [] # split data for leave-one-group(call)-out validation data, labels, ids = sep_data_labels(args.feat_loc) logo = LeaveOneGroupOut() curr_split = 1 num_splits = logo.get_n_splits(data, labels, ids) # loop through all cross validation folds for train_index, test_index in logo.split(data, labels, ids): print('Split ' + str(curr_split) + ' out of ' + str(num_splits)) data_train, data_test = data[train_index], data[test_index] labels_train, labels_test = labels[train_index], labels[test_index] # classify with the selected models if args.hmm_flag: if args.n_components: n_components = int(args.n_components) else: n_components = 2 if args.n_mix: n_mix = int(args.n_mix) else: n_mix = 2 hmm_model = HmmMorency(n_components=n_components, n_mix=n_mix) chunk_scores, call_score = train_and_test( hmm_model, data_train, data_test, labels_train, labels_test) hmm_chunk_scores.append(chunk_scores) hmm_overall_scores.append(call_score) if args.rf_flag: if args.n_estimators: n_estimators = int(args.n_estimators) else: n_estimators = 100 rf_model = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1, random_state=10) chunk_scores, call_score = train_and_test( rf_model, data_train, data_test, labels_train, labels_test) rf_chunk_scores.append(chunk_scores) rf_overall_scores.append(call_score) curr_split += 1 # evaluate the scores for all models out_file = os.path.join(args.out_loc, 'results.txt') if args.hmm_flag: score_stats( 'hmm, mix: ' + str(n_mix) + ' states: ' + str(n_components), hmm_chunk_scores, hmm_overall_scores, out_file) if args.rf_flag: score_stats('random forest, estimators: ' + str(n_estimators), rf_chunk_scores, rf_overall_scores, out_file) else: sys.exit( 'Must choose at least one classification method. (--hmm, --rf)')
# Load feature matrices, labels, and groups (denoting which labeled time # segment each row of the feature matrix comes from) featuresAll = np.loadtxt(dataPath + dataName + '_all.csv', delimiter=',') featuresAcc = np.loadtxt(dataPath + dataName + '_acc.csv', delimiter=',') featuresEda = np.loadtxt(dataPath + dataName + '_eda.csv', delimiter=',') labels = np.loadtxt(dataPath + dataName + '_label.csv') groups = np.loadtxt(dataPath + dataName + '_groups.csv') # Indicates the subjects that have no MAs, in order to exclude them during grid search includeRowsTrain = np.logical_and( np.logical_and(np.where(groups != 5, True, False), np.where(groups != 17, True, False)), np.where(groups != 18, True, False)) # Leave-one-group-out cross-validation cv = LeaveOneGroupOut() ## Svm # Parameter tuning by grid search regParamC = 10.**np.arange(-3, 5) regParamG = 10.**np.arange(-10, 1) parameters = {'gamma': regParamG, 'C': regParamC} svmgsAll = GridSearchCV(svm.SVC(), parameters, 'roc_auc', n_jobs=nJobs, cv=cv, refit=False, verbose=1) svmgsAll.fit(featuresAll[includeRowsTrain, :], labels[includeRowsTrain],
def whole_brain_analysis(model, fmri_runs, design_matrices, subject): # - fmri_runs: list of fMRI data runs (1 for each run) # - matrices: list of design matrices (1 for each run) distribution_array = None nb_runs = len(fmri_runs) nb_voxels = fmri_runs[0].shape[1] n_sample = params.n_sample scores_cv = np.zeros((nb_runs, nb_voxels)) distribution_array = np.zeros((nb_runs, n_sample, nb_voxels)) logo = LeaveOneGroupOut() # leave on run out ! columns_index = np.arange(design_matrices[0].shape[1]) shuffling = [] cv = 0 for _ in range(n_sample): np.random.shuffle(columns_index) shuffling.append(columns_index) for train, test in tqdm(logo.split(fmri_runs, groups=range(1, nb_runs + 1))): fmri_data_train = np.vstack([ fmri_runs[i] for i in train ]) # fmri_runs liste 2D colonne = voxels et chaque row = un t_i predictors_train = np.vstack([design_matrices[i] for i in train]) if type(model) == sklearn.linear_model.RidgeCV: nb_samples = np.cumsum( [0] + [[fmri_runs[i] for i in train][i].shape[0] for i in range(len([fmri_runs[i] for i in train])) ]) # list of cumulative lenght indexes = { 'run{}'.format(run + 1): [nb_samples[i], nb_samples[i + 1]] for i, run in enumerate(train) } model.cv = Splitter( indexes_dict=indexes, n_splits=nb_runs - 1 ) # adequate splitter for cross-validate alpha taking into account groups print('Fitting model...') start = time() model_fitted = model.fit(predictors_train, fmri_data_train) # pickle.dump(model_fitted, open(join(paths.path2derivatives, 'fMRI/glm-indiv/english', str(model).split('(')[0] + '{}.sav'.format(test[0])), 'wb')) with open(os.path.join(paths.path2derivatives, 'fMRI', 'time2fit.txt'), 'w') as f: f.write(str(model_fitted)) f.write('Model fitted in {}.'.format(time() - start)) print('Model fitted in {}.'.format(time() - start)) # return the R2_score for each voxel (=list) #r2 = get_r2_score(model_fitted, fmri_runs[test[0]], design_matrices[test[0]]) r2, distribution = sample_r2(model_fitted, design_matrices[test[0]], fmri_runs[test[0]], shuffling=shuffling, n_sample=n_sample, alpha_percentile=params.alpha_percentile) # log the results # log(subject, voxel='whole brain', alpha=None, r2=r2) scores_cv[cv, :] = r2 # r2 is a 1d array: 1 value for each voxel distribution_array[ cv, :, :] = distribution # distribution is a 2d array: n_sample values for each voxel cv += 1 # result = pd.DataFrame(scores, columns=['voxel #{}'.format(i) for i in range(scores.shape[1])]) # result.to_csv(join(paths.path2derivatives, 'fMRI/glm-indiv/english', str(model).split('(')[0] + '.csv')) return scores_cv, distribution_array # 2D arrays : (nb_runs_test, nb_voxels)
param_grid=param_dist, cv=cv, n_jobs=-1, verbose=True, scoring='neg_root_mean_squared_error') grid_search.fit(X_train_test, y_train_test, groups=groups_train_test) best_grid = grid_search.best_estimator_ print("Best Grid") print(best_grid) #choosing the reg model from grid search reg_model = grid_search.best_estimator_ ###################### Regression ########################## ############# perform leave-one-out-cross-validation on the training-testing set ############# logo = LeaveOneGroupOut() #initialize vector to keep score for each fold y_train_test_predictions = np.zeros(y_train_test.shape) for train, test in logo.split(X_train_test, y_train_test, groups=groups_train_test): #perform train test split for the current fold X_train, X_test, y_train, y_test = X_train_test[train], X_train_test[ test], y_train_test[train], y_train_test[test] model = reg_model # regression model from the grid search portion model_trained = model.fit(X_train, y_train)
def findParametersAndEvaluate(self, data, strategy, label_name, group=None, dataset=None, cv=5): self.strategy = strategy self.results = {} print('-------------------------------') print(' STEP : Finding Parameters & Evaluate Models') print('-------------------------------') self.label_name_check(label_name) #print(self.labelset.columns) # store performance data for each strategy if (strategy == 'train_test_split' or strategy == 'all'): self.train_test = dict() for model in self.models.keys(): self.train_test[model] = None print('===> Evaluation strategy: Train and Test Split ') X_train, X_test, y_train, y_test = train_test_split( data, self.label_set[label_name], train_size=.7, random_state=self.seed) print('===> Parameters find-> Start') for model in self.models.keys(): if model == 'vot': continue if not self.configured: gd = GridSearchCV(self.models[model], self.params[model], cv=cv, scoring='neg_root_mean_squared_error') gd.fit(X_train, y_train) print(' Parameters for ', model, ': ', gd.best_params_) self.models[model] = gd.best_estimator_ print('===> Parameters find-> End') test_performances = dict() print('===> Test data performance[RMSE] ') for model in self.models.keys(): self.models[model].fit(X_train, y_train) test_performances[model] = mean_squared_error( y_test, self.models[model].predict(X_test), squared=False) #print(' Model[',model,']:',test_performances[model]) self.train_test[model] = test_performances[model] print(self.train_test) self.results['train_test'] = self.train_test if (strategy == 'cross_val' or strategy == 'all'): self.cross_val = dict() cross_val = dict() for model in self.models.keys(): self.cross_val[model] = None print('==============================================') print('Evaluation strategy: Cross Validation') print('==============================================') for model in self.models.keys(): if model != 'vot' and not self.configured: print(' ==> Finding params for ', model) gd = GridSearchCV(self.models[model], self.params[model], cv=10, scoring='neg_root_mean_squared_error') gd.fit(data, self.label_set[label_name]) print(' Parameters: ', gd.best_params_) self.models[model] = gd.best_estimator_ cross_val[model] = cross_val_score( self.models[model], data, self.label_set[label_name], scoring='neg_root_mean_squared_error', cv=cv) #print(' Score[',model,']:',cross_val_scores[model]) cross_val_mean = -1 * statistics.mean(cross_val[model]) cross_val_var = statistics.variance(cross_val[model]) self.cross_val[model] = [cross_val_mean, cross_val_var] self.results['cross_val'] = self.cross_val if (strategy == 'leave_one_group_out' or strategy == 'all'): self.leave_group = dict() for model in self.models.keys(): self.leave_group[model] = None print('==============================================') print('Evaluation strategy: Leave one group out') print('==============================================') logo = LeaveOneGroupOut() n_splits = logo.get_n_splits(groups=group) error = dict() for model in self.models.keys(): error[model] = [None] * n_splits k = 0 for train_index, test_index in logo.split( data, self.label_set[label_name], group): #print(test_index) X_train, y_train = data.iloc[train_index], self.label_set[ label_name][train_index] X_test, y_test = data.iloc[test_index], self.label_set[ label_name][test_index] for model in self.models.keys(): if model != 'vot' and not self.configured: print(' ==> Finding params for ', model) gd = GridSearchCV( self.models[model], self.params[model], cv=10, scoring='neg_root_mean_squared_error') gd.fit(X_train, y_train) print(' Parameters: ', gd.best_params_) estimator = gd.best_estimator_ self.models[model] = estimator self.models[model].fit(X_train, y_train) error[model][k] = mean_squared_error( y_test, self.models[model].predict(X_test), squared=False) #print(' Model[',model,']:',error[model]) k = k + 1 for model in self.models.keys(): err_mean = statistics.mean(error[model]) err_var = statistics.variance(error[model]) self.leave_group[model] = [err_mean, err_var] self.results['leave_group'] = self.leave_group if (strategy == 'leave_one_dataset_out' or strategy == 'all'): self.leave_dataset = dict() for model in self.models.keys(): self.leave_dataset[model] = None print('==============================================') print('Evaluation strategy: Leave one dataset out') print('==============================================') logo = LeaveOneGroupOut() n_splits = logo.get_n_splits(groups=dataset) error = dict() for model in self.models.keys(): error[model] = [None] * n_splits k = 0 for train_index, test_index in logo.split( data, self.label_set[label_name], dataset): X_train, y_train = data.iloc[train_index], self.label_set[ label_name][train_index] X_test, y_test = data.iloc[test_index], self.label_set[ label_name][test_index] for model in self.models.keys(): if model != 'vot' and not self.configured: print(' ==> Finding params for ', model) gd = GridSearchCV( self.models[model], self.params[model], cv=10, scoring='neg_root_mean_squared_error') gd.fit(X_train, y_train) #print(' Parameters: ',gd.best_params_) estimator = gd.best_estimator_ self.models[model] = estimator self.models[model].fit(X_train, y_train) error[model][k] = mean_squared_error( y_test, self.models[model].predict(X_test), squared=False) #print(' Model[',model,']:',error[model]) k = k + 1 for model in self.models.keys(): err_mean = statistics.mean(error[model]) err_var = statistics.variance(error[model]) self.leave_dataset[model] = [err_mean, err_var] self.results['leave_dataset'] = self.leave_dataset if (strategy == 'sorted_stratified' or strategy == 'all'): self.stratified = dict() for model in self.models.keys(): self.stratified[model] = None # idea from https://scottclowe.com/2016-03-19-stratified-regression-partitions/ print('==============================================') print('Evaluation strategy: Sorted Stratification') print('==============================================') label_df = pd.DataFrame(self.label_set) indices = label_df.sort_values(by=[label_name]).index.tolist() splits = dict() error = dict() for model in self.models.keys(): error[model] = [None] * cv for i in range(cv): splits[i] = list() for i in range(len(indices)): if i % cv == 0: pick = random.sample(range(cv), cv) cur_pick = pick.pop() splits[cur_pick].append(indices[i]) for i in range(cv): test_index = splits[i] train_index = [] for j in range(cv): if j != i: train_index = train_index + splits[j] ########################################## # Code to training model on sorted stratified set X_train, y_train = data.iloc[train_index], self.label_set[ label_name][train_index] X_test, y_test = data.iloc[test_index], self.label_set[ label_name][test_index] for model in self.models.keys(): if model != 'vot' and not self.configured: print(' ==> Finding params for ', model) gd = GridSearchCV( self.models[model], self.params[model], cv=10, scoring='neg_root_mean_squared_error') gd.fit(X_train, y_train) print(' Parameters: ', gd.best_params_) estimator = gd.best_estimator_ self.models[model] = estimator self.models[model].fit(X_train, y_train) error[model][i] = mean_squared_error( y_test, self.models[model].predict(X_test), squared=False) #print(' Model[',model,']:',error[model]) for model in self.models.keys(): err_mean = statistics.mean(error[model]) err_var = statistics.variance(error[model]) self.stratified[model] = [err_mean, err_var] ########################################## self.results['stratified'] = self.stratified else: print('Unsupported evaluation strategy') return None return self.results # Preparing dataframe with results for report generation """
df['class_label'] = df.apply(lambda row: give_class(row), axis=1) data_train_x = df[df.Year != 2018][df.Year != 2016][train_columns] data_train_y = df[df.Year != 2018][df.Year != 2016].class_label data_dev_x = df[df.Year == 2016][train_columns] data_dev_y = df[df.Year == 2016].class_label data_all_x = df[df.Year != 2018][train_columns] data_all_y = df[df.Year != 2018].class_label # Temporal_cv------------------------------------ from sklearn.model_selection import LeaveOneGroupOut groups = data_all_x.Year.values.tolist() temporal_cv = LeaveOneGroupOut() #logistic regression------------------------------------ from sklearn.linear_model import LogisticRegression clever_print('logistic regression with no penelization') basic_multi_logi = LogisticRegression(max_iter=10000000, penalty='l2', C=10000000000).fit( data_train_x, data_train_y) print('accuracy on training') print(accuracy_score(data_train_y, basic_multi_logi.predict(data_train_x))) print('accuracy on dev') print(accuracy_score(data_dev_y, basic_multi_logi.predict(data_dev_x)))
for i,n in enumerate(sorted(names)): roi_name=fold+'mni4060/asymroi_'+smt+'_'+n+'.npz' roi=np.load(roi_name)['roi'] roi=roi[:,motor_label-1] roi_imp=roi[mask_imp] roi_imag=roi[mask_imag] roi_imp_all=np.vstack((roi_imp_all,roi_imp)) roi_imag_all=np.vstack((roi_imag_all,roi_imag)) y_imp_all=np.append(y_imp_all,y_imp) y_imag_all=np.append(y_imag_all,y_imag) groups=np.append(groups,np.ones(len(y_imp))*i) result_cv_tr_imp=[] result_cv_tr_imag=[] pipeline = Pipeline([('scale', scaler),('svm', svm)]) from sklearn.model_selection import LeaveOneGroupOut logo = LeaveOneGroupOut() for train_index, test_index in logo.split(roi_imp_all, y_imp_all, groups): X_train, X_test = roi_imp_all[train_index], roi_imag_all[test_index] y_train, y_test = y_imp_all[train_index], y_imp_all[test_index] pipeline.fit(X_train,y_train) prediction = pipeline.predict(X_test) result_cv_tr_imp.append(accuracy_score(prediction,y_test)) X_train, X_test = roi_imag_all[train_index], roi_imp_all[test_index] y_train, y_test = y_imp_all[train_index], y_imp_all[test_index] pipeline.fit(X_train,y_train) prediction = pipeline.predict(X_test) result_cv_tr_imag.append(accuracy_score(prediction,y_test)) from scipy.stats import ttest_1samp tt,p=ttest_1samp(np.array(result_cv_tr_imag),0.5)
# osx_data_path = '/Users/elijahc/data/uminn/preprocessed' # all_data = load_preprocessed(data_path=osx_data_path,file_names=file_names,simple=True) # merged_data = load_preprocessed(data_path=osx_data_path,file_names=file_names,merge_keys=['pows','stages'],simple=True) # linux all_data = load_preprocessed(file_names=file_names, simple=True) merged_data = load_preprocessed(file_names=file_names, merge_keys=['pows', 'stages'], simple=True) scaler = StandardScaler() scaler.fit(merged_data['pows']) groups = [[i] * len(d['pows']) for i, d in zip(np.arange(len(all_data)), all_data)] logo = LeaveOneGroupOut(n_splits=10, groups=groups) import ipdb ipdb.set_trace() model_params = dict( layer_spec=[64], # reg_weight=0.01, # num_labels=3, activ='relu', ) fit_params = dict( # validation_split=0.1, epochs=500, verbose=0, )
(DataImp_dropNA['Well Name'] == 'SHRIMPLIN')].index DataImp_dropF9 = DataImp_dropNA.drop(dropidx) wells_noPE = DataImp_dropF9['Well Name'].values DataImp = DataImp_dropF9.drop(['Formation', 'Well Name', 'Depth'], axis=1).copy() Ximp = DataImp.loc[:, DataImp.columns != 'PE'].values Yimp = DataImp.loc[:, 'PE'].values from sklearn.preprocessing import RobustScaler scaler = RobustScaler() scaler.fit(Ximp) Ximp_scaled = scaler.transform(Ximp) from sklearn.metrics import mean_squared_error logo = LeaveOneGroupOut() R2list = [] mselist = [] start = time.time() for train, test in logo.split(Ximp_scaled, Yimp, groups=wells_noPE): well_name = wells_noPE[test[0]] # Imputation using linear regression linear_model = LinearRegression() linear_model.fit(Ximp_scaled[train], Yimp[train]) R2 = linear_model.score(Ximp_scaled[test], Yimp[test]) # R2 print("Well name_test : ", well_name) print("R2: %.4f" % R2)
def RF_classifier(X_data,Y_data,options=None): from sklearn.ensemble import RandomForestClassifier #################### # Parse user options #################### params = {} gridsearch = False GS_settings = None randomsearch = False RS_settings = None accuracy = False cv_type = 'logo' scoring = 'f1' if (options is not None): if (("RF_parameters" in options)==True): params = options['RF_parameters'] if (("grid_search" in options)==True): from sklearn.model_selection import GridSearchCV gridsearch = True GS_params = options['grid_search']['parameter_grid'] if (("settings" in options['grid_search'])==True): GS_settings = options['grid_search']['settings'] if (("random_search" in options)==True): from sklearn.model_selection import RandomizedSearchCV from cfd2ml.utilities import convert_param_dist randomsearch = True RS_params, RS_Nmax = convert_param_dist(options['random_search']['parameter_grid']) print('RS_Nmax = ', RS_Nmax) if (("settings" in options['random_search'])==True): RS_settings = options['random_search']['settings'] if(randomsearch==True and gridsearch==True): quit('********** Stopping! grid_search and random_search both set *********') if (("accuracy" in options)==True): accuracy = options['accuracy'] if (accuracy==True): from sklearn.model_selection import cross_validate from sklearn.metrics import precision_recall_curve, auc, f1_score, accuracy_score, balanced_accuracy_score, confusion_matrix from cfd2ml.utilities import print_cm if (("scoring" in options)==True): scoring = options['scoring'] if (("cv_type" in options)==True): cv_type = options['cv_type'] ############## # Prepare data ############## if(cv_type=='logo'): groups = X_data['group'] X_data = X_data.drop(columns='group') # Find feature and target headers X_headers = X_data.columns Y_header = Y_data.name nX = X_headers.size print('\nFeatures:') for i in range(0,nX): print('%d/%d: %s' %(i+1,nX,X_headers[i]) ) print('\nTarget: ', Y_header) ######################## # Prepare other settings ######################## # Setting cross-validation type (either leave-one-group-out or 5-fold) if(cv_type=='logo'): from sklearn.model_selection import LeaveOneGroupOut logo = LeaveOneGroupOut() ngroup = logo.get_n_splits(groups=groups) print('\nUsing Leave-One-Group-Out cross validation on ', ngroup, ' groups') elif(cv_type=='kfold'): from sklearn.model_selection import StratifiedKFold print('\nUsing 10-fold cross validation') k_fold = StratifiedKFold(n_splits=10, random_state=42,shuffle=True) cv = k_fold.split(X_data,Y_data) ######################### # Training the classifier ######################### # TODO TODO TODO - improve accuracy by using balanced or weighted random forest # (see https://statistics.berkeley.edu/sites/default/files/tech-reports/666.pdf) if(gridsearch==True): # Finding optimal hyperparameters with GridSearchCV print('\n Performing GridSearchCV to find optimal hyperparameters for random forest classifier') clf = RandomForestClassifier(**params,random_state=42) if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups) GS_clf = GridSearchCV(estimator=clf,param_grid=GS_params, cv=cv, scoring=scoring, iid=False, verbose=2, **GS_settings) GS_clf.fit(X_data,Y_data) # Write out results to file scores_df = pd.DataFrame(GS_clf.cv_results_)#.sort_values(by='rank_test_score') scores_df.to_csv('GridSearch_results.csv') # Pich out best results best_params = GS_clf.best_params_ best_score = GS_clf.best_score_ clf = GS_clf.best_estimator_ # (this clf has been fit to all of the X_data,Y_data) print('\nBest hyperparameters found:', best_params) print('\nScore with these hyperparameters:', best_score) elif(randomsearch==True): # Finding optimal hyperparameters with RandomSearchCV print('\n Performing RandomizedSearchCV to find optimal hyperparameters for random forest classifier') clf = RandomForestClassifier(**params,random_state=42) if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups) RS_clf = RandomizedSearchCV(estimator=clf,param_distributions=RS_params, cv=cv, scoring=scoring,iid=False, verbose=2, error_score=np.nan, **RS_settings) RS_clf.fit(X_data,Y_data) # Write out results to file scores_df = pd.DataFrame(RS_clf.cv_results_)#.sort_values(by='rank_test_score') scores_df.to_csv('RandomSearch_results.csv') # Pick out best results best_params = RS_clf.best_params_ best_score = RS_clf.best_score_ clf = RS_clf.best_estimator_ # (this clf has been fit to all of the X_data,Y_data) print('\nBest hyperparameters found:', best_params) print('\nScore with these hyperparameters:', best_score) else: # Train RF classifier with hyperparameters given by user print('\nTraining random forest classifer with given hyperparameters') clf = RandomForestClassifier(**params) clf.fit(X_data,Y_data) # Cross validation accuracy metrics if(accuracy==True): print('\nPerforming cross validation to determine train and test accuracy/error, and precision-recall curves') #TODO - capability to decide on probablity threshold, and predict with chosen threshold # Get generator object depending on cv strategy if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups) elif(cv_type=='kfold'): cv = k_fold.split(X_data,Y_data) # Need to regen "Generator" object fig1, ax1 = plt.subplots() # Init lists y_real = [] y_proba = [] train_f1 = [] test_f1 = [] train_A = [] test_A = [] train_BA = [] test_BA = [] # Loop through CV folds i = 0 for train_index, test_index in cv: X_train, X_test = X_data.iloc[train_index], X_data.iloc[test_index] Y_train, Y_test = Y_data.iloc[train_index], Y_data.iloc[test_index] # Train classifier clf_cv = clf clf_cv.fit(X_train, Y_train) # Predict Y Y_pred_train = clf_cv.predict(X_train) Y_pred_test = clf_cv.predict(X_test ) # F1 scores f1score = f1_score(Y_test , Y_pred_test) train_f1.append(f1_score(Y_train, Y_pred_train) ) test_f1.append(f1score) # Accuracy scores Ascore = accuracy_score(Y_test , Y_pred_test) train_A.append(accuracy_score(Y_train, Y_pred_train) ) test_A.append(Ascore) # Balanced accuracy scores BAscore = balanced_accuracy_score(Y_test , Y_pred_test) train_BA.append(balanced_accuracy_score(Y_train, Y_pred_train) ) test_BA.append(BAscore) # Print validation scores (training scores are stored to print mean later, but not printed for each fold) if(cv_type=='logo'): print('\nTest group = ', groups.iloc[test_index[0]]) elif(cv_type=='kfold'): print('\nFold = ', i) print('-------------------') print('F1 score = %.2f %%' %(f1score*100) ) print('Total error = %.2f %%' %((1.0-Ascore)*100) ) print('Per-class error = %.2f %%' %((1.0-BAscore)*100) ) # Print confusion matrix for this fold print('Confusion matrix:') confuse_mat = confusion_matrix(Y_test, Y_pred_test) print_cm(confuse_mat, ['Off','On']) # Prediction probability based on X_test (used for precision-recall curves) pred_proba = clf_cv.predict_proba(X_test) precision, recall, _ = precision_recall_curve(Y_test, pred_proba[:,1]) lab = 'Fold %d AUC=%.4f' % (i+1, auc(recall, precision)) ax1.step(recall, precision, label=lab) y_real.append(Y_test) y_proba.append(pred_proba[:,1]) i += 1 # Calculate errors from accuracies train_TE = 1.0 - np.array(train_A) test_TE = 1.0 - np.array(test_A) train_CAE = 1.0 - np.array(train_BA) test_CAE = 1.0 - np.array(test_BA) # Print performance scores print('\nMean training scores:') print('F1 score = %.2f %%' %(np.mean(train_f1)*100) ) print('Total error = %.2f %%' %(np.mean(train_TE)*100) ) print('Per-class error = %.2f %%' %(np.mean(train_CAE)*100) ) print('\nMean validation scores:') print('F1 score = %.2f %%' %(np.mean(test_f1)*100) ) print('Total error = %.2f %%' %(np.mean(test_TE)*100) ) print('Per-class error = %.2f %%' %(np.mean(test_CAE)*100) ) # Average precision-recall over folds, and plot curves y_real = np.concatenate(y_real) y_proba = np.concatenate(y_proba) precision, recall, _ = precision_recall_curve(y_real, y_proba) lab = 'Overall AUC=%.4f' % (auc(recall, precision)) ax1.step(recall, precision, label=lab, lw=2, color='black') ax1.set_xlabel('Recall') ax1.set_ylabel('Precision') ax1.legend(loc='lower left', fontsize='small') plt.show() return clf