def setup_kf(self, k=3, k_type=None): self.k_type = k_type if self.k_type == None: self.kf = KFold(self.n, n_folds=k, indices=True) self.k = k elif self.k_type == 'taz_ids': import pickle with open('%s/%s/taz_ids.pkl' % (c.DATA_DIR, c.ESTIMATION_INFO_DIR)) as f: ids = pickle.load(f) labels = [int(id) for (ind, id) in ids if ind not in self.nz] self.kf = LeaveOneLabelOut(labels=labels) self.k = self.kf.n_unique_labels elif self.k_type == 'city_ids': import pickle with open('%s/%s/city_ids.pkl' % (c.DATA_DIR, c.ESTIMATION_INFO_DIR)) as f: ids = pickle.load(f) # FIXME caution, cities with no id are all grouped together labels = [ int(id) if id else 0 for (ind, id) in ids if ind not in self.nz ] self.kf = LeaveOneLabelOut(labels=labels) self.k = self.kf.n_unique_labels elif self.k_type == 'street_names': import pickle with open('%s/%s/street_names.pkl' % (c.DATA_DIR, c.ESTIMATION_INFO_DIR)) as f: ids = pickle.load(f) labels = [id for (ind, id) in ids if ind not in self.nz] self.k = k unique_labels = list(set(labels)) nunique_labels = len(unique_labels) name_to_b_ind = [[ind for ind,name in enumerate(labels) if \ name == label] for label in unique_labels] kf = KFold(nunique_labels, n_folds=k, indices=True) self.kf = [] for (train, test) in kf: train_temp = [name_to_b_ind[t] for t in train] train_temp = [ item for sublist in train_temp for item in sublist ] test_temp = [name_to_b_ind[t] for t in test] test_temp = [item for sublist in test_temp for item in sublist] self.kf.append((train_temp, test_temp)) self.iters = [None] * self.k self.times = [None] * self.k self.states = [None] * self.k
def getProbsGSThread(nthread, clf, data, label, allAuthors, modeldir, saveModel): lolo = LeaveOneLabelOut(label) prob_per_author = [[0]*(len(allAuthors)+3) for i in range(len(allAuthors)+3)] scores = Parallel(n_jobs=nthread, verbose=5)(delayed(getProbsTrainTest)(clf, data, label, train, test, modeldir, saveModel) for train,test in lolo) #print (scores) for train, test in lolo: anAuthor = int(label[test[0]]) #print (anAuthor) train_data_label = label[train] trainAuthors = list(set(train_data_label)) test_data_label = label[test] nTestDoc = len(test_data_label) for j in range(nTestDoc): for i in range(len(trainAuthors)): prob_per_author[anAuthor][int(trainAuthors[i])]+= scores[anAuthor-1][j][i] for i in range(len(trainAuthors)): prob_per_author[anAuthor][int(trainAuthors[i])]/=nTestDoc return prob_per_author
def _fit_predict(clf, X, y_true, labels, params): y_pred = np.empty_like(y_true) clf = clone(clf).set_params(**params) for train, test in LeaveOneLabelOut(labels): clf.fit(X[train], y_true[train]) y_pred[test] = clf.predict(X[test]) return y_pred
def SVC_decode_full(X, Y, n_times, epochs, indexcountlist): clf = SVC(C=1, kernel='linear') # Define a monte-carlo cross-validation generator (reduce variance): #cv = ShuffleSplit(len(X), 10, test_size=0.2,random_state=0) cv = LeaveOneLabelOut(labels=np.concatenate(indexcountlist)) # cv = LabelShuffleSplit(np.concatenate(indexcountlist), test_size=4,random_state=0) cm_return = np.ones((len(comb), len(comb))) * (1 / len(comb)) Xa = X.reshape(X.shape[0], X.shape[1] * X.shape[2]) # Standardize features Xa -= Xa.mean(axis=0) Xa /= Xa.std(axis=0) # Run cross-validation # Note : for sklearn the Xt matrix should be 2d (n_samples x n_features) preds = np.empty(len(Y)) for train, test in cv: clf.fit(Xa[train], Y[train]) preds[test] = clf.predict(Xa[test]) # Normalized confusion matrix cm = confusion_matrix(Y, preds) cm_normalized = cm.astype(float) / cm.sum(axis=1)[:, np.newaxis] cm_return[:, :] = cm_normalized print 'done' return cm_return
def instantiate_grid_search_models(data, n_jobs=1, scoring='f1_micro', sample_label_column='screen_number'): """Instantiates each GridSearchCV model Args: data (pd.DataFrame): data to train models on n_jobs (int): how many threads to used scoring (str): metric to optimize with grid search label_column (str): column to use for LeaveOneLabelOut CV Returns: list of sklearn GridSearchCV models """ cv_settings = { 'verbose': 1, 'n_jobs': n_jobs, 'scoring': scoring, 'cv': LeaveOneLabelOut(data[sample_label_column]) } # K Nearest Neighbors knn_param_grid = {'knn__n_neighbors': [25, 50, 75]} knn_grid_search_cv = GridSearchCV(knn_pipeline(), knn_param_grid, **cv_settings) models = [knn_grid_search_cv] return models
def do_cv(self,Train,xcols='wvl',ycol=('comp','SiO2'),method='PLS'): #TODO: get RANSAC working with CV cv_iterator=LeaveOneLabelOut(Train[('meta','Folds')]) #create an iterator for cross validation based on the predefined folds rmsecv_folds=[] rmsec=[] rmsecv=[] #loop through the grid of parameters, do cross validation for each permutation for i in list(range(len(self.paramgrid))): print(self.paramgrid[i]) #self.modelkey=method+' '+str(self.paramgrid[i]) #create the estimator object with the current parameters model=regression([method],[self.paramgrid[i]]) rmsecv_folds_tmp=[] #Create empty list to hold RMSECV for each fold for train,holdout in cv_iterator: #Iterate through each of the folds in the training set cvcol=('meta',method+'-CV-'+str(self.paramgrid[i]))#ycol[-1]+'_cv_'+method+'_param'+str(i)) #create the name of the column in which results will be stored cv_train=Train.iloc[train] #extract the data to be used to create the model cv_holdout=Train.iloc[holdout] #extract the data that will be held out of the model model.fit(cv_train[xcols],cv_train[ycol]) if model.goodfit: y_pred_holdout=model.predict(cv_holdout[xcols]) else: y_pred_holdout=cv_holdout[ycol]*np.nan Train.set_value(Train.index[holdout],cvcol,y_pred_holdout) rmsecv_folds_tmp.append(RMSE(y_pred_holdout,cv_holdout[ycol])) rmsecv_folds.append(rmsecv_folds_tmp) rmsecv.append(RMSE(Train[ycol],Train[cvcol])) model.fit(Train[xcols],Train[ycol]) if model.goodfit: ypred_train=model.predict(Train[xcols]) pass else: ypred_train=Train[ycol]*np.nan calcol=('meta',method+'-Cal-'+str(self.paramgrid[i])) Train[calcol]=ypred_train rmsec.append(RMSE(ypred_train,Train[ycol])) output=pd.DataFrame(self.paramgrid) output['RMSEC']=rmsec output['RMSECV']=rmsecv rmsecv_folds=np.array(rmsecv_folds) for i in list(range(len(rmsecv_folds[0,:]))): label='Fold'+str(i) output[label]=rmsecv_folds[:,i] return Train,output
def train_test_scale(df, y, users, cols_to_scale, cols_to_combine): for train, test in LeaveOneLabelOut(users): scaler = StandardScaler().fit(df.iloc[train][cols_to_scale].copy()) df_new_train = scale_and_combine(df.iloc[train], scaler, cols_to_scale, cols_to_combine) df_new_test = scale_and_combine(df.iloc[test], scaler, cols_to_scale, cols_to_combine) yield df_new_train, df_new_test, y.iloc[train], y.iloc[test]
def leave_one_label_out(self): """ sklearnのcross_validationにあるLeaveOneLabelOut()を使用して、 年ごとのクロスバリデーション時に使用する。 データをトレーニング用とテスト用に分けるオブジェクトを作成する。 yearはpandasのデータフレーム型かシリーズ型を想定。 """ lol = LeaveOneLabelOut(self.labels) return lol
def main(arglist): args = parse_args(arglist) if args.subjects is None: args.subjects = lyman.determine_subjects() for subj in args.subjects: print "Running subject", subj searchlight_dir = op.join(analysis_dir, "dksort", subj, "mvpa/searchlight") if not op.exists(searchlight_dir): os.mkdir(searchlight_dir) vol_fname = op.join(searchlight_dir, "dimension_dksort_pfc.nii.gz") if "fit" in args.do and (not op.exists(vol_fname) or args.overwrite): print " Doing searchlight" mask_img, X, y, runs = load_data(subj) s = SearchLight(mask_img, radius=10, n_jobs=10, estimator=LogisticRegression(), cv=LeaveOneLabelOut(runs)) s.fit(X, y) out_img = nib.Nifti1Image(s.scores_, s.mask_img.get_affine()) out_img.to_filename(vol_fname) surf_fnames = [ op.join(searchlight_dir, "lh.dimension_dksort_pfc.mgz"), op.join(searchlight_dir, "rh.dimension_dksort_pfc.mgz") ] if "surf" in args.do and (not all(map(op.exists, surf_fnames)) or args.overwrite): print " Doing surfreg" reg_fname = op.join(analysis_dir, "dksort", subj, "preproc/run_1/func2anat_tkreg.dat") for i, hemi in enumerate(["lh", "rh"]): cmdline = [ "mri_vol2surf", "--mov", vol_fname, "--reg", reg_fname, "--trgsubject", "fsaverage", "--projfrac-avg", "0", "1", ".1", "--surf-fwhm", "5", "--hemi", hemi, "--o", surf_fnames[i] ] sp.check_output(" ".join(cmdline), shell=True)
def main(): print('-----------------------------') print('| Active Learning Activated |') print('-----------------------------') X = np.load('X.npy') Y = np.load('Y.npy') Y = Y.astype(np.int64) labels = np.load('LOO.npy') # fixes errors with Nan data X = preprocessing.Imputer().fit_transform(X) print(X.shape, Y.shape) # The feature division is not clear by their column number, # It was attempted intuitively while cross-checking with the # feature_importance attribute to make two equally good subspaces # Features regarding the first classifier clasOneCols = [0, 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 14, 15, 16, 32] clasOneData = X[:, clasOneCols] # Features regarding the second classifier clasTwoCols = [ 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ] clasTwoData = X[:, clasTwoCols] # Training User-Specific Models lolo = LeaveOneLabelOut(labels) for dontcare, users in lolo: np.random.shuffle(users) train = users[0:int(0.5 * len(users))] test = users[int(0.5 * len(users)):len(users)] # Training two classifiers on half of data with different features # for comparison reasons rfrPers1 = RandomForestClassifier(n_estimators=300, n_jobs=-1) rfrPers1.fit(clasOneData[train], Y[train]) rfrPers2 = RandomForestClassifier(n_estimators=300, n_jobs=-1) rfrPers2.fit(clasTwoData[train], Y[train]) pred1 = rfrPers1.predict(clasOneData[test]) print(tolAcc(Y[test], pred1)) pred2 = rfrPers2.predict(clasTwoData[test]) print(tolAcc(Y[test], pred2)) pred1 = pred1.astype(np.int64) pred2 = pred2.astype(np.int64) # To what extent do they agree is calculated by activeLabeling() activeLabeling(Y[test], pred1, pred2)
def _crossValidate(self, y_train, X_train, refit=False): # Run the grid search print "Cross-validating" # for", self.numFolds, "folds" print "Args", self.classifierArgs cv = LeaveOneLabelOut(self._getTrainGroups()) #cv = StratifiedKFold(y_train, n_folds=self.numFolds, shuffle=True, random_state=1) #self.getCV(y_train, self.meta.meta, numFolds=self.numFolds) #cv = BalancedIteratorCV(y_train, n_folds=self.numFolds, shuffle=True, random_state=1, examples=[x for x in self.meta.db.query("SELECT * from example WHERE [set] == 'train';")], groupBy="project_code") classifier, classifierArgs = self._getClassifier() metric = self.metric search = ExtendedGridSearchCV(classifier(), classifierArgs, refit=refit, cv=cv, scoring=metric, verbose=self.verbose, n_jobs=self.parallel, pre_dispatch=int(self.preDispatch) if self.preDispatch.isdigit() else self.preDispatch) search.fit(X_train, y_train) print "---------------------- Grid scores on development set --------------------------" results = [] index = 0 bestExtras = None bestScores = None for params, mean_score, scores in search.grid_scores_: print "Grid:", params results.append(self._getResult("train", classifier, cv, params, None, None, mean_score, scores, extra={"train_size":None, "test_size":None})) if bestScores == None or float(mean_score) > bestScores[1]: bestScores = (params, mean_score, scores) if hasattr(search, "extras_"): bestExtras = search.extras_[index] for fold in range(len(scores)): result = self._getResult("train", classifier, cv, params, scores[fold], fold) if hasattr(search, "extras_"): for key in search.extras_[index][fold].get("counts", {}).keys(): result[key + "_size"] = search.extras_[index][fold]["counts"][key] results.append(result) if hasattr(search, "extras_") and self.classes and len(self.classes) == 2: print ["%0.8f" % x for x in self._validateExtras(search.extras_[index], y_train)], "(eval:auc)" print scores, "(" + self.metric + ")" print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params) index += 1 print "---------------------- Best scores on development set --------------------------" params, mean_score, scores = bestScores print scores print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params) print "--------------------------------------------------------------------------------" # Save the grid search results print "Saving results" self._insert("result", results) self._saveExtras(bestExtras, "train") self.db.flush() return search
def set_cv(self, cv_dict): """ Set the CV algorithm to use in subsequent prediction analyses. Args: cv_dict: Type of cross_validation to use. A dictionary of {'kfold',5} or {'loso':subject_id}. """ if type(cv_dict) is dict: if cv_dict.keys()[0] is 'kfolds': from sklearn.cross_validation import StratifiedKFold self.cv = StratifiedKFold(self.Y, n_folds=cv_dict.values()[0]) elif cv_dict.keys()[0] is 'loso': from sklearn.cross_validation import LeaveOneLabelOut self.cv = LeaveOneLabelOut(labels=cv_dict.values()[0]) else: raise ValueError( "Make sure you specify a dictionary of {'kfold',5} or {'loso':subject_id}." ) else: raise ValueError("Make sure 'cv_dict' is a dictionary.")
#selector.pvalues_[selector.pvalues_<1e-200]=1e-200 #scores = -np.log10(selector.pvalues_) #scores /= scores.max() #print "selected factors:\n" #support=selector.get_support(indices=True) #selected= header[selector.get_support(indices=True)] #for i,item in enumerate(selected): # print item," ",scores[support[i]] #print X.shape #X=selector.transform(X) #print X.shape # ridge lol = LeaveOneLabelOut(offers) lop = LeavePLabelOut(offers, len(offers_set) - 1) cvset = lop rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, oob_score=True, max_depth=5, min_samples_leaf=5) #0.675 xrf = ExtraTreesClassifier(n_estimators=100, n_jobs=-1) #0.662 dt = DecisionTreeClassifier() #0.559 lr = LogisticRegression(C=0.001) # 0.654 gbr = GradientBoostingRegressor() # 0.691 #gbr1=GradientBoostingRegressor(loss="quantile",alpha=0.6) # 0.542 gbr1 = GradientBoostingRegressor(loss='quantile', alpha=0.5) #0.544 gbc = GradientBoostingClassifier() #0.692
def Run_Regression_Model(df, reg, cv_num, ALG, df_unknowns, test_df, cv_sets, j): from sklearn.model_selection import cross_val_predict from sklearn.metrics.scorer import make_scorer from sklearn.metrics import mean_squared_error, r2_score from sklearn.metrics import explained_variance_score # Data from balanced dataframe y = df['Y'] X = df.drop(['Y'], axis=1) # Obtain the predictions using 10 fold cross validation # (uses KFold cv by default): if isinstance(cv_sets, pd.DataFrame): from sklearn.cross_validation import LeaveOneLabelOut cv_folds = LeaveOneLabelOut(cv_sets.iloc[:, j]) cv_pred = cross_val_predict(estimator=reg, X=X, y=y, cv=cv_folds) else: cv_pred = cross_val_predict(estimator=reg, X=X, y=y, cv=cv_num) cv_pred_df = pd.DataFrame(data=cv_pred, index=df.index, columns=['pred']) # Get performance statistics from cross-validation y = y.astype(float) mse = mean_squared_error(y, cv_pred) evs = explained_variance_score(y, cv_pred) r2 = r2_score(y, cv_pred) cor = np.corrcoef(np.array(y), cv_pred) result = [mse, evs, r2, cor[0, 1]] reg.fit(X, y) # Apply fit model to unknowns if isinstance(df_unknowns, pd.DataFrame): unk_pred = reg.predict(df_unknowns.drop(['Y'], axis=1)) unk_pred_df = pd.DataFrame(data=unk_pred, index=df_unknowns.index, columns=['pred']) cv_pred_df = cv_pred_df.append(unk_pred_df) if not isinstance(test_df, str): test_y = test_df['Y'] test_pred = reg.predict(test_df.drop(['Y'], axis=1)) test_pred_df = pd.DataFrame(data=test_pred, index=test_df.index, columns=['pred']) cv_pred_df = cv_pred_df.append(test_pred_df) # Get performance stats mse_test = mean_squared_error(test_y, test_pred) evs_test = explained_variance_score(test_y, test_pred) r2_test = r2_score(test_y, test_pred) cor_test = np.corrcoef(np.array(test_y), test_pred) result_test = [mse_test, evs_test, r2_test, cor_test[0, 1]] # Try to extract importance scores try: importances = reg.feature_importances_ except: try: importances = reg.coef_ except: importances = "na" print("Cannot get importance scores") if not isinstance(test_df, str): return result, cv_pred_df, importances, result_test else: return result, cv_pred_df, importances
session_labels = labels["chunks"][resting_state == False] # The classifier: a support vector classifier from sklearn.svm import SVC classifier = SVC(C=1., kernel="linear") # A classifier to set the chance level from sklearn.dummy import DummyClassifier dummy_classifier = DummyClassifier() # Make a data splitting object for cross validation from sklearn.cross_validation import LeaveOneLabelOut, cross_val_score cv = LeaveOneLabelOut(session_labels) mask_names = ['mask_vt', 'mask_face', 'mask_house'] mask_scores = {} mask_chance_scores = {} for mask_name in mask_names: print "Working on mask %s" % mask_name # For decoding, standardizing is often very important masker = NiftiMasker(mask_img=data_files[mask_name][0], standardize=True) masked_timecourses = masker.fit_transform( data_files.func[0])[resting_state == False] mask_scores[mask_name] = {} mask_chance_scores[mask_name] = {}
preds_test = None # switch to add subjects if addSubjectID: dataTest = np.c_[dataTest, subjects_test] # get predictions p = np.zeros((len(ids),6)) for k in range(nbags): print(("Test Bag #%d" % (k+1))) model = all_models.pop(0) p += model.predict_proba(dataTest) / nbags np.save('test/test_%s.npy' % fileName, [p]) else: auc_tot = [] p = np.zeros(labels.shape) cv = LeaveOneLabelOut(series) for fold, (train, test) in enumerate(cv): for k in range(nbags): print(("Train Bag #%d/%d" % (k+1, nbags))) allsubjects = np.arange(1,13) np.random.shuffle(allsubjects) ix_subjects = np.sum([subjects[train]==s for s in allsubjects[0:bagsize]], axis=0) != 0 model = deepcopy(model_base) model.mdlNr = k if modelName == 'NeuralNet': model.fit(dataTrain[train[ix_subjects]], labels[train[ix_subjects]], dataTrain[test], labels[test]) else: model.fit(dataTrain[train[ix_subjects]], labels[train[ix_subjects]]) p[test] += model.predict_proba(dataTrain[test]) / nbags auc = [roc_auc_score(labels[test][:, col], p[test][:, col])
def test_generalization_across_time(): """Test time generalization decoding """ from sklearn.svm import SVC from sklearn.base import is_classifier # KernelRidge is used for testing 1) regression analyses 2) n-dimensional # predictions. from sklearn.kernel_ridge import KernelRidge from sklearn.preprocessing import LabelEncoder from sklearn.metrics import roc_auc_score, mean_squared_error epochs = make_epochs() y_4classes = np.hstack((epochs.events[:7, 2], epochs.events[7:, 2] + 1)) if check_version('sklearn', '0.18'): from sklearn.model_selection import (KFold, StratifiedKFold, ShuffleSplit, LeaveOneGroupOut) cv = LeaveOneGroupOut() cv_shuffle = ShuffleSplit() # XXX we cannot pass any other parameters than X and y to cv.split # so we have to build it before hand cv_lolo = [ (train, test) for train, test in cv.split(y_4classes, y_4classes, y_4classes) ] # With sklearn >= 0.17, `clf` can be identified as a regressor, and # the scoring metrics can therefore be automatically assigned. scorer_regress = None else: from sklearn.cross_validation import (KFold, StratifiedKFold, ShuffleSplit, LeaveOneLabelOut) cv_shuffle = ShuffleSplit(len(epochs)) cv_lolo = LeaveOneLabelOut(y_4classes) # With sklearn < 0.17, `clf` cannot be identified as a regressor, and # therefore the scoring metrics cannot be automatically assigned. scorer_regress = mean_squared_error # Test default running with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(picks='foo') assert_equal("<GAT | no fit, no prediction, no score>", "%s" % gat) assert_raises(ValueError, gat.fit, epochs) with warnings.catch_warnings(record=True): # check classic fit + check manual picks gat.picks = [0] gat.fit(epochs) # check optional y as array gat.picks = None gat.fit(epochs, y=epochs.events[:, 2]) # check optional y as list gat.fit(epochs, y=epochs.events[:, 2].tolist()) assert_equal(len(gat.picks_), len(gat.ch_names), 1) assert_equal( "<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), no " "prediction, no score>", '%s' % gat) assert_equal(gat.ch_names, epochs.ch_names) # test different predict function: with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(predict_method='decision_function') gat.fit(epochs) # With classifier, the default cv is StratifiedKFold assert_true(gat.cv_.__class__ == StratifiedKFold) gat.predict(epochs) assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 1)) gat.predict_method = 'predict_proba' gat.predict(epochs) assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 2)) gat.predict_method = 'foo' assert_raises(NotImplementedError, gat.predict, epochs) gat.predict_method = 'predict' gat.predict(epochs) assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 1)) assert_equal( "<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), " "predicted 14 epochs, no score>", "%s" % gat) gat.score(epochs) assert_true(gat.scorer_.__name__ == 'accuracy_score') # check clf / predict_method combinations for which the scoring metrics # cannot be inferred. gat.scorer = None gat.predict_method = 'decision_function' assert_raises(ValueError, gat.score, epochs) # Check specifying y manually gat.predict_method = 'predict' gat.score(epochs, y=epochs.events[:, 2]) gat.score(epochs, y=epochs.events[:, 2].tolist()) assert_equal( "<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), " "predicted 14 epochs,\n scored " "(accuracy_score)>", "%s" % gat) with warnings.catch_warnings(record=True): gat.fit(epochs, y=epochs.events[:, 2]) old_mode = gat.predict_mode gat.predict_mode = 'super-foo-mode' assert_raises(ValueError, gat.predict, epochs) gat.predict_mode = old_mode gat.score(epochs, y=epochs.events[:, 2]) assert_true("accuracy_score" in '%s' % gat.scorer_) epochs2 = epochs.copy() # check _DecodingTime class assert_equal( "<DecodingTime | start: -0.200 (s), stop: 0.499 (s), step: " "0.050 (s), length: 0.050 (s), n_time_windows: 15>", "%s" % gat.train_times_) assert_equal( "<DecodingTime | start: -0.200 (s), stop: 0.499 (s), step: " "0.050 (s), length: 0.050 (s), n_time_windows: 15 x 15>", "%s" % gat.test_times_) # the y-check gat.predict_mode = 'mean-prediction' epochs2.events[:, 2] += 10 gat_ = copy.deepcopy(gat) with use_log_level('error'): assert_raises(ValueError, gat_.score, epochs2) gat.predict_mode = 'cross-validation' # Test basics # --- number of trials assert_true(gat.y_train_.shape[0] == gat.y_true_.shape[0] == len( gat.y_pred_[0][0]) == 14) # --- number of folds assert_true(np.shape(gat.estimators_)[1] == gat.cv) # --- length training size assert_true( len(gat.train_times_['slices']) == 15 == np.shape(gat.estimators_)[0]) # --- length testing sizes assert_true( len(gat.test_times_['slices']) == 15 == np.shape(gat.scores_)[0]) assert_true( len(gat.test_times_['slices'][0]) == 15 == np.shape(gat.scores_)[1]) # Test score_mode gat.score_mode = 'foo' assert_raises(ValueError, gat.score, epochs) gat.score_mode = 'fold-wise' scores = gat.score(epochs) assert_array_equal(np.shape(scores), [15, 15, 5]) gat.score_mode = 'mean-sample-wise' scores = gat.score(epochs) assert_array_equal(np.shape(scores), [15, 15]) gat.score_mode = 'mean-fold-wise' scores = gat.score(epochs) assert_array_equal(np.shape(scores), [15, 15]) gat.predict_mode = 'mean-prediction' with warnings.catch_warnings(record=True) as w: gat.score(epochs) assert_true( any("score_mode changed from " in str(ww.message) for ww in w)) # Test longer time window with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(train_times={'length': .100}) with warnings.catch_warnings(record=True): gat2 = gat.fit(epochs) assert_true(gat is gat2) # return self assert_true(hasattr(gat2, 'cv_')) assert_true(gat2.cv_ != gat.cv) with warnings.catch_warnings(record=True): # not vectorizing scores = gat.score(epochs) assert_true(isinstance(scores, np.ndarray)) # type check assert_equal(len(scores[0]), len(scores)) # shape check assert_equal(len(gat.test_times_['slices'][0][0]), 2) # Decim training steps with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(train_times={'step': .100}) with warnings.catch_warnings(record=True): gat.fit(epochs) gat.score(epochs) assert_true(len(gat.scores_) == len(gat.estimators_) == 8) # training time assert_equal(len(gat.scores_[0]), 15) # testing time # Test start stop training & test cv without n_fold params y_4classes = np.hstack((epochs.events[:7, 2], epochs.events[7:, 2] + 1)) train_times = dict(start=0.090, stop=0.250) with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(cv=cv_lolo, train_times=train_times) # predict without fit assert_raises(RuntimeError, gat.predict, epochs) with warnings.catch_warnings(record=True): gat.fit(epochs, y=y_4classes) gat.score(epochs) assert_equal(len(gat.scores_), 4) assert_equal(gat.train_times_['times'][0], epochs.times[6]) assert_equal(gat.train_times_['times'][-1], epochs.times[9]) # Test score without passing epochs & Test diagonal decoding with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(test_times='diagonal') with warnings.catch_warnings(record=True): # not vectorizing gat.fit(epochs) assert_raises(RuntimeError, gat.score) with warnings.catch_warnings(record=True): # not vectorizing gat.predict(epochs) scores = gat.score() assert_true(scores is gat.scores_) assert_equal(np.shape(gat.scores_), (15, 1)) assert_array_equal( [tim for ttime in gat.test_times_['times'] for tim in ttime], gat.train_times_['times']) # Test generalization across conditions with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(predict_mode='mean-prediction', cv=2) with warnings.catch_warnings(record=True): gat.fit(epochs[0:6]) with warnings.catch_warnings(record=True): # There are some empty test folds because of n_trials gat.predict(epochs[7:]) gat.score(epochs[7:]) # Test training time parameters gat_ = copy.deepcopy(gat) # --- start stop outside time range gat_.train_times = dict(start=-999.) with use_log_level('error'): assert_raises(ValueError, gat_.fit, epochs) gat_.train_times = dict(start=999.) assert_raises(ValueError, gat_.fit, epochs) # --- impossible slices gat_.train_times = dict(step=.000001) assert_raises(ValueError, gat_.fit, epochs) gat_.train_times = dict(length=.000001) assert_raises(ValueError, gat_.fit, epochs) gat_.train_times = dict(length=999.) assert_raises(ValueError, gat_.fit, epochs) # Test testing time parameters # --- outside time range gat.test_times = dict(start=-999.) with warnings.catch_warnings(record=True): # no epochs in fold assert_raises(ValueError, gat.predict, epochs) gat.test_times = dict(start=999.) with warnings.catch_warnings(record=True): # no test epochs assert_raises(ValueError, gat.predict, epochs) # --- impossible slices gat.test_times = dict(step=.000001) with warnings.catch_warnings(record=True): # no test epochs assert_raises(ValueError, gat.predict, epochs) gat_ = copy.deepcopy(gat) gat_.train_times_['length'] = .000001 gat_.test_times = dict(length=.000001) with warnings.catch_warnings(record=True): # no test epochs assert_raises(ValueError, gat_.predict, epochs) # --- test time region of interest gat.test_times = dict(step=.150) with warnings.catch_warnings(record=True): # not vectorizing gat.predict(epochs) assert_array_equal(np.shape(gat.y_pred_), (15, 5, 14, 1)) # --- silly value gat.test_times = 'foo' with warnings.catch_warnings(record=True): # no test epochs assert_raises(ValueError, gat.predict, epochs) assert_raises(RuntimeError, gat.score) # --- unmatched length between training and testing time gat.test_times = dict(length=.150) assert_raises(ValueError, gat.predict, epochs) # --- irregular length training and testing times # 2 estimators, the first one is trained on two successive time samples # whereas the second one is trained on a single time sample. train_times = dict(slices=[[0, 1], [1]]) # The first estimator is tested once, the second estimator is tested on # two successive time samples. test_times = dict(slices=[[[0, 1]], [[0], [1]]]) with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(train_times=train_times, test_times=test_times) gat.fit(epochs) with warnings.catch_warnings(record=True): # not vectorizing gat.score(epochs) assert_array_equal(np.shape(gat.y_pred_[0]), [1, len(epochs), 1]) assert_array_equal(np.shape(gat.y_pred_[1]), [2, len(epochs), 1]) # check cannot Automatically infer testing times for adhoc training times gat.test_times = None assert_raises(ValueError, gat.predict, epochs) svc = SVC(C=1, kernel='linear', probability=True) with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(clf=svc, predict_mode='mean-prediction') with warnings.catch_warnings(record=True): gat.fit(epochs) # sklearn needs it: c.f. # https://github.com/scikit-learn/scikit-learn/issues/2723 # and http://bit.ly/1u7t8UT with use_log_level('error'): assert_raises(ValueError, gat.score, epochs2) gat.score(epochs) assert_true(0.0 <= np.min(scores) <= 1.0) assert_true(0.0 <= np.max(scores) <= 1.0) # Test that error if cv is not partition with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(cv=cv_shuffle, predict_mode='cross-validation') gat.fit(epochs) assert_raises(ValueError, gat.predict, epochs) with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(cv=cv_shuffle, predict_mode='mean-prediction') gat.fit(epochs) gat.predict(epochs) # Test that gets error if train on one dataset, test on another, and don't # specify appropriate cv: with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime() gat.fit(epochs) with warnings.catch_warnings(record=True): gat.fit(epochs) gat.predict(epochs) assert_raises(ValueError, gat.predict, epochs[:10]) # Make CV with some empty train and test folds: # --- empty test fold(s) should warn when gat.predict() gat._cv_splits[0] = [gat._cv_splits[0][0], np.empty(0)] with warnings.catch_warnings(record=True) as w: gat.predict(epochs) assert_true(len(w) > 0) assert_true( any('do not have any test epochs' in str(ww.message) for ww in w)) # --- empty train fold(s) should raise when gat.fit() with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(cv=[([0], [1]), ([], [0])]) assert_raises(ValueError, gat.fit, epochs[:2]) # Check that still works with classifier that output y_pred with # shape = (n_trials, 1) instead of (n_trials,) if check_version('sklearn', '0.17'): # no is_regressor before v0.17 with warnings.catch_warnings(record=True): # dep gat = GeneralizationAcrossTime(clf=KernelRidge(), cv=2) epochs.crop(None, epochs.times[2]) gat.fit(epochs) # With regression the default cv is KFold and not StratifiedKFold assert_true(gat.cv_.__class__ == KFold) gat.score(epochs) # with regression the default scoring metrics is mean squared error assert_true(gat.scorer_.__name__ == 'mean_squared_error') # Test combinations of complex scenarios # 2 or more distinct classes n_classes = [2, 4] # 4 tested # nicely ordered labels or not le = LabelEncoder() y = le.fit_transform(epochs.events[:, 2]) y[len(y) // 2:] += 2 ys = (y, y + 1000) # Univariate and multivariate prediction svc = SVC(C=1, kernel='linear', probability=True) reg = KernelRidge() def scorer_proba(y_true, y_pred): return roc_auc_score(y_true, y_pred[:, 0]) # We re testing 3 scenario: default, classifier + predict_proba, regressor scorers = [None, scorer_proba, scorer_regress] predict_methods = [None, 'predict_proba', None] clfs = [svc, svc, reg] # Test all combinations for clf, predict_method, scorer in zip(clfs, predict_methods, scorers): for y in ys: for n_class in n_classes: for predict_mode in ['cross-validation', 'mean-prediction']: # Cannot use AUC for n_class > 2 if (predict_method == 'predict_proba' and n_class != 2): continue y_ = y % n_class with warnings.catch_warnings(record=True): gat = GeneralizationAcrossTime( cv=2, clf=clf, scorer=scorer, predict_mode=predict_mode) gat.fit(epochs, y=y_) gat.score(epochs, y=y_) # Check that scorer is correctly defined manually and # automatically. scorer_name = gat.scorer_.__name__ if scorer is None: if is_classifier(clf): assert_equal(scorer_name, 'accuracy_score') else: assert_equal(scorer_name, 'mean_squared_error') else: assert_equal(scorer_name, scorer.__name__)
def randomize_classifier(data, model, n_iter=1000, cv_method="run", random_seed=None, return_dist=False, dv=None): """Randomly shuffle class labels to build a null distribution of accuracy. Randimization can be distributed over an IPython cluster using the ``dv`` argument. Otherwise, it runs in serial. Parameters ---------- data : dict single-subject dataset dictionary model : scikit-learn estimator model object to fit n_iter : int number of permutation iterations cv_method : run | sample | cv arg for cross_val_score cross validate over runs, over samples (leave-one-out) or otherwise something that can be provided to the cv argument for sklearn.cross_val_score random_state : int seed for random state to obtain stable permutations return_dist : bool if True, return null distribution dv : IPython direct view view onto IPython cluster for parallel execution over iterations Returns ------- p_vals : n_tp array array of one-sided p values for observed classification scores against the empirical null distribution null_dist : n_iter x n_tp array array of null model scores, only if asked for it """ # Import sklearn here to relieve moss dependency on it from sklearn.cross_validation import (cross_val_score, LeaveOneOut, LeaveOneLabelOut) if dv is None: from six.moves import builtins _map = builtins.map else: _map = dv.map_sync # Set up the data properly X = data["X"] y = data["y"] runs = data["runs"] if cv_method == "run": cv = LeaveOneLabelOut(runs) elif cv_method == "sample": cv = LeaveOneOut(len(y)) else: cv = cv_method if X.ndim < 3: X = [X] def _perm_decode(model, X, y, cv, perm): """Internal func for parallel purposes.""" y_perm = y[perm] perm_acc = cross_val_score(model, X, y_perm, cv=cv).mean() return perm_acc # Make lists to send into map() model_p = [model for i in range(n_iter)] y_p = [y for i in range(n_iter)] cv_p = [cv for i in range(n_iter)] # Permute within run rs = np.random.RandomState(random_seed) perms = [] for i in range(n_iter): perm_i = [] for run in np.unique(runs): perm_r = rs.permutation(np.sum(runs == run)) perm_r += np.sum(runs == run - 1) perm_i.append(perm_r) perms.append(np.concatenate(perm_i)) # Actually do the permutations, possibly in parallel null_dist = [] for X_i in X: X_p = [X_i for i in range(n_iter)] tr_scores = list(_map(_perm_decode, model_p, X_p, y_p, cv_p, perms)) null_dist.append(tr_scores) null_dist = np.array(null_dist).T # Calculate a p value for each TR p_vals = [] for i, dist_i in enumerate(null_dist.T): acc_i = cross_val_score(model, X[i], y, cv=cv).mean() p_i = 1 - percentile_score(dist_i, acc_i) / 100 p_vals.append(p_i) p_vals = np.array(p_vals) if return_dist: return p_vals, null_dist return p_vals
data,labels=prepare_data_train(fname) raw.append(data) y_raw.append(labels) sequence.extend([ser]*len(data)) X = pd.concat(raw) y = pd.concat(y_raw) #transform in numpy array #transform train data in numpy array X = np.asarray(X.astype(float)) y = np.asarray(y.astype(float)) sequence = np.asarray(sequence) ################ Train classifiers ######################################## cv = LeaveOneLabelOut(sequence) pred = np.empty((X.shape[0],6)) for train, test in cv: X_train = X[train] X_test = X[test] y_train = y[train] #apply preprocessing X_train=data_preprocess_train(X_train) X_test=data_preprocess_test(X_test) clfs = Parallel(n_jobs=6)(delayed(fit)(X_train[::subsample,:],y_train[::subsample,i]) for i in range(6)) preds = Parallel(n_jobs=6)(delayed(predict)(clfs[i],X_test) for i in range(6)) pred[test,:] = np.concatenate(preds,axis=1) pred_tot.append(pred) y_tot.append(y) # get AUC
def test_generalization_across_time(): """Test time generalization decoding """ from sklearn.svm import SVC from sklearn.linear_model import RANSACRegressor, LinearRegression from sklearn.preprocessing import LabelEncoder from sklearn.metrics import mean_squared_error from sklearn.cross_validation import LeaveOneLabelOut epochs = make_epochs() # Test default running gat = GeneralizationAcrossTime(picks='foo') assert_equal("<GAT | no fit, no prediction, no score>", "%s" % gat) assert_raises(ValueError, gat.fit, epochs) with warnings.catch_warnings(record=True): # check classic fit + check manual picks gat.picks = [0] gat.fit(epochs) # check optional y as array gat.picks = None gat.fit(epochs, y=epochs.events[:, 2]) # check optional y as list gat.fit(epochs, y=epochs.events[:, 2].tolist()) assert_equal(len(gat.picks_), len(gat.ch_names), 1) assert_equal( "<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), no " "prediction, no score>", '%s' % gat) assert_equal(gat.ch_names, epochs.ch_names) gat.predict(epochs) assert_equal( "<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), " "predicted 14 epochs, no score>", "%s" % gat) gat.score(epochs) gat.score(epochs, y=epochs.events[:, 2]) gat.score(epochs, y=epochs.events[:, 2].tolist()) assert_equal( "<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), " "predicted 14 epochs,\n scored " "(accuracy_score)>", "%s" % gat) with warnings.catch_warnings(record=True): gat.fit(epochs, y=epochs.events[:, 2]) old_mode = gat.predict_mode gat.predict_mode = 'super-foo-mode' assert_raises(ValueError, gat.predict, epochs) gat.predict_mode = old_mode gat.score(epochs, y=epochs.events[:, 2]) assert_true("accuracy_score" in '%s' % gat.scorer_) epochs2 = epochs.copy() # check _DecodingTime class assert_equal( "<DecodingTime | start: -0.200 (s), stop: 0.499 (s), step: " "0.047 (s), length: 0.047 (s), n_time_windows: 15>", "%s" % gat.train_times_) assert_equal( "<DecodingTime | start: -0.200 (s), stop: 0.499 (s), step: " "0.047 (s), length: 0.047 (s), n_time_windows: 15 x 15>", "%s" % gat.test_times_) # the y-check gat.predict_mode = 'mean-prediction' epochs2.events[:, 2] += 10 gat_ = copy.deepcopy(gat) assert_raises(ValueError, gat_.score, epochs2) gat.predict_mode = 'cross-validation' # Test basics # --- number of trials assert_true(gat.y_train_.shape[0] == gat.y_true_.shape[0] == len( gat.y_pred_[0][0]) == 14) # --- number of folds assert_true(np.shape(gat.estimators_)[1] == gat.cv) # --- length training size assert_true( len(gat.train_times_['slices']) == 15 == np.shape(gat.estimators_)[0]) # --- length testing sizes assert_true( len(gat.test_times_['slices']) == 15 == np.shape(gat.scores_)[0]) assert_true( len(gat.test_times_['slices'][0]) == 15 == np.shape(gat.scores_)[1]) # Test longer time window gat = GeneralizationAcrossTime(train_times={'length': .100}) with warnings.catch_warnings(record=True): gat2 = gat.fit(epochs) assert_true(gat is gat2) # return self assert_true(hasattr(gat2, 'cv_')) assert_true(gat2.cv_ != gat.cv) scores = gat.score(epochs) assert_true(isinstance(scores, list)) # type check assert_equal(len(scores[0]), len(scores)) # shape check assert_equal(len(gat.test_times_['slices'][0][0]), 2) # Decim training steps gat = GeneralizationAcrossTime(train_times={'step': .100}) with warnings.catch_warnings(record=True): gat.fit(epochs) gat.score(epochs) assert_true(len(gat.scores_) == len(gat.estimators_) == 8) # training time assert_equal(len(gat.scores_[0]), 15) # testing time # Test start stop training & test cv without n_fold params y_4classes = np.hstack((epochs.events[:7, 2], epochs.events[7:, 2] + 1)) gat = GeneralizationAcrossTime(cv=LeaveOneLabelOut(y_4classes), train_times={ 'start': 0.090, 'stop': 0.250 }) # predict without fit assert_raises(RuntimeError, gat.predict, epochs) with warnings.catch_warnings(record=True): gat.fit(epochs, y=y_4classes) gat.score(epochs) assert_equal(len(gat.scores_), 4) assert_equal(gat.train_times_['times'][0], epochs.times[6]) assert_equal(gat.train_times_['times'][-1], epochs.times[9]) # Test score without passing epochs & Test diagonal decoding gat = GeneralizationAcrossTime(test_times='diagonal') with warnings.catch_warnings(record=True): gat.fit(epochs) assert_raises(RuntimeError, gat.score) gat.predict(epochs) scores = gat.score() assert_true(scores is gat.scores_) assert_equal(np.shape(gat.scores_), (15, 1)) assert_array_equal( [tim for ttime in gat.test_times_['times'] for tim in ttime], gat.train_times_['times']) # Test generalization across conditions gat = GeneralizationAcrossTime(predict_mode='mean-prediction') with warnings.catch_warnings(record=True): gat.fit(epochs[0:6]) gat.predict(epochs[7:]) gat.score(epochs[7:]) # Test training time parameters gat_ = copy.deepcopy(gat) # --- start stop outside time range gat_.train_times = dict(start=-999.) assert_raises(ValueError, gat_.fit, epochs) gat_.train_times = dict(start=999.) assert_raises(ValueError, gat_.fit, epochs) # --- impossible slices gat_.train_times = dict(step=.000001) assert_raises(ValueError, gat_.fit, epochs) gat_.train_times = dict(length=.000001) assert_raises(ValueError, gat_.fit, epochs) gat_.train_times = dict(length=999.) assert_raises(ValueError, gat_.fit, epochs) # Test testing time parameters # --- outside time range gat.test_times = dict(start=-999.) assert_raises(ValueError, gat.predict, epochs) gat.test_times = dict(start=999.) assert_raises(ValueError, gat.predict, epochs) # --- impossible slices gat.test_times = dict(step=.000001) assert_raises(ValueError, gat.predict, epochs) gat_ = copy.deepcopy(gat) gat_.train_times_['length'] = .000001 gat_.test_times = dict(length=.000001) assert_raises(ValueError, gat_.predict, epochs) # --- test time region of interest gat.test_times = dict(step=.150) gat.predict(epochs) assert_array_equal(np.shape(gat.y_pred_), (15, 5, 14, 1)) # --- silly value gat.test_times = 'foo' assert_raises(ValueError, gat.predict, epochs) assert_raises(RuntimeError, gat.score) # --- unmatched length between training and testing time gat.test_times = dict(length=.150) assert_raises(ValueError, gat.predict, epochs) svc = SVC(C=1, kernel='linear', probability=True) gat = GeneralizationAcrossTime(clf=svc, predict_mode='mean-prediction') with warnings.catch_warnings(record=True): gat.fit(epochs) # sklearn needs it: c.f. # https://github.com/scikit-learn/scikit-learn/issues/2723 # and http://bit.ly/1u7t8UT assert_raises(ValueError, gat.score, epochs2) gat.score(epochs) scores = sum(scores, []) # flatten assert_true(0.0 <= np.min(scores) <= 1.0) assert_true(0.0 <= np.max(scores) <= 1.0) # Test that gets error if train on one dataset, test on another, and don't # specify appropriate cv: gat = GeneralizationAcrossTime() with warnings.catch_warnings(record=True): gat.fit(epochs) gat.predict(epochs) assert_raises(ValueError, gat.predict, epochs[:10]) # Check that still works with classifier that output y_pred with # shape = (n_trials, 1) instead of (n_trials,) gat = GeneralizationAcrossTime(clf=RANSACRegressor(LinearRegression()), cv=2) epochs.crop(None, epochs.times[2]) gat.fit(epochs) gat.predict(epochs) # Test combinations of complex scenarios # 2 or more distinct classes n_classes = [2, 4] # 4 tested # nicely ordered labels or not le = LabelEncoder() y = le.fit_transform(epochs.events[:, 2]) y[len(y) // 2:] += 2 ys = (y, y + 1000) # Univariate and multivariate prediction svc = SVC(C=1, kernel='linear') class SVC_proba(SVC): def predict(self, x): probas = super(SVC_proba, self).predict_proba(x) return probas[:, 0] svcp = SVC_proba(C=1, kernel='linear', probability=True) clfs = [svc, svcp] scorers = [None, mean_squared_error] # Test all combinations for clf, scorer in zip(clfs, scorers): for y in ys: for n_class in n_classes: y_ = y % n_class with warnings.catch_warnings(record=True): gat = GeneralizationAcrossTime(cv=2, clf=clf, scorer=scorer) gat.fit(epochs, y=y_) gat.score(epochs, y=y_)
# In[1]: ##Leave on Subject out from sklearn.cross_validation import LeaveOneLabelOut aBScores = np.array([]) aBCScores = np.array([]) dTScores = np.array([]) logRegScores = np.array([]) logRegCustScores = np.array([]) linRegScores = np.array([]) linRegCustScores = np.array([]) locWeigRegCustScores = np.array([]) subjLabels = data[:, 1] lolo = LeaveOneLabelOut(subjLabels) for train, test in lolo: features_train = data[train, 4:23] labels_train = data[train, 2] #2 features_test = data[test, 4:23] labels_test = data[test, 2] #2 aBScore, aBCScore, dTScore, logRegScore, logRegCustScore, linRegScore, linRegCustScore, locWeigRegCustScore = classify(features_train, labels_train, features_test, labels_test) aBScores = np.append(aBScores, aBScore) aBCScores = np.append(aBCScores, aBCScore) dTScores = np.append(dTScores, dTScore) logRegScores = np.append(logRegScores, logRegScore) logRegCustScores = np.append(logRegCustScores, logRegCustScore)
subject, label_type) #SVC stuff from sklearn.svm import SVC from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.pipeline import Pipeline feature_selection = SelectPercentile(f_classif, percentile=percentile) svc = SVC(kernel='linear') anova_svc = Pipeline([('anova', feature_selection), ('svc', svc)]) #file prefix prefix = 'output/reduced_svc_{}_{}_{}'.format(subject, label_type, percentile) if True: from sklearn.cross_validation import LeaveOneLabelOut, cross_val_score cv = LeaveOneLabelOut(runs) cv_score = cross_val_score(anova_svc, fmri_masked, labels, cv=cv, n_jobs=8) print("Results for subject {}:".format(subject)) print(cv_score) # write scores to log file with open('output/log_svc_reduced.txt', 'a') as fh: log_line = ['Anova-SVC', subject, label_type, percentile] cv_score = np.mean(cv_score) cv_score = "{0:.2f}".format(cv_score) log_line.append(cv_score) log_line = [str(x) for x in log_line] fh.write('\t'.join(log_line) + '\n') anova_svc.fit(fmri_masked, labels) training_score = anova_svc.score(fmri_masked, labels)
#决策树进行预测 #0.624977554157 data_train,data_test,target_train,target_test=\ train_test_split(housing.data,housing.target,test_size=0.1,random_state=42) dtr=tree.DecisionTreeRegressor() dtr.fit(data_train,target_train) #score=dtr.score(data_test,target_test) kf=KFold(data_train.shape[0],n_folds=5) skf=StratifiedKFold(target_train,n_folds=5) loo=LeaveOneOut(data_train.shape[0]) lpo=LeavePOut(data_train.shape[0],6) labels_lolo=[1,1,2,2] lolo=LeaveOneLabelOut(labels_lolo) #这个策略在划分样本时,会根据第三方提供的整数型样本类标号进行划分 #每次划分数据集时,去除某个属于某个类裱好的样本作为测试集,剩余的作为训练集 labels_lopo=[1,1,2,2,3,3] lopo=LeavePLabelOut(labels_lopo,2) #这个策略每次取得p种类标号的数据作为测试集,其余作为训练集 #注意cross_val_score中的cv参数 cross_score=cross_val_score(dtr,data_train,target_train,cv=skf) print("交叉验证:") print(cross_score) #print(score) ''' #这种无法使用metrics进行判断,原因在于continous is not supported pred_test=dtr.predict(data_test)
# namely Anova. We set the number of features to be selected to 500 feature_selection = SelectKBest(f_classif, k=500) # We have our classifier (SVC), our feature selection (SelectKBest), and now, # we can plug them together in a *pipeline* that performs the two operations # successively: from sklearn.pipeline import Pipeline anova_svc = Pipeline([('anova', feature_selection), ('svc', svc)]) ### Cross validation ########################################################## anova_svc.fit(X, y) y_pred = anova_svc.predict(X) from sklearn.cross_validation import LeaveOneLabelOut, cross_val_score cv = LeaveOneLabelOut(session[session < 10]) k_range = [10, 15, 30, 50, 150, 300, 500, 1000, 1500, 3000, 5000] cv_scores = [] scores_validation = [] for k in k_range: feature_selection.k = k cv_scores.append(np.mean( cross_val_score(anova_svc, X[session < 10], y[session < 10]))) print("CV score: %.4f" % cv_scores[-1]) anova_svc.fit(X[session < 10], y[session < 10]) y_pred = anova_svc.predict(X[session == 10]) scores_validation.append(np.mean(y_pred == y[session == 10])) print("score validation: %.4f" % scores_validation[-1])
X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #TimeSeriesSplit from sklearn.model_selection import TimeSeriesSplit X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) y = np.array([1, 2, 3, 4, 5, 6]) tscv = TimeSeriesSplit(n_splits=3, max_train_size=None) print(tscv) for train, test in tscv.split(X): print("%s %s" % (train, test)) ''' Leave-One-Label-Out - LOLO¶ LeaveOneLabelOut (LOLO) is a cross-validation scheme which holds out the samples according to a third-party provided label. This label information can be used to encode arbitrary domain specific stratifications of the samples as integers. Each training set is thus constituted by all the samples except the ones related to a specific label. For example, in the cases of multiple experiments, LOLO can be used to create a cross-validation based on the different experiments: we create a training set using the samples of all the experiments except one: ''' from sklearn.cross_validation import LeaveOneLabelOut X = [[0, 0], [1, 1], [2, 2], [3, 3]] Y = [0, 1, 0, 1]
cv_scores = [] for train, test in cv: svc.fit(fmri_masked[train], target[train]) prediction = svc.predict(fmri_masked[test]) cv_scores.append( np.sum(prediction == target[test]) / float(np.size(target[test]))) cv_scores = cross_val_score(svc, fmri_masked, target, cv=cv) #下面的可以加快计算速度 #cv_scores = cross_val_score(svc, fmri_masked, target, cv=cv, n_jobs=-1, verbose=10) session_label = labels['chunks'] session_label = session_label[condition_mask] cv = LeaveOneLabelOut(labels=session_label) cv_scores_one = cross_val_score(svc, fmri_masked, target, cv=cv) #使用F1评分 #cv_scores = cross_val_score(svc, fmri_masked, target, cv=cv, scoring='f1') #计算平均分类准确率 classification_accuracy = np.mean(cv_scores) classification_accuracy_one = np.mean(cv_scores_one) #计算随机分类器的交叉验证得分 null_cv_scores = cross_val_score(DummyClassifier(), fmri_masked, target, cv=cv) #置换检验 null_cv_scores_2 = permutation_test_score(svc, fmri_masked, target, cv=cv) # Retrieve the SVC discriminating weights coef_ = svc.coef_
mean_img = image.mean_img(func_filename) plot_stat_map(weight_img, mean_img, title='SVM weights') # Saving the results as a Nifti file may also be important weight_img.to_filename('haxby_face_vs_house.nii') ############################################################################# # Obtain prediction scores via cross validation from sklearn.cross_validation import LeaveOneLabelOut # Define the cross-validation scheme used for validation. # Here we use a LeaveOneLabelOut cross-validation on the session label # divided by 2, which corresponds to a leave-two-session-out cv = LeaveOneLabelOut(session // 2) # Compute the prediction accuracy for the different folds (i.e. session) cv_scores = [] for train, test in cv: anova_svc.fit(X[train], y[train]) y_pred = anova_svc.predict(X[test]) cv_scores.append(np.sum(y_pred == y[test]) / float(np.size(y[test]))) # Return the corresponding mean prediction accuracy classification_accuracy = np.mean(cv_scores) # Print the results print("Classification accuracy: %.4f / Chance level: %f" % (classification_accuracy, 1. / n_conditions)) # Classification accuracy: 0.9861 / Chance level: 0.5000
#setting prediction & testing the classifer svc = SVC(kernel='linear') print(svc) # Define the dimension reduction to be used. # Here we use a classical univariate feature selection based on F-test, namely Anova. We set the number of features to be selected to 500 feature_selection = SelectKBest(f_classif, k=3000) # We have our classifier (SVC), our feature selection (SelectKBest), and now, we can plug them together in a *pipeline* that performs the two operations successively: anova_svc = Pipeline([('anova',feature_selection), ('svc',svc)]) #fit the decoder and predict anova_svc.fit(X, y) y_pred = anova_svc.predict(X) cv = LeaveOneLabelOut(subs[subs < 1]) k_range = [10, 15, 30, 50, 150, 300, 500, 1000, 1500, 3000, 5000] cv_scores = [] scores_validation = [] # we are working with a composite estimator: # a pipeline of feature selection followed by SVC. Thus to give the name of the parameter that we want to tune we need to give the name of the step in # the pipeline, followed by the name of the parameter, with ‘__’ as a separator. # We are going to tune the parameter 'k' of the step called 'anova' in the pipeline. Thus we need to address it as 'anova__k'. # Note that GridSearchCV takes an n_jobs argument that can make it go much faster grid = GridSearchCV(anova_svc, param_grid={'anova__k': k_range},n_jobs=-1) nested_cv_scores = cross_val_score(grid, X, y) classification_accuracy = np.mean(nested_cv_scores) print("Classification accuracy: %.4f / Chance level: %f" % (classification_accuracy, 1. / n_conditions))
def main(): add_pitch, add_roll, add_filter = False, False, True n_samples, step = 200, 200 load_data = LoadHAR(add_pitch=add_pitch, add_roll=add_roll, add_filter=add_filter, n_samples=n_samples, step=step) batch_size = 64 # Define datasets and load iteratively datasets = [ load_data.idash, load_data.wisdm1, load_data.uci_mhealth, load_data.uci_hapt ] X, y, name, users = datasets[0]() users = ['%s_%02d' % (name, user) for user in users] for dataset in datasets[1:]: X_tmp, y_tmp, name_tmp, users_tmp = dataset() X = np.concatenate((X, X_tmp)) y = np.concatenate((y, y_tmp)) for user in users_tmp: users.append('%s_%02d' % (name_tmp, user)) name += '_' + name_tmp users = np.array(users) print('Users: %d' % len(np.unique(users))) print(X.shape) n_windows, sequence_length, n_features = X.shape y = one_hot(y, n_classes=len(ACTIVITY_MAP)) n_classes = y.shape[-1] # Create a time-string for our cv run d = str( datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d_%H%M%S')) cv = LeaveOneLabelOut(users) user_idx = 0 user_names = np.unique(users) user = None if user is not None: train_idx = users != user test_idx = users == user cv = ((train_idx, test_idx), ) for train_index, test_index in cv: user = user_names[user_idx] user_idx += 1 X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # Scale data using training data scaler = StandardScaler().fit(X_train.reshape((-1, n_features))) n_windows = X_train.shape[0] X_train = scaler.transform(X_train.reshape((-1, n_features))).reshape( (n_windows, sequence_length, n_features)) n_windows = X_test.shape[0] X_test = scaler.transform(X_test.reshape((-1, n_features))).reshape( (n_windows, sequence_length, n_features)) print('Xtrain mean: %f\tstd: %f' % (X_train.mean(), X_train.std())) print('Xtest mean: %f\tstd: %f' % (X_test.mean(), X_test.std())) train_set = (X_train, y_train) test_set = (X_test, y_test) valid_set = test_set n_train = train_set[0].shape[0] n_test = test_set[0].shape[0] n_test_batches = 1 n_valid_batches = None batch_size = n_test n_train_batches = n_train // batch_size print("n_train_batches: %d, n_test_batches: %d" % (n_train_batches, n_test_batches)) model = ResNet(n_in=(sequence_length, n_features), n_filters=[32, 32, 64, 64], pool_sizes=[2, 1, 2, 1], n_hidden=[512], conv_dropout=0.5, dropout=0.5, n_out=n_classes, trans_func=rectify, out_func=softmax, batch_size=batch_size, batch_norm=True) if len(cv) > 1: # Generate root path and edit root_path = model.get_root_path() model.root_path = "%s_cv_%s_%s" % (root_path, d, user) paths.path_exists(model.root_path) rmdir(root_path) # Build model f_train, f_test, f_validate, train_args, test_args, validate_args = model.build_model( train_set, test_set, None) train_args['inputs']['batchsize'] = batch_size train_args['inputs']['learningrate'] = 0.001 train_args['inputs']['beta1'] = 0.9 train_args['inputs']['beta2'] = 1e-6 test_args['inputs']['batchsize'] = batch_size validate_args['inputs']['batchsize'] = batch_size # Define confusion matrix cfm = ConfusionMatrix(n_classes=n_classes, class_names=list(ACTIVITY_MAP.values())) print(n_classes, len(list(ACTIVITY_MAP.values()))) def f_custom(model, path): mean_evals = model.get_output(X_test).eval() t_class = np.argmax(y_test, axis=1) y_class = np.argmax(mean_evals, axis=1) # cfm.batchAdd(t_class, y_class) # print(cfm) cm = confusion_matrix(t_class, y_class) cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] plt.clf() plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues) plt.colorbar() plt.ylabel('True') plt.xlabel('Predicted') plt.savefig(path) train = TrainModel(model=model, anneal_lr=0.75, anneal_lr_freq=100, output_freq=1, pickle_f_custom_freq=100, f_custom_eval=f_custom) train.pickle = False train.add_initial_training_notes( "Standardizing data after adding features\ \nUsing striding instead of pooling") train.write_to_logger("Dataset: %s" % name) train.write_to_logger("LOO user: %s" % user) train.write_to_logger("Training samples: %d" % n_train) train.write_to_logger("Test samples: %d" % n_test) train.write_to_logger("Sequence length: %d" % sequence_length) train.write_to_logger("Step: %d" % step) train.write_to_logger("Shuffle: %s" % False) train.write_to_logger("Add pitch: %s\nAdd roll: %s" % (add_pitch, add_roll)) train.write_to_logger("Add filter separated signals: %s" % add_filter) train.write_to_logger("Transfer function: %s" % model.transf) train.write_to_logger("Network Architecture ---------------") for layer in get_all_layers(model.model): # print(layer.name, ": ", get_output_shape(layer)) train.write_to_logger(layer.name + ": " + str(get_output_shape(layer))) train.train_model(f_train, train_args, f_test, test_args, f_validate, validate_args, n_train_batches=n_train_batches, n_test_batches=n_test_batches, n_valid_batches=n_valid_batches, n_epochs=500) # Reset logging handlers = train.logger.handlers[:] for handler in handlers: handler.close() train.logger.removeHandler(handler) del train.logger
class CrossValidation: def __init__(self, k=3, f=None, solver=None, var=None, iter=200, noise=None, k_type=None, reg=None, weights=None): self.f = f self.solver = solver self.var = var self.iter = iter self.noise = noise self.reg = reg self.weights = weights self.setup() self.setup_kf(k=k, k_type=k_type) def save(self): pass def load(self): pass def setup(self): # load data self.A,self.b,self.N,self.block_sizes,self.x_true,self.nz,self.f = \ util.load_data(self.f) self.NT = self.N.T.tocsr() # Assumption: Gaussian noise is proportional to link volume if self.noise: self.b_true = self.b delta = np.random.normal(scale=self.b * self.noise) self.b = self.b + delta self.n = np.size(self.b) self.x0 = np.array(util.block_e(self.block_sizes - 1, self.block_sizes)) # self.x0 = self.x_true logging.debug("Blocks: %s" % self.block_sizes.shape) self.options = { 'max_iter': self.iter, 'verbose': 1, 'suff_dec': 0.003, # FIXME unused 'corrections': 500 } # FIXME unused self.proj = lambda x: simplex_projection(self.block_sizes - 1, x) # self.proj = lambda x: pysimplex_projection(self.block_sizes - 1,x) self.z0 = np.zeros(self.N.shape[1]) if self.reg and self.weights == 'travel_time': self.D = util.load_weights('%s/%s/travel_times.pkl' % (c.DATA_DIR, c.ESTIMATION_INFO_DIR), self.block_sizes, weight=1) self.D2 = self.D * self.D def setup_kf(self, k=3, k_type=None): self.k_type = k_type if self.k_type == None: self.kf = KFold(self.n, n_folds=k, indices=True) self.k = k elif self.k_type == 'taz_ids': import pickle with open('%s/%s/taz_ids.pkl' % (c.DATA_DIR, c.ESTIMATION_INFO_DIR)) as f: ids = pickle.load(f) labels = [int(id) for (ind, id) in ids if ind not in self.nz] self.kf = LeaveOneLabelOut(labels=labels) self.k = self.kf.n_unique_labels elif self.k_type == 'city_ids': import pickle with open('%s/%s/city_ids.pkl' % (c.DATA_DIR, c.ESTIMATION_INFO_DIR)) as f: ids = pickle.load(f) # FIXME caution, cities with no id are all grouped together labels = [ int(id) if id else 0 for (ind, id) in ids if ind not in self.nz ] self.kf = LeaveOneLabelOut(labels=labels) self.k = self.kf.n_unique_labels elif self.k_type == 'street_names': import pickle with open('%s/%s/street_names.pkl' % (c.DATA_DIR, c.ESTIMATION_INFO_DIR)) as f: ids = pickle.load(f) labels = [id for (ind, id) in ids if ind not in self.nz] self.k = k unique_labels = list(set(labels)) nunique_labels = len(unique_labels) name_to_b_ind = [[ind for ind,name in enumerate(labels) if \ name == label] for label in unique_labels] kf = KFold(nunique_labels, n_folds=k, indices=True) self.kf = [] for (train, test) in kf: train_temp = [name_to_b_ind[t] for t in train] train_temp = [ item for sublist in train_temp for item in sublist ] test_temp = [name_to_b_ind[t] for t in test] test_temp = [item for sublist in test_temp for item in sublist] self.kf.append((train_temp, test_temp)) self.iters = [None] * self.k self.times = [None] * self.k self.states = [None] * self.k def init_metrics(self): self.train = {} self.test = {} self.nbins = 6 # emulating class of link by flow counts, bins = np.histogram(self.b, bins=self.nbins) self.bins = bins self.train_bin = {} self.test_bin = {} # Run cross-validation and store intermediate states of each run def run(self): for i, (train, test) in enumerate(self.kf): # Setup b_train, A_train = self.b[train], self.A[train, :] b_test, A_test = self.b[test], self.A[test, :] AT = A_train.T.tocsr() target = A_train.dot(self.x0) - b_train if self.reg == None: f = lambda z: 0.5 * la.norm( A_train.dot(self.N.dot(z)) + target)**2 nabla_f = lambda z: self.NT.dot(AT.dot(A_train.dot(self.N.dot(z)) \ + target)) elif self.reg == 'L2' and self.weights: f = lambda z: 0.5 * la.norm( A_train.dot(self.N.dot(z)) + target)**2 + 0.5 * la.norm( self.D * (self.N.dot(z) + self.x0))**2 nabla_f = lambda z: self.NT.dot(AT.dot(A_train.dot(self.N.dot(z)) \ + target)) + self.NT.dot(self.D2 * (self.N.dot(z) + \ self.x0)) elif self.reg == 'L2': f = lambda z: 0.5 * la.norm( A_train.dot(self.N.dot(z)) + target)**2 + 0.5 * la.norm( self.N.dot(z) + self.x0)**2 nabla_f = lambda z: self.NT.dot(AT.dot(A_train.dot(self.N.dot(z)) \ + target)) + self.NT.dot(self.N.dot(z) + self.x0) iters, times, states = [], [], [] def log(iter_, state, duration): iters.append(iter_) times.append(duration) states.append(state) start = time.time() return start # Solve logging.debug('[%d] Starting %s solver...' % (i, self.solver)) if self.solver == 'LBFGS': LBFGS.solve(self.z0 + 1, f, nabla_f, solvers.stopping, log=log, proj=self.proj, options=self.options) elif self.solver == 'BB': BB.solve(self.z0, f, nabla_f, solvers.stopping, log=log, proj=self.proj, options=self.options) elif self.solver == 'DORE': # setup for DORE alpha = 0.99 lsv = util.lsv_operator(A_train, self.N) logging.info("Largest singular value: %s" % lsv) A_dore = A_train * alpha / lsv target_dore = target * alpha / lsv DORE.solve(self.z0, lambda z: A_dore.dot(self.N.dot(z)), lambda b: self.N.T.dot(A_dore.T.dot(b)), target_dore, proj=self.proj, log=log, options=self.options) A_dore = None logging.debug('[%d] Stopping %s solver... %s' % \ (i,self.solver,str(np.sum(times)))) self.iters[i] = iters self.times[i] = times self.states[i] = states AT, A_train, A_test = None, None, None # Post process intermediate states of runs def post_process(self): self.init_metrics() self.mean_times = util.mask(self.times).cumsum(axis=0).mean(axis=1) # self.mean_times = np.mean(np.array([np.cumsum(self.times[i]) for i in \ # range(self.k)]),axis=0) def metrics(A, b, X): d = X.shape[1] diff = A.dot(X) - np.tile(b, (d, 1)).T error = 0.5 * np.diag(diff.T.dot(diff)) RMSE = np.sqrt(error / b.size) den = np.sum(b) / np.sqrt(b.size) pRMSE = RMSE / den plus = A.dot(X) + np.tile(b, (d, 1)).T # GEH metric [See https://en.wikipedia.org/wiki/GEH_statistic] GEH = np.sqrt(2 * diff**2 / plus) meanGEH = np.mean(GEH, axis=0) maxGEH = np.max(GEH, axis=0) GEHunder5 = np.mean(GEH < 5, axis=0) GEHunder1 = np.mean(GEH < 1, axis=0) GEHunder05 = np.mean(GEH < 0.5, axis=0) GEHunder005 = np.mean(GEH < 0.05, axis=0) return { 'error': error, 'RMSE': RMSE, 'pRMSE': pRMSE, 'mean_GEH': meanGEH, 'max_GEH': maxGEH, 'GEH_under_5': GEHunder5, 'GEH_under_1': GEHunder1, 'GEH_under_0.5': GEHunder05, 'GEH_under_0.05': GEHunder005, } def populate(d, m): for (k, v) in m.iteritems(): if k not in d: d[k] = [] d[k].append(v) return d for i, (train, test) in enumerate(self.kf): d = len(self.states[i]) b_train, A_train = self.b[train], self.A[train, :] b_test, A_test = self.b[test], self.A[test, :] self.x_hat = self.N.dot(np.array(self.states[i]).T) + \ np.tile(self.x0,(d,1)).T # Aggregate error m = metrics(A_train, b_train, self.x_hat) self.train = populate(self.train, m) logging.debug( 'Train: %8.5e to %8.5e (%8.5e)' % (m['RMSE'][0], m['RMSE'][-1], m['RMSE'][0] - m['RMSE'][-1])) m = metrics(A_test, b_test, self.x_hat) self.test = populate(self.test, m) logging.debug( 'Test: %8.5e to %8.5e (%8.5e)' % (m['RMSE'][0], m['RMSE'][-1], m['RMSE'][0] - m['RMSE'][-1])) # TODO deprecate x_last = self.x_hat[:, -1] dist_from_true = np.max(np.abs(x_last - self.x_true)) start_dist_from_true = np.max(np.abs(self.x_true - self.x0)) logging.debug('max|x-x_true|: %.2f\nmax|x_init-x_true|: %.2f' \ % (dist_from_true, start_dist_from_true)) # Error metric by link class inds = np.digitize(b_train, self.bins) indts = np.digitize(b_test, self.bins) train_bin, test_bin = {}, {} for j in range(1, self.nbins + 2): ind = inds == j indt = indts == j if np.all(indt == False) or np.all(ind == False): for k in KEYS: if k not in train_bin: train_bin[k] = [] train_bin[k].append(None) for k in KEYS: if k not in test_bin: test_bin[k] = [] test_bin[k].append(None) continue b_bin, A_bin = b_train[ind], A_train[ind, :] b_bint, A_bint = b_test[indt], A_test[indt, :] m = metrics(A_bin, b_bin, self.x_hat) train_bin = populate(train_bin, m) m = metrics(A_bint, b_bint, self.x_hat) test_bin = populate(test_bin, m) self.train_bin = populate(self.train_bin, train_bin) self.test_bin = populate(self.test_bin, test_bin) # Summary metrics self.mean_time = np.mean([np.cumsum(self.times[i])[-1] for i in \ range(self.k)]) self.mean_error = np.mean([self.test['error'][i][-1] for i in \ range(self.k)]) self.mean_RMSE = np.mean([self.test['RMSE'][i][-1] for i in \ range(self.k)]) logging.debug('mean time: %8.5e, mean error: %8.5e' % (self.mean_time, self.mean_error)) print '\n\n' def cleanup(self): self.A = None self.N = None self.NT = None self.states = None # Plot each of the k tests separately def plot_all(self, subplot=None, color='k'): if subplot: plt.subplot(subplot) for i in range(self.k): times = np.cumsum(self.times[i]) if i == 0: plt.loglog(times,self.test['RMSE'][i],color=color, label='%s-%s (%d iters)' % \ (self.solver,self.var,self.iters[0][-1])) else: plt.loglog(times, self.test['RMSE'][i], color=color) plt.hold(True) plt.loglog(times, self.train['RMSE'][i], color=color, alpha=0.25) plt.xlabel('CPU time (seconds)') plt.ylabel('%d-fold CV RMSE' % self.k) plt.title('CV error') plt.legend(shadow=True) # Plot summary dot for this solver def plot(self, subplot=None, color='k'): if subplot: plt.subplot(subplot) plt.plot(self.mean_time, self.mean_RMSE, marker='o', color=color, label='%s-%s' % (self.solver, self.var)) plt.xlabel('Average CPU time (seconds)') plt.ylabel('%d-fold CV average RMSE' % self.k) plt.title('CV Summary') plt.legend(shadow=True, loc='best') # Plot bar graph of k tests by link volume bin def plot_bar_bins(self, subplot=None, color='k', offset=0, time_max=None, metric='RMSE'): if subplot: plt.subplot(subplot) test_metrics = self.test_bin[metric] train_metrics = self.train_bin[metric] # TODO do this for individual times instead of mean times inds = [len(self.times[i]) - 1 for i in range(len(self.times))] iters = [self.iters[i][-1] for i in range(len(self.times))] if self.mean_time > time_max: for i in range(len(self.times)): times = np.cumsum(self.times[i]) for j in range(len(self.times[i])): if times[j] > time_max: inds[i] = j - 1 iters[i] = self.iters[i][j - 1] break for j in range(self.nbins + 1): x = np.array(range(self.nbins + 1)) try: test_metric = [test_metrics[i][j] for i in range(self.k)] train_metric = [train_metrics[i][j] for i in range(self.k)] except IndexError: import ipdb ipdb.set_trace() if len(test_metric) == 0: print 'Skipping %s %s (empty)' % (metric, j) continue try: y1 = np.mean([test_metric[i][inds[i]] for i in range(self.k) if \ test_metric[i] != None]) y2 = np.mean([train_metric[i][inds[i]] for i in range(self.k) if \ train_metric[i] != None]) std1 = np.std([test_metric[i][inds[i]] for i in range(self.k) if \ test_metric[i] != None]) std2 = np.std([train_metric[i][inds[i]] for i in range(self.k) if \ train_metric[i] != None]) except IndexError: import ipdb ipdb.set_trace() if j == 0: plt.bar(x[j]-1+offset,y1,label='%s-%s (%d iters)' % \ (self.solver,self.var,np.mean(iters)),width=0.15, color=color,yerr=std1) else: plt.bar(x[j] - 1 + offset, y1, width=0.15, color=color, yerr=std1) plt.hold(True) plt.bar(x[j] - 1 + offset + 1. / 6, y2, width=0.15, color=color, yerr=std2, alpha=0.25) xlabels = self.bins plt.gca().set_xticklabels( ['%8.5e' % x for x in np.hstack((self.bins, [np.inf]))]) plt.xlabel('Link flow volume') plt.ylabel('%d-fold CV average %s' % (self.k, metric)) plt.title('CV %s by link volume (%f sec)' % \ (metric,time_max)) plt.legend(shadow=True) # Plot each of the k tests separately per link volume bin # TODO deprecate def plot_bins(self, subplot=None, color='k', time_max=None): if subplot: plt.subplot(subplot) for i in range(self.k): times = np.cumsum(self.times[i]) for j in range(self.nbins + 1): if self.test_bin_error[i][j] == None or \ self.train_bin_error[i][j] == None: continue if i == 0: plt.loglog(times, self.test_bin_error[i][j], color=color, label='%s-%s %s' % (self.solver, self.var, self.bins[j])) else: plt.loglog(times, self.test_bin_error[i][j], color=color) plt.hold(True) plt.loglog(times, self.train_bin_error[i][j], color=color, alpha=0.25) plt.xlabel('CPU time (seconds)') plt.ylabel('%d-fold CV error (L2)' % self.k) plt.title('CV error by link volume (%d iterations)' % self.iter) plt.legend(shadow=True)