def test_cross_val_generator_with_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) # explicitly passing indices value is deprecated loo = cval.LeaveOneOut(4) lpo = cval.LeavePOut(4, 2) kf = cval.KFold(4, 2) skf = cval.StratifiedKFold(y, 2) lolo = cval.LeaveOneLabelOut(labels) lopo = cval.LeavePLabelOut(labels, 2) ps = cval.PredefinedSplit([1, 1, 2, 2]) ss = cval.ShuffleSplit(2) for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X[train], X[test] y[train], y[test]
def test_cross_val_generator_with_default_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4) lpo = cval.LeavePOut(4, 2) kf = cval.KFold(4, 2) skf = cval.StratifiedKFold(y, 2) lolo = cval.LeaveOneLabelOut(labels) lopo = cval.LeavePLabelOut(labels, 2) b = cval.Bootstrap(2) # only in index mode ss = cval.ShuffleSplit(2) ps = cval.PredefinedSplit([1, 1, 2, 2]) for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss, ps]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X[train], X[test] y[train], y[test]
def _cross_val_score_loo_r0(lm, X, y): """ mean_square_error metric is used from sklearn.metric. Return -------- The mean squared error values are returned. """ if len(y.shape) == 1: y = np.array([y]).T kf = cross_validation.LeaveOneOut(y.shape[0]) score_l = list() for tr, te in kf: lm.fit(X[tr, :], y[tr, :]) yp = lm.predict(X[te, :]) score_l.append(metrics.mean_squared_error(y[te, :], yp)) return score_l
def LOOCV(self, labels, values, details=False): ''' Performs leave one out cross validation. Takes a subset of the input data label_array and values to do the cross validation. RETURNS: array of length folds of cross validation scores, detailedResults is an array of length # of samples containing the predicted classes ''' num_samples = values.shape[0] scores = np.zeros(num_samples) detailedResults = np.zeros(num_samples) # get training and testing set, train on training set, score on test set for train, test in cross_validation.LeaveOneOut(num_samples): values_test = values[test] label_test = labels[test] self.Train(labels[train], values[train], fout=None) scores[test] = self.classifier.score(values_test, label_test) if details: detailedResults[test] = self.Predict(values_test) if details: return scores, detailedResults return scores
def do_LOOCV(all_x, all_y): loo = cross_validation.LeaveOneOut(len(all_x)) tot_cnt = np.zeros(num_cls) hit_cnt = np.zeros(num_cls) cancer_prob = [] labels = [] cnt = 0 for train_index, test_index in tqdm(loo): train_val_x, test_x = all_x[train_index], all_x[test_index] train_val_y, test_y = all_y[train_index], all_y[test_index] train_val_x, train_val_y = shuffle(train_val_x, train_val_y, random_state=RANDOM_STATE) n_trn = len(train_val_x) n_dev = int(n_trn * VAL_RATIO) n_trn = n_trn - n_dev train_x = train_val_x[0:n_trn] train_y = train_val_y[0:n_trn] val_x = train_val_x[n_trn:] val_y = train_val_y[n_trn:] prob = classifier_LOOCV(train_x, train_y, val_x, val_y, test_x, test_y, method_clf=method_clf, verbose=verbose, num_cls=num_cls) for i in range(0, num_cls): if test_y[0] == i: tot_cnt[i] += 1 if np.argmax(prob) == i: hit_cnt[i] += 1 return (tot_cnt, hit_cnt)
def LR_training_python(lrf, Y, verboseoutput): Y = Y.reshape((len(Y), )) loo = cross_validation.LeaveOneOut(len(Y)) mae2 = 0 errors2 = [] for train_idx, test_idx in loo: f_train, f_test = lrf[train_idx], lrf[test_idx] Y_train, Y_test = Y[train_idx], Y[test_idx] if not np.any(np.isnan(f_train)) and np.all(np.isfinite(f_train)): r2 = linearRegression(f_train, Y_train) y2 = r2.predict(f_test) errors2.append(np.abs(Y_test - y2)) if verboseoutput: print Y_test[0], y2[0] else: print 'nan or infinite' pass mae2 = np.mean(errors2) var2 = np.sqrt(np.var(errors2)) mre2 = mae2 / Y.mean() return mae2, var2, mre2
def validation(X,Y): repeats = 20 metric_list = [] parameters = {'estimator__n_estimators':np.arange(5,40)} for i in np.arange(repeats): # split train test X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.2,random_state=i) loocv = cross_validation.LeaveOneOut(n = len(X_train)) kfold = cross_validation.KFold(n = len(X_train),n_folds = 6) cv = GridSearchCV(clf,param_grid=parameters,n_jobs=-1, cv=loocv) cv.fit(X_train, y_train) tuned_model = cv.best_estimator_ model = tuned_model.fit(X_train,y_train) y_score = model.predict_proba(X_test) roc_auc = get_roc(y_test, y_score, y_train) roc_df = pd.DataFrame(roc_auc, index=['auc']) roc_df = pd.melt(roc_df, value_name='AUC',var_name='classes') roc_df['repeat'] = np.repeat(i,len(roc_df)) metric_list.append(roc_df) print 'Trained %i times' %i return pd.concat(metric_list,axis=0)
def GetOptimalBandwidth(self,datalabel,bandlims,numbands): '''Optimize the bandwidth using leave-one-out cross-validation. Essentially, for a single bandwidth, a PDF is made with all points except one, and the unused point is tested against the model. This is done many times, and an average error is computed. This is done for each bandwidth, and the bandwidth with the lowest average error is returned. Example follows that at jakevdp.github.io/PythonDataScienceHandbook. Args datalabel: string string describing which datalabel in the dataframe to find the bandwidth for bandlims: array (length 2) limits to search for the optimal bandwidth in numbands: int ints ''' if bandlims[1] < 0 or bandlims[0] < 0: print("Bandwidth must be greater than zero") return bandwidths = np.linspace(bandlims[0],bandlims[1],numbands) data = self.df[datalabel] if isinstance(self.df[datalabel][0],np.ndarray): print("WERE IN HERE") data_arr = [] for i in range(len(self.df[datalabel])): data_arr = data_arr + list(self.df[datalabel][0]) data=np.array(data_arr) if len(data)>500: print("This may take some time depending on your data length.") print("numbands > 10 with len(data)>500 starts to take a bit") grid = sgs.GridSearchCV(skn.KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths}, cv=cv.LeaveOneOut(len(data)), verbose=1) grid.fit(data[:,None]) thebandwidth = grid.best_params_['bandwidth'] return thebandwidth
def DoLOOCV(all_x, all_y): loo = cross_validation.LeaveOneOut(len(all_x)) acc = [] for train_index, test_index in loo: train_val_x, test_x = all_x[train_index], all_x[test_index] train_val_y, test_y = all_y[train_index], all_y[test_index] train_val_x, train_val_y = shuffle(train_val_x, train_val_y, random_state=RANDOM_STATE) n_trn = len(train_val_x) n_dev = int(n_trn * VAL_RATIO) n_trn = n_trn - n_dev train_x = train_val_x[0:n_trn] train_y = train_val_y[0:n_trn] val_x = train_val_x[n_trn:] val_y = train_val_y[n_trn:] is_correct = ClassifierLoocv(train_x, train_y, val_x, val_y, test_x, test_y) acc.append(is_correct) return acc
def test_cross_val_predict(): boston = load_boston() X, y = boston.data, boston.target cv = cval.KFold(len(boston.target)) est = Ridge() # Naive loop (should be same as cross_val_predict): preds2 = np.zeros_like(y) for train, test in cv: est.fit(X[train], y[train]) preds2[test] = est.predict(X[test]) preds = cval.cross_val_predict(est, X, y, cv=cv) assert_array_almost_equal(preds, preds2) preds = cval.cross_val_predict(est, X, y) assert_equal(len(preds), len(y)) cv = cval.LeaveOneOut(len(y)) preds = cval.cross_val_predict(est, X, y, cv=cv) assert_equal(len(preds), len(y)) Xsp = X.copy() Xsp *= (Xsp > np.median(Xsp)) Xsp = coo_matrix(Xsp) preds = cval.cross_val_predict(est, Xsp, y) assert_array_almost_equal(len(preds), len(y)) preds = cval.cross_val_predict(KMeans(), X) assert_equal(len(preds), len(y)) def bad_cv(): for i in range(4): yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8]) assert_raises(ValueError, cval.cross_val_predict, est, X, y, cv=bad_cv())
def runOneOut(): num_folds = 2 num_instances = len(X) num_trees = 100 loocv = cross_validation.LeaveOneOut(n=num_instances) model = LogisticRegression() #results = cross_validation.cross_val_score(model, X, Y.ravel(), cv=loocv,n_jobs=-1) #print("LogisticRegression Accuracy: %.3f%% (%.3f%%)") % (results.mean() * 100.0, results.std() * 100.0) model = DecisionTreeClassifier(max_depth=5) #results = cross_validation.cross_val_score(model, X, Y.ravel(), cv=loocv,n_jobs=-1) #print("Decision Tree Accuracy: %.3f%% (%.3f%%)") % (results.mean() * 100.0, results.std() * 100.0) #model = GradientBoostingClassifier(learning_rate=0.005,n_estimators=num_trees, random_state=seed,max_depth=5, min_samples_split=1600,min_samples_leaf=50, subsample=0.8) #results = cross_validation.cross_val_score(model, X, Y.ravel(), cv=loocv,n_jobs=-1) model = XGBClassifier() results = cross_validation.cross_val_score(model, X, Y.ravel(), cv=loocv, n_jobs=-1) print("Xboost Accuracy: %.3f%% (%.3f%%)") % (results.mean() * 100.0, results.std() * 100.0)
def use_grid_search_svm(traindata_features, traindata_labels, nfolds): X = traindata_features[:, 0:2] numobs = X.shape[0] hyperparam_grid = [{ 'C': np.logspace(-12, 12, 100, base=2), 'gamma': np.logspace(-12, 12, 100, base=2) }] y = np.array(traindata_labels) #1. first shuffle the data inds = list(range(numobs)) shuffle(inds) X = X[inds, :] y = y[inds] cv = cross_validation.LeaveOneOut(numobs) grid_searcher = grid_search.GridSearchCV(svm.SVC(kernel='rbf'), hyperparam_grid, cv=cv) grid_searcher.fit(X, y) print(grid_searcher.best_score_) return grid_searcher.best_estimator_
def use_grid_search_LogisticRegression(traindata_features, traindata_labels, nfolds): print("in") X = traindata_features numobs = X.shape[0] hyperparam_grid = [{'C': np.logspace(-12, 12, 10, base=2)}] y = np.array(traindata_labels) print("in 1") # 1. first shuffle the data inds = list(range(numobs)) shuffle(inds) X = X[inds, :] y = y[inds] print("in 2") cv = cross_validation.LeaveOneOut(numobs) grid_searcher = grid_search.GridSearchCV(linear_model.LogisticRegression(), hyperparam_grid, cv=cv) grid_searcher.fit(X, y) print("in 3") print(grid_searcher.best_score_) return grid_searcher.best_estimator_
from sklearn.neighbors import KNeighborsClassifier from sklearn import datasets, cross_validation data = datasets.load_svmlight_file("a9a") X = data[0] print X.shape y = data[1] loo = cross_validation.LeaveOneOut(X.shape[0]) errors = [] # Train and find best k # k = [1, 2, 5, 10, 20] # for idx, k in enumerate(k): # score = 0 # i = 0 # for train_index, test_index in loo: # i += 1 # X_train, X_test = X[train_index], X[test_index] # y_train, y_test = y[train_index], y[test_index] # clf = KNeighborsClassifier(k) # clf.fit(X_train, y_train) # score += clf.score(X_test, y_test) # if i % 1000 == 0: # print "Progress - i = " + str(i) + " k = " + str(k) # sc = score/float(X.shape[0]) # print "Finale score for k = " + str(k) + " " + str(sc) # errors.append(1-sc) clf = KNeighborsClassifier(20) clf.fit(X, y)
for i in inds: t = read('store/tmp/%d_err.txt'%i) if t: m,s = t.split('elapsed')[0].split(' ')[-1].split(':') y.append(float(m)*60+float(s)) f = [float(i) for i in t.split('Features: ')[-1].split('\n')[0].split(' ')] p = [] print(f, y[-1]) for i in range(len(f)): for j in range(i): p.append(f[i]*f[j]) p.append(f[i]) p = [f[0], f[3], f[0]*f[3]] x.append(p) """loo = cross_validation.LeaveOneOut(len(y)) regr = linear_model.LinearRegression() scores = cross_validation.cross_val_score(regr, x, y, scoring='neg_mean_squared_error', cv=loo,) print(10**((-scores.mean())**.5))""" model = linear_model.LinearRegression() model.fit(x, y) r_sq = model.score(x, y) loo = cross_validation.LeaveOneOut(len(y)) scores = cross_validation.cross_val_score(model, x, y, scoring='neg_mean_squared_error', cv=loo,) print(((-scores.mean())**.5)) print('coefficient of determination:', r_sq) print('intercept:', model.intercept_) print('slope:', model.coef_)
B_enabled = True # IRIS (arff) - load datasets data, meta = arff.loadarff(open("datasets/iris.arff", "r")) y_train = np.array(data['class']) X_train = np.array([list(x) for x in data[meta._attrnames[0:-1]]]) X_train = X_train.toarray() if sps.issparse( X_train) else X_train # avoid sparse data class_names = np.unique(y_train) # IRIS (arff) - cross validation example clf = WisardClassifier(nobits=16, bleaching=B_enabled, notics=256, mapping='linear', debug=True, default_bleaching=3) kf = cross_validation.LeaveOneOut(len(class_names)) predicted = cross_validation.cross_val_score(clf, X_train, y_train, cv=kf, n_jobs=1) print("Accuracy Avg: %.2f" % predicted.mean()) # IRIS (libsvm) - load datasets X_train, y_train = load_svmlight_file(open("datasets/iris.libsvm", "r")) class_names = np.unique(y_train) X_train = X_train.toarray() if sps.issparse( X_train) else X_train # avoid sparse data # IRIS - cross validation example (with fixed seed) clf = WisardClassifier(nobits=16, notics=1024,
plot(np.arange(1, kMax), scoreK) xlabel('k') ylabel('accuracy') title('k-NN 5-fold cross validation accuracy vs k') grid(True) savefig('q1b.png') print('q1b done') #### q1a : leave one out accuracy #take a random n size sample from the dataset n = 10000 sampleIndices = np.random.randint(N, size=n) X = X[sampleIndices, :] y = y[sampleIndices] loo = cross_validation.LeaveOneOut(n) scoreK = [] for k in range(1, kMax): neigh = KNeighborsClassifier(n_neighbors=k) scores = cross_validation.cross_val_score(neigh, X, y, cv=loo) scoreK.append(np.mean(scores)) print(scoreK) plot(np.arange(1, kMax), scoreK) xlabel('k') ylabel('accuracy') title('k-NN leave one out accuracy vs k') grid(True) savefig('q1a.png') print('q1a done')
chistogram.append(chist(imc)) labels.append(fname[:-len('xx.jpg')]) print('Finished computing features.') haralick = np.array(haralick) chistogram = np.array(chistogram) labels = np.array(labels) haralick_plus_chist = np.hstack([chistogram, haralick]) clf = Pipeline([('preproc', StandardScaler()), ('classifier', LogisticRegression())]) from sklearn import cross_validation cv = cross_validation.LeaveOneOut(len(images)) scores = cross_validation.cross_val_score(clf, haralick, labels, cv=cv) print( 'Accuracy (Leave-one-out) with Logistic Regression [haralick features]: {:.1%}' .format(scores.mean())) scores = cross_validation.cross_val_score(clf, chistogram, labels, cv=cv) print( 'Accuracy (Leave-one-out) with Logistic Regression [color histograms]: {:.1%}' .format(scores.mean())) scores = cross_validation.cross_val_score(clf, haralick_plus_chist, labels, cv=cv) print(
def RF_Model(Scaled_Input_Data, Output_Data): T0 = time.time() n = len(Scaled_Input_Data) RFModel = RandomForestRegressor() RFModel.fit(Scaled_Input_Data, Output_Data) RF_Time = time.time() - T0 print('The computational time of Random Forest Regression for ', n, ' examples is: ', RF_Time) MSEs_RF = cross_validation.cross_val_score(RFModel, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error") MeanMSE_RF = np.mean(list(MSEs_RF)) print('The average MSE of Random Forest Regression for ', n, ' examples is: ', (-1*MeanMSE_RF)) return(MeanMSE_RF, RFModel)
words = [k for k,v in filtered_word_dict.iteritems()] wiki_requests_data = np.zeros((len(binary_wiki_requests),len(words))) wiki_requests_target = np.zeros(2178) request_count = 0 for request in binary_wiki_requests: for word in request['words']: if word in words: idx = words.index(word) wiki_requests_data[request_count][idx] += 1 wiki_requests_target[request_count] = request['class'] request_count += 1 loo = cross_validation.LeaveOneOut(len(binary_wiki_requests)) print len(loo) print len(binary_wiki_requests) accuracy = 0.0 count = 0 for train_index, test_index in loo: X_train, X_test = wiki_requests_data[train_index], wiki_requests_data[test_index] y_train, y_test = wiki_requests_target[train_index], wiki_requests_target[test_index] clf = svm.LinearSVC() clf.fit(X_train,y_train) prediction = clf.predict(X_test)
def setup_indices(self, train_data, test_data): self.indices = skl_cross_validation.LeaveOneOut(len(test_data))
def region_CV_fits_and_errors(X, Y, P_X, P_Y, P_Y_dag, err_fun, Omega=None, rel_type=2): n_inj = X.shape[1] outer_sets = cross_validation.LeaveOneOut(n_inj) err_reg = np.zeros((len(outer_sets), )) err_homog = np.zeros((len(outer_sets), )) rel_err_reg = np.zeros((len(outer_sets), )) rel_err_homog = np.zeros((len(outer_sets), )) GOF_reg = np.zeros((len(outer_sets), )) GOF_homog = np.zeros((len(outer_sets), )) for i, (train, test) in enumerate(outer_sets): # compare models in outer sets only, same as eventual test errors in the # nested cross-validation procedure X_train = X[:, train] X_test = X[:, test] Y_train = Y[:, train] Y_test = Y[:, test] if Omega is not None: Omega_train = Omega[:, train] Omega_test = Omega[:, test] # W = fit_linear_model_proj(X_train, Y_train, P_Y_dag, P_X, # Omega_train) W = fit_linear_model(P_X.dot(X_train), P_Y.dot(Y_train)) else: W = fit_linear_model(P_X.dot(X_train), P_Y.dot(Y_train)) # W = fit_linear_model(P_X.dot(X_train), P_Y.dot(Y_train)) Y_pred = W.dot(P_X.dot(X_test)) Y_pred_homog = P_Y_dag.dot(Y_pred) Y_test_reg = P_Y.dot(Y_test) resid_reg = Y_pred - Y_test_reg # regional matrix resid_homog = Y_pred_homog - Y_test # voxel-homogeneous matrix err_reg[i] = err_fun(resid_reg) if Omega is not None: err_homog[i] = err_fun(proj_Omega(resid_homog, Omega_test)) else: err_homog[i] = err_fun(resid_homog) if rel_type == 1: rel_err_reg[i] = err_reg[i] / err_fun(Y_test_reg) rel_err_homog[i] = err_homog[i] / err_fun(Y_test) GOF_reg[i] = err_fun(W.dot(P_X.dot(X_train))-P_Y.dot(Y_train))/\ err_fun(P_Y.dot(Y_train)) GOF_homog[i] = err_fun(P_Y_dag.dot(W.dot(P_X.dot(X_train)))-Y_train)/\ err_fun(Y_train) elif rel_type == 2: rel_err_reg[i] = \ 2*err_reg[i] / (err_fun(Y_test_reg) + err_fun(Y_pred)) GOF_reg[i] = 2*err_fun(W.dot(P_X.dot(X_train))-P_Y.dot(Y_train))/\ (err_fun(P_Y.dot(Y_train))+err_fun(W.dot(P_X.dot(X_train)))) if (Omega is not None) and proj_errors: rel_err_homog[i] = 2*err_homog[i] / \ (err_fun(proj_Omega(Y_test, Omega_test))+ err_fun(proj_Omega(Y_pred_homog, Omega_test))) GOF_homog[i] = \ 2*err_fun(P_Y_dag.dot(W.dot(P_X.dot(X_train)))-Y_train) / \ (err_fun(proj_Omega(Y_train, Omega_train)) + err_fun(proj_Omega(P_Y_dag.dot(W.dot(P_X.dot(X_train))), Omega_train))) else: rel_err_homog[i] = 2*err_homog[i]/\ (err_fun(Y_test) + err_fun(Y_pred_homog)) GOF_homog[i] = \ 2*err_fun(P_Y_dag.dot(W.dot(P_X.dot(X_train)))-Y_train)/\ (err_fun(Y_train) + err_fun(P_Y_dag.dot(W.dot(P_X.dot(X_train))))) #if i == 2: # import pdb # pdb.set_trace() return (err_reg, err_homog, rel_err_reg, rel_err_homog, GOF_reg, GOF_homog)
def compute_acc_conf(x, y, confounds, verbose=False, balanced=True, loo=False, optimize=True, C=.01): encoder = preprocessing.LabelEncoder() encoder.fit(y) # remove intra matrix mean and var #x = ts.normalize_data(x) #cv = cross_validation.KFold(len(y),n_folds=10) if loo: cv = cross_validation.LeaveOneOut(len(y)) else: cv = StratifiedKFold(y=encoder.transform(y), n_folds=10) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] total_test_score = [] y_pred = [] #clf_array = [] bc_all = [] prec = [] recall = [] if len(np.unique(y)) == 1: print 'Unique class: 100%', np.sum(encoder.transform(y) == 0) / len(y) return (1., 0., len(y)) for i, (train, test) in enumerate(cv): select_x = x.copy() #betacluster = bc.BetaCluster(crm.transform(confounds[train,:],select_x[train,:]),encoder.transform(y[train]),100,k_feature=200) #bc_all.append(betacluster) if balanced: clf = SVC(kernel='linear', class_weight='auto', C=C) else: clf = SVC(kernel='linear', C=C) if len(confounds) == 0: xtrain = select_x[train, :] xtest = select_x[test, :] else: crm = ConfoundsRm(confounds[train, :], select_x[train, :]) xtrain = crm.transform(confounds[train, :], select_x[train, :]) xtest = crm.transform(confounds[test, :], select_x[test, :]) ytrain = encoder.transform(y[train]) ytest = encoder.transform(y[test]) #clf.probability = True if optimize: clf, score = plib.grid_search(clf, xtrain, ytrain, n_folds=10, verbose=verbose) clf.fit(xtrain, ytrain) total_test_score.append(clf.score(xtest, ytest)) #clf_array.append(clf) prec.append(metrics.precision_score(ytest, clf.predict(xtest))) recall.append(metrics.recall_score(ytest, clf.predict(xtest))) if loo: y_pred.append(clf.predict(xtest)) if verbose: print('nSupport: ', clf.n_support_) print "Train:", clf.score(xtrain, ytrain) print "Test :", clf.score(xtest, ytest) print "Prediction :", clf.predict(xtest) print "Real Labels:", ytest print('Precision:', prec[-1], 'Recall:', recall[-1]) if loo: total_std_test_score = estimate_std( metrics.accuracy_score(encoder.transform(y), np.array(y_pred)), len(y)) print('Mean:', np.mean(total_test_score), 'Std:', total_std_test_score, 'AvgPrecision:', np.mean(prec), 'AvgRecall:', np.mean(recall)) return (np.mean(total_test_score), total_std_test_score, len(y)) else: print('Mean:', np.mean(total_test_score), 'Std:', np.std(total_test_score), 'AvgPrecision:', np.mean(prec), 'AvgRecall:', np.mean(recall)) return (np.mean(total_test_score), np.std(total_test_score))
def First_Model_SVR(Scaled_Input_Data, Output_Data): T0 = time.time() n = len(Scaled_Input_Data) Grid_Dict = {"C": [1e-2, 1e-1,1e0, 1e1, 1e2],"gamma": np.logspace(-4, 2, 6)} svr_Tuned = GridSearchCV(SVR(kernel='rbf', gamma=0.1, tol = 0.005), cv=5,param_grid=Grid_Dict, scoring="mean_absolute_error") svr_Tuned.fit(Scaled_Input_Data, Output_Data) SVR_MSE = SVR(kernel='rbf', C=svr_Tuned.best_params_['C'], gamma=svr_Tuned.best_params_['gamma'], tol = 0.01) SVR_Time = time.time() - T0 print('The computational time of Radial based Support Vector Regression for ', n, ' examples is: ', SVR_Time) MSEs_SVR = cross_validation.cross_val_score(SVR_MSE, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error") MeanMSE_SVR = np.mean(list(MSEs_SVR)) print('The average MSE of Radial based Support Vector Regression for ', n, ' examples is: ', (-1*MeanMSE_SVR)) return(MeanMSE_SVR, svr_Tuned)
def tenFoldCV_onChicagoCrimeData(features=['corina'], CVmethod='10Fold', P=10, NUM_ITER=20, SHUFFLE=True): """ Use different years data to train the NB model """ YEARS = range(2003, 2014) Y = [] C = [] FL = [] GL = [] T = [] for year in YEARS: W = generate_transition_SocialLag(year, lehd_type=0) Yhat = retrieve_crime_count(year - 1, ['total']) y = retrieve_crime_count(year, ['total']) c = generate_corina_features() popul = c[1][:, 0].reshape((77, 1)) # crime count is normalized by the total population as crime rate # here we use the crime count per 10 thousand residents y = np.divide(y, popul) * 10000 Yhat = np.divide(Yhat, popul) * 10000 W2 = generate_geographical_SpatialLag_ca() f1 = np.dot(W, Yhat) f2 = np.dot(W2, Yhat) FL.append(f1) GL.append(f2) Y.append(y) T.append(Yhat) C.append(c[1]) Y = np.concatenate(Y, axis=0) columnName = ['intercept'] f = np.ones(Y.shape) if 'corina' in features: C = np.concatenate(C, axis=0) f = np.concatenate((f, C), axis=1) columnName += c[0] if 'sociallag' in features: FL = np.concatenate(FL, axis=0) f = np.concatenate((f, FL), axis=1) columnName += ['sociallag'] if 'spatiallag' in features: GL = np.concatenate(GL, axis=0) f = np.concatenate((f, GL), axis=1) columnName += ['spatiallag'] if 'temporallag' in features: T = np.concatenate(T, axis=0) f = np.concatenate((f, T), axis=1) columnName += ['temporallag'] if SHUFFLE: f, Y = shuffle(f, Y) if CVmethod == '10Fold': splt = cross_validation.KFold(n=f.shape[0], n_folds=10, shuffle=True) elif CVmethod == 'leaveOneOut': splt = cross_validation.LeaveOneOut(n=f.shape[0]) elif CVmethod == 'leavePOut': splt = cross_validation.LeavePOut(n=f.shape[0], p=P) mae1 = [] mae2 = [] mre1 = [] mre2 = [] sd_mae1 = [] sd_mae2 = [] sd_mre1 = [] sd_mre2 = [] med_mae1 = [] med_mae2 = [] med_mre1 = [] med_mre2 = [] cnt = 0 if CVmethod == 'leaveOneOut': y_gnd = [] y_lr = [] for train_idx, test_idx in splt: cnt += 1 if cnt > NUM_ITER: break f_train, f_test = f[train_idx, :], f[test_idx, :] Y_train, Y_test = Y[train_idx, :], Y[test_idx, :] # write file for invoking NB regression in R np.savetxt("Y_train.csv", Y_train, delimiter=",") np.savetxt("Y_test.csv", Y_test, delimiter=",") pd.DataFrame(f_train, columns=columnName).to_csv("f_train.csv", sep=",", index=False) pd.DataFrame(f_test, columns=columnName).to_csv("f_test.csv", sep=",", index=False) # NB regression nbres = subprocess.check_output(['Rscript', 'nbr_eval_kfold.R']).split(" ") y1 = np.array([float(e) for e in nbres]) y1 = y1.reshape((y1.shape[0], 1)) a = np.abs(Y_test - y1) mae1.append(np.mean(a)) sd_mae1.append(np.std(a)) med_mae1 += a.tolist() r = a / Y_test mre1.append(np.mean(r)) sd_mre1.append(np.std(r)) med_mre1 += r.tolist() # Linear regression r2 = linearRegression(f_train, Y_train) y2 = r2.predict(f_test) y2 = y2.reshape((y2.shape[0], 1)) ae = np.abs(Y_test - y2) mae2.append(np.mean(ae)) sd_mae2.append(np.std(ae)) med_mae2 += ae.tolist() re = ae / Y_test mre2.append(np.mean(re)) sd_mre2.append(np.std(re)) med_mre2 += re.tolist() if CVmethod == 'leaveOneOut': y_gnd.append(Y_test) y_lr.append(y2) if CVmethod == 'leaveOneOut': print np.mean(mae1), np.median(mae1), np.mean(mre1), np.median(mre1), print np.mean(mae2), np.median(mae2), np.mean(mre2), np.median(mre2) return y_gnd, y_lr else: print np.mean(mae1), np.mean(sd_mae1), np.median(med_mae1), np.mean( mre1), np.mean(sd_mre1), np.median(med_mre1), print np.mean(mae2), np.mean(sd_mae2), np.median(med_mae2), np.mean( mre2), np.mean(sd_mre2), np.median(med_mre2) return mae1, mae2
# exercise 7.1.2 from pylab import * from scipy.io import loadmat from sklearn.neighbors import KNeighborsClassifier from sklearn import cross_validation # requires data from exercise 4.1.1 from ex4_1_1 import * # Maximum number of neighbors L=40 CV = cross_validation.LeaveOneOut(N) errors = np.zeros((N,L)) i=0 for train_index, test_index in CV: print('Crossvalidation fold: {0}/{1}'.format(i+1,N)) # extract training and test set for current CV fold X_train = X[train_index,:] y_train = y[train_index,:] X_test = X[test_index,:] y_test = y[test_index,:] # Fit classifier and classify the test points (consider 1 to 40 neighbors) for l in range(1,L+1): knclassifier = KNeighborsClassifier(n_neighbors=l); knclassifier.fit(X_train, ravel(y_train)); y_est = knclassifier.predict(X_test); errors[i,l-1] = np.sum(y_est[0]!=y_test[0,0])
valid_param_vals = [] print "Checking for errors for classifier %s" % (classifier_to_print) print "Params: %s" % (classifier_varying_params) for param_val in classifier_varying_params[key]: try: alt_classifier = base.clone(classifier) alt_classifier = alt_classifier.set_params(**{key:param_val}) alt_classifier.fit(population, training_labels) valid_param_vals.append(param_val) except ValueError as e: info_fh.write("The parameter %s:%s for SVM %s for antigen %s errored: '%s'\n" % (key, param_val, classifier_to_print, antigen_type, e)) if len(valid_param_vals) > 0: try: print "Running grid for classifier %s" % (classifier_to_print) print "Params: %s" % (classifier_varying_params) cv = cross_validation.LeaveOneOut(len(training_labels)) grid = grid_search.GridSearchCV(classifier, {key:valid_param_vals}, cv=cv,, refit=False, verbose=3) t0 = time.time() grid.fit(population, training_labels) t1 = time.time() time_logging_fh.write('Fitting grid with SVM %s for antigen %s took %f seconds\n' % (classifier_to_print, antigen_type, t1-t0)) print "The best parameters for antigen %s, SVM %s are:" % (antigen_type, classifier_to_print) print "%s with a score of %f" % (grid.best_params_, grid.best_score_) print "Grid scores:" for thing in grid.grid_scores_: print thing info_fh.write("The best parameters for SVM %s for antigen %s are %s with a score of %f\n" % (classifier_to_print, antigen_type, grid.best_params_, grid.best_score_)) info_fh.write("Grid scores:\n") info_fh.write("%s\n" % (grid.grid_scores_)) plt.figure() plt.errorbar(
import pandas from sklearn import cross_validation from sklearn.linear_model import LogisticRegression url = "https://goo.gl/vhm1eU" names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] dataframe = pandas.read_csv(url, names=names) array = dataframe.values X = array[:,0:8] Y = array[:,8] num_folds = 10 num_instances = len(X) loocv = cross_validation.LeaveOneOut(n=num_instances) model = LogisticRegression() results = cross_validation.cross_val_score(model, X, Y, cv=loocv) print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)# -*- coding: utf-8 -*-
def benchmark(df): predictors = [ "LinearRegression", "Lasso", "AdaBoostRegressor", "RandomForestRegressor", "DecisionTreeRegressor" ] predictorsMapper = { 'LinearRegression': linear_model.LinearRegression(), 'Lasso': linear_model.Lasso(alpha=0.1, max_iter=1000), 'AdaBoostRegressor': ensemble.AdaBoostRegressor(), 'RandomForestRegressor': ensemble.RandomForestRegressor(), 'DecisionTreeRegressor': tree.DecisionTreeRegressor() } #Separate data by operators sumData = df[(df.Operator == 1)] sumTarget = sumData.Time sumData = sumData.drop(sumData.columns[[1, 3]], axis=1) subData = df[(df.Operator == 2)] subTarget = subData.Time subData = subData.drop(subData.columns[[1, 3]], axis=1) mulData = df[(df.Operator == 3)] mulTarget = mulData.Time mulData = mulData.drop(mulData.columns[[1, 3]], axis=1) divData = df[(df.Operator == 4)] divTarget = divData.Time divData = divData.drop(divData.columns[[1, 3]], axis=1) sumLoo = cross_validation.LeaveOneOut(len(sumTarget)) subLoo = cross_validation.LeaveOneOut(len(subTarget)) mulLoo = cross_validation.LeaveOneOut(len(mulTarget)) divLoo = cross_validation.LeaveOneOut(len(divTarget)) for p in predictors: print("Benchmarking " + p + "...") scoreTotal = 0 sumRegr = predictorsMapper.get(p, False) subRegr = predictorsMapper.get(p, False) mulRegr = predictorsMapper.get(p, False) divRegr = predictorsMapper.get(p, False) scoreSum = abs( cross_validation.cross_val_score(sumRegr, sumData, sumTarget, scoring='mean_squared_error', cv=sumLoo).mean()) scoreSub = abs( cross_validation.cross_val_score(subRegr, subData, subTarget, scoring='mean_squared_error', cv=subLoo).mean()) scoreMul = abs( cross_validation.cross_val_score(mulRegr, mulData, mulTarget, scoring='mean_squared_error', cv=mulLoo).mean()) scoreDiv = abs( cross_validation.cross_val_score(divRegr, divData, divTarget, scoring='mean_squared_error', cv=divLoo).mean()) scoreTotal = scoreSum + scoreSub + scoreMul + scoreDiv print("Mean Squared Error (by operator):") print("\tSum regressor: " + str(scoreSum)) print("\tSubstraction regressor: " + str(scoreSub)) print("\tMultiplication regressor: " + str(scoreMul)) print("\tDivision regressor: " + str(scoreDiv)) print("\tTotal: " + str(scoreTotal))
def Second_Model_KRR(Scaled_Input_Data, Output_Data): T0 = time.time() n = len(Scaled_Input_Data) Grid_Dict = {"alpha": [1e0, 1e-1, 1e-2],"gamma": np.logspace(-2, 1, 3)} krr_Tuned = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5 ,param_grid=Grid_Dict, scoring="mean_absolute_error") krr_Tuned.fit(Scaled_Input_Data, Output_Data) KRR_MSE = KernelRidge(kernel='rbf', alpha=krr_Tuned.best_params_['alpha'], gamma=krr_Tuned.best_params_['gamma']) KRR_Time = time.time() - T0 print('The computational time of Kernel Ridge Regression for ', n, ' examples is: ', KRR_Time) MSEs_KRR = cross_validation.cross_val_score(KRR_MSE, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error") MeanMSE_KRR = np.mean(list(MSEs_KRR)) print('The average MSE of Kernel Ridge Regression for ', n, ' examples is: ', (-1*MeanMSE_KRR)) return(MeanMSE_KRR, krr_Tuned)