def svm_regression(self, out_filename, delete=True): self.to_svmlight(out_filename) train_file = out_filename+".train" test_file = out_filename+".val" model_file = train_file+".mod" classified_file = test_file+".class" classified_file_original = test_file+".class_orig" print "Writing output to " + self.base_file+"_svm.txt" with open(self.base_file+"_svm.txt", 'w') as fx: d = 1 for t in [0]: #[0, 1, 1, 1, 1, 1, 2]: print "---" print "SVM Regression..." if t != 1: MSVMLight.learn(train_file, model_file, z='r', t=t) else: MSVMLight.learn(train_file, model_file, z='r', t=t, d=d) d += 1 print "Learnt Model" MSVMLight.classify(test_file, model_file, classified_file) MSVMLight.classify(train_file, model_file, classified_file_original) with open(test_file, 'r') as f: ytrue = f.readlines() ytrue = [float(l.split(' ', 1)[0]) for l in ytrue] with open(classified_file, 'r') as f: yguess = f.readlines() yguess = [float(l.replace('\n', '')) for l in yguess] with open(train_file, 'r') as f: ytrue_orig = f.readlines() ytrue_orig = [float(l.split(' ', 1)[0]) for l in ytrue_orig] with open(classified_file_original, 'r') as f: yguess_orig = f.readlines() yguess_orig = [float(l.replace('\n', '')) for l in yguess_orig] fx.write("---t=%d d=%d\n" % (t, d)) fx.write("R2 (val/test)\n") fx.write("%f %f \n" % (metrics.r2_score(ytrue, yguess), metrics.r2_score(ytrue_orig, yguess_orig))) fx.write("MSE (val/test)\n") fx.write("%f %f \n" % (metrics.mean_square_error(ytrue, yguess), metrics.mean_square_error(ytrue_orig, yguess_orig))) fx.write("---\n")
def test_all_regressors(): x, y = make_friedman2(10000) x_train, y_train, x_test, y_test = test_helpers.split_dataset(x,y) #print y_test[:100] ols = LinearRegression() ols.fit(x_train, y_train) ols_pred = ols.predict(x_test) #print ols_pred[:100] ols_mse = mean_square_error(y_test, ols_pred) for fn in regressors: print fn model = fn(x_train,y_train) print model pred = model.predict(x_test) #print pred[:100] mse = mean_square_error(y_test, pred) print "OLS MSE:", ols_mse, " Current MSE:", mse print "Ratio:", mse / ols_mse assert ols_mse > 1.1*mse
def test_all_regressors(): x, y = make_friedman2(10000) x_train, y_train, x_test, y_test = test_helpers.split_dataset(x, y) #print y_test[:100] ols = LinearRegression() ols.fit(x_train, y_train) ols_pred = ols.predict(x_test) #print ols_pred[:100] ols_mse = mean_square_error(y_test, ols_pred) for fn in regressors: print fn model = fn(x_train, y_train) print model pred = model.predict(x_test) #print pred[:100] mse = mean_square_error(y_test, pred) print "OLS MSE:", ols_mse, " Current MSE:", mse print "Ratio:", mse / ols_mse assert ols_mse > 1.1 * mse
from sklearn.datasets import load_boston boston = load_boston() from matplotlib import pyplot as plt import scipy as sp from sklearn.metrics import mean_square_error plt.figure(1) plt.hist(boston.target) plt.xlabel('price ($1000s)') plt.ylabel('count') from sklearn.linear_model import LinearRegression clf = LinearRegression() clf.fit(boston.data[::2], boston.target[::2]) predicted = clf.predict(boston.data[1::2]) plt.figure(2) plt.scatter(boston.target[1::2], predicted) plt.plot([0, 50], [0, 50], '--k') plt.axis('tight') plt.xlabel('True price ($1000s)') plt.ylabel('Predicted price ($1000s)') print mean_square_error(boston.target[1::2],predicted) plt.show()
data_train, data_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_size=0.25, random_state=None) ################################################### # 3 - define the regression model clf = linear_model.LinearRegression() ################################################### # 4 - fit the model clf.fit (data_train, y_train) clf.coef_ # 5 - predict using the model y_predicted = clf.predict(y_train) # 6 - validata the model print(metrics.explained_variance_score(y_test, y_predicted)) #ES = SSR/SST print(metrics.mean_absolute_error(y_test, y_predicted)) #MAE (l1) print(metrics.mean_square_error(y_test, y_predicted)) #MSE (l2) print(metrics.r2_score(y_test, y_predicted)) #R2 = 1-SSE/SST # 7 - print the result import matplotlib.pyplot as plt plt.scatter(data_test, y_test, color='black') plt.plot(data_test, y_predicted), color='blue', linewidth=3) plt.xticks(()) plt.yticks(()) plt.show()
def TRVP(Ym,Ypred): return mean_square_error(Ym,Ypred) / len(Ym) def correcting(m): return [ v+v*0.01*a for a,v in enumerate(m)]
def EVRP(Ym,Ypred): return mean_square_error(Ym,Ypred) / norm(Ym)**2 plot_metrics('EVRP','ERVP',apply_metrics(EVRP))
roc_values = [] for feature in X_train.columns: clf = DecisionTreeClassifier() clf.fit(X_train[feature].to_frame(), y_train) y_scored = clf.predict_proba(X_test[feature].to_frame()) roc_values.append(roc_auc_score(y_test, y_scored[:, 1])) # 之后rank一下选分数高的就好了 # 8.2 Univariate roc-auc for Regression mse_values = [] for feature in X_train.columns: clf = DecisionTreeRegressor() clf.fit(X_train[feature].to_frame(), y_train) y_scored = clf.predict(X_test[feature].to_frame()) mse_values.append(mean_square_error(y_test, y_scored)) # Rank it! ################################# B. Wrapper Methods ############################################ # 1. Forward Selection: add one feature at a time recursively # 2. Backward Selection: remove one feature at a time recursively # 3. Exhaustive Search: searches across all possible feature combinations ## Procedure # 1. Search for the subset of features # 2. Build the Machine Learning Model on the selected feature subset # 3. Evaluate Model Performance # 4. Repeat
std = X_train.std(axis=0) mean = X_train.mean(axis=0) X_train = (X_train - mean) / std X_test = (X_test - mean) / std std = y_train.std(axis=0) mean = y_train.mean(axis=0) y_train = (y_train - mean) / std y_test = (y_test - mean) / std gc.collect() print "- benching ElasticNet" clf = ElasticNet(alpha=alpha, rho=0.5, fit_intercept=False) tstart = time() clf.fit(X_train, y_train) elnet_results[i, j, 0] = mean_square_error(clf.predict(X_test), y_test) elnet_results[i, j, 1] = time() - tstart gc.collect() print "- benching SGD" n_iter = np.ceil(10 ** 4.0 / n_train) clf = SGDRegressor(alpha=alpha, fit_intercept=False, n_iter=n_iter, learning_rate="invscaling", eta0=.01, power_t=0.25) tstart = time() clf.fit(X_train, y_train) sgd_results[i, j, 0] = mean_square_error(clf.predict(X_test), y_test) sgd_results[i, j, 1] = time() - tstart
# <codecell> permdata = np.load('100iter.npz') hist(permdata['distribution'], 64, color=[0.6,0.6,0.6]) plot([permdata['value'], permdata['value']], [0, 12], color='r', linewidth=2) title('p = %.3f' % max(1./100, (1-permdata['pvalue']))) xlim([390, 1100]) xlabel('Mean square error (lower=better)') savefig("figures/permtest_hist.svg") savefig("figures/permtest_hist.png", dpi=600) # <codecell> msedata = [] for idx, res in enumerate(result_lsas): msedata.append((skm.mean_square_error(res[0], res[1]), skm.mean_square_error(cvres['result'][idx][0], cvres['result'][idx][1]))) # <codecell> print wilcoxon(np.diff(msedata, axis=1).ravel()) boxplot(np.diff(msedata, axis=1)) # <markdowncell> # ##Amygdala responses # <codecell> amygdata = recfromcsv('AmygdalaResponses.csv', names=True)
def regression(self, type='PCA'): self.lm = linear_model.LinearRegression() if type == 'PCA': X = self.Xt Xv = self.Xvt else: X = self.X Xv = self.Xv self.lm.fit(X, self.Y) Ypv = self.lm.predict(Xv) Yp = self.lm.predict(X) print "Writing output to " + self.base_file+"_"+type+".txt" with open(self.base_file+"_"+type+".txt", 'w') as f: if type == 'Linear': f.write("---\n") f.write("Linear Components\n") for i, x in enumerate(self.select_x): f.write("%d\t%s\n" % (i,x)) f.write("---\n") f.write("%s Regression...\n" % type) f.write("R2 Score (Val / Test) \n") f.write("%f %f \n" % (metrics.r2_score(self.Yv, Ypv), metrics.r2_score(self.Y, Yp))) f.write("MSE (Val / Test)") f.write("%f %f \n" % (metrics.mean_square_error(self.Yv, Ypv), metrics.mean_square_error(self.Y, Yp))) f.write("Coefficients\n") f.write("%s\n" % self.lm.coef_) f.write("Intercept\n") f.write("%s\n" % self.lm.intercept_) f.write("---\n") # Do R Linear Regression lm_string = "y ~ x0" data_frame_val = {} data_frame_train = {} for i in range(len(self.select_x)): R.globalenv['x%d' % i] = R.FloatVector(X[:,i].tolist()) data_frame_val['x%d' % i] = R.FloatVector(Xv[:,i].tolist()) #data_frame_train['x%d' % i] = R.FloatVector(X[:,i].tolist()) if i > 0: lm_string += " + x%d" % i R.globalenv['y'] = R.FloatVector(self.Y) data_frame_val['y'] = R.FloatVector(self.Yv) #data_frame_train['y'] = R.FloatVector(self.Y) data_frame_val = R.DataFrame(data_frame_val) #data_frame_train = R.DataFrame(data_frame_train) #R.r.attach(data_frame_train) fit = R.r.lm(lm_string) aic = R.r.AIC(fit) f.write("%s\n" % R.r.summary(fit)) f.write("%s\n" % aic) #R.r.attach(data_frame_val) # Print Test R2 Value predicted = R.r.predict(fit) YpR = [] for p in predicted: YpR.append(p) f.write("Test: %s\n" % metrics.r2_score(self.Y, YpR)) # Print Validation R2 Value for i in range(len(self.select_x)): R.globalenv['x%d' % i] = R.FloatVector(Xv[:,i].tolist()) R.globalenv['y'] = R.FloatVector(self.Yv) predicted = R.r.predict(fit, newdata=data_frame_val) YpvR = [] for p in predicted: YpvR.append(p) f.write("Val: %s\n" % metrics.r2_score(self.Yv, YpvR)) # fit2 = R.r.lm('y ~ x1 + x2 + x3') # print R.r.anova(fit, fit2) #print aic[0]
from sklearn.linear_model import LinearRegression lr = LinearRegression() lr.fit(X_train, y_train) LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) y_pred = lr.predict(X_test) y_pred #VISUALIZE THE TRAIN RESULTS plt.scatter(X_train, y_train, color = 'blue') plt.plot(X_train, lr.predict(X_train), color = 'red') plt.title('Salary ~ Experience (Train Set)') plt.xlabel('Years of Experience') plt.ylabel('Salary') plt.show() #VISUALIZE THE TEST RESULTS plt.scatter(X_test, y_test, color = 'blue') plt.plot(X_test, lr.predict(X_train), color = 'red') plt.title('Salary Vs Experience (Test set)') plt.xlabel('Years of Experience') plt.ylabel('Salary') plt.show() #CALCULATING THE RESIDUALS from sklearn import metrics print('MAE:', metrics.mean_absolute_error(y_test,y_pred)) print('MSE:', metrics.mean_square_error(y_test, y_pred)) print('RMSE:', np.sqrt(metrics.mean_absolute_error(y_test, y_pred)))
def hold_out(X): train, test = [], [] for i in X: test.append(i) if int(random() * 4) == 0 else train.append(i) return (train, test) read_data = lambda file_name: [ map(float, line.split(' ')[0:-1]) for line in open(file_name, 'r').readlines() ] data = lambda records: [record[0:-1] for record in records if len(record) == 9] labels = lambda records: [record[-1] for record in records if len(record) == 9] match_rate = lambda classifier, data, labels, Paser: mean_square_error( labels, [Paser(classifier.predict(i)) for i in data]) def best_grid(train_data, train_labels, validation_data, validation_labels, grid, Classifier, Paser): best = (sys.maxint, 0) for i in grid: classifier = Classifier(i) classifier.fit(train_data, train_labels) matches = match_rate(classifier, validation_data, validation_labels, Paser) if matches < best[0]: best = (matches, i) return best[1] train, validation = hold_out(read_data('./data/bank8FM.data'))
from sklearn.neighbors import KNeighborsRegressor from rbf import RBF from sklearn.metrics import mean_square_error from numpy import mean from random import random import math def hold_out(X): train, test = [], [] for i in X: test.append(i) if int(random() * 4) == 0 else train.append(i) return (train, test) read_data = lambda file_name: [map(float, line.split(' ')[0:-1]) for line in open(file_name,'r').readlines()] data = lambda records: [record[0:-1] for record in records if len(record) == 9] labels = lambda records: [record[-1] for record in records if len(record) == 9] match_rate = lambda classifier, data, labels, Paser: mean_square_error(labels, [Paser(classifier.predict(i)) for i in data]) def best_grid(train_data, train_labels, validation_data, validation_labels, grid, Classifier, Paser): best = (sys.maxint, 0) for i in grid: classifier = Classifier(i) classifier.fit(train_data, train_labels) matches = match_rate(classifier, validation_data, validation_labels, Paser) if matches < best[0]: best = (matches, i) return best[1] train, validation = hold_out(read_data('./data/bank8FM.data')) train_data, train_labels = data(train), labels(train) validation_data, validation_labels = data(validation), labels(validation) test = read_data('./data/bank8FM.test') test_data, test_labels = data(test), labels(test)