ols.fit(train, spamtrain) expected = spamtest predicted = clf.predict(test) predicted1 = ols.predict(test) #print(spamtrain) #print(predicted) print(clf.score(test, spamtest)) print(ols.score(test, spamtest)) # Create a blank figure with labels p = figure(plot_width = 600, plot_height = 600, title = 'Example Glyphs', x_axis_label = 'X', y_axis_label = 'Y') # Add squares glyph p.square(clf.X_offset_, ols.coef_, size = 12, color = 'navy', alpha = 0.6) bokehShow(p)
# random_state=0)) t1 = time() sc.fit(X_train, y_train) sc_time = time() -t1 computed_coefs = sc.inverse_transform() computed_coefs = np.reshape(computed_coefs, [size, size, size]) score = sc.score(X_test, y_test) ############################################################################### # Compute the results for simple BayesianRidge t1 = time() clf.fit(X_train, y_train) bayes_time = time() - t1 bayes_coefs = clf.coef_ bayes_score = clf.score(X_test, y_test) bayes_coefs = bayes_coefs.reshape((size, size, size)) ############################################################################### # Plot the results pl.close('all') pl.figure() pl.title('Scores of the supervised clustering') pl.subplot(2, 1, 1) pl.plot(np.arange(len(sc.scores_)), sc.scores_) pl.xlabel('score') pl.ylabel('iteration') pl.title('Score of the best parcellation of each iteration') pl.subplot(2, 1, 2)
class IndividualTest: def __init__(self): self.test = Test() self.file_io = FileIO() self.lr = LinearRegression(normalize=True) self.br = BayesianRidge() #self.svr_lin = SVR(kernel='linear', C=1e5) self.svr_poly = SVR(kernel='poly', C=1e5, degree=2) self.svr_rbf = SVR(kernel='rbf', C=5e4, gamma='scale') self.svr_sig = SVR(kernel='sigmoid', C=1e3) #self.gridsearch = GridSearchCV(SVR(kernel='rbf'), scoring="r2", return_train_score=True) self.sc = StandardScaler() self.ms = MinMaxScaler() self.chart = DrawChart2() def lin_reg(self, X, Y, train_test_ratio, col_list, out_path): # 空のDataFrameを作成 df = pd.DataFrame( index=['coefficient', 'intercept', 'train_score', 'test_score'], columns=[]) #print(df.head()) for col in col_list: s_X = pd.DataFrame(X[col]) s_Y = Y # トレーニングデータとテストデータに分割(30%) s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data( s_X, s_Y, train_test_ratio) # 列ごとに単回帰分析 self.lr.fit(s_X_train, s_Y_train) # 偏回帰係数 coef = self.lr.coef_ # 切片 (誤差) intercept = self.lr.intercept_ # トレーニングスコア train_score = self.lr.score(s_X_train, s_Y_train) # テストスコア test_score = self.lr.score(s_X_test, s_Y_test) # DataFrameに追加 df[col] = [coef, intercept, train_score, test_score] # 回帰曲線 lin_pred = self.lr.predict(s_X_test) plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, lin_pred, 'go-') plt.show() #if col in ['売上単価','コース受諾回数_なし','数量','施術時間','指名回数_あり','治療送客回数_あり','治療送客回数_なし']: # グラフ描画 #self.chart.draw(self.lr, s_X_test, s_Y_test, col, 'score is {}'.format(test_score)) # csvファイルに書き出し self.file_io.export_csv_from_pandas(df, out_path) def bayesian_reg(self, X, Y, train_test_ratio, col_list, out_path): # 空のDataFrameを作成 df = pd.DataFrame( index=['coefficient', 'intercept', 'train_score', 'test_score'], columns=[]) #print(df.head()) for col in col_list: s_X = pd.DataFrame(X[col]) s_Y = Y # トレーニングデータとテストデータに分割(30%) s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data( s_X, s_Y, train_test_ratio) # 列ごとに単回帰分析 self.br.fit(s_X_train, s_Y_train) # 偏回帰係数 coef = self.br.coef_ # 切片 (誤差) intercept = self.br.intercept_ # トレーニングスコア train_score = self.br.score(s_X_train, s_Y_train) # テストスコア test_score = self.br.score(s_X_test, s_Y_test) # DataFrameに追加 df[col] = [coef, intercept, train_score, test_score] if col in [ '売上単価', 'コース受諾回数_なし', '数量', '施術時間', '指名回数_あり', '治療送客回数_あり', '治療送客回数_なし' ]: # グラフ描画 self.chart.draw(self.br, s_X_test, s_Y_test, col, 'score is {}'.format(test_score)) # csvファイルに書き出し self.file_io.export_csv_from_pandas(df, out_path) def svr_rbf_reg(self, X, Y, train_test_ratio, col_list, out_path): # 空のDataFrameを作成 df = pd.DataFrame(index=[ 'coefficient', 'suport_vector', 'intercept', 'train_score', 'test_score' ], columns=[]) #print(df.head()) for col in col_list: s_X = pd.DataFrame(X[col]) s_Y = Y # トレーニングデータとテストデータに分割(30%) s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data( s_X, s_Y, train_test_ratio) # 列ごとに回帰分析 #self.svr_lin.fit(s_X_train, s_Y_train) #self.svr_poly.fit(s_X_train, s_Y_train) self.svr_rbf.fit(s_X_train, s_Y_train) #self.gridsearch.fit(s_X_train, s_Y_train) # 偏回帰係数 coef = self.svr_rbf.dual_coef_ # サポートベクトル support_vec = self.svr_rbf.support_vectors_ # 切片 (誤差) intercept = self.svr_rbf.intercept_ # 精度 train_score = self.svr_rbf.score(s_X_train, s_Y_train) test_score = self.svr_rbf.score(s_X_test, s_Y_test) # DataFrameに追加 df[col] = [coef, support_vec, intercept, train_score, test_score] #lin_pred = self.svr_lin.predict(s_X_test) #poly_pred = self.svr_poly.predict(s_X_test) rbf_pred = self.svr_rbf.predict(s_X_test) plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, rbf_pred, 'go-') plt.show() if col in ['生年月日']: #plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, lin_pred, 'ro-') #plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, poly_pred, 'yo-') plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, rbf_pred, 'go-') plt.show() # csvファイルに書き出し self.file_io.export_csv_from_pandas(df, out_path) def svr_poly_reg(self, X, Y, train_test_ratio, col_list, out_path): # 空のDataFrameを作成 df = pd.DataFrame(index=[ 'coefficient', 'suport_vector', 'intercept', 'train_score', 'test_score' ], columns=[]) #print(df.head()) for col in col_list: s_X = pd.DataFrame(X[col]) s_Y = Y # トレーニングデータとテストデータに分割(30%) s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data( s_X, s_Y, train_test_ratio) # 列ごとに回帰分析 self.svr_poly.fit(s_X_train, s_Y_train) # 偏回帰係数 coef = self.svr_poly.dual_coef_ # サポートベクトル support_vec = self.svr_poly.support_vectors_ # 切片 (誤差) intercept = self.svr_poly.intercept_ # 精度 train_score = self.svr_poly.score(s_X_train, s_Y_train) test_score = self.svr_poly.score(s_X_test, s_Y_test) # DataFrameに追加 df[col] = [coef, support_vec, intercept, train_score, test_score] #lin_pred = self.svr_lin.predict(s_X_test) #poly_pred = self.svr_poly.predict(s_X_test) rbf_pred = self.svr_poly.predict(s_X_test) if col in ['生年月日']: #plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, lin_pred, 'ro-') plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, poly_pred, 'yo-') #plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, rbf_pred, 'go-') plt.show() # csvファイルに書き出し self.file_io.export_csv_from_pandas(df, out_path) def svr_sig_reg(self, X, Y, train_test_ratio, col_list, out_path): # 空のDataFrameを作成 df = pd.DataFrame(index=[ 'coefficient', 'suport_vector', 'intercept', 'train_score', 'test_score' ], columns=[]) #print(df.head()) for col in col_list: s_X = pd.DataFrame(X[col]) s_Y = Y # トレーニングデータとテストデータに分割(30%) s_X_train, s_X_test, s_Y_train, s_Y_test = self.test.make_train_test_data( s_X, s_Y, train_test_ratio) # 列ごとに回帰分析 self.svr_sig.fit(s_X_train, s_Y_train) # 偏回帰係数 coef = self.svr_sig.dual_coef_ # サポートベクトル support_vec = self.svr_sig.support_vectors_ # 切片 (誤差) intercept = self.svr_sig.intercept_ # 精度 train_score = self.svr_sig.score(s_X_train, s_Y_train) test_score = self.svr_sig.score(s_X_test, s_Y_test) # DataFrameに追加 df[col] = [coef, support_vec, intercept, train_score, test_score] sig_pred = self.svr_sig.predict(s_X_test) plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, sig_pred, 'go-') plt.show() if col in ['生年月日', '閲覧ページ総数', '閲覧ページ数/セッション', '滞在時間']: plt.plot(s_X_test, s_Y_test, 'bo-', s_X_test, sig_pred, 'go-') plt.show() # csvファイルに書き出し self.file_io.export_csv_from_pandas( df, inifile.get('regression', 'ind_path'))
def test3(): name = request.form["name"] target = request.form["target"] test_size = request.form["test_size"] dataset = request.files["dataset"] df = pd.read_csv(dataset) #directory making rootdirectory = name parent_dir = "/home/sanfer/Documents/ml-examples-vuejs-flask/web-app/src/assets/" path = os.path.join(parent_dir, rootdirectory) working = path #working path os.mkdir(path) plotdirectory = "plots" plot_parent_dir = parent_dir + rootdirectory + '/' path = os.path.join(plot_parent_dir, plotdirectory) plots_dir = path #plot path os.mkdir(path) modeldirectory = "models" model_parent_dir = parent_dir + rootdirectory + "/" path = os.path.join(model_parent_dir, modeldirectory) model_dir = path #model path os.mkdir(path) #pre-processiong plots snsdist = sns.distplot(df[target]) snsdist = snsdist.get_figure() snsdist.savefig(plots_dir + "/dist.png") snsdist.clf() features = {} dataTypes = df.dtypes for items in dataTypes.iteritems(): # print(items) # print((items[1].name)) if (items[1].name != 'float64' and items[1].name != 'int64'): df.drop(labels=items[0], axis=1, inplace=True) else: features.update({items[0]: items[1].name}) del features[target] features = json.dumps(features) y = df[target] df.drop(labels=target, axis=1, inplace=True) df.replace(0, np.NaN).fillna(df.mean(), inplace=True) X = df[list(df.columns)] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=float(test_size), random_state=101) from sklearn.linear_model import BayesianRidge lm = BayesianRidge() lm.fit(X_train, y_train) print("Linear model intercept") print(lm.intercept_) coeff_df = pd.DataFrame(lm.coef_, X.columns, columns=['Coefficient']) print(coeff_df) predictions = lm.predict(X_test) # plt.figure() plt.scatter(y_test, predictions) plt.savefig(plots_dir + "/scatter.png") plt.clf() sn = sns.distplot((y_test - predictions), bins=50) sn = sn.get_figure() sn.savefig(plots_dir + "/residual.png") sn.clf() # plt.show() # cv2.waitKey(0) # sns.distplot((y_test-predictions),bins=50); from sklearn import metrics print('MAE:', metrics.mean_absolute_error(y_test, predictions)) print('MSE:', metrics.mean_squared_error(y_test, predictions)) print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions))) print(test_size) print(name) print(features) pkl_filename = model_dir + "/" + name + ".pkl" with open(pkl_filename, 'wb') as file: pickle.dump(lm, file) #metrics to return r_square = lm.score(X, y) MAE = metrics.mean_absolute_error(y_test, predictions) MSE = metrics.mean_squared_error(y_test, predictions) RMSE = np.sqrt(MSE) #plot paths to return scatterplotpath = name + "/plots/scatter.png" distpath = name + "/plots/dist.png" residualpath = name + "/plots/residual.png" #path to model modelpath = name + "/models/" + name + ".pkl" return jsonify({ "status": "success LinearReg", "metrics": { "mae": MAE, "mse": MSE, "rmse": RMSE, "r_square": r_square }, "ploturl": { "scatterplotpath": scatterplotpath, "distpath": distpath, "residualpath": residualpath }, "feature_names": features, "model_path": modelpath }), 201
logistic_model = LogisticRegression() # predicted values are like categories gaussian_nb = GaussianNB() #error rmse - r2 degree = 1 poly_reg_model = make_pipeline(PolynomialFeatures(degree=5),LinearRegression()) linear_model.fit(X_train, y_train) ridge_model.fit(X_train, y_train) lasso_model.fit(X_train, y_train) poly_reg_model.fit(X_train, y_train) logistic_model.fit(X_train, y_train) elastic_model.fit(X_train, y_train) bayesian_model.fit(X_train, y_train) gaussian_nb.fit(X_train, y_train) y_pred = gaussian_nb.predict(X_test) r2_score = linear_model.score(X_test, y_test) print("linear",linear_model.score(X_test, y_test)*100) print("ridge",ridge_model.score(X_test, y_test)*100) print("lasso",lasso_model.score(X_test, y_test)*100) print("polynomial reg model",poly_reg_model.score(X_test, y_test)*100) print("logistic reg",logistic_model.score(X_test, y_test)*100) print("elastic net",elastic_model.score(X_test, y_test)*100) print("bayesian",bayesian_model.score(X_test, y_test)*100) print("gaussian-nb",gaussian_nb.score(X_test, y_test)*100) # print(r2_score*100,'%') print(y_pred) plt.scatter(X, y, s=15) plt.plot(X_test, y_pred, color = 'r') # plt.show()
def prediction_BayesianRidge (X_train, Y_train, X_test, Y_test,normalize): # Print shapes of the training and testing data sets #print ("Shapes of the training and testing data sets") #print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape) #Create our regression object lreg = BayesianRidge(normalize=normalize) #do a linear regression, except only on the training lreg.fit(X_train,Y_train) #print("The estimated intercept coefficient is %.2f " %lreg.intercept_) #print("The number of coefficients used was %d " % len(lreg.coef_)) # Set a DataFrame from the Facts coeff_df = DataFrame(X_train.columns) coeff_df.columns = ["Fact"] # Set a new column lining up the coefficients from the linear regression coeff_df["Coefficient"] = pd.Series(lreg.coef_) # Show #coeff_df #highest correlation between a fact and fraction votes #print ("Highest correlation fact: %s is %.9f" % (cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"], coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]) ) #sns_plot = sns.jointplot(coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"Fraction Votes",pd.merge(X_test,pd.DataFrame(Y_test), right_index=True, left_index=True),kind="scatter") #Predictions on training and testing sets pred_train = lreg.predict(X_train) pred_test = lreg.predict(X_test) # The mean square error #print("MSE with X_train and Y_train: %.6f" % np.mean((Y_train - pred_train) ** 2)) #print("MSE with X_test and Y_test: %.6f" %np.mean((Y_test - pred_test) ** 2)) #Explained variance score: 1 is perfect prediction #print("Variance score: %.2f" % lreg.score(X_test, Y_test)) result={} result["method"]="BayesianRidge" if normalize : result["normalize"]="Y" else: result["normalize"]="N" result["X_train_shape"]=X_train.shape result["Y_train_shape"]=Y_train.shape result["X_test_shape"]=X_test.shape result["Y_test_shape"]=Y_test.shape result["intercept"]=lreg.intercept_ result["num_coef"]=len(lreg.coef_) result["max_fact"]=cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"] result["max_fact_value"]=coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"] result["MSE_train"]=np.mean((Y_train - pred_train) ** 2) result["MSE_test"]=np.mean((Y_test - pred_test) ** 2) result["variance"]=lreg.score(X_test, Y_test) return pred_test,coeff_df,pred_train,result
#MSE print(metrics.mean_squared_error(t_test, prediction_hu)) #RMSE print(np.sqrt(metrics.mean_squared_error(t_test, prediction_hu))) model_hu.score(s_test, t_test) from sklearn.linear_model import BayesianRidge model_br = BayesianRidge() fit = model_br.fit(X_train, y_train) prediction_br = model_br.predict(X_test) from matplotlib import pyplot as plt plt.plot(t, y_test, 'bs', t, prediction_br, 'g^') plt.xlabel('Samples') plt.ylabel('prediction') plt.title('BeysianRidge regressor') #MAE print(metrics.mean_absolute_error(y_test, prediction_br)) #MSE print(metrics.mean_squared_error(y_test, prediction_br)) #RMSE print(np.sqrt(metrics.mean_squared_error(y_test, prediction_br))) model_br.score(X_test, y_test)
def __bayesian_ridge_regression(self, X_train, X_test, y_train, y_test): lm = BayesianRidge() lm.fit(X_train, y_train) print('BayesianRidge Accuracy:', lm.score(X_test, y_test))
y_test = y[ind_split:] # Lasso Regressor reg_1 = Lasso() reg_1.fit(X_train, y_train) print("Lasso Score:", reg_1.score(X_test, y_test)) # Ridge Regressor reg_2 = Ridge() reg_2.fit(X_train, y_train) print("Ridge Score:", reg_2.score(X_test, y_test)) # Bayesian Ridge Regressor reg_3 = BayesianRidge() reg_3.fit(X_train, y_train) print("BayesianRidge Score:", reg_3.score(X_test, y_test)) # ElasticNet Regresor reg_4 = ElasticNet() reg_4.fit(X_train, y_train) print("ElasticNet Score:", reg_4.score(X_test, y_test)) #Let us predict the stock market for the Future 30 days days = 20 data_seed = df['Adj Close'].values[-window_size:][None] input_values = { 'Lasso': data_seed, 'Ridge': data_seed, 'BayesianRidge': data_seed,
# Determining accuracy rf_accuracy = rf.score(x_test, y_test) rf_evs = evs(y_test, rf_yhat) print("Random Forest Training Accuracy:", rf.score(x_train, y_train)) print("Random Forest Testing Accuracy:", rf_accuracy) print("Random Forest Explained Variance Score:", rf_evs) dt_accuracy = dt.score(x_test, y_test) dt_evs = evs(y_test, dt_yhat) print("Decision Tree Training Accuracy:", dt.score(x_train, y_train)) print("Decision Tree Testing Accuracy:", dt_accuracy) print("Decision Tree Explained Variance Score:", dt_evs) lr_accuracy = lr.score(x_test, y_test) lr_evs = evs(y_test, lr_yhat) print("Linear Regression Training Accuracy:", lr.score(x_train, y_train)) print("Linear Regression Testing Accuracy:", lr_accuracy) print("Linear Regression Explained Variance Score:", lr_evs) bayesian_accuracy = bayesian.score(x_test, y_test) bayesian_evs = evs(y_test, bayesian_yhat) print("Bayesian Training Accuracy:", bayesian.score(x_train, y_train)) print("Bayesian Testing Accuracy:", bayesian_accuracy) print("Bayesian Explained Variance Score:", bayesian_evs)
while (j < len(X_valid) - 2): xx1.append(j) for t in range(3): if t == 0: yy1.append(pred4[j + t]) elif t == 1: yy2.append(pred4[j + t]) else: yy3.append(pred4[j + t]) j += 3 plt.plot(xx1, yy1) plt.plot(xx1, yy2) plt.plot(xx1, yy3) plt.show() print('Model score is: ', br.score(X_train, y_train)) print('R2 score for bayesian ridge is: ', br.score(X_valid, y_valid)) print('MSE For validation set for bayesian ridge is: ', mean_squared_error(pred4, y_valid)) pred5 = clf.predict(X_valid) xx1 = [] yy1 = [] yy2 = [] yy3 = [] j = 0 while (j < len(X_valid) - 2): xx1.append(j) for t in range(3):
x = data.loc[:, ['carat', 'cut', 'color', 'depth']].values y = data.loc[:, 'price'].values xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.20) model = BayesianRidge(compute_score=True) model.fit(xtrain, ytrain) yprd = model.predict(xtest) wt = float(input("Enter the weight\n")) ct = float(input("Enter the Cut( 0-4)")) cl = float(input("Enter the color(0-6)")) dt = float(input("Enter the Depth\n")) xnew = [[wt, ct, cl, dt]] ynew = model.predict(xnew) print("Diamond Price", ynew[0]) print("Mean sqaured Error", mean_squared_error(ytest, yprd)) print("Variance Score", r2_score(ytest, yprd)) print("Coefficent", model.coef_) print("Intercept ", model.intercept_) print("Accuracy", model.score(x, y) * 100) '''plt.scatter(ytest,yprd) plt.plot(ytest,yprd) plt.title("Expected Price and Predict Output") plt.xlabel("Excpected Value") plt.ylabel("Predict output") plt.show() '''
print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2 print "\n**********测试BayesianRidge类**********" bayesianRidge = BayesianRidge() # 拟合训练集 bayesianRidge.fit(train_X, train_Y.values.ravel()) # 打印模型的系数 print "系数:", bayesianRidge.coef_ print "截距:", bayesianRidge.intercept_ print '训练集R2: ', r2_score(train_Y, bayesianRidge.predict(train_X)) # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者 # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏. test_Y_pred = bayesianRidge.predict(test_X) print "测试集得分:", bayesianRidge.score(test_X, test_Y) print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "测试集R2:", r2_score(test_Y, test_Y_pred) tss, rss, ess, r2 = xss(Y, bayesianRidge.predict(X)) print "TSS(Total Sum of Squares): ", tss print "RSS(Residual Sum of Squares): ", rss print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2 print "\n**********测试ARDRegression类**********" ardRegression = ARDRegression() # 拟合训练集 ardRegression.fit(train_X, train_Y.values.ravel()) # 打印模型的系数
def main(): usage = 'usage: %prog [options] <repr_hdf5> <data_hdf5> <target_index>' parser = OptionParser(usage) parser.add_option('-a', dest='add_only', default=False, action='store_true', help='Use additional features only; no sequence features') parser.add_option('-b', dest='balance', default=False, action='store_true', help='Downsample the negative set to balance [Default: %default]') parser.add_option('-o', dest='out_dir', default='postmodel', help='Output directory [Default: %default]') parser.add_option('-r', dest='regression', default=False, action='store_true', help='Regression mode [Default: %default]') parser.add_option('-s', dest='seq_only', default=False, action='store_true', help='Use sequence features only; no additional features [Default: %default]') parser.add_option('--sample', dest='sample', default=None, type='int', help='Sample from the training set [Default: %default]') parser.add_option('-t', dest='target_hdf5', default=None, help='Extract targets from this HDF5 rather than data_hdf5 argument') parser.add_option('-x', dest='regex_add', default=None, help='Filter additional features using a comma-separated list of regular expressions') (options,args) = parser.parse_args() if len(args) != 3: parser.error('Must provide full data HDF5, representation HDF5, and target index or filename') else: repr_hdf5_file = args[0] data_hdf5_file = args[1] target_i = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(1) ####################################################### # preprocessing ####################################################### # load training targets data_hdf5_in = h5py.File(data_hdf5_file, 'r') if options.target_hdf5: target_hdf5_in = h5py.File(options.target_hdf5, 'r') else: target_hdf5_in = data_hdf5_in train_y = np.array(target_hdf5_in['train_out'])[:,target_i] test_y = np.array(target_hdf5_in['test_out'])[:,target_i] # load training representations if not options.add_only: repr_hdf5_in = h5py.File(repr_hdf5_file, 'r') train_x = np.array(repr_hdf5_in['train_repr']) test_x = np.array(repr_hdf5_in['test_repr']) repr_hdf5_in.close() if options.seq_only: add_labels = [] else: # load additional features train_a = np.array(data_hdf5_in['train_add']) test_a = np.array(data_hdf5_in['test_add']) add_labels = np.array(data_hdf5_in['add_labels']) if options.regex_add: fi = filter_regex(options.regex_add, add_labels) train_a, test_a, add_labels = train_a[:,fi], test_a[:,fi], add_labels[fi] # append additional features if options.add_only: add_i = 0 train_x, test_x = train_a, test_a else: add_i = train_x.shape[1] train_x = np.concatenate((train_x,train_a), axis=1) test_x = np.concatenate((test_x,test_a), axis=1) data_hdf5_in.close() if options.target_hdf5: target_hdf5_in.close() # balance if options.balance: train_x, train_y = balance(train_x, train_y) # sample if options.sample is not None and options.sample < train_x.shape[0]: sample_indexes = random.sample(range(train_x.shape[0]), options.sample) train_x = train_x[sample_indexes] train_y = train_y[sample_indexes] ####################################################### # model ####################################################### if options.regression: # fit model = BayesianRidge(fit_intercept=True) model.fit(train_x, train_y) # accuracy acc_out = open('%s/r2.txt' % options.out_dir, 'w') print >> acc_out, model.score(test_x, test_y) acc_out.close() test_preds = model.predict(test_x) # plot a sample of predictions versus actual plt.figure() sns.jointplot(test_preds[:5000], test_y[:5000], joint_kws={'alpha':0.3}) plt.savefig('%s/scatter.pdf' % options.out_dir) plt.close() # plot the distribution of residuals plt.figure() sns.distplot(test_y-test_preds) plt.savefig('%s/residuals.pdf' % options.out_dir) plt.close() else: # fit model = LogisticRegression(penalty='l2', C=1000) model.fit(train_x, train_y) # accuracy test_preds = model.predict_proba(test_x)[:,1].flatten() acc_out = open('%s/auc.txt' % options.out_dir, 'w') print >> acc_out, roc_auc_score(test_y, test_preds) acc_out.close() # compute and print ROC curve fpr, tpr, thresholds = roc_curve(test_y, test_preds) roc_out = open('%s/roc.txt' % options.out_dir, 'w') for i in range(len(fpr)): print >> roc_out, '%f\t%f\t%f' % (fpr[i], tpr[i], thresholds[i]) roc_out.close() # compute and print precision-recall curve precision, recall, thresholds = precision_recall_curve(test_y, test_preds) prc_out = open('%s/prc.txt' % options.out_dir, 'w') for i in range(len(precision)): print >> prc_out, '%f\t%f' % (precision[i], recall[i]) prc_out.close() # save model joblib.dump(model, '%s/model.pkl' % options.out_dir) ####################################################### # analyze ####################################################### # print coefficients table coef_out = open('%s/add_coefs.txt' % options.out_dir, 'w') for ai in range(len(add_labels)): if options.regression: coefi = model.coef_[add_i+ai] else: coefi = model.coef_[0,add_i+ai] print >> coef_out, add_labels[ai], coefi coef_out.close()
feature_selector = FeatureSelector(feature_corr_min=0.45, feature_x_corr_max=0.5) feature_selector.fit(X_train, y_train) X_train = feature_selector.transform(X_train) X_test = feature_selector.transform(X_test) # corr = data.corr() # param_grid = {'C': [4.7, 4.8, 4.9, 5.0], 'gamma': [ 0.000009, 0.000010, 0.000011, 0.000012]} #print(X_train) #print(y_train) # regressor = LinearRegression() regressor = BayesianRidge() #regressor.fit(X_train, y_train.squeeze().tolist()) regressor.fit(X_train, y_train) print('Fit=' + str(regressor.score(X_train, y_train))) print('Score=' + str(regressor.score(X_test, y_test))) print(feature_selector.features_) print(regressor.get_params()) y_predict = regressor.predict(X_test) plt.plot(y_test, y_predict, 'o') plt.show()
class BayesianRegressor: def __init__(self, X, y): self.X = X self.y = y self.model = BayesianRidge(normalize=True, copy_X=True) self.train = self.transform(self.X) def transform(self, X): return X def fit(self): self.model.fit(self.train, self.y) self.V = self.Vn() def score(self): return self.model.score(self.train, self.y) def plot(self, file=None): import matplotlib.pyplot as plt if file is not None: plt.ioff() else: plt.ion() fig, ax = plt.subplots(nrows=1, ncols=1) X_axis = np.linspace(self.X.min(), self.X.max() + 20, 100) X_axis_transformed = self.transform(X_axis) ax.scatter(self.X, self.y) ax.plot(X_axis, self.model.predict(X_axis_transformed)) if file is not None: fig.savefig(file) plt.close(fig) else: fig.show() def Vn(self): try: return np.linalg.inv( (1 / self.y.std()**2) * (self.train.T).dot(self.train)) except np.linalg.LinAlgError: return np.linalg.pinv( (1 / self.y.std()**2) * (self.train.T).dot(self.train)) # computes estimator for the posterior variance def posterior_variance(self, x): x_t = self.transform(x)[0] return ((self.y.std()**2) + (x_t.T).dot(self.V).dot(x_t)) # computes estimator for the posterior mean def posterior_mean(self, x): return self.model.predict(self.transform(x)[0].reshape(1, -1))[0] # return tuple of (posterior_mean,sqrt(posterior_variance)) def posterior_distribution(self, x): return self.posterior_mean(x), np.sqrt(self.posterior_variance(x)) def posterior_cdf(self, y_query, x): return norm.cdf(y_query, self.posterior_mean(x), np.sqrt(self.posterior_variance(x))) def print_stats(self, t, y_query): print("R^2:{}".format(self.score())) print("posterior std on t = {}: {:.2f} ".format( t, np.sqrt(self.posterior_variance(t)))) print("posterior mean on t = {}: {:.0f}".format( t, self.posterior_mean(t))) print("goal of {} achieved with probability:{:.2f} ".format( y_query, (1 - self.posterior_cdf(y_query, t)) * 100))
def task2(data): df = data dfreg = df.loc[:,['Adj Close','Volume']] dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0 dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0 # Drop missing value dfreg.fillna(value=-99999, inplace=True) # We want to separate 1 percent of the data to forecast forecast_out = int(math.ceil(0.01 * len(dfreg))) # Separating the label here, we want to predict the AdjClose forecast_col = 'Adj Close' dfreg['label'] = dfreg[forecast_col].shift(-forecast_out) X = np.array(dfreg.drop(['label'], 1)) # Scale the X so that everyone can have the same distribution for linear regression X = preprocessing.scale(X) # Finally We want to find Data Series of late X and early X (train) for model generation and evaluation X_lately = X[-forecast_out:] X = X[:-forecast_out] # Separate label and identify it as y y = np.array(dfreg['label']) y = y[:-forecast_out] #Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) ################## ################## ################## # Linear regression clfreg = LinearRegression(n_jobs=-1) # 1 - First save the models to local device in models folder # filename = 'models/clfreg_model.sav' # pickle.dump(clfreg, open(filename, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfreg = pickle.load(open(filename, 'rb')) clfreg.fit(X_train, y_train) # Quadratic Regression 2 clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) #Save model to a pickle # filename1 = 'models/clfpoly2_model.sav' # pickle.dump(clfpoly2, open(filename1, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfpoly2 = pickle.load(open(filename1, 'rb')) clfpoly2.fit(X_train, y_train) # Quadratic Regression 3 clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) #Save model to a pickle # filename2 = 'models/clfpoly3_model.sav' # pickle.dump(clfpoly3, open(filename2, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfpoly3 = pickle.load(open(filename2, 'rb')) clfpoly3.fit(X_train, y_train) # KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) #Save model to a pickle # filename3 = 'models/clfknn_model.sav' # pickle.dump(clfknn, open(filename3, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfknn = pickle.load(open(filename3, 'rb')) clfknn.fit(X_train, y_train) # Lasso Regression clflas = Lasso() #Save model to a pickle # filename4 = 'models/clflas_model.sav' # pickle.dump(clflas, open(filename4, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clflas = pickle.load(open(filename4, 'rb')) clflas.fit(X_train, y_train) # Multitask Lasso Regression # clfmtl = MultiTaskLasso(alpha=1.) # clfmtl.fit(X_train, y_train).coef_ # Bayesian Ridge Regression clfbyr = BayesianRidge() clfbyr.fit(X_train, y_train) #Save model to a pickle # filename5 = 'models/clfbyr_model.sav' # pickle.dump(clfbyr, open(filename5, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfbyr = pickle.load(open(filename5, 'rb')) # Lasso LARS Regression clflar = LassoLars(alpha=.1) clflar.fit(X_train, y_train) #Save model to a pickle # filename6 = 'models/clflar_model.sav' # pickle.dump(clflar, open(filename6, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clflar = pickle.load(open(filename6, 'rb')) # Orthogonal Matching Pursuit Regression clfomp = OrthogonalMatchingPursuit(n_nonzero_coefs=2) clfomp.fit(X_train, y_train) #Save model to a pickle # filename7 = 'models/clfomp_model.sav' # pickle.dump(clfomp, open(filename7, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfomp = pickle.load(open(filename7, 'rb')) # Automatic Relevance Determination Regression clfard = ARDRegression(compute_score=True) clfard.fit(X_train, y_train) #Save model to a pickle # filename8 = 'models/clfard_model.sav' # pickle.dump(clfard, open(filename8, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfard = pickle.load(open(filename8, 'rb')) # Logistic Regression # clflgr = linear_model.LogisticRegression(penalty='l1', solver='saga', tol=1e-6, max_iter=int(1e6), warm_start=True) # coefs_ = [] # for c in cs: # clflgr.set_params(C=c) # clflgr.fit(X_train, y_train) # coefs_.append(clflgr.coef_.ravel().copy()) #SGD Regression clfsgd = SGDRegressor(random_state=0, max_iter=1000, tol=1e-3) clfsgd.fit(X_train, y_train) #Save model to a pickle # filename9 = 'models/clfsgd_model.sav' # pickle.dump(clfsgd, open(filename9, 'wb')) # 2 - load the models from disk onces first instruction is done once. # clfsgd = pickle.load(open(filename9, 'rb')) ################## ################## ################## #Create confindence scores confidencereg = clfreg.score(X_test, y_test) confidencepoly2 = clfpoly2.score(X_test,y_test) confidencepoly3 = clfpoly3.score(X_test,y_test) confidenceknn = clfknn.score(X_test, y_test) confidencelas = clflas.score(X_test, y_test) # confidencemtl = clfmtl.score(X_test, y_test) confidencebyr = clfbyr.score(X_test, y_test) confidencelar = clflar.score(X_test, y_test) confidenceomp = clfomp.score(X_test, y_test) confidenceard = clfard.score(X_test, y_test) confidencesgd = clfsgd.score(X_test, y_test) # results print('The linear regression confidence is:',confidencereg*100) print('The quadratic regression 2 confidence is:',confidencepoly2*100) print('The quadratic regression 3 confidence is:',confidencepoly3*100) print('The knn regression confidence is:',confidenceknn*100) print('The lasso regression confidence is:',confidencelas*100) # print('The lasso regression confidence is:',confidencemtl*100) print('The Bayesian Ridge regression confidence is:',confidencebyr*100) print('The Lasso LARS regression confidence is:',confidencelar*100) print('The OMP regression confidence is:',confidenceomp*100) print('The ARD regression confidence is:',confidenceard*100) print('The SGD regression confidence is:',confidencesgd*100) #Create new columns forecast_reg = clfreg.predict(X_lately) forecast_pol2 = clfpoly2.predict(X_lately) forecast_pol3 = clfpoly3.predict(X_lately) forecast_knn = clfknn.predict(X_lately) forecast_las = clflas.predict(X_lately) forecast_byr = clfbyr.predict(X_lately) forecast_lar = clflar.predict(X_lately) forecast_omp = clfomp.predict(X_lately) forecast_ard = clfard.predict(X_lately) forecast_sgd = clfsgd.predict(X_lately) #Process all new columns data dfreg['Forecast_reg'] = np.nan last_date = dfreg.iloc[-1].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_reg: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg.loc[next_date] = [np.nan for _ in range(len(dfreg.columns))] dfreg['Forecast_reg'].loc[next_date] = i dfreg['Forecast_pol2'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_pol2: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_pol2'].loc[next_date] = i dfreg['Forecast_pol3'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_pol3: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_pol3'].loc[next_date] = i dfreg['Forecast_knn'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_knn: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_knn'].loc[next_date] = i dfreg['Forecast_las'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_las: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_las'].loc[next_date] = i dfreg['Forecast_byr'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_byr: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_byr'].loc[next_date] = i dfreg['Forecast_lar'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_lar: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_lar'].loc[next_date] = i dfreg['Forecast_omp'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_omp: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_omp'].loc[next_date] = i dfreg['Forecast_ard'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_ard: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_ard'].loc[next_date] = i dfreg['Forecast_sgd'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_sgd: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_sgd'].loc[next_date] = i return dfreg.index.format(formatter=lambda x: x.strftime('%Y-%m-%d')), dfreg['Adj Close'].to_list(), dfreg['Forecast_reg'].to_list(), dfreg['Forecast_pol2'].to_list(), dfreg['Forecast_pol3'].to_list(), dfreg['Forecast_knn'].to_list(), dfreg['Forecast_las'].to_list(), dfreg['Forecast_byr'].to_list(), dfreg['Forecast_lar'].to_list(), dfreg['Forecast_omp'].to_list(), dfreg['Forecast_ard'].to_list(), dfreg['Forecast_sgd'].to_list()
def calc_bayesian_ridge_regression(X_train, X_test, y_train, y_test): reg = BayesianRidge().fit(X_train, y_train) reg.score(X_train, y_train) return calc_spearmanr_from_regressor(reg, X_test, y_test)
na_mask_train = ~X_train.loc[X_train_odds.index].isna().T.any() X_train_odds_comp = X_train.loc[X_train_odds.index].dropna() # X_train_odds_comp = X_train_odds_comp.fillna(X_train_odds_comp.mean()) na_mask_val = ~X_val.loc[X_val_odds.index].isna().T.any() X_val_odds_comp = X_val.loc[X_val_odds.index].dropna() # X_val_odds_comp = X_val_odds_comp.fillna(X_val_odds_comp.mean()) X_train_odds = X_train_odds[na_mask_train] X_val_odds = X_val_odds[na_mask_val] y_train_odds = y_train_odds[na_mask_train] y_val_odds = y_val_odds[na_mask_val] lm = BayesianRidge().fit(X_train_odds.median(axis=1).values.reshape(-1,1), y_train_odds) predictions = lm.predict(X_val_odds.median(axis=1).values.reshape(-1,1)) print(mean_squared_error(y_val_odds, predictions)) lm.score(X_val_odds.median(axis=1).values.reshape(-1,1), y_val_odds) # X_train_odds_comp_tot = pd.concat([X_train.loc[X_train_odds.index], X_train_odds], axis=1) # X_val_odds_comp_tot = pd.concat([X_val.loc[X_val_odds.index], X_val_odds], axis=1) ####### Scale data select features standardscaler = StandardScaler() X_trainscaled_odds_comp = standardscaler.fit_transform(X_train_odds_comp[featurestouse]) X_valscaled_odds_comp = standardscaler.transform(X_val_odds_comp[featurestouse]) # standardscaler = StandardScaler() # X_trainscaled_odds_comp_tot = standardscaler.fit_transform(X_train_odds_comp_tot[featurestouse]) # X_valscaled_odds_comp_tot = standardscaler.transform(X_val_odds_comp_tot[featurestouse]) ####### Setup grid search def do_grid_search(X_train, y_train, X_val, y_val): X_train_val = np.vstack((X_train, X_val))
# PCA + Orthogonal Matching Pursuit omp = OrthogonalMatchingPursuit() omp.fit(reduced_training_features, training_labels) preds = omp.predict(reduced_testing_features) score = omp.score(reduced_testing_features,testing_labels) print 'PCA + Orthogonal Matching Pursuit Results:' print 'R2 score:', score print 'MAE:', mean_absolute_error(testing_labels,preds) # Bayesian Ridge Regression from sklearn.linear_model import BayesianRidge br = BayesianRidge() br.fit(training_features, training_labels) preds = br.predict(testing_features) score = br.score(testing_features,testing_labels) print 'Bayesian Ridge Regression Results:' print 'R2 score:', score print 'MAE:', mean_absolute_error(testing_labels,preds), '\n' # PCA + Bayesian Ridge Regression br = BayesianRidge() br.fit(reduced_training_features, training_labels) preds = br.predict(reduced_testing_features) score = br.score(reduced_testing_features,testing_labels) print 'PCA + Bayesian Ridge Regression Results:' print 'R2 score:', score print 'MAE:', mean_absolute_error(testing_labels,preds) # Stochastic Gradient Descent Regression from sklearn.linear_model import SGDRegressor
df['Prediction'] = df_close.shift(-forecast_out) # label column with data shifted 30 units up # print(df.tail()) X = np.array(df.drop(['Prediction'], 1)) X = preprocessing.scale(X) X_forecast = X[-forecast_out:] # set X_forecast equal to last 30 X = X[:-forecast_out] # remove last 30 from X y = np.array(df['Prediction']) y = y[:-forecast_out] X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3) # Training clf = BayesianRidge() clf.fit(X_train,y_train) # Testing confidence = clf.score(X_test, y_test) print("confidence: ", confidence) forecast_prediction = clf.predict(X_forecast) print(forecast_prediction)
def make_predictions(df): ## Volatility #high to low percent df['HL_PCT'] = (df['high'] - df['low']) / df['close'] * 100.0 #Change percent in close to open df['PCT_change'] = (df['close'] - df['open']) / df['open'] * 100.0 # Drop missing value df.fillna(value=-99999, inplace=True) # separate 1 percent of the data to forecast forecast_out = int(math.ceil(0.01 * len(df))) # Separating the label here, we want to predict the AdjClose forecast_col = 'adjusted_close' df['label'] = df[forecast_col].shift(-forecast_out) X = np.array(df.drop(['label'], 1)) # Scale X - so all have the same distribution for Linear regression X = preprocessing.scale(X) # #finally, we want to find Data series of late X early X (train) for model generation and evaluation X_forecast = X[-forecast_out:] X = X[:-forecast_out] # Separate label and identify it as y y = np.array(df['label']) y = y[:-forecast_out] #Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Linear regression model = LinearRegression(n_jobs=-1) model.fit(X_train, y_train) # KNN Regression model_knn = KNeighborsRegressor(n_neighbors=2) model_knn.fit(X_train, y_train) # Bayesian Ridge Regression model_by = BayesianRidge() model_by.fit(X_train, y_train) #Create confindence scores confidencereg = model.score(X_test, y_test) confidence_model_knn = model_knn.score(X_test, y_test) confidence_model_by = model_by.score(X_test, y_test) reg = confidencereg * 100 knn = confidence_model_knn * 100 by = confidence_model_by * 100 score = " Regression {}\n KNN {}\n Bayesian {}\n ".format(reg, knn, by) #Create new columns forecast_reg = model.predict(X_forecast) forecast_knn = model_knn.predict(X_forecast) forecast_by = model_by.predict(X_forecast) #Process all new columns data df['Forecast_reg'] = np.nan last_date = df.iloc[-1].name # last_unix = datetime.strptime(last_date, '%Y-%m-%d') last_unix = last_date next_unix = last_unix + timedelta(days=1) for i in forecast_reg: next_date = next_unix next_unix += timedelta(days=1) df.loc[next_date] = [np.nan for _ in range(len(df.columns))] df['Forecast_reg'].loc[next_date] = i df['Forecast_knn'] = np.nan last_date = df.iloc[-40].name # last_date = df.iloc[-26].name last_unix = last_date next_unix = last_unix + timedelta(days=1) for i in forecast_knn: next_date = next_unix next_unix += timedelta(days=1) df['Forecast_knn'].loc[next_date] = i df['forecast_by'] = np.nan last_date = df.iloc[-40].name last_unix = last_date next_unix = last_unix + timedelta(days=1) for i in forecast_by: next_date = next_unix next_unix += timedelta(days=1) df['forecast_by'].loc[next_date] = i return df
#!/usr/bin/env python import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import BayesianRidge from sklearn import datasets from sklearn.utils import shuffle import numpy as np boston = datasets.load_boston() X, Y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) offset = int(X.shape[0] * 0.9) X_train, Y_train = X[:offset], Y[:offset] X_test, Y_test = X[offset:], Y[offset:] regressor = BayesianRidge(compute_score=True) regressor.fit(X_train, Y_train) score = regressor.score(X_test, Y_test) print(score)
# corr = data.corr() # param_grid = {'C': [4.7, 4.8, 4.9, 5.0], 'gamma': [ 0.000009, 0.000010, 0.000011, 0.000012]} print(X_train) print(y_train) # regressor = LinearRegression() # regressor = SVR(C=5, gamma=0.00001) regressor = BayesianRidge(normalize=True, n_iter=5, tol=0.01, fit_intercept=True) # regressor = ARDRegression(normalize=True, n_iter=5, tol=0.01) # regressor = SGDRegressor() # regressor = MLPRegressor(hidden_layer_sizes=(200, 50, 10)) # regressor = RANSACRegressor(min_samples=80, max_trials=1000) # regressor = Lasso() regressor.fit(X_train, y_train.squeeze().tolist()) print(regressor.score(X_train, y_train.squeeze().tolist())) print(regressor.score(X_test, y_test.squeeze().tolist())) print(regressor.get_params()) y_predict = regressor.predict(X_test) print(y_predict) plt.plot(y_test.squeeze().tolist(), y_predict, 'o') plt.show()
# cv=ShuffleSplit(X_train.shape[0], n_splits=10, test_fraction=0.6, # random_state=0)) t1 = time() sc.fit(X_train, y_train) sc_time = time() - t1 computed_coefs = sc.inverse_transform() computed_coefs = np.reshape(computed_coefs, [size, size, size]) score = sc.score(X_test, y_test) ############################################################################### # Compute the results for simple BayesianRidge t1 = time() clf.fit(X_train, y_train) bayes_time = time() - t1 bayes_coefs = clf.coef_ bayes_score = clf.score(X_test, y_test) bayes_coefs = bayes_coefs.reshape((size, size, size)) ############################################################################### # Plot the results pl.close('all') pl.figure() pl.title('Scores of the supervised clustering') pl.subplot(2, 1, 1) pl.plot(np.arange(len(sc.scores_)), sc.scores_) pl.xlabel('score') pl.ylabel('iteration') pl.title('Score of the best parcellation of each iteration') pl.subplot(2, 1, 2) pl.plot(np.arange(len(sc.delta_scores_)), sc.delta_scores_)
import os, sys full_path = os.path.realpath(__file__) file = os.path.dirname(full_path) + "\\\data\\housingSample.csv" (X,Y,records)=getData(file) X_train, X_test, price_train, price_test = train_test_split(X, Y, test_size = 0.1, random_state = 42) model=BayesianRidge() model.fit(X_train, price_train.ravel()) predPrices=model.predict(X_train) print(model) # Summarize the fit of the model #print(model.intercept_, model.coef_, mse) print(model.score(X_train, price_train)) predPrices=model.predict(X_train) mse=mean_squared_error(price_train, predPrices) rs=r2_score(price_train, predPrices) print("training mse:",mse) print("training score:",rs) # testing testing_pred_price_results=model.predict(X_test) mse=mean_squared_error(price_test, testing_pred_price_results) rs=r2_score(price_test, testing_pred_price_results) print("median_house_value"+" Predicted_median_house_value") print(np.c_[price_test, testing_pred_price_results]) print("testing mse:", mse)
def main(): usage = 'usage: %prog [options] <model_file>' parser = OptionParser(usage) parser.add_option('-c', dest='center_dist', default=10, type='int', help='Distance between the motifs and sequence center [Default: %default]') parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]') parser.add_option('-g', dest='cuda', default=False, action='store_true', help='Run on the GPGPU [Default: %default]') parser.add_option('-l', dest='seq_length', default=600, type='int', help='Sequence length [Default: %default]') parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]') parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide Basset model file') else: model_file = args[0] out_targets = [int(ti) for ti in options.targets.split(',')] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(1) # torch options cuda_str = '' if options.cuda: cuda_str = '-cuda' ################################################################# # place filter consensus motifs ################################################################# # determine filter consensus motifs filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str) seqs_1hot = [] num_filters = len(filter_consensus) # num_filters = 40 filter_len = filter_consensus[0].shape[1] # position the motifs left_i = options.seq_length/2 - options.center_dist - filter_len right_i = options.seq_length/2 + options.center_dist ns_1hot = np.zeros((4,options.seq_length)) + 0.25 # ns_1hot = np.zeros((4,options.seq_length)) # for i in range(options.seq_length): # nt_i = random.randint(0,3) # ns_1hot[nt_i,i] = 1 for i in range(num_filters): for j in range(num_filters): # copy the sequence of N's motifs_seq = np.copy(ns_1hot) # write them into the one hot coding motifs_seq[:,left_i:left_i+filter_len] = filter_consensus[i] motifs_seq[:,right_i:right_i+filter_len] = filter_consensus[j] # save seqs_1hot.append(motifs_seq) # make a full array seqs_1hot = np.array(seqs_1hot) # reshape for spatial seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,options.seq_length)) ################################################################# # place filter consensus motifs ################################################################# # save to HDF5 seqs_file = '%s/motif_seqs.h5' % options.out_dir h5f = h5py.File(seqs_file, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() # predict scores scores_file = '%s/motif_seqs_scores.h5' % options.out_dir torch_cmd = 'th basset_place2_predict.lua %s %s %s %s' % (cuda_str, model_file, seqs_file, scores_file) subprocess.call(torch_cmd, shell=True) # load in scores hdf5_in = h5py.File(scores_file, 'r') motif_seq_scores = np.array(hdf5_in['scores']) hdf5_in.close() ################################################################# # analyze ################################################################# for ti in out_targets: ################################################################# # compute pairwise expectations ################################################################# # X = np.zeros((motif_seq_scores.shape[0],num_filters)) # xi = 0 # for i in range(num_filters): # for j in range(num_filters): # X[xi,i] += 1 # X[xi,j] += 1 # xi += 1 X = np.zeros((motif_seq_scores.shape[0],2*num_filters)) xi = 0 for i in range(num_filters): for j in range(num_filters): X[xi,i] += 1 X[xi,num_filters+j] += 1 xi += 1 # fit model model = BayesianRidge() model.fit(X, motif_seq_scores[:,ti]) # predict pairwise expectations motif_seq_preds = model.predict(X) print model.score(X, motif_seq_scores[:,ti]) # print filter coefficients coef_out = open('%s/coefs_t%d.txt' % (options.out_dir,ti), 'w') for i in range(num_filters): print >> coef_out, '%3d %6.2f' % (i,model.coef_[i]) coef_out.close() ################################################################# # normalize pairwise predictions ################################################################# filter_interaction = np.zeros((num_filters,num_filters)) table_out = open('%s/table_t%d.txt' % (options.out_dir,ti), 'w') si = 0 for i in range(num_filters): for j in range(num_filters): filter_interaction[i,j] = motif_seq_scores[si,ti] - motif_seq_preds[si] cols = (i, j, motif_seq_scores[si,ti], motif_seq_preds[si], filter_interaction[i,j]) print >> table_out, '%3d %3d %6.3f %6.3f %6.3f' % cols si += 1 table_out.close() scores_abs = abs(filter_interaction.flatten()) max_score = stats.quantile(scores_abs, .999) print 'Limiting scores to +-%f' % max_score filter_interaction_max = np.zeros((num_filters, num_filters)) for i in range(num_filters): for j in range(num_filters): filter_interaction_max[i,j] = np.min([filter_interaction[i,j], max_score]) filter_interaction_max[i,j] = np.max([filter_interaction_max[i,j], -max_score]) # plot heat map plt.figure() sns.heatmap(filter_interaction_max, xticklabels=False, yticklabels=False) plt.savefig('%s/heat_t%d.pdf' % (options.out_dir,ti))
reg_world_deaths.fit(xtrain_world_deaths,ytrain_world_deaths) print(reg_world_deaths.get_params) #World reg_world_deaths=BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300, normalize=False, tol=0.001, verbose=False) reg_world_deaths.fit(xtrain_world_deaths,ytrain_world_deaths) reg_world_deaths_test = reg_world_deaths.predict(xtest_world_deaths) reg_world_deaths_predict_days = reg_world_deaths.predict(prediction_days) print('MAE:', metrics.mean_absolute_error(reg_world_deaths_test, ytest_world_deaths)) print('MSE:',metrics.mean_squared_error(reg_world_deaths_test, ytest_world_deaths)) print('R2 :',metrics.r2_score(reg_world_deaths_test, ytest_world_deaths)) print('Training score:',reg_world_deaths.score(xtrain_world_deaths,ytrain_world_deaths)) print('Testing score:',reg_world_deaths.score(xtest_world_deaths,ytest_world_deaths)) #Graph for Bayesian Predicted deaths in World plt.figure(figsize=(12, 8)) plt.plot(days,world_deaths) plt.plot(prediction_days,reg_world_deaths_predict_days,linestyle='dashed') plt.title('Predicted Coronavirus deaths Cases Over Time in World', size=30) plt.xlabel('Days Since 1/22/2020', size=20) plt.ylabel('No.of Cases(in Croces)', size=20) plt.legend(['deaths Cases', 'Bayesian Ridge Predictions']) plt.xticks(size=15) plt.show() reg_world_deaths_predict_days = reg_world_deaths_predict_days.reshape(1,-1)[0]
ls.fit(X_trn, y_trn) ls.score(X_trn, y_trn) ls.intercept_ ls.coef_ ls.__dict__ scoreOfModel6 = ls.score(X_trn, y_trn) pred6 = ls.predict(X_tst) pred6 = pd.DataFrame(pred6) print('r2 score:', {r2_score(y_tst, pred6)}) ############### Bayesian regression ############################################### Bs = BayesianRidge() Bs.fit(X_trn, y_trn) Bs.coef_ Bs.intercept_ scoreOfModel7 = Bs.score(X_trn, y_trn) pred7 = Bs.predict(X_tst) pred7 = pd.DataFrame(pred7) print('r2 score BSR:', {r2_score(y_tst, pred7)}) #model Evaluation ############## ElasticNet Regression (L1 + L2 penalized model) ########### ## hyperparameter that determines strength of a Ridge,lasso, elastcNet regression # we use alpha as our hyperparameter lambda ## l1 for lasso enet = ElasticNet(alpha=0.005, l1_ratio=0.7) enet.fit(X_trn, y_trn) enet.alpha scoreOfModel8 = enet.score(X_trn, y_trn) pred8 = enet.predict(X_tst) pred8 = pd.DataFrame(pred8)
def main(): usage = "usage: %prog [options] <model_file>" parser = OptionParser(usage) parser.add_option( "-c", dest="center_dist", default=10, type="int", help="Distance between the motifs and sequence center [Default: %default]", ) parser.add_option( "-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5 [Default: %default]" ) parser.add_option( "-g", dest="cuda", default=False, action="store_true", help="Run on the GPGPU [Default: %default]" ) parser.add_option("-l", dest="seq_length", default=600, type="int", help="Sequence length [Default: %default]") parser.add_option("-o", dest="out_dir", default="heat", help="Output directory [Default: %default]") parser.add_option( "-t", dest="targets", default="0", help="Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 1: parser.error("Must provide Basset model file") else: model_file = args[0] out_targets = [int(ti) for ti in options.targets.split(",")] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(1) # torch options cuda_str = "" if options.cuda: cuda_str = "-cuda" ################################################################# # place filter consensus motifs ################################################################# # determine filter consensus motifs filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str) seqs_1hot = [] # num_filters = len(filter_consensus) num_filters = 20 filter_len = filter_consensus[0].shape[1] # position the motifs left_i = options.seq_length / 2 - options.center_dist - filter_len right_i = options.seq_length / 2 + options.center_dist ns_1hot = np.zeros((4, options.seq_length)) + 0.25 # ns_1hot = np.zeros((4,options.seq_length)) # for i in range(options.seq_length): # nt_i = random.randint(0,3) # ns_1hot[nt_i,i] = 1 for i in range(num_filters): for j in range(num_filters): # copy the sequence of N's motifs_seq = np.copy(ns_1hot) # write them into the one hot coding motifs_seq[:, left_i : left_i + filter_len] = filter_consensus[i] motifs_seq[:, right_i : right_i + filter_len] = filter_consensus[j] # save seqs_1hot.append(motifs_seq) # make a full array seqs_1hot = np.array(seqs_1hot) # reshape for spatial seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, options.seq_length)) ################################################################# # place filter consensus motifs ################################################################# # save to HDF5 seqs_file = "%s/motif_seqs.h5" % options.out_dir h5f = h5py.File(seqs_file, "w") h5f.create_dataset("test_in", data=seqs_1hot) h5f.close() # predict scores scores_file = "%s/motif_seqs_scores.h5" % options.out_dir torch_cmd = "th basset_place2_predict.lua %s %s %s %s" % (cuda_str, model_file, seqs_file, scores_file) subprocess.call(torch_cmd, shell=True) # load in scores hdf5_in = h5py.File(scores_file, "r") motif_seq_scores = np.array(hdf5_in["scores"]) hdf5_in.close() ################################################################# # analyze ################################################################# for ti in out_targets: ################################################################# # compute pairwise expectations ################################################################# # X = np.zeros((motif_seq_scores.shape[0],num_filters)) # xi = 0 # for i in range(num_filters): # for j in range(num_filters): # X[xi,i] += 1 # X[xi,j] += 1 # xi += 1 X = np.zeros((motif_seq_scores.shape[0], 2 * num_filters)) xi = 0 for i in range(num_filters): for j in range(num_filters): X[xi, i] += 1 X[xi, num_filters + j] += 1 xi += 1 # fit model model = BayesianRidge() model.fit(X, motif_seq_scores[:, ti]) # predict pairwise expectations motif_seq_preds = model.predict(X) print model.score(X, motif_seq_scores[:, ti]) # print filter coefficients coef_out = open("%s/coefs_t%d.txt" % (options.out_dir, ti), "w") for i in range(num_filters): print >> coef_out, "%3d %6.2f" % (i, model.coef_[i]) coef_out.close() ################################################################# # normalize pairwise predictions ################################################################# filter_interaction = np.zeros((num_filters, num_filters)) table_out = open("%s/table_t%d.txt" % (options.out_dir, ti), "w") si = 0 for i in range(num_filters): for j in range(num_filters): filter_interaction[i, j] = motif_seq_scores[si, ti] - motif_seq_preds[si] cols = (i, j, motif_seq_scores[si, ti], motif_seq_preds[si], filter_interaction[i, j]) print >> table_out, "%3d %3d %6.3f %6.3f %6.3f" % cols si += 1 table_out.close() # plot heat map plt.figure() sns.heatmap(filter_interaction) plt.savefig("%s/heat_t%d.pdf" % (options.out_dir, ti))
x_test = test['Open'].as_matrix() y_test = test['Future'].as_matrix() # reshape into (row, column for sklearn) x = x.reshape((len(x), 1)) y = y.reshape((len(y), 1)) x_test = x_test.reshape(len(x_test), 1) y_test = y_test.reshape(len(y_test), 1) # fit classifiers ols = LinearRegression() ols = ols.fit(x, y) predict_ols = ols.predict(x_test) score_ols = ols.score(x_test, y_test) clf = BayesianRidge(compute_score=True) clf = clf.fit(x, y) predict_b = clf.predict(x_test) score_b = clf.score(x_test, y_test) print("Accuracy: OLS %lf, Bayes %lf" % (score_ols, score_b)) # plot results plt.plot(y_test, 'r+', label="actual") plt.plot(predict_ols, 'bx', label="ols") plt.plot(predict_b, 'g1', label="bayesian") plt.legend() plt.title("Predict DJIA 1 year ahead ( 2016 )") plt.savefig('OLS_vs_BayesianRegression.png') plt.show()
test_size=test_size, random_state=0) # k = int(0.5 * n_features) # print("-----------------------------------------------") # print("Perform chi2 feature selection k=", k) # print("-----------------------------------------------") # X_train, X_test = selectFeatures(X_train, X_test, y_train, k) print("-----------------------------------------------") print("SVM Classification of training set") print("-----------------------------------------------") class_weight = {0:5} print("Class weight=", class_weight) clf = BayesianRidge(compute_score=True).fit(X_train, y_train) print("Test svm.SVC score=", clf.score(X_test, y_test)) print("Train svm.SVC score=", clf.score(X_train, y_train)) print("-----------------------------------------------") print("Metrics on TEST SET") print("-----------------------------------------------") y_pred = clf.predict(X_test) print(metrics.classification_report(y_test, y_pred, target_names=label_names)) print(metrics.confusion_matrix(y_test, y_pred)) print("-----------------------------------------------") print("Metrics on TRAIN SET") print("-----------------------------------------------") y_predTrain = clf.predict(X_train)
topIndex = len(y_test) - 1 for i in range(topIndex, -1, -1): if (math.isnan(y_test[i])): del y_test[i] del X_test[i] # CHOOSING THE MODEL model = BayesianRidge() model2 = SVR() model.fit(X_train, y_train) model2.fit(X_train, y_train) print("Bayesian Ridge") print("R2:" + str(model.score(X_test, y_test))) print("Mean Squared Error: " + str(mean_squared_error(y_test, model.predict(X_test)))) df['Bayesian Ridge'][index] = mean_squared_error(y_test, model.predict(X_test)) print("Mean Absolute Error: " + str(mean_absolute_error(y_test, model.predict(X_test)))) print("Median Absolute Error: " + str(median_absolute_error(y_test, model.predict(X_test)))) print("") print("Support Vector Regression") print("R2:" + str(model2.score(X_test, y_test))) print("Mean Squared Error: " + str(mean_squared_error(y_test, model2.predict(X_test)))) df['SVR'][index] = mean_squared_error(y_test, model2.predict(X_test))