def lassoreg(a): print ("Doing lasso regression") clf2 = Lasso(alpha=a) clf2.fit(base_X, base_Y) print ("Score = %f" % clf2.score(base_X, base_Y)) clf2_pred = clf2.predict(X_test) write_to_file("lasso.csv", clf2_pred)
def comparaison_ridge_lasso(X,Y): X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=random.seed()) clf_lasso = Lasso(selection='random', random_state=random.seed()) clf_ridge = Ridge() clf_lasso.fit(X_train,Y_train) clf_ridge.fit(X_train,Y_train) score_lasso=clf_lasso.score(X_test,Y_test) score_ridge=clf_ridge.score(X_test,Y_test) print("Precision de Lasso={:3.2f}% \nPrecision de Ridge={:3.2f}%\n".format(score_lasso*100,score_ridge*100))
def test_alpha_opti(X,Y,nb_tests): score_lasso=0 score_ridge=0 score_lasso_opti=0 score_ridge_opti=0 for i in range(0,nb_tests): X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=random.seed()) clf_lasso = Lasso(selection='random', random_state=random.seed()) clf_ridge = Ridge() clf_lasso.fit(X_train,Y_train) clf_ridge.fit(X_train,Y_train) score_lasso+=clf_lasso.score(X_test,Y_test) score_ridge+=clf_ridge.score(X_test,Y_test) clf_lasso_opti = Lasso(selection='random', random_state=random.seed(),alpha=0.1) clf_ridge_opti = Ridge(alpha=0.1) clf_lasso_opti.fit(X_train,Y_train) clf_ridge_opti.fit(X_train,Y_train) score_lasso_opti+=clf_lasso_opti.score(X_test,Y_test) score_ridge_opti+=clf_ridge_opti.score(X_test,Y_test) print("Lasso (opti - non-opti) : {:3.3f}%".format(100*(score_lasso_opti-score_lasso)/nb_tests)) print("Ridge (opti - non-opti) : {:3.3f}%".format(100*(score_ridge_opti-score_ridge)/nb_tests))
def linearReg(): sl=Lasso(alpha=0.2) sl.fit(features_array,values_array) predict_val=sl.predict(features_array) print(sl.coef_) print(sl.score(features_array,values_array)) fig = plt.figure() ax = plt.subplot(111) ax.bar(range(0,features.shape[1]),sl.coef_) plt.show()
def calc_linear_regression(files, data_matrix, target, results): lr = Lasso() lr.fit(data_matrix, target) rss = np.mean((lr.predict(data_matrix) - target) ** 2) var = lr.score(data_matrix, target) global best if rss < best: for i in range(0,len(target)): print str(target[i]) + "\t" + str(lr.predict(data_matrix[i])[0]) print lr.coef_ best = rss results.append((files, rss, var, lr.coef_))
def test_StackingEstimator_4(): """Assert that the StackingEstimator worked as expected in scikit-learn pipeline in regression.""" stack_reg = StackingEstimator(estimator=RandomForestRegressor(random_state=42)) meta_reg = Lasso(random_state=42) sklearn_pipeline = make_pipeline(stack_reg, meta_reg) # fit in pipeline sklearn_pipeline.fit(training_features_r, training_target_r) # fit step by step stack_reg.fit(training_features_r, training_target_r) X_reg_transformed = stack_reg.transform(training_features_r) meta_reg.fit(X_reg_transformed, training_target_r) # scoring score = meta_reg.score(X_reg_transformed, training_target_r) pipeline_score = sklearn_pipeline.score(training_features_r, training_target_r) assert np.allclose(score, pipeline_score) # test cv score cv_score = np.mean(cross_val_score(sklearn_pipeline, training_features_r, training_target_r, cv=3, scoring='r2')) known_cv_score = 0.795877470354 assert np.allclose(known_cv_score, cv_score)
def _random_search(self, random_iter, x, y): # Default Values alpha = 1.0 best_score = -sys.maxint if random_iter > 0: sys.stdout.write("Do a random search %d times" % random_iter) param_dist = {"alpha": uniform(loc=0.0001, scale=10-0.0001)} param_list = [{"alpha": alpha}, ] param_list.extend(list(ParameterSampler(param_dist, n_iter=random_iter-1, random_state=self._rng))) for idx, d in enumerate(param_list): lasso = Lasso(alpha=d["alpha"], fit_intercept=True, normalize=False, precompute='auto', copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False) train_x, test_x, train_y, test_y = \ train_test_split(x, y, test_size=0.5, random_state=self._rng) lasso.fit(train_x, train_y) sc = lasso.score(test_x, test_y) # Tiny output m = "." if idx % 10 == 0: m = "#" if sc > best_score: m = "<" best_score = sc alpha = d['alpha'] sys.stdout.write(m) sys.stdout.flush() sys.stdout.write("Using alpha: %f\n" % alpha) return alpha
def apply_lasso( X_train, Y_train, alpha=None ): alphas = [ 0.1, 0.3, 0.5 ] ALPHA_VALS = {} for a in alphas: model = Lasso(alpha=a, fit_intercept=True, normalize=False, precompute='auto', copy_X=True, max_iter=50000, tol=0.001, warm_start=False, positive=False) # sample_weights = [ 1.0/float(len(Y)) for x in Y ] model.fit( X_train, Y_train )# , sample_weight=sample_weights) R2 = model.score(X_train, Y_train) L1 = sum([abs(x) for x in model.coef_]) ALPHA_VALS [a ] = [ a, R2, L1, [x for x in model.coef_] ] print "ALPHA: %.2f \t R^2=%7.4f \t L1(THETA)=%.2f \t THETA[1:N]=%s" % ( a, R2, L1, ", ".join(["%.4f" % x for x in model.coef_] )) # A = sorted([ ALPHA_VALS[x] for x in ALPHA_VALS [ a, R2, L2, model.coef_[:] ], key=lambda x: x[1], reversed=True ) Theta = [ float( model.intercept_ ) , ] Theta.extend( [ float( x ) for x in model.coef_]) ( model, Theta, J, SCORE ) = performance_analysis( model, Theta, X_train, Y_train, debug=1 ) return ( model, Theta, J, SCORE )
print("baseline MSE: %f" % mean_squared_error(avg_pts, y_test)) std = X_train.std(axis=0) mean = X_train.mean(axis=0) X_train = (X_train - mean) / std X_test = (X_test - mean) / std #=============================================================================== # std = y_train.std(axis=0) # mean = y_train.mean(axis=0) # y_train = (y_train - mean) / std # y_test = (y_test - mean) / std #=============================================================================== linear_estimator = LinearRegression(fit_intercept=True) linear_estimator.fit(X_train, y_train) y_test_est = linear_estimator.predict(X_test) print("linear regression score: %f" % linear_estimator.score(X_test, y_test)) print("linear regression MSE: %f" % mean_squared_error(y_test, y_test_est)) ridge_estimator = Ridge(alpha=1, fit_intercept=True) ridge_estimator.fit(X_train, y_train) y_test_est = ridge_estimator.predict(X_test) print("ridge regression score: %f" % ridge_estimator.score(X_test, y_test)) print("ridge regression MSE: %f" % mean_squared_error(y_test, y_test_est)) lasso_estimator = Lasso(alpha=0.1, fit_intercept=True) lasso_estimator.fit(X_train, y_train) y_test_est = lasso_estimator.predict(X_test) print("lasso regression score: %f" % lasso_estimator.score(X_test, y_test)) print("lasso regression MSE: %f" % mean_squared_error(y_test, y_test_est))
ypred = knn.predict(x_test) print(knn.score(x_test, y_test)) lr = LinearRegression() lr.fit(x_train,y_train) lr.predict(x_test) print(lr.score(x_test, y_test)) ls = Lasso(alpha=0.1) ls.fit(x_train,y_train) ls.predict(x_test) print(ls.score(x_test, y_test)) dct = DecisionTreeClassifier() dct.fit(x_train,y_train) dct.predict(x_test) print(dct.score(x_test, y_test)) #from pyspark.sql import SQLContext #from pyspark import SparkContext #sc = SparkContext("local","example") #sql_sc = SQLContext(sc) #df = pd.read_csv('data.csv')
print("Test set score: {:.2f}".format(ridge10.score(X_test, y_test))) ridge01 = Ridge(alpha=0.1).fit(X_train, y_train) print("Training set score: {:.2f}".format(ridge10.score(X_train, y_train))) print("Test set score: {:.2f}".format(ridge10.score(X_test, y_test))) # plt.plot(ridge.coef_, 's', label="Ridge alpha=1") # plt.plot(ridge10.coef_, '^', label="Ridge alpha=10") # plt.plot(ridge01.coef_, 'v', label="Ridge alpha=0.1") # plt.plot(lr.coef_, 'o', label="LinearRegression") # plt.xlabel("Coefficient index") # plt.ylabel("Coefficient magnitude") # plt.hlines(0, 0, len(lr.coef_)) # plt.ylim(-25, 25) # plt.legend() # mglearn.plots.plot_ridge_n_samples() # plt.show() lasso = Lasso().fit(X_train, y_train) print("Training set score: {:.2f}".format(lasso.score(X_train, y_train))) print("Test set score: {:.2f}".format(lasso.score(X_test, y_test))) print("Number of features used: {}".format(np.sum(lasso.coef_ != 0))) # we increase the default setting of "max_iter", # otherwise the model would warn us that we should increase max_iter. print("---------------------------------") lasso001 = Lasso(alpha=0.01, max_iter=100000).fit(X_train, y_train) print("Training set score: {:.2f}".format(lasso001.score(X_train, y_train))) print("Test set score: {:.2f}".format(lasso001.score(X_test, y_test))) print("Number of features used: {}".format(np.sum(lasso001.coef_ != 0)))
def CrossValidateForMSEAtDiffLambda(df, IV, DVs): avgMSELasso_values = [] avgMSERidge_values = [] avgMSELinReg_values = [] avgR2Lasso_values = [] avgR2Ridge_values = [] avgR2LinReg_values = [] lmda_values = [x*0.00005 for x in range(1, 1001)] #lmda_values = [x*0.00005 for x in range(1, 11)] var_list = deepcopy(DVs) var_list.insert(0, IV) tempDF = df[var_list].dropna() for lmda in lmda_values: MSELasso_list = [] MSERidge_list = [] MSELinReg_list = [] R2Lasso_list = [] R2Ridge_list = [] R2LinReg_list = [] for x in range(10): y_train, y_test, X_train, X_test = train_test_split(tempDF[IV], tempDF[DVs], test_size=0.2) #print(X_train.shape,X_test.shape, y_train.shape, y_test.shape) #print(train_test_split(tempDF[IV], tempDF[DVs], test_size=0.2)) model = Lasso(alpha=lmda, normalize=True) model2 = Ridge(alpha=lmda, normalize=True) model3 = LinearRegression(normalize=True) model.fit(X_train, y_train) model2.fit(X_train, y_train) model3.fit(X_train, y_train) #print(model.score(X_test, y_test), model2.score(X_test, y_test)) MSELasso = np.mean((model.predict(X_test)-y_test)**2) MSELasso_list.append(MSELasso) MSERidge = np.mean((model2.predict(X_test)-y_test)**2) MSERidge_list.append(MSERidge) MSELinReg = np.mean((model3.predict(X_test)-y_test)**2) MSELinReg_list.append(MSELinReg) R2Lasso_list.append(model.score(X_test, y_test)) R2Ridge_list.append(model2.score(X_test, y_test)) R2LinReg_list.append(model3.score(X_test, y_test)) #print(X_test, y_test) avgMSELasso_values.append(np.mean(MSELasso_list)) avgMSERidge_values.append(np.mean(MSERidge_list)) avgMSELinReg_values.append(np.mean(MSELinReg_list)) avgR2Lasso_values.append(np.mean(R2Lasso_list)) avgR2Ridge_values.append(np.mean(R2Ridge_list)) avgR2LinReg_values.append(np.mean(R2LinReg_list)) minMSE1 = min(avgMSELasso_values) idx = avgMSELasso_values.index(minMSE1) minLmda = lmda_values[idx] LassoAvgR2 = avgR2Lasso_values[idx] minMSE2 = min(avgMSERidge_values) idx2 = avgMSERidge_values.index(minMSE2) minLmda2 = lmda_values[idx2] RidgeAvgR2 = avgR2Ridge_values[idx2] LinRegAvgR2 = np.mean(avgR2LinReg_values) y_train, y_test, X_train, X_test = train_test_split(tempDF[IV], tempDF[DVs], test_size=0.2) model1 = Lasso(alpha=minLmda, normalize=True) model1.fit(X_train, y_train) model2 = Ridge(alpha=minLmda2, normalize=True) model2.fit(X_train, y_train) model3 = LinearRegression(normalize=True) model3.fit(X_train, y_train) print(minLmda, minMSE1, model1.coef_, model1.intercept_, np.mean(avgR2Lasso_values)) print(minLmda2, minMSE2, model2.coef_, model2.intercept_, np.mean(avgR2Ridge_values)) print(model3.coef_, model3.intercept_, np.mean(avgR2LinReg_values)) return lmda_values, avgMSELasso_values, avgMSERidge_values, avgMSELinReg_values, avgR2Lasso_values, avgR2Ridge_values, avgR2LinReg_values
print("Coefficient of determination R^2 <-- on train set: {}".format( support_regressor.score(X_train, y_train))) print("Coefficient of determination R^2 <-- on test set: {}".format( support_regressor.score(X_test, y_test))) dtr = DecisionTreeRegressor() dtr.fit(X_train, y_train) print("Coefficient of determination R^2 <-- on train set: {}".format( dtr.score(X_train, y_train))) print("Coefficient of determination R^2 <-- on test set: {}".format( dtr.score(X_test, y_test))) indiana_jones = Lasso(alpha=1.0) indiana_jones.fit(X_train, y_train) print("Coefficient of determination R^2 <-- on train set : {}".format( indiana_jones.score(X_train, y_train))) print("Coefficient of determination R^2 <-- on test set: {}".format( indiana_jones.score(X_test, y_test))) etr = ExtraTreesRegressor(n_estimators=300) etr.fit(X_train, y_train) print(etr.feature_importances_) indecis = np.argsort(etr.feature_importances_)[::-1] plt.figure(num=None, figsize=(14, 10), dpi=80, facecolor='w') plt.title("Feature importances") plt.bar(range(X_train.shape[1]), etr.feature_importances_[indecis], color="r", align="center")
import numpy as np # 在x轴上从0到25均匀采样100个数据点 xx = np.linspace(0, 26, 100) xx = xx.reshape(xx.shape[0], 1) # 以上述100个数据点作为基准,预测回归直线 yy = regressor.predict(xx) # 使用4次多项式回归模型在比萨训练样本上进行拟合 poly4 = PolynomialFeatures(degree=4) X_train_poly4 = poly4.fit_transform(X_train) from sklearn.linear_model import Lasso lasso_poly4 = Lasso() lasso_poly4.fit(X_train_poly4, y_train) print lasso_poly4.score(X_test_poly4, y_test) # 输出Lasso模型的参数列表 print lasso_poly4.coef_ regressor_poly4 = LinearRegression() regressor_poly4.fit(X_train_poly4, y_train) xx_poly4 = poly4.transform(xx) yy_poly4 = regressor_poly4.predict(xx_poly4) # 评估3种回归模型在测试数据集上的性能表现 # 准备测试数据 X_test = [[6], [8], [11], [16]] y_test = [[8], [12], [15], [18]]
# Comparing coefficient magnitudes for ridge regression with different values # of alpha and linear regression plt.plot(ridge.coef_, 's', label="Ridge alpha=1") plt.plot(ridge10.coef_, '^', label="Ridge alpha=10") plt.plot(ridge01.coef_, 'v', label="Ridge alpha=0.1") plt.plot(lr.coef_, 'o', label="LinearRegression") plt.xlabel("Coefficient index") plt.ylabel("Coefficient magnitude") plt.hlines(0, 0, len(lr.coef_)) plt.ylim(-25, 25) plt.legend() # Lasso Regression ------------------------------------------------------------ from sklearn.linear_model import Lasso lasso=Lasso().fit(X_train,y_train) print('Training set score : {}'.format(lasso.score(X_train,y_train))) print('Test set score : {}'.format(lasso.score(X_test,y_test))) print('Number of features used : {}'.format(np.sum(lasso.coef_!=0))) # we increase the default setting of "max_iter", # otherwise the model would warn us that we should increase max_iter lasso001=Lasso(alpha=0.01,max_iter=100000).fit(X_train,y_train) print('Training set score : {}'.format(lasso001.score(X_train,y_train))) print('Test set score : {}'.format(lasso001.score(X_test,y_test))) print('Number of features used : {}'.format(np.sum(lasso001.coef_!=0))) lasso00001 = Lasso(alpha=0.0001, max_iter=100000).fit(X_train, y_train) print("Training set score: {:.2f}".format(lasso00001.score(X_train, y_train))) print("Test set score: {:.2f}".format(lasso00001.score(X_test, y_test))) print("Number of features used: {}".format(np.sum(lasso00001.coef_ != 0)))
from sklearn import metrics print('Mean Squared Error:', metrics.mean_squared_error(labels_test, labels_pred)) #********************************************************************************************** from sklearn.linear_model import Lasso from sklearn.linear_model import Ridge lm_lasso = Lasso() lm_ridge = Ridge() lm_lasso.fit(features_train, labels_train) lm_ridge.fit(features_train, labels_train) print("RSquare Value for Lasso Regresssion TEST data is-") print(np.round(lm_lasso.score(features_test, labels_test) * 100, 2)) print("RSquare Value for Ridge Regresssion TEST data is-") print(np.round(lm_ridge.score(features_test, labels_test) * 100, 2)) predict_test_lasso = lm_lasso.predict(features_test) predict_test_ridge = lm_ridge.predict(features_test) print("Lasso Regression Mean Square Error (MSE) for TEST data is") print(np.round(metrics.mean_squared_error(labels_test, predict_test_lasso), 2)) print("Ridge Regression Mean Square Error (MSE) for TEST data is") print(np.round(metrics.mean_squared_error(labels_test, predict_test_ridge), 2)) ''' Code Challenges 02: (House Data) This is kings house society data.
def upload_get_stocks(st): # Find one record of data from the mongo database # @TODO: YOUR CODE HERE! #cr = csv.reader(open("https://query1.finance.yahoo.com/v7/finance/download/"+st+"?period1=1454112000&period2=1611964800&interval=1d&events=history&includeAdjustedClose=true","rb")) #data = pd.read_csv('https://example.com/passkey=wedsmdjsjmdd') #df = pd.read_csv("static/data/"+st+".csv") #with open("static/data/"+st+".csv", "wt") as fp: # writer = csv.writer(fp) # # writer.writerow(["your", "header", "foo"]) # write header # writer.writerows(data) #dateval = datetime.date.strtime("%D") #print(dateval) session = Session(engine) stock = session.execute("select * from stocks where symbol='" + st + "'") #return render_template("index.html", listings=listings) # Return template and data if (stock.rowcount == 0): data = pd.read_csv( "https://query1.finance.yahoo.com/v7/finance/download/" + st + "?period1=1454112000&period2=1611964800&interval=1d&events=history&includeAdjustedClose=true", sep=',') data.to_csv("static/data/" + st + ".csv", index=False, header=True) print(data) session.execute("INSERT INTO stocks VALUES ('" + st + "', '" + st + " Corp')") session.execute("commit") stocks = session.execute("select * from stocks") resdata = [{}] responsedata = {'respdata': resdata} session.close() print('Hello this is test') data = pd.read_csv("static/data/" + st + ".csv") df = data # Drop the null columns where all values are null df = df.dropna(axis='columns', how='all') # Drop the null rows # This is for the MinMax Linear Regression model print(df.head()) df = df.dropna() print(df.head()) y = df["Open"].values.reshape(-1, 1) diff = df['Close'] - df["Open"] diff_locations = [] for i in diff: if (i < 0): diff_locations.append(0) else: diff_locations.append(1) df['diff'] = pd.DataFrame(diff_locations) #X = df[['High', 'Low', 'Close', 'Volume','diff']] X = df[['High', 'Low', 'Close', 'Volume', 'diff']] print(X) print(y) print(X.shape, y.shape) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) X_minmax = MinMaxScaler().fit(X_train) y_minmax = MinMaxScaler().fit(y_train) X_train_minmax = X_minmax.transform(X_train) X_test_minmax = X_minmax.transform(X_test) y_train_minmax = y_minmax.transform(y_train) y_test_minmax = y_minmax.transform(y_test) model2 = LinearRegression() model2.fit(X_train_minmax, y_train_minmax) print(f"Testing Data Score: {model2.score(X_test_minmax, y_test_minmax)}") minmax_predict = model2.score(X_test_minmax, y_test_minmax) print(minmax_predict) #This is standard scalar transformation X_scaler = StandardScaler().fit(X_train) y_scaler = StandardScaler().fit(y_train) X_train_scaled = X_scaler.transform(X_train) X_test_scaled = X_scaler.transform(X_test) y_train_scaled = y_scaler.transform(y_train) y_test_scaled = y_scaler.transform(y_test) model = LinearRegression() model.fit(X_train_scaled, y_train_scaled) predictions = model.predict(X_test_scaled) scallar_MSE = mean_squared_error(y_test_scaled, predictions) scallar_r2 = model.score(X_test_scaled, y_test_scaled) plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data") plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data") #plt.legend() plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max()) plt.title("Residual Plot") #plt.show() pwd = os.getcwd() print(pwd) #p = Path(os.getcwd()+"\static\images") plt.savefig("static/images/" + st + ".png") f = open("static/images/" + st + ".png") plt.close() f.close() #Lasso model ### BEGIN SOLUTION lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_scaled) lasso_predictions = lasso.predict(X_test_scaled) lasso_MSE = mean_squared_error(y_test_scaled, lasso_predictions) lasso_r2 = lasso.score(X_test_scaled, y_test_scaled) ### END SOLUTION print(f"Lasso MSE: {lasso_MSE}, R2: {lasso_r2}") #Ridge model ridgeVal = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled) ridge_predictions = ridgeVal.predict(X_test_scaled) ridge_MSE = mean_squared_error(y_test_scaled, ridge_predictions) ridge_r2 = ridgeVal.score(X_test_scaled, y_test_scaled) print(f"ridge MSE: {ridge_MSE}, R2: {ridge_r2}") #elasticNet elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train_scaled) elasticnet_predictions = elasticnet.predict(X_test_scaled) elasticnet_MSE = mean_squared_error(y_test_scaled, elasticnet_predictions) elasticnet_r2 = elasticnet.score(X_test_scaled, y_test_scaled) print(f"elasticnet MSE: {elasticnet_MSE}, R2: {elasticnet_r2}") fig1 = plt.figure(figsize=(12, 6)) axes1 = fig1.add_subplot(1, 2, 1) axes2 = fig1.add_subplot(1, 2, 2) axes1.set_title("Original Data") axes2.set_title("Scaled Data") maxx = X_train["High"].max() maxy = y_train.max() axes1.set_xlim(-maxx + 1, maxx + 1) axes1.set_ylim(-maxy + 1, maxy + 1) axes2.set_xlim(-2, 2) axes2.set_ylim(-2, 2) set_axes(axes1) set_axes(axes2) axes1.scatter(X_train["High"], y_train) axes2.scatter(X_train_scaled[:, 0], y_train_scaled[:]) p = Path(os.getcwd() + "/static/images") #q = p / "axes2"+st+".png" #if (q.exists()): fig1.savefig("static/images/axes2" + st + ".png") f = open("static/images/axes2" + st + ".png") plt.close() f.close() #else: # fig1.savefig("static/images/axes2"+st+".png") # plt.close() return render_template("indexStocks.html", stocks=stocks, responsedata=responsedata, init_page="initpage", sel_stk=st, minmax_predict=minmax_predict, scallar_MSE=scallar_MSE, scallar_r2=scallar_r2, lasso_MSE=lasso_MSE, lasso_r2=lasso_r2, ridge_MSE=ridge_MSE, ridge_r2=ridge_r2, elasticnet_MSE=elasticnet_MSE, elasticnet_r2=elasticnet_r2)
def get_stocks(st): # Find one record of data from the mongo database # @TODO: YOUR CODE HERE! session = Session(engine) stocks = session.execute("select * from stocks ") #return render_template("index.html", listings=listings) # Return template and data resdata = [{}] responsedata = {'respdata': resdata} session.close() print('Hello this is test') df = pd.read_csv("static/data/" + st + ".csv") # Drop the null columns where all values are null df = df.dropna(axis='columns', how='all') # Drop the null rows # This is for the MinMax Linear Regression model print(df.head()) df = df.dropna() print(df.head()) y = df["Open"].values.reshape(-1, 1) diff = df['Close'] - df["Open"] diff_locations = [] for i in diff: if (i < 0): diff_locations.append(0) else: diff_locations.append(1) df['diff'] = pd.DataFrame(diff_locations) #X = df[['High', 'Low', 'Close', 'Volume','diff']] X = df[['High', 'Low', 'Close', 'Volume', 'diff']] print(X) print(y) print(X.shape, y.shape) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) X_minmax = MinMaxScaler().fit(X_train) y_minmax = MinMaxScaler().fit(y_train) X_train_minmax = X_minmax.transform(X_train) X_test_minmax = X_minmax.transform(X_test) y_train_minmax = y_minmax.transform(y_train) y_test_minmax = y_minmax.transform(y_test) model2 = LinearRegression() model2.fit(X_train_minmax, y_train_minmax) print(f"Testing Data Score: {model2.score(X_test_minmax, y_test_minmax)}") minmax_predict = model2.score(X_test_minmax, y_test_minmax) print(minmax_predict) #This is standard scalar transformation X_scaler = StandardScaler().fit(X_train) y_scaler = StandardScaler().fit(y_train) X_train_scaled = X_scaler.transform(X_train) X_test_scaled = X_scaler.transform(X_test) y_train_scaled = y_scaler.transform(y_train) y_test_scaled = y_scaler.transform(y_test) model = LinearRegression() model.fit(X_train_scaled, y_train_scaled) predictions = model.predict(X_test_scaled) scallar_MSE = mean_squared_error(y_test_scaled, predictions) scallar_r2 = model.score(X_test_scaled, y_test_scaled) plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data") plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data") #plt.legend() plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max()) plt.title("Residual Plot") #plt.show() pwd = os.getcwd() print(pwd) #p = Path(os.getcwd()+"\static\images") plt.savefig("static/images/" + st + ".png") f = open("static/images/" + st + ".png") plt.close() f.close() #Lasso model ### BEGIN SOLUTION lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_scaled) lasso_predictions = lasso.predict(X_test_scaled) lasso_MSE = mean_squared_error(y_test_scaled, lasso_predictions) lasso_r2 = lasso.score(X_test_scaled, y_test_scaled) ### END SOLUTION print(f"Lasso MSE: {lasso_MSE}, R2: {lasso_r2}") #Ridge model ridgeVal = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled) ridge_predictions = ridgeVal.predict(X_test_scaled) ridge_MSE = mean_squared_error(y_test_scaled, ridge_predictions) ridge_r2 = ridgeVal.score(X_test_scaled, y_test_scaled) print(f"ridge MSE: {ridge_MSE}, R2: {ridge_r2}") #elasticNet elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train_scaled) elasticnet_predictions = elasticnet.predict(X_test_scaled) elasticnet_MSE = mean_squared_error(y_test_scaled, elasticnet_predictions) elasticnet_r2 = elasticnet.score(X_test_scaled, y_test_scaled) print(f"elasticnet MSE: {elasticnet_MSE}, R2: {elasticnet_r2}") fig1 = plt.figure(figsize=(12, 6)) axes1 = fig1.add_subplot(1, 2, 1) axes2 = fig1.add_subplot(1, 2, 2) axes1.set_title("Original Data") axes2.set_title("Scaled Data") maxx = X_train["High"].max() maxy = y_train.max() axes1.set_xlim(-maxx + 1, maxx + 1) axes1.set_ylim(-maxy + 1, maxy + 1) axes2.set_xlim(-2, 2) axes2.set_ylim(-2, 2) set_axes(axes1) set_axes(axes2) axes1.scatter(X_train["High"], y_train) axes2.scatter(X_train_scaled[:, 0], y_train_scaled[:]) p = Path(os.getcwd() + "/static/images") #q = p / "axes2"+st+".png" #if (q.exists()): fig1.savefig("static/images/axes2" + st + ".png") f = open("static/images/axes2" + st + ".png") plt.close() f.close() #else: # fig1.savefig("static/images/axes2"+st+".png") # plt.close() return render_template("indexStocks.html", stocks=stocks, responsedata=responsedata, init_page="initpage", sel_stk=st, minmax_predict=minmax_predict, scallar_MSE=scallar_MSE, scallar_r2=scallar_r2, lasso_MSE=lasso_MSE, lasso_r2=lasso_r2, ridge_MSE=ridge_MSE, ridge_r2=ridge_r2, elasticnet_MSE=elasticnet_MSE, elasticnet_r2=elasticnet_r2)
#second estimator --Lasso from sklearn.linear_model import Lasso #find best alpha for the model alphas = [0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 1] for a in alphas: lasso_model = Lasso(alpha=a,normalize=True).fit(x,y) r2 = lasso_model.score(x, y) y_pred = lasso_model.predict(x) mse = mean_squared_error(y, y_pred) rmse=math.sqrt(mse) print("Alpha:{0:.4f}, r2:{1:.2f}, MSE:{2:.2f}, RMSE:{3:.2f}" .format(a, r2, mse, rmse)) lasso = Lasso(alpha =0.0001, normalize=True).fit(x_train,y_train) #print the coefficients by sorting them from most important to less important predictors =x_train.columns coef=pd.Series(lasso.coef_, predictors).sort_values(ascending=False)
#plot graph of most import feature important_features.plot(kind='bar') plt.show() #lasso model alphas = np.arange(0, 10) grid = GridSearchCV(estimator=Lasso(), param_grid={'alpha': alphas}) grid.fit(X_train, y_train) lasso_clf = grid.best_estimator_ #best lambda lasso_clf #set best lambda and fit train data lasso = Lasso() lasso.set_params(alpha=9.0) lasso.fit(X_train, y_train) lasso.score(X_train, y_train) #get cofficient lasso.coef_ #predicted value from train data predicted_y1 = lasso.predict(xtest) #score of the predicted data lasso.score(xtest, predicted_y1) #ridge model alphas = np.arange(0, 10) grid = GridSearchCV(estimator=Ridge(), param_grid={'alpha': alphas}) grid.fit(X_train, y_train) ridge_clf = grid.best_estimator_ #best lambda ridge_clf #set best lambda and fit train data
from sklearn.linear_model import Lasso from sklearn.model_selection import train_test_split import mglearn x, y = mglearn.datasets.load_extended_boston() x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0) lasso = Lasso(alpha=0.01, max_iter=100000).fit(x_train, y_train) print('\ntrain score:{:.2f}'.format(lasso.score(x_train, y_train))) print('test score:{:.2f}'.format(lasso.score(x_test, y_test)))
# predict FS_brain = [] idx = np.triu_indices(20, 1) for item in cur_paths: item_data = np.load(item) FS_brain.append(item_data[idx]) FS_brain = np.array(FS_brain) FS_brain = StandardScaler().fit_transform(FS_brain) for i_beh in range(5): scores = beh_scores[:, i_beh] title = beh_titles[i_beh] from sklearn.cross_validation import cross_val_score, ShuffleSplit from sklearn.linear_model import Lasso clf = Lasso(alpha=1.0) scores = StandardScaler().fit_transform(scores) coefs = [] r2_list = [] folder = ShuffleSplit(n=len(scores), n_iter=500, test_size=0.1) for train, test in folder: clf.fit(X=FS_brain[train], y=scores[train]) r2 = clf.score(FS_brain[test], scores[test]) r2_list.append(r2) mean_r2 = np.mean(r2_list) print('%s/%s/mean-R2: %.4f' % (cur_ana, title, mean_r2))
mse = mean_squared_error(y_test_final, y_pred) rmse = np.math.sqrt(mse) print('RMSE: {}'.format(rmse)) print('Train Score: {}'.format(ridge_model_100_train_score)) print('Test Score: {}'.format(ridge_model_100_test_score)) # # ******************* Lasso Regularization (0.01) ************************ lasso_001 = Lasso(alpha=0.01) lasso_001.fit(X_train_final, y_train_final) y_pred = model.predict(X_test_final) scores = cross_val_score(model, X_train_final, y_train_final, cv=5) print( '***************************************Lasso(0.01)***************************************' ) print('cross validation score', scores.mean()) lasso_model_001_train_score = lasso_001.score(X_train_final, y_train_final) lasso_model_001_test_score = lasso_001.score(X_test_final, y_test_final) mse = mean_squared_error(y_test_final, y_pred) rmse = np.math.sqrt(mse) print('RMSE: {}'.format(rmse)) print('Train Score: {}'.format(lasso_model_001_train_score)) print('Test Score: {}'.format(lasso_model_001_test_score)) # ******************* Lasso (0.001) ************************ lasso_0001 = Lasso(alpha=0.001) lasso_0001.fit(X_train_final, y_train_final) y_pred = model.predict(X_test_final) scores = cross_val_score(model, X_train_final, y_train_final, cv=5) print( '***************************************Lasso (0.001)***************************************'
train.to_csv('dumified_train.csv') test.to_csv('dumified_test.csv') df1.to_csv('df1.csv') #simple model = LinearRegression() train test y_train model.fit(train, y_train) model.score(train, y_train) model.predict(test) #lasso lasso = Lasso(alpha=0.01, max_iter=1000) #lasso = Lasso(alpha=0.01, max_iter=10e5) lasso.fit(train, y_train) lasso.score(train, y_train) lasso.predict(test) model2 = GradientBoostingRegressor() model2 = model2.fit(train,y_train) model2.score(train,y_train)
plt.plot(test_scores, label="test scores") plt.xticks(range(4), [100, 10, 1, .01]) plt.legend(loc="best") #Lasso (L1) Penalty (alpha Regularization Parameter) #LASSO leads to sparse solutions, driving most coefficients to zero from sklearn.linear_model import Lasso lasso_models = {} training_scores = [] test_scores = [] for alpha in [30, 10, 1, .01]: lasso = Lasso(alpha=alpha).fit(X_train, y_train) training_scores.append(lasso.score(X_train, y_train)) test_scores.append(lasso.score(X_test, y_test)) lasso_models[alpha] = lasso plt.plot(training_scores, label="training scores") plt.plot(test_scores, label="test scores") plt.xticks(range(4), [30, 10, 1, .01]) plt.legend(loc="best") ############################################ #Learning Curve (Analyise Model Complexity) ############################################
''' the conclusion of all this is that with little data is better use linear model but when we have a lot of data is better use model like forest and gradient tree decision-based ''' X_train, X_test, Y_train, Y_test = train_test_split(mamoDataX,mamoDataY, random_state = 2) model = KNeighborsRegressor(18) model2 = Lasso() model.fit(X_train,Y_train) model2.fit(X_train,Y_train) print(model.score(X_test,Y_test)) print(model2.score(X_test,Y_test)) PREDICTED = model.predict(X_test) PREDICTED2 = model2.predict(X_test) plt.subplot(2,2,1) plt.hist([PREDICTED,Y_test]) plt.subplot(2,2,2) plt.hist([PREDICTED2,Y_test]) plt.show
svm_rmse_test test_rmse = [ lin_rmse_test, ridge_rmse_test, lasso_rmse_test, elastic_net_rmse_test, SGD_rmse_test, tree_rmse_test, forest_rmse_test, xg_rmse_test, svm_rmse_test ] aa1 = pd.DataFrame(test_rmse) ########################################################################### ################# R^2 Score for train data ############################# R2_lin_train = lin_reg.score(X_train, y_train) R2_ridge_train = ridge_reg.score(X_train, y_train) R2_lasso_train = lasso_reg.score(X_train, y_train) R2_elastic_net_train = elastic_net_reg.score(X_train, y_train) R2_SGD_train = SGD_reg.score(X_train, y_train) R2_tree_train = tree_reg.score(X_train, y_train) R2_forest_train = forest_reg.score(X_train, y_train) R2_xg_train = xg_reg.score(X_train, y_train) R2_svm_train = svm_reg.score(X_train, y_train) train_r2 = [ R2_lin_train, R2_ridge_train, R2_lasso_train, R2_elastic_net_train, R2_SGD_train, R2_tree_train, R2_forest_train, R2_xg_train, R2_svm_train ] aa2 = pd.DataFrame(train_r2) ##############################################################################
Elas.fit(X_train, y_train) # print(sqrt(mean_squared_error(ytrain, Elas.predict(xtrain)))) print(sqrt(mserr(y_test, Elas.predict(X_test)))) print('R2 Value/Coefficient of Determination: {}'.format( Elas.score(X_test, y_test))) # In[34]: # Lassoreg = Lasso(alpha = 0.5,tol = 0.1) # Lassoreg = Lassoreg.fit(X_train,y_train) # print(Ridgereg.score(X_train,y_train)) # print(Ridgereg.score(X_test,y_test)) from sklearn.linear_model import Lasso from math import sqrt from sklearn.metrics import r2_score, mean_squared_error lassoreg = Lasso(alpha=0.001, normalize=True) lassoreg.fit(X_train, y_train) # lassoreg.predict(X_train) print(sqrt(mean_squared_error(y_test, lassoreg.predict(X_test)))) print('R2 Value/Coefficient of Determination: {}'.format( lassoreg.score(X_test, y_test))) # In[40]: test_prediction = pd.DataFrame(Elas.predict(test_x), columns=['SalePrice']) test_prediction.index.name = 'Id' test_prediction.to_csv("C:\\Users\\Bhuvan PC\\Downloads\\final_test_pred.csv")
y_train_pred = lasso.predict(X_train) y_test_pred = lasso.predict(X_test) for i in range(0,26): print('Slope'+str(i)+':'+str(lasso.coef_[i])) print('Intercept: %.3f' % lasso.intercept_) plt.scatter(y_train_pred, y_train_pred - y_train, c='steelblue', marker='o', edgecolor='white', label='Training data') plt.scatter(y_test_pred, y_test_pred - y_test, c='limegreen', marker='s', edgecolor='white', label='Test data') plt.xlabel('Predicted values') plt.ylabel('Residuals') plt.title('Lasso Regression') plt.legend(loc='upper left') plt.hlines(y=0, xmin=-10, xmax=50, color='black', lw=2) plt.xlim([-10, 50]) plt.figure() plt.show() print("R^2: {}".format(lasso.score(X_test, y_test))) rmse = np.sqrt(mean_squared_error(y_test,y_test_pred)) print("Root Mean Squared Error: {}".format(rmse)) #best alpha for Lasso scores=[] ran=[] rmse=[] for alpha in range(1,21): lassob = Lasso(alpha=alpha) lassob.fit(X_train, y_train) y_train_pred = lassob.predict(X_train) y_test_pred = lassob.predict(X_test) scores.append(lassob.score(X_test, y_test)) rmse.append(np.sqrt(mean_squared_error(y_test, y_test_pred))) ran.append(alpha) plt.figure()
print(len(Y_test)) model = Lasso() # assignating model model.fit(X_train, Y_train) # training the model # print(X_test) PREDICTED = model.predict( X_test) # to predict the objective data accurate to this data ''' Very bad results (neither mamoDataX (without zeros) neither mamoData (Tissue = 0 | 1) neither mamoData (Tissue = 0 - 1). generate a good score) for that we are gonna aplly featuring engineering and evaluate other models ''' plt.subplot(2, 2, 1) plt.hist([PREDICTED, Y_test ]) # to know the relation the predict data with the Y_test data print(model.score(X_test, Y_test)) # score of the model (very bad!!) RESIDUALS = Y_test - PREDICTED # to know the rate of the error plt.subplot(2, 2, 2) plt.scatter(Y_test, RESIDUALS) plt.subplot(2, 2, 3) plt.hist(RESIDUALS, bins=100, normed=1, histtype='step') plt.show() # map to correlation # sb.heatmap(mamoDataX.corr()) # there are much correlation between the data that is bad
A = f['trialdata'][:] A = np.transpose(A) A = A[:, 1000:2000] # sample the 1000 time samples prior to stimulus # Do continuous wave transform scale = np.arange(1, 9) # frequencies trialData = [] for i in range(159): w = A[i, :] coefs, freqs = pywt.cwt(w, scale, 'morl', 0.0005) means = np.mean(coefs, axis=1) trialData.extend(means) X.append(trialData) X = np.asarray(X) #change X into numpy ndarray ### Split training and testing data X_train, X_test, respTimes_train, respTimes_test = train_test_split( X, respTimes, test_size=0.3) ### Fit lasso regression lasso = Lasso() lasso.fit(X_train, respTimes_train) train_score = lasso.score(X_train, respTimes_train) test_score = lasso.score(X_test, respTimes_test) print "training score:", train_score print "test score: ", test_score
'o', alpha=.5, zorder=-1, label='samples', color="tab:green") disp.axes_[0, 0].set_ylim(-3, 3) disp.axes_[0, 0].set_xlim(-1, 1) plt.legend() plt.show() ############################################################################## # Sample-weight support for Lasso and ElasticNet # ---------------------------------------------- # The two linear regressors :class:`~sklearn.linear_model.Lasso` and # :class:`~sklearn.linear_model.ElasticNet` now support sample weights. from sklearn.model_selection import train_test_split from sklearn.datasets import make_regression from sklearn.linear_model import Lasso import numpy as np n_samples, n_features = 1000, 20 rng = np.random.RandomState(0) X, y = make_regression(n_samples, n_features, random_state=rng) sample_weight = rng.rand(n_samples) X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split( X, y, sample_weight, random_state=rng) reg = Lasso() reg.fit(X_train, y_train, sample_weight=sw_train) print(reg.score(X_test, y_test, sw_test))
""" Runs a linear regression on a random sample of the data. Collects r squared values. """ times = 0 times2 = 0 while times < 100: np.random.shuffle(temp_matrix) set1 = temp_matrix[0:size][:,0:2] set2 = temp_matrix[0:size][:,2] test_set1 = temp_matrix[size:][:,0:2] test_set2 = temp_matrix[size:][:,2] clf = Lasso(alpha = 5) clf.fit(set1, set2) r_sq.append(clf.score(test_set1, test_set2)) clf1 = LinearRegression(fit_intercept = True) clf1.fit(set1, set2) r_sq1.append(clf1.score(test_set1, test_set2)) times = times + 1 """ Outputs the results of the linear regression using spearman's rank and OLS regression. """ #print "Course :" , course, "Spearman: ", stats.spearmanr(course_grade, GPA_list)[0], ", R^2: ", np.asarray(r_sq1).mean() label_course.append(course) #adds course to label_course list for future reference
feature_vector.append(value) song_x_train.append(feature_vector) song_y_train.append(song[2]) # Map to hotttnesss score lasso_model = Lasso(alpha=0.1) lasso_model.fit(np.array(song_x_train), np.array(song_y_train)) print lasso_model.coef_ song_x_test = [] song_y_test = [] for song in songs_test: feature_vector = [0]*len(artist_inputs) artist = song[1]['artist'] feature_vector[artists[artist]] = 1 feature_dict = song[1] for feature, value in feature_dict.iteritems(): if feature != 'artist' and feature != 'genre': feature_vector.append(value) song_x_test.append(feature_vector) song_y_test.append(song[2]) prediction = lasso_model.predict(song_x_test[0]) print 'Prediction: {}'.format(prediction) print 'Actual: {}'.format(song_y_test[0]) score = lasso_model.score(np.array(song_x_test), np.array(song_y_test)) print 'Score: {}'.format(score)
print('훈련 세트 score : ',ridge.score(X_train, y_train)) # 0.89 print('테스트 세트 score : ',ridge.score(X_test, y_test)) # 0.75 # alpha값 조정 -> alpha값을 높이면 계수를 0에 가깝게함 -> 최적의 alpha값을 찾아야 함 ridge10 = Ridge(alpha=10).fit(X_train, y_train) # alpha값이 10일 때 print('훈련 세트 score : ',ridge10.score(X_train, y_train)) # 0.79 print('테스트 세트 score : ',ridge10.score(X_test, y_test)) # 0.64 ridge01 = Ridge(alpha=0.1).fit(X_train, y_train) # alpha값이 1일 때 print('훈련 세트 score : ',ridge01.score(X_train, y_train)) # 0.93 print('테스트 세트 score : ',ridge01.score(X_test, y_test)) # 0.77 ####### Lasso Regression ####### from sklearn.linear_model import Lasso lasso = Lasso().fit(X_train, y_train) print('----Lasso Regression----') print('훈련 세트 score : ',lasso.score(X_train, y_train)) # 0.29 -> 과소적합 print('테스트 세트 score : ',lasso.score(X_test, y_test)) # 0.20 print('사용한 특성의 수 : ',np.sum(lasso.coef_ != 0)) # 4 -> 105개의 특성 중 4개만 사용 # 과소적합을 줄이기 위해 alpha값(규제)을 줄임 lasso001 = Lasso(alpha=0.01, max_iter=100000).fit(X_train, y_train) print('훈련 세트 score : ',lasso001.score(X_train, y_train)) # 0.90 print('테스트 세트 score : ',lasso001.score(X_test, y_test)) # 0.77 print('사용한 특성의 수 : ',np.sum(lasso001.coef_ != 0)) # 33 ####### ElasticNet ####### from sklearn.linear_model import ElasticNet elastic = ElasticNet(alpha=0.001, max_iter=10000000).fit(X_train, y_train) print('train score :',elastic.score(X_train, y_train)) print('test score :',elastic.score(X_test, y_test))
from sklearn.model_selection import train_test_split import pandas as pd from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() crime = pd.read_table('CommViolPredUnnormalizedData.txt', sep=',', na_values='?') columns_to_keep = [5, 6] + list(range(11, 26)) + list(range(32, 103)) + [145] crime = crime.iloc[:, columns_to_keep].dropna() X_crime = crime.iloc[:, range(0, 88)] y_crime = crime['ViolentCrimesPerPop'] X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, random_state=0) X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) linlasso = Lasso(alpha=2.0, max_iter=10000).fit(X_train_scaled, y_train) print('lasso regression linear model intercept: {}'.format( linlasso.intercept_)) print('lasso regression linear model coeff: {}'.format(linlasso.coef_)) print('R-Squared Score (training) :{:.3f}'.format( linlasso.score(X_train_scaled, y_train))) print('R-Squared score (test) :{:.3f}'.format( linlasso.score(X_test_scaled, y_test)))
# Lasso Regression import numpy as np from sklearn import datasets from sklearn.linear_model import Lasso # load the diabetes datasets dataset = datasets.load_diabetes() # fit a LASSO model to the data model = Lasso(alpha=0.1) model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model mse = np.mean((predicted-expected)**2) print(mse) print(model.score(dataset.data, dataset.target))
if descdim == 1: new_features.append(first) elif descdim == 2: new_features.append((first, second)) elif descdim == 3: new_features.append((first, second, third)) #plt.scatter(new_features, val_labels) #plt.show() val_features = numpy.asarray(new_features) if descdim == 1: val_features = val_features.reshape(-1, 1) reg = linear_model.LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) reg.fit(val_features, val_labels) lasso = Lasso(alpha=0.01, max_iter=10e5) lasso.fit(val_features, val_labels) train_score = lasso.score(val_features, val_labels) coeff_used = numpy.sum(lasso.coef_!=0) print ("LASSO training score:", train_score ) print ("LASSO number of features used: ", coeff_used) print ("LASSO coeff: ", lasso.coef_) print("Linear model: ", reg.coef_ , " ", reg.intercept_)
def prediction_lasso (X_train, Y_train, X_test, Y_test,alpha,normalize): # Print shapes of the training and testing data sets #print ("Shapes of the training and testing data sets") #print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape) #Create our regression object lreg = Lasso (alpha = alpha,normalize=normalize) #do a linear regression, except only on the training lreg.fit(X_train,Y_train) #print("The estimated intercept coefficient is %.2f " %lreg.intercept_) #print("The number of coefficients used was %d " % len(lreg.coef_)) # Set a DataFrame from the Facts coeff_df = DataFrame(X_train.columns) coeff_df.columns = ["Fact"] # Set a new column lining up the coefficients from the linear regression coeff_df["Coefficient"] = pd.Series(lreg.coef_) # Show #coeff_df #highest correlation between a fact and fraction votes #print ("Highest correlation fact: %s is %.9f" % (cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"], coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]) ) #sns_plot = sns.jointplot(coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"Fraction Votes",pd.merge(X_test,pd.DataFrame(Y_test), right_index=True, left_index=True),kind="scatter") #Predictions on training and testing sets pred_train = lreg.predict(X_train) pred_test = lreg.predict(X_test) # The mean square error #print("Fit a model X_train, and calculate MSE with Y_train: %.6f" % np.mean((Y_train - pred_train) ** 2)) #print("Fit a model X_train, and calculate MSE with X_test and Y_test: %.6f" %np.mean((Y_test - pred_test) ** 2)) #Explained variance score: 1 is perfect prediction #print("Variance score: %.2f" % lreg.score(X_test, Y_test)) result={} result["method"]="Lasso %.3f " %alpha if normalize : result["normalize"]="Y" else: result["normalize"]="N" result["X_train_shape"]=X_train.shape result["Y_train_shape"]=Y_train.shape result["X_test_shape"]=X_test.shape result["Y_test_shape"]=Y_test.shape result["intercept"]=lreg.intercept_ result["num_coef"]=len(lreg.coef_) result["max_fact"]=cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"] result["max_fact_value"]=coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"] result["MSE_train"]=np.mean((Y_train - pred_train) ** 2) result["MSE_test"]=np.mean((Y_test - pred_test) ** 2) result["variance"]=lreg.score(X_test, Y_test) return pred_test,coeff_df,pred_train,result
ridge.fit(X_train, y_train) ridge_y_pred = ridge.predict(X_test) print('Ridge regression model performance: ', ridge.score(X_test, y_test)) # cross_val_score cv_results = cross_val_score(ridge, X, y, cv=5) print('5 fold Cross validations scores : ', np.around(cv_results, 3).tolist()) y_new_data = ridge.predict(new_data) print(' Predicted house price on new data: ', y_new_data.item(), '\n\n') print('Regularised Regression:') print(' Lasso Regression:') lasso = Lasso(alpha=0.1, normalize=True) lasso.fit(X_train, y_train) lasso_y_pred = lasso.predict(X_test) print('Lasso regression model performance: ', lasso.score(X_test, y_test)) # cross_val_score cv_results = cross_val_score(lasso, X, y, cv=5) print('5 fold Cross validations scores : ', np.around(cv_results, 3).tolist()) y_new_data = lasso.predict(new_data) print(' Predicted house price on new data: ', y_new_data.item(), '\n\n') print('Regularised Regression:') print('Lasso Regression for feature selection: PLOT') names = boston['feature_names'] lasso_feature = Lasso(alpha=0.1) coef = lasso_feature.fit(X_train, y_train).coef_ _ = plt.plot(range(len(names)), coef) _ = plt.xticks(range(len(names)), names, rotation=60)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, new_y, test_size=0.3, random_state=42) b = LinearRegression(normalize=False) b.fit(X_train, y_train) # Regularized Ridge & Lasso Regressions ridge_model = Ridge(alpha=0.02) ridge_model.fit(X_train, y_train) lasso_model = Lasso(alpha=0.001) lasso_model.fit(X_train, y_train) print("Simple Train: ", b.score(X_train, y_train)) print("Simple Test: ", b.score(X_test, y_test)) print('---------------------------------------') print("Lasso Train: ", lasso_model.score(X_train, y_train)) #Lasso print("Lasso Test: ", lasso_model.score(X_test, y_test)) print('---------------------------------------') print("Ridge Train: ", ridge_model.score(X_train, y_train)) #Ridge print("Ridge Test: ", ridge_model.score(X_test, y_test)) ridge_model = Ridge(alpha=0.2) ridge_model.fit(X_train, y_train) lasso_model = Lasso(alpha=0.1) lasso_model.fit(X_train, y_train) print("Simple Train: ", b.score(X_train, y_train)) print("Simple Test: ", b.score(X_test, y_test)) print('---------------------------------------') print("Lasso Train: ", lasso_model.score(X_train, y_train)) #Lasso print("Lasso Test: ", lasso_model.score(X_test, y_test)) print('---------------------------------------')
#plt.rc('text', usetex=True) #没装LaTeX宏包把该句注释 a = np.loadtxt("Pdata12_6.txt") #加载表中的9行5列数据 n = a.shape[1] - 1 #自变量的总个数 x = a[:, :n] #提出自变量观测值矩阵 X = sm.add_constant(x) md = sm.OLS(a[:, n], X).fit() #构建并拟合模型 print(md.summary()) #输出模型的所有结果 aa = zscore(a) #数据标准化 x = aa[:, :n] y = aa[:, n] #提出自变量和因变量观测值矩阵 b = [] #用于存储回归系数的空列表 kk = np.logspace(-4, 0, 100) #循环迭代的不同k值 for k in kk: md = Lasso(alpha=k).fit(x, y) b.append(md.coef_) st = ['s-r', '*-k', 'p-b', '^-y'] #下面画图的控制字符串 for i in range(n): plt.plot(kk, np.array(b)[:, i], st[i]) plt.legend(['$x_1$', '$x_2$', '$x_3$', '$x_4$'], fontsize=15) plt.show() md0 = Lasso(0.05).fit(x, y) #构建并拟合模型 cs0 = md0.coef_ #提出标准化数据的回归系数b1,b2,b3,b4 print("标准化数据的所有回归系数为:", cs0) mu = a.mean(axis=0) s = a.std(axis=0, ddof=1) #计算所有指标的均值和标准差 params = [mu[-1] - s[-1] * sum(cs0 * mu[:-1] / s[:-1]), s[-1] * cs0 / s[:-1]] print("原数据的回归系数为:", params) print("拟合优度:", md0.score(x, y))
Y1 = Y_train_raw[train] X2 = X_train_reduced[test] Y2 = Y_train_raw[test] ## Train Classifiers on fold rdg_clf = Ridge(alpha=0.5) rdg_clf.fit(X1, Y1) lso_clf = Lasso(alpha=0.6257) lso_clf.fit(X1, Y1) svr_clf = LinearSVR(C=1e3) svr_clf.fit(X1, Y1) ## Score Classifiers on fold rdg_clf_score = rdg_clf.score(X2, Y2) lso_clf_score = lso_clf.score(X2, Y2) svr_clf_score = svr_clf.score(X2, Y2) print "Ridge: ", rdg_clf_score print "Lasso: ", lso_clf_score print "SVR_RBF: ", svr_clf_score ## Train final Classifiers # clf = Ridge(alpha=.5) clf = LinearSVR(C=1e3, gamma=0.1) clf.fit(X_train_reduced, Y_train_raw) Y_predicted = clf.predict(X_test_reduced) ## Save results to csv np.savetxt("prediction.csv", Y_predicted, fmt="%.5f", delimiter=",")
print "train error: " , np.sqrt(np.mean((data_0am_train_predy-data_0am_train_yy)**2))/nom_train print "test error: ", np.sqrt(np.mean((data_0am_test_predy-data_0am_test_y)**2))/nom_test # print "train error ratio: " , np.mean(np.divide(np.absolute(data_0am_train_predy-data_0am_train_yy),data_0am_train_yy+0.001)) # print "train error ratio: " , np.absolute(data_0am_train_predy-data_0am_train_yy) # print "test error ratio: ", np.mean(np.divide(np.absolute(data_0am_test_predy-data_0am_test_y),data_0am_train_yy+0.00001)) las = Lasso(max_iter=50000,alpha=0.01) las.fit(data_0am_train_xx,data_0am_train_yy) data_0am_train_predy = las.predict(data_0am_train_xx) lasso_train_predy = las.predict(data_0am_train_xx) data_0am_test_predy = las.predict(data_0am_test_x) lasso_test_predy = las.predict(data_0am_test_x) print "Lasso report" print "train score: ", las.score(data_0am_train_xx,data_0am_train_yy) print "train error: " , np.sqrt(np.mean((data_0am_train_predy-data_0am_train_yy)**2))/nom_train print "test error: ", np.sqrt(np.mean((data_0am_test_predy-data_0am_test_y)**2))/nom_test svr = SVR(kernel='linear') svr.fit(data_0am_train_xx,data_0am_train_yy) data_0am_train_predy = svr.predict(data_0am_train_xx) svr_train_predy = svr.predict(data_0am_train_xx) data_0am_test_predy = svr.predict(data_0am_test_x) svr_test_predy = svr.predict(data_0am_test_x) print "SVR report" print "train score: ", svr.score(data_0am_train_xx,data_0am_train_yy) print "train error: " , np.sqrt(np.mean((data_0am_train_predy-data_0am_train_yy)**2))/nom_train print "test error: ", np.sqrt(np.mean((data_0am_test_predy-data_0am_test_y)**2))/nom_test
def train_model(): start_time=time.time() data_inp=data_clean(df) pivot = data_inp.pivot(index='goods_code', columns='dis_month', values='sale') #对变量重新命名 col_name=[] for i in range(len(pivot.columns)): col_name.append('sales_'+str(i)) pivot.columns=col_name pivot.fillna(0, inplace=True) sub=pivot.reset_index() test_features=['goods_code'] trian_features = ['goods_code'] for i in range(1,3): test_features.append('sales_' + str(i)) #前面21个月作为训练集 for i in range(3,23): trian_features.append('sales_' + str(i)) sub.fillna(0, inplace=True) sub.drop_duplicates(subset=['goods_code'],keep='first',inplace=True) #最近的两个月作为测试集 for i in range(1,3): test_features.append('sales_' + str(i)) for i in range(3,23): trian_features.append('sales_' + str(i)) X_train = sub[trian_features] y_train = sub[['sales_0', 'goods_code']] X_test = sub[test_features] sales_type = 'sales_' #平均数特征 X_train['mean_sale'] = X_train.apply( lambda x: np.mean([x[sales_type+'3'], x[sales_type+'4'],x[sales_type+'5'], x[sales_type+'6'], x[sales_type+'7'],x[sales_type+'8'], x[sales_type+'9'], x[sales_type+'10'], x[sales_type+'11'],x[sales_type+'12'],x[sales_type+'13'], x[sales_type+'14'], x[sales_type+'15'], x[sales_type+'16'], x[sales_type+'17'],x[sales_type+'18'], x[sales_type+'19'], x[sales_type+'20'], x[sales_type+'21'], x[sales_type+'22']]), axis=1) X_test['mean_sale'] = X_test.apply( lambda x: np.mean([x[sales_type+'1'], x[sales_type+'2']]), axis=1) train_mean=X_train['mean_sale'] test_mean=X_test['mean_sale'] train_mean=pd.Series(train_mean) test_mean=pd.Series(test_mean) #众数特征 X_train['median_sale'] = X_train.apply( lambda x: np.median([ x[sales_type+'3'], x[sales_type+'4'], x[sales_type+'5'], x[sales_type+'6'], x[sales_type+'7'],x[sales_type+'8'], x[sales_type+'9'], x[sales_type+'10'], x[sales_type+'11'],x[sales_type+'12'], x[sales_type+'13'], x[sales_type+'14'],x[sales_type+'15'], x[sales_type+'16'], x[sales_type+'17'],x[sales_type+'18'], x[sales_type+'19'], x[sales_type+'20'], x[sales_type+'21'], x[sales_type+'22']]), axis=1) X_test['median_sale'] = X_test.apply( lambda x: np.median([x[sales_type+'1'], x[sales_type+'2']]), axis=1) #标准差特征 X_train['std_sale'] = X_train.apply( lambda x: np.std([ x[sales_type+'3'], x[sales_type+'4'],x[sales_type+'5'], x[sales_type+'6'], x[sales_type+'7'],x[sales_type+'8'], x[sales_type+'9'], x[sales_type+'10'], x[sales_type+'11'],x[sales_type+'12'],x[sales_type+'13'], x[sales_type+'14'], x[sales_type+'15'], x[sales_type+'16'], x[sales_type+'17'],x[sales_type+'18'], x[sales_type+'19'], x[sales_type+'20'], x[sales_type+'21'], x[sales_type+'22']]), axis=1) X_test['std_sale'] = X_test.apply( lambda x: np.std([x[sales_type+'1'], x[sales_type+'2']]), axis=1) train_median=X_train['median_sale'] test_median=X_test['median_sale'] train_std=X_train['std_sale'] test_std=X_test['std_sale'] X_train = sub[trian_features] X_test = sub[test_features] formas_train=[train_mean,train_median,train_std] formas_test=[test_mean,test_median,test_std] train_inp=pd.concat(formas_train,axis=1) test_inp=pd.concat(formas_test,axis=1) #残差特征 lr_Y=y_train['sales_0'] lr_train_x=train_inp re_train= sm.OLS(lr_Y,lr_train_x).fit() train_inp['resid']=re_train.resid lr_Y=y_train['sales_0'] lr_test_x=test_inp re_test= sm.OLS(lr_Y,lr_test_x).fit() test_inp['resid']=re_test.resid train_inp=pd.concat([y_train,train_inp],axis=1) ts_test_pro,ts_train_pro=split_ts(df) ts_train_=ts_train_pro.reset_index() train_inp=pd.merge(train_inp,ts_train_,left_on='goods_code',right_on='id',how='left') test_inp=pd.concat([y_train,test_inp],axis=1) ts_test_=ts_test_pro.reset_index() test_inp=pd.merge(test_inp,ts_test_,left_on='goods_code',right_on='id',how='left') train_inp.drop(['sales_0','goods_code'],axis=1,inplace=True) test_inp.drop(['sales_0','goods_code'],axis=1,inplace=True) train_inp.fillna(0,inplace=True) train_inp.replace(np.inf,0,inplace=True) test_inp.replace(np.inf,0,inplace=True) test_inp.fillna(0,inplace=True) #lasso ss = StandardScaler() train_inp_s= ss.fit_transform(train_inp) test_inp_s= ss.transform(test_inp) alpha_ridge = [1e-4,1e-3,1e-2,0.1,1] coeffs = {} for alpha in alpha_ridge: r = Lasso(alpha=alpha, normalize=True, max_iter=1000000) r = r.fit(train_inp_s, y_train['sales_0']) grid_search = GridSearchCV(Lasso(alpha=alpha, normalize=True), scoring='neg_mean_squared_error', param_grid={'alpha': alpha_ridge}, cv=5, n_jobs=-1) grid_search.fit(train_inp_s, y_train['sales_0']) alpha = alpha_ridge rmse = list(np.sqrt(-grid_search.cv_results_['mean_test_score'])) plt.figure(figsize=(6,5)) lasso_cv = pd.Series(rmse, index = alpha) lasso_cv.plot(title = "Validation - LASSO", logx=True) plt.xlabel("alpha") plt.ylabel("rmse") plt.show() least_lasso=min(alpha) lasso = Lasso(alpha=least_lasso,normalize=True) model_lasso=lasso.fit(train_inp_s,y_train['sales_0']) print("lasso feature.......................") lasso_coef = pd.Series(model_lasso.coef_,index = train_inp.columns) lasso_coef=lasso_coef[lasso_coef!=0.0000] lasso_coef=lasso_coef.astype(float) print(".....lasso_coef..............") print(lasso_coef.sort_values(ascending=False).head(10)) print(" R^2,拟合优度") matplotlib.rcParams['figure.figsize'] = (8.0, 10.0) imp_coef = pd.concat([lasso_coef.sort_values().head(5), lasso_coef.sort_values().tail(5)])#选头尾各10条 imp_coef.plot(kind = "barh") plt.title("Coefficients in the Lasso Model") print(lasso.score(train_inp_s,y_train['sales_0'])) print(lasso.get_params()) print('参数信息') print(lasso.set_params(fit_intercept=False)) lasso_preds =model_lasso.predict(test_inp_s) #绘制预测结果和真实值散点图 fig, ax = plt.subplots() ax.scatter(y_train['sales_0'],lasso_preds) ax.plot([y_train['sales_0'].min(), y_train['sales_0'].max()], [y_train['sales_0'].min(), y_train['sales_0'].max()], 'k--', lw=4) ax.set_xlabel('y_true') ax.set_ylabel('Pred') plt.show() y_pred=pd.DataFrame(lasso_preds,columns=['y_pred']) matplotlib.rcParams['figure.figsize'] = (6.0, 6.0) preds = pd.DataFrame({"preds":y_pred['y_pred'], "true":y_train['sales_0']}) preds["residuals"] = preds["true"] - preds["preds"] print("打印预测值描述.....................") preds=preds.astype(float) print(preds.head()) print(preds.describe()) print(preds.shape) preds.plot(x = "preds", y = "residuals",kind = "scatter") plt.title("True and residuals") plt.show() data_out=[y_train['goods_code'],y_train['sales_0'],y_pred] result=pd.concat(data_out,axis=1) #计算mape result['mape']=abs((result['sales_0']-result['y_pred'])/result['sales_0']*100) return result,lasso_coef
# In[13]: #Split data into train and test X_train, X_test, y_train, y_test =train_test_split(x, price, test_size=0.2,random_state=0) # In[14]: #Lasso with Cross Validation lasso = LassoCV(alphas=np.linspace(0.00001,1,100), cv=10) L=lasso.fit(X_train, y_train) print(lasso.score(X_train, y_train)) print(lasso.score(X_test, y_test)) lasso.alpha_ lasso.coef_ Error_Tr=(y_train - L.predict(X_train)) ErroT= (y_test - L.predict(X_test)) Tr_rmse = (np.mean(Error_Tr**2))**.5 T_rmse=(np.mean(ErroT**2))**.5 print(Tr_rmse,T_rmse) # In[8]:
print("level 1 Linear Regression") print("훈련 세트 점수: {:.2f}".format(lr.score(X_train, y_train))) print("테스트 세트 점수: {:.2f}".format(lr.score(X_test, y_test))) #Ridge Model ridge_model = Ridge(alpha=0.01, normalize=True) ridge_model.fit(X_train, y_train) pred_ridge = ridge_model.predict(X_test) print("level 1 Ridge Regression") print("훈련 세트 점수: {:.2f}".format(ridge_model.score(X_train, y_train))) print("테스트 세트 점수: {:.2f}".format(ridge_model.score(X_test, y_test))) #Lasso Model Lasso_model = Lasso(alpha=0.001, normalize=False) Lasso_model.fit(X_train, y_train) pred_Lasso = Lasso_model.predict(X_test) print("level 1 Lasso Regression") print("훈련 세트 점수: {:.2f}".format(Lasso_model.score(X_train, y_train))) print("테스트 세트 점수: {:.2f}".format(Lasso_model.score(X_test, y_test))) #ElasticNet Model model_enet = ElasticNet(alpha=0.01, normalize=False) model_enet.fit(X_train, y_train) pred_test_enet = model_enet.predict(X_test) print("level 1 ElasticNet Regression") print("훈련 세트 점수: {:.2f}".format(model_enet.score(X_train, y_train))) print("테스트 세트 점수: {:.2f}".format(model_enet.score(X_test, y_test))) print('-----------1 단계 끝 --------------') ''' # ============= 2. room_type 변수 제거 ======================== nyc_model_xx= df1.drop(columns=['room_type']) nyc_model_xx, nyc_model_yx = nyc_model_xx.iloc[:,:-1], nyc_model_xx.iloc[:,-1] X_train_x, X_test_x, y_train_x, y_test_x = train_test_split(nyc_model_xx, nyc_model_yx, test_size=0.3,random_state=42)
from sklearn.linear_model import Lasso lasso = Lasso() # In[106]: lasso.fit(X_train, y_train) # In[107]: # Score the model lasso_score = lasso.score(X_test, y_test) lasso_score # In[108]: # Score the model lasso_score = lasso.score(X_train, y_train) lasso_score # In[109]: # Make predictions using the testing set