def lasso_cv_coef(X_train, y_train, plotit=True, summarize=True): ''' lasso_cv_coef(X_train, y_train, plotit=True, summarize=True) plotit produces plot at runtime summarize returns printed summary RETURNS: model, alpha, score, coef, yhat ''' model = LassoCV().fit(X_train, y_train) alpha = model.alpha_ score = model.score(X_train, y_train) coef = pd.Series(model.coef_, index=X_train.columns) yhat = model.predict(X_train) if summarize: imp_coef = coef.sort_values() vars_kept = sum(coef != 0) vars_elim = sum(coef == 0) print("Best alpha using built-in LassoCV: %f" % model.alpha_) print("Best score using built-in LassoCV: %f" % model.score(X_train, y_train)) print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables") print(pd.DataFrame(coef)) if plotit: imp_coef = coef.sort_values() matplotlib.rcParams['figure.figsize'] = (4.0, 5.0) imp_coef.plot(kind="barh") plt.title("Feature importance using Lasso Model") plt.plot() return model, alpha, score, coef, yhat
def run_lasso_main(X_tr, y_tr, X_vl, y_val, output, main): os.chdir(output) if not os.path.exists("lasso"): os.makedirs("lasso") dir = output + "lasso/" os.chdir(os.path.join(output, 'lasso/')) if not os.path.exists("figures"): os.makedirs("figures") dir = output + "lasso/figures/" os.chdir(os.path.join(output, 'lasso/figures/')) for i in range(0, y_tr.shape[1]): lasso = LassoCV(cv=5, random_state=0).fit(X_tr, y_tr[:,i]) # lasso = linear_model.Lasso(alpha=0.01) # lasso.fit(X_train, y_train) lasso.score(X_tr, y_tr[:,i]) y_hat = lasso.predict(X_vl) # correlation btw predicted and observed corr = pearsonr(y_hat, y_val[:,i]) fig = plt.figure() # plot observed vs. predicted targets plt.title('Lasso: Observed vs Predicted Y_trait_' + str(i) + 'cor:' + str(corr[0])) plt.ylabel('Predicted') plt.xlabel('Observed') plt.scatter(y_val[:,i], y_hat, marker='o') fig.savefig("Lasso_Out" + str(i) + '.png', dpi=300) plt.close(fig)
def test_cross_val_criterion(): alpha_min = alpha_max / 10 log_alpha_max = np.log(alpha_max) log_alpha_min = np.log(alpha_min) max_iter = 10000 n_alphas = 10 kf = KFold(n_splits=5, shuffle=True, random_state=56) estimator = sklearn.linear_model.Lasso(fit_intercept=False, max_iter=1000, warm_start=True) monitor_grid = Monitor() criterion = CrossVal(X, y, Lasso, cv=kf, estimator=estimator) algo = Forward() grid_search(algo, criterion, log_alpha_min, log_alpha_max, monitor_grid, max_evals=n_alphas, tol=tol) reg = LassoCV(cv=kf, verbose=True, tol=tol, fit_intercept=False, alphas=np.geomspace(alpha_max, alpha_min, num=n_alphas), max_iter=max_iter).fit(X, y) reg.score(X, y) objs_grid_sk = reg.mse_path_.mean(axis=1) # these 2 value should be the same (objs_grid_sk - np.array(monitor_grid.objs)) assert np.allclose(objs_grid_sk, monitor_grid.objs)
class EPMNF_model(object): def __init__(self,train_path,test_path,pred_path): self.train_path = train_path self.test_path = test_path self.pred_path = pred_path self.lasso_model = LassoCV(alphas=[float(i)*0.05 for i in range(1,100)],cv=10,n_alphas=10,max_iter=10000000,normalize=False,random_state=0) #get X_train,y_train,X_test,y_test, and EPMNF expansion def preprocess_data(self): train_data = read_data(self.train_path) test_data = read_data(self.test_path) len_train = len(train_data) len_test = len(test_data) train_data = np.asarray(train_data) test_data = np.asarray(test_data) #print(train_data.shape,test_data.shape) X_train,y_train = train_data[:,:-1],train_data[:,-1] X_test,y_test = test_data[:,:-1],test_data[:,-1] #print(X_train.shape,y_train.shape,X_test.shape,y_test.shape) X_all = np.append(X_train,X_test,axis=0) X_all_EPMNF = [] for row in X_all: line = [] for p in row: line = line + PMNF_exp(p) X_all_EPMNF.append(line) X_all_EPMNF = np.asarray(X_all_EPMNF) #print(X_all_EPMNF.shape) scaler = StandardScaler() scaler.fit(X_all_EPMNF) X_all_EPMNF = scaler.transform(X_all_EPMNF) X_train_EPMNF = X_all_EPMNF[:len_train,:] X_test_EPMNF = X_all_EPMNF[len_train:,:] print(X_train_EPMNF.shape,X_test_EPMNF.shape) return train_data,test_data,X_train_EPMNF,X_test_EPMNF,y_train,y_test def train(self): train_data,test_data,X_train_EPMNF,X_test_EPMNF,y_train,y_test = self.preprocess_data() self.lasso_model.fit(X_train_EPMNF,y_train) y_pred = self.lasso_model.predict(X_test_EPMNF) with open(self.pred_path,"w",newline='') as f: csv_writer = csv.writer(f) for i in range(len(test_data)): row = np.append(test_data[i],y_pred[i]) csv_writer.writerow(row) #print(pred_data) print("The alpha is : {}".format(self.lasso_model.alpha_)) print("The train R^2 is : {}".format(self.lasso_model.score(X_train_EPMNF,y_train))) print("The test R^2 is : {}".format(self.lasso_model.score(X_test_EPMNF,y_test))) print("number of no-zero coefs is : {}".format(np.count_nonzero(self.lasso_model.coef_)))
def linear_reg_all(df): ## Split and clean Data X_train, X_test, y_train, y_test = split_data_multimeter(df) # Fit your model using the training set linear = LinearRegression() lasso_cv = LassoCV(cv=5, random_state=0) ridge_cv = RidgeCV(alphas=(0.1, 1.0, 10.0)) linear.fit(X_train, y_train) lasso_cv.fit(X_train, y_train) ridge_cv.fit(X_train, y_train) print( 'Linear regression score on train set with all parameters: {}'.format( linear.score(X_train, y_train))) print('Linear regression score on test set with all parameters: {}'.format( linear.score(X_test, y_test))) print( 'Linear regression crossVal score on train set with all parameters: {}' .format(linear.score(X_train, y_train))) print( 'Linear regression crossVal score on test set with all parameters: {}'. format(linear.score(X_test, y_test))) print( 'LassoCV regression score on train set with all parameters: {}'.format( lasso_cv.score(X_train, y_train))) print( 'LassoCV regression score on test set with all parameters: {}'.format( lasso_cv.score(X_test, y_test))) print( 'LassoCV regression crossVal score on train set with all parameters: {}' .format(lasso_cv.score(X_train, y_train))) print( 'LassoCV regression crossVal score on test set with all parameters: {}' .format(lasso_cv.score(X_test, y_test))) print( 'RidgeCV regression score on train set with all parameters: {}'.format( ridge_cv.score(X_train, y_train))) print( 'RidgeCV regression score on test set with all parameters: {}'.format( ridge_cv.score(X_test, y_test))) print( 'RidgeCV regression crossVal score on train set with all parameters: {}' .format(ridge_cv.score(X_train, y_train))) print( 'RidgeCV regression crossVal score on test set with all parameters: {}' .format(ridge_cv.score(X_test, y_test))) return ridge_cv, lasso_cv, linear, X_train, X_test, y_train, y_test
def linear_reg_all(df, drop_list, dummies, thresh=1): ## Split and clean Data X_train, X_test, y_train, y_test = split_data_multimeter( df, drop_list, dummies, thresh) X_scaler = StandardScaler() X_train = X_scaler.fit_transform(X_train) X_test_1 = X_scaler.transform(X_test) # Fit your model using the training set linear = LinearRegression() lasso_cv = LassoCV(cv=5, random_state=0) ridge_cv = RidgeCV(alphas=(0.1, 1.0, 10.0)) linear.fit(X_train, y_train) lasso_cv.fit(X_train, y_train) ridge_cv.fit(X_train, y_train) print("Variance Inflation Factors") vif = vifs(X_test) print(vif) print('\n') print(list(zip(vif, list(X_test.columns)))) print( 'Linear regression score on train set with all parameters: {}'.format( linear.score(X_train, y_train))) print('Linear regression score on test set with all parameters: {}'.format( linear.score(X_test_1, y_test))) # print('Linear regression crossVal score on train set with all parameters: {}'.format(linear.score(X_train, y_train))) # print('Linear regression crossVal score on test set with all parameters: {}'.format(linear.score(X_test, y_test))) print( 'LassoCV regression score on train set with all parameters: {}'.format( lasso_cv.score(X_train, y_train))) print( 'LassoCV regression score on test set with all parameters: {}'.format( lasso_cv.score(X_test_1, y_test))) # print('LassoCV regression crossVal score on train set with all parameters: {}'.format(lasso_cv.score(X_train, y_train))) # print('LassoCV regression crossVal score on test set with all parameters: {}'.format(lasso_cv.score(X_test, y_test))) print( 'RidgeCV regression score on train set with all parameters: {}'.format( ridge_cv.score(X_train, y_train))) print( 'RidgeCV regression score on test set with all parameters: {}'.format( ridge_cv.score(X_test_1, y_test))) # print('RidgeCV regression crossVal score on train set with all parameters: {}'.format(ridge_cv.score(X_train, y_train))) # print('RidgeCV regression crossVal score on test set with all parameters: {}'.format(ridge_cv.score(X_test, y_test))) return ridge_cv, lasso_cv, linear, X_train, X_test, y_train, y_test
def lasso(A, y, positive=True): A_scaler = StandardScaler().fit(A[:, 1:]) y_scaler = StandardScaler().fit(y.reshape(-1, 1)) A_new = A_scaler.transform(A[:, 1:]) y_new = y_scaler.transform(y.reshape(-1, 1)).reshape(-1) clf = LassoCV(cv=5, n_jobs=8, normalize=False, fit_intercept=False, positive=positive).fit(A_new, y_new) score = clf.score(A_new, y_new) df = np.count_nonzero(clf.coef_) logging.info("[LASSO] # iter: %d, alpha: %e, # of terms: %d, score: %f", clf.n_iter_, clf.alpha_, df, score) logging.debug("[LASSO] alphas:") logging.debug(str(clf.alphas_)) logging.debug("[LASSO] MSE path:") logging.debug(str(clf.mse_path_)) nonzero = abs(clf.coef_) > 0.0 coef = np.zeros_like(clf.coef_) # coef[nonzero] = ((y_scaler.var_ / A_scaler.var_[nonzero]) ** 0.5) * clf.coef_[nonzero] coef[nonzero] = (y_scaler.scale_ / A_scaler.scale_[nonzero]) * clf.coef_[nonzero] intercept = y_scaler.mean_ - np.dot(A_scaler.mean_, coef) return np.append(intercept, coef), df
def test_lasso_cv(): X, y, X_test, y_test = build_dataset() max_iter = 150 clf = LassoCV(n_alphas=10, eps=1e-3, max_iter=max_iter, cv=3).fit(X, y) assert_almost_equal(clf.alpha_, 0.056, 2) clf = LassoCV(n_alphas=10, eps=1e-3, max_iter=max_iter, precompute=True, cv=3) clf.fit(X, y) assert_almost_equal(clf.alpha_, 0.056, 2) # Check that the lars and the coordinate descent implementation # select a similar alpha lars = LassoLarsCV(normalize=False, max_iter=30, cv=3).fit(X, y) # for this we check that they don't fall in the grid of # clf.alphas further than 1 assert np.abs( np.searchsorted(clf.alphas_[::-1], lars.alpha_) - np.searchsorted(clf.alphas_[::-1], clf.alpha_)) <= 1 # check that they also give a similar MSE mse_lars = interpolate.interp1d(lars.cv_alphas_, lars.mse_path_.T) np.testing.assert_approx_equal(mse_lars(clf.alphas_[5]).mean(), clf.mse_path_[5].mean(), significant=2) # test set assert clf.score(X_test, y_test) > 0.99
def run(self,trainingDasaset,plotting): dataset = trainingDasaset accuracy = 0 y = dataset['int_rate'] X = dataset.drop(columns=['int_rate',]) if plotting==True: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) lassoreg = LassoCV(cv=5, random_state=42) lassoreg.fit(X_train,y_train) print("###################################LassoRegression#############################") accuracy=lassoreg.score(X_test, y_test) pred = lassoreg.predict(X_test) #accuracy = np.sqrt(metrics.mean_squared_error( y_test,pred)) print("score:"+str(accuracy)) else: lassoreg = LassoCV(cv=5, random_state=42) lassoreg.fit(X,y) testData = pd.read_csv("./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/CleanedData/SiameseTrainingData.csv") predictions = lassoreg.predict(testData) np.savetxt("./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/OutputFiles/LassoCVRegressionPredictions.csv", predictions, delimiter=",") testData = pd.read_csv("./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/CleanedData/OverallTestingData.csv") predictions = lassoreg.predict(testData) np.savetxt("./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/OutputFiles/LassoCVRegressionPredictionsTestData.csv", predictions, delimiter=",") return accuracy
def train_and_test(self, g, m, t, approx, I=100, delta=0.025, skip_variance=False): kernel = FastSK( g=g, m=m, t=t, approx=approx, max_iters=I, delta=delta, skip_variance=skip_variance, ) kernel.compute_kernel(self.train_seq, self.test_seq) self.Xtest = kernel.get_test_kernel() self.Xtest = np.array(self.Xtest).reshape(len(self.Xtest), -1) self.Xtrain = kernel.get_train_kernel() self.Xtrain = np.array(self.Xtrain).reshape(len(self.Xtrain), -1) # Can replace Lasso with alternative regression approaches such as SVR model = LassoCV(cv=5, n_jobs=t, random_state=293).fit(self.Xtrain, self.Ytrain) r2 = model.score(self.Xtest, self.Ytest) return r2
def fit_grn_row((i, x, y, eps, max_iter, verbose)): model = LassoCV(eps=eps, max_iter=max_iter).fit(x, y) if verbose: print 'row:', i, 'nnz:', ( ~np.isclose(model.coef_, 0)).sum(), 'score:', model.score( x, y), 'reg param', model.alpha_ return model.coef_
def do_LASSO(cv=10): """ Do LASSO on the data-set Params :: cv: int: folds of craoss-validation to do. Default 10 Returns :: None """ x_scaler = StandardScaler() y_scaler = StandardScaler() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.60, random_state=23) X_std_train = x_scaler.fit_transform(X_train) X_std_test = x_scaler.transform(X_test) y_std_train = y_scaler.fit_transform(y_train) y_std_test = y_scaler.transform(y_test) y_sigma = y_scaler.scale_ lasso = LassoCV(cv=cv) lasso.fit(X_std_train, y_std_train) y_predict = [(_ * y_sigma) + y_scaler.mean_ for _ in lasso.predict(X_std_test)] print('Mean Absolute Error: ', mean_absolute_error(y_true=y_test, \ y_pred=y_predict)) print('R2 of training data: ', lasso.score(X_std_train, y_std_train)) plot_parity(x=y_test, y=y_predict, xlabel='True E/Z Ratio', \ ylabel='Predicted E/Z Ratio')
def getSortedTopKfeatures(train_features, train_label): reg = LassoCV() reg.fit(train_features, train_label) print("Best alpha using built-in LassoCV: %f" % reg.alpha_) print("Best score using built-in LassoCV: %f" % reg.score(train_features, train_label)) coef = pd.Series(reg.coef_, index=train_features.columns) # In[33]: print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables") # In[34]: imp_coef = coef.sort_values() ''' import matplotlib matplotlib.rcParams['figure.figsize'] = (8.0, 25.0) imp_coef.plot(kind = "barh") plt.title("Feature importance using Lasso ds_model") fig=plt.gcf() fig.set_size_inches(10,20) #plt.show() fig.savefig('Features_importance.png') ''' # In[35]: ## drop all columns except. #df.drop(df.columns.difference(['a','b']), 1, inplace=True) # In[47]: coef # In[38]: print(len(coef)) # In[65]: filter(lambda a: a != 0, coef) # In[70]: from collections import OrderedDict, defaultdict coef_dict = (coef).to_dict() # In[75]: import collections sorted_dict = sorted(coef_dict.items(), key=lambda kv: kv[1], reverse=True) #OrderedDict(coef_dict) keys = [k for k, v in sorted_dict if v != 0] #return {"topk": keys} return keys
def LASSO_cv(problem, **kwargs): r"""High level description. Parameters ---------- problem : type Description kwargs['LASSO_reg_coefs'] must be a nonnegative float. These are the multipliers for the penalty term in cross-validation of LASSO kwargs['coef_tolerance'] must be a nonnegative float Returns ------- output : tuple (optimum, maximum) """ data_list = [datum['data']['values'] for datum in problem.data] data = numpy.array(data_list) lasso = LassoCV(alphas=kwargs['LASSO_reg_coefs']) lasso.fit(data.T, problem.goal['data']['values']) lasso_coefficients = lasso.coef_ optimum = [ problem.data[index] for index, element in enumerate(lasso_coefficients) if abs(element) > kwargs['coef_tolerance'] ] maximum = lasso.score(data.T, problem.goal['data']['values']) output = (optimum, maximum) return output
def lasso_regression(X_train, y_train, X_test, y_test, plot): """ Perfomring a lasso regression with built in CV and plotting the feature importance """ # Fit the ridge regression reg = LassoCV() reg.fit(X_train, y_train) print("Best alpha using built-in LassoCV: %f" % reg.alpha_) print("Best score using built-in LassoCV: %f" % reg.score(X_train, y_train)) coef = pd.Series(reg.coef_, index=X_train.columns) print( "Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables" ) # Extract the feature importance imp_coef = coef.sort_values() # Plot the feature importance if plot: plt.rcParams["figure.figsize"] = (8.0, 10.0) imp_coef.plot(kind="barh") plt.title("Feature importance using Lasso Model") plt.show() # Plotting the prediction error visualizer = PredictionError(reg, size=(1080, 720)) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure # Visualizing the regression visualizer = ResidualsPlot(reg, size=(1080, 720)) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure # Using the test data to calculate a score y_pred = reg.predict(X_test) # Return metrics return { "name": "Lasso Regression", "R squared": reg.score(X_test, y_test), "RMSE": rmse(y_test, y_pred), "R squared training": reg.score(X_train, y_train), "MAE": mean_absolute_error(y_test, y_pred), }
def feature_lasso(self): model = LassoCV() model.fit(self.x, self.y) coefficients = pd.Series(model.coef_, index=self.x.columns) print("Beta weights/co-efficients (L1 regularisation)") print("-----------------------------------------") print(coefficients) print('\n') print('R2 score is {}'.format(model.score(self.x, self.y)))
def Divergence_Plots_For_Single_Instrument(instrument_name, flag): data = pd.read_csv(instrument_name + ".csv") # Making a copy of data frame and dropping all the null values df_copy = data.copy() df_copy = df_copy.dropna(axis=1) df_copy = df_copy.dropna() print(len(df_copy)) X = df_copy[[ "CCI", "RSI", "MACD", "WPCTR", "pdi", "mdi", "adx", "Divergence Factor 1", "Divergence Factor 2", "Divergence Factor 3", "Divergence Factor 4" ]] y = df_copy["DF Avg Rank"] print(len(X), len(y)) # Embedded method reg = LassoCV() reg.fit(X, y) print("Best alpha using built-in LassoCV: %f" % reg.alpha_) print("Best score using built-in LassoCV: %f" % reg.score(X, y)) coef = pd.Series(reg.coef_, index=X.columns) print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables") fig, ax = plt.subplots(1) sns.set() imp_coef = coef.sort_values() matplotlib.rcParams['figure.figsize'] = (10, 14) matplotlib.rcParams['ytick.labelsize'] = 3 matplotlib.rcParams['axes.labelsize'] = 3 matplotlib.rcParams['legend.fontsize'] = 1 plt.rc('axes', titlesize=12) plt.yticks(fontsize=9.5) my_colors = list( islice(cycle(['orange', 'b', 'r', 'g', 'y', 'k', 'm']), None, len(df_copy))) ax = imp_coef.plot(kind="barh", stacked=True, color=my_colors, width=0.91, align='edge') ax.yaxis.label.set_size(3) title = f"Feature importance of {instrument_name} using the Lasso Model" plt.title(title) fig_name = instrument_name + "_EmbaddedMethod" + ".png" if flag == True: plt.savefig("man_select_inst" + "\\" + fig_name) else: plt.savefig("rule_select_inst" + "\\" + fig_name)
def lasso_reg(x, y): alpha = np.logspace(-2, 10, num=50) lassocv = LassoCV(alphas=alpha, cv=20) lassocv.fit(x, y) lassocv_score = lassocv.score(x, y) lassocv_alpha = lassocv.alpha_ print('Lasso R square', lassocv_score) print('Lasso Alpha', lassocv_score) return lassocv.coef_
def featureImportanceLasso(X,y): reg = LassoCV() reg.fit(X, y) print("Best alpha using built-in LassoCV: %f" % reg.alpha_) print("Best score using built-in LassoCV: %f" %reg.score(X,y)) coef = pd.Series(reg.coef_, index = X.columns) imp_coef = coef.sort_values() imp_coef.plot(kind = "barh") plt.title("Feature importance usando Lasso Model")
def k_fold(x_train, y_train): alphas = np.logspace(-4, -0.5, 30) lassoCV = LassoCV(random_state=0, alphas=alphas, max_iter=10000) k_fold = KFold(3) scores = [] for k, (train, test) in enumerate(k_fold.split(x_train, y_train)): lassoCV.fit(x_train[train], y_train[train]) scores.append(lassoCV.score(x_train[test], y_train[test])) return scores
def lassoReg(X, y, names): reg = LassoCV() reg.fit(X, y) print("Best alpha using built-in LassoCV: %f" % reg.alpha_) print("Best score using built-in LassoCV: %f" % reg.score(X, y)) #coef = pd.Series(reg.coef_, index = X.columns) print("Lasso picked " + str(np.sum(reg.coef_ != 0)) + " variables and eliminated the other " + str(np.sum(reg.coef_ == 0)) + " variables")
def get_regression(data: pd.DataFrame) -> typing.Tuple[LassoCV, float]: x_train, x_test, y_train, y_test = train_test_split( data[data.columns[:-1]], data[data.columns[-1]], test_size=0.1, random_state=42) reg = LassoCV(cv=5, random_state=42).fit(x_train, y_train) return reg, reg.score(x_test, y_test)
def lasso(x, y): sv = LassoCV(normalize=True) sv.fit(x, y) print("Mejor alpha usando LassoCV: %f" % sv.alpha_) print("Mejor valor usando LassoCV: %f" % sv.score(x, y)) coef = pd.Series(sv.coef_, index=x.columns) imp_coef = coef.sort_values() matplotlib.rcParams['figure.figsize'] = (8.0, 10.0) imp_coef.plot(kind="barh") plt.title("Modelo Lasso para selección de variables") plt.show()
def linear_reg_single_meter(X_train, X_test, y_train, y_test): # Fit your model using the training set linear = LinearRegression() lasso_cv = LassoCV(cv=5, random_state=0) ridge_cv = RidgeCV(alphas=(0.1, 1.0, 10.0)) linear.fit(X_train, y_train) lasso_cv.fit(X_train, y_train) ridge_cv.fit(X_train, y_train) print("Variance Inflation Factors") print(vifs(X_test)) print('\n') print('Features') print('\n') print(list(X_test.columns)) print( 'Linear regression score on train set with all parameters: {}'.format( linear.score(X_train, y_train))) print('Linear regression score on test set with all parameters: {}'.format( linear.score(X_test, y_test))) # print('Linear regression crossVal score on train set with all parameters: {}'.format(linear.score(X_train, y_train))) # print('Linear regression crossVal score on test set with all parameters: {}'.format(linear.score(X_test, y_test))) print( 'LassoCV regression score on train set with all parameters: {}'.format( lasso_cv.score(X_train, y_train))) print( 'LassoCV regression score on test set with all parameters: {}'.format( lasso_cv.score(X_test, y_test))) # print('LassoCV regression crossVal score on train set with all parameters: {}'.format(lasso_cv.score(X_train, y_train))) # print('LassoCV regression crossVal score on test set with all parameters: {}'.format(lasso_cv.score(X_test, y_test))) print( 'RidgeCV regression score on train set with all parameters: {}'.format( ridge_cv.score(X_train, y_train))) print( 'RidgeCV regression score on test set with all parameters: {}'.format( ridge_cv.score(X_test, y_test))) # print('RidgeCV regression crossVal score on train set with all parameters: {}'.format(ridge_cv.score(X_train, y_train))) # print('RidgeCV regression crossVal score on test set with all parameters: {}'.format(ridge_cv.score(X_test, y_test))) return ridge_cv, lasso_cv, linear
def lasso_with_cv(): lasso_cv = LassoCV(alphas=alphas) k_fold = KFold(5) for k, (train, test) in enumerate(k_fold.split(all_training_data, train_labels)): lasso_cv.fit(all_training_data[train], train_labels[train]) print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}". format(k, lasso_cv.alpha_, lasso_cv.score(all_training_data[test], train_labels[test]))) print()
def run(): # Data preprocessing train = DataPrep.prep_data(headless_run) # Scale data: https://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use target = train.SalePrice train = train.drop(columns='SalePrice') X_train, X_test, y_train, y_test = train_test_split( train, target, test_size=0.25, random_state=0) # Trying L1 regularization parameters = {"fit_intercept": (True, False)} # "n_alphas":(1000,10000) clf = LassoCV(alphas=None, cv=5) # clf = GridSearchCV(clf_plain, parameters, cv = 5) clf = clf.fit(X_train, y_train) # Lasso gives us an alpha of 0.1231, picks some coefficients and gives the rest a 0 value coef = pd.Series(clf.coef_, index=X_train.columns) # Metrics variance_score = clf.score(X_test, y_test) MSEscore = mean_squared_error(clf.predict(X_test), y_test) MAEscore = median_absolute_error(clf.predict(X_test), y_test) R2score = r2_score(clf.predict(X_test), y_test) if not headless_run: print('Variance score: {}'.format(variance_score)) # print("CLF best: {}".format(clf.best_score_)) grid search only print('MSE score: {}'.format(MSEscore)) print('MAE score: {}'.format(MAEscore)) print('R2 score: {}'.format(R2score)) # Plotting Residuals plt.scatter(clf.predict(X_train), clf.predict(X_train) - y_train, color="green", s=10, label='Train data') plt.scatter(clf.predict(X_test), clf.predict(X_test) - y_test, color="blue", s=10, label='Test data') plt.hlines(y=0, xmin=10, xmax=14, linewidth=2) plt.legend(loc='upper right') plt.title("Residual errors") plt.show() else: return [variance_score,MSEscore,MAEscore,R2score]
def Embedded_Method(self, x, y, plot_matr = 'yes'): reg = LassoCV() reg.fit(x, y) print("Best alpha using built-in LassoCV: %f" % reg.alpha_) print("Best score using built-in LassoCV: %f" %reg.score(x,y)) coef = pd.Series(reg.coef_, index = x.columns) print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables") imp_coef = coef.sort_values() if plot_matr == 'yes': matplotlib.rcParams['figure.figsize'] = (8.0, 10.0) imp_coef.plot(kind = "barh") plt.title("Feature importance using Lasso Model") plt.show() return imp_coef
def makeLassoCVPrediction(cv=3): global y_t_pred, result print "Prediction with cv = %s" % 3 prefix = "%s_LassoCV_FULL" % (name) lasso = LassoCV(cv=cv) y_t_pred = lasso.fit(x, y).predict(x_t) r = lasso.score(x, y) m_log_alphas = -np.log10(lasso.alphas_) plt.plot(m_log_alphas, lasso.mse_path_, ':') plt.show() print("score r = %s" % r) print "Intercept: %s" % lasso.intercept_ #print "Coefficients: %s" % lasso.coef_ return prefix, lasso
def cross_validation(self): """k-fold CV procedure to find the best (minimize deviance) complexity parameter of a lasso regression among a custom grid of points.""" """Need to preprocess the Xtrain data for each time? Checked the data and the means and standard deviations are quite consistenly 0 and 1.""" alpha_no = 100 alpha_array = np.logspace(0, -7, alpha_no) reg = LassoCV(cv = 5, n_jobs = -1, alphas = alpha_array,\ fit_intercept = False) # 5-fold CV reg = reg.fit(self.Xtrain, self.ytrain) score = reg.score(self.Xtest, self.ytest) return score
def scale_test_and_train_Lasso(X, y): """ Run a ridge regression on the model """ X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, random_state=3) X_train_scale = X_train.values X_val_scale = X_val.values X_test_scale = X_test.values scale = StandardScaler() X_train_scale = scale.fit_transform(X_train_scale) X_test_scale = scale.transform(X_test_scale) X_val_scale = scale.transform(X_val_scale) lasso = LassoCV() lasso.fit(X_train_scale, y_train) lasso.score(X_val_scale, y_val) y_pred = lasso.predict(X_val_scale) print(f'Lasso Regression val R^2: {lasso.score(X_val_scale, y_val):.3f}') print( f'Lasso Regression val RME: {sqrt(mean_squared_error(y_val,y_pred)):.3f}' ) return lasso.coef_
def Lasso_model(train_linear, test_linear): train_linear_fea=train_linear.drop(columns=['SalePrice']) train_linear_tar=train_linear.SalePrice real_train_tar=np.expm1(train_linear_tar) x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0) real_train_tar=np.expm1(train_linear_tar) """ . Lasso model """ lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), ) lassocv.fit(train_linear_fea, train_linear_tar) lassocv_score = lassocv.score(train_linear_fea, train_linear_tar) lassocv_alpha = lassocv.alpha_ print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score) start=time.time() lasso =Lasso(normalize = True) lasso.set_params(alpha=lassocv_alpha,max_iter = 10000) lasso.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, lasso.predict(x_test)) coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(lasso,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_lasso_predict=lasso.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_lasso_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_lasso=np.expm1(lasso.predict(test_linear)) write_pkl(lassocv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/lasso_params.pkl') return test_prediction_lasso
print coef_path_forest_cv.feature_importances_ forest_prediction = coef_path_forest_cv.predict(X) forest_score = coef_path_forest_cv.score(X,y) print "Forest_score:%.3g" % forest_score forest_cv_score = cross_validation.cross_val_score(coef_path_forest_cv, X, y, n_jobs=2, cv=5) print forest_cv_score print "########LASSO######" coef_path_lasso_cv.fit(X,y) print coef_path_lasso_cv.get_params print "alphas:" print coef_path_lasso_cv.alphas_ print "coef_:" print coef_path_lasso_cv.coef_ lasso_prediction = coef_path_lasso_cv.predict(X) lasso_score = coef_path_lasso_cv.score(X,y) print "Lasso_score:%.3g" % lasso_score #print "Lasso precision:%.3g" % precision_score(y, lasso_predict) #print "Lasso_confusion matrix:" #print confusion_matrix(y, lasso_prediction) lasso_cv_score = cross_validation.cross_val_score(coef_path_lasso_cv, X, y, n_jobs=2, cv=5) print lasso_cv_score plt.figure() plt.hist2d(y, lasso_prediction) plt.ylabel("Predicted Values") plt.xlabel("Truth Values") plt.title("Lasso Linear Regression") plt.savefig("figures/lasso_predicted_truth.png") print "#######ELASTIC#####" coef_path_elastic_cv.fit(X,y) print coef_path_elastic_cv.get_params
y_pred_lasso1.describe() print lasso1 print 'Lasso R^2 score:' print r2_score(y_test, y_pred_lasso1) #0.2604 print 'Lasso Mean Squared Error:' print mean_squared_error(y_test, y_pred_lasso1) #24232 print 'Lasso Root Mean Squared Log Error:' print rmsle(y_test, y_pred_lasso1) #6.089 #Cross-validate the LASSO-penalized linear regression lasso2 = LassoCV(cv = 15) #cv specifies the number of cross-validation folds to lasso2_fit = lasso2.fit(X_train, y_train) lasso2_path = lasso2.score(X_train, y_train) #run on each penalty-parameter value plt.plot(-np.log(lasso2_fit.alphas_), np.sqrt(lasso2_fit.mse_path_).mean(axis = 1)) plt.ylabel('RMSE (avg. across folds)') plt.xlabel(r'\$-\\log(\\lambda)\$') # Indicate the lasso parameter that minimizes the average MSE across #folds plt.axvline(-np.log(lasso2_fit.alpha_), color = 'red') alpha = lasso2_fit.alpha_ lasso3 = Lasso(alpha = alpha)
plt.axhline(np.max(scores), linestyle='--', color='.5') plt.xlim([alphas[0], alphas[-1]]) # ############################################################################# # Bonus: how much can you trust the selection of alpha? # To answer this question we use the LassoCV object that sets its alpha # parameter automatically from the data by internal cross-validation (i.e. it # performs cross-validation on the training data it receives). # We use external cross-validation to see how much the automatically obtained # alphas differ across different cross-validation folds. lasso_cv = LassoCV(alphas=alphas, random_state=0) k_fold = KFold(3) print("Answer to the bonus question:", "how much can you trust the selection of alpha?") print() print("Alpha parameters maximising the generalization score on different") print("subsets of the data:") for k, (train, test) in enumerate(k_fold.split(X, y)): lasso_cv.fit(X[train], y[train]) print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}". format(k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test]))) print() print("Answer: Not very much since we obtained different alphas for different") print("subsets of the data and moreover, the scores for these alphas differ") print("quite substantially.") # plt.show() pltshow(plt)
p = 180 K = 10 # K-fold CV y = y.reshape(n) alphas = np.exp(np.linspace(np.log(0.01),np.log(10),100)) # Using log-scale N = len(alphas) # Number of lasso parameters scores = np.zeros(N) alpha = np.zeros(N) from sklearn.linear_model import LassoCV from sklearn.feature_selection import SelectFromModel for i in range(N): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) clf = LassoCV(n_alphas = 100, cv = K) clf = clf.fit(X_train,y_train) scores[i] = clf.score(X_test,y_test) alpha[i] = clf.alpha_ scores = np.asarray(scores) max_score_index = np.argmax(scores) best_alpha = alpha[max_score_index] print(best_alpha) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) clf = Lasso(alpha=best_alpha) #clf = LassoCV(n_alphas = 100, cv = K, precompute='auto', n_jobs=2, normalize='True') clf = clf.fit(X_train,y_train) scores = clf.score(X_test,y_test) print(predictor_var[0]) print("clf.coef_",clf.coef_)
def Model(train_linear, test_linear): train_linear_fea=train_linear.drop(columns=['SalePrice']) train_linear_tar=train_linear.SalePrice x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0) def evaluate(model, test_features, test_labels,train_features, train_labels): predictions = model.predict(test_features) errors = abs(predictions - test_labels) mape = 100 * np.mean(errors / test_labels) accuracy = 100 - mape print('Model Performance') print('Average Error: {:0.4f} degrees.'.format(np.mean(errors))) print('Accuracy = {:0.2f}%.'.format(accuracy)) print("MSE for train data is: %f" % mean_squared_error(y_train, model.predict(x_train))) print("MSE for validation data is: %f" % mean_squared_error(y_test, model.predict(x_test))) return accuracy real_train_tar=np.expm1(train_linear_tar) """ . Lasso model """ lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), ) lassocv.fit(train_linear_fea, train_linear_tar) lassocv_score = lassocv.score(train_linear_fea, train_linear_tar) lassocv_alpha = lassocv.alpha_ print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score) start=time.time() lasso =Lasso(normalize = True) lasso.set_params(alpha=lassocv_alpha,max_iter = 10000) lasso.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, lasso.predict(x_test)) coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(lasso,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_lasso_predict=lasso.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_lasso_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_lasso=np.expm1(lasso.predict(test_linear)) """ . Ridge model """ ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400)) ridgecv.fit(x_train, y_train) ridgecv_score = ridgecv.score(x_train, y_train) ridgecv_alpha = ridgecv.alpha_ print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score) coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) start=time.time() ridge =Ridge(normalize = True) ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000) ridge.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, ridge.predict(x_test)) coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(ridge,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_ridge_predict=ridge.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_ridge_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_ridge=np.expm1(ridge.predict(test_linear)) """ . Random Forest """ #train=train.drop(columns=['DateSold']) #test=test.drop(columns=['DateSold']) #X_train=train.drop(columns=['SalePrice']) #Y_train=train['SalePrice'] X_train=train_linear_fea Y_train=train_linear_tar x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X_train, Y_train,test_size=0.2, random_state=0) n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 4] bootstrap = [True, False] random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap} rf = RandomForestRegressor() # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores # rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1) rf_random.fit(X_train, Y_train) #rf_random.fit(x_train_rf, y_train_rf) rf_random.best_params_ #Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search, # we can explicitly specify every combination of settings to try. param_grid = { 'bootstrap': [False], 'max_depth': [80, 90, 100, 110,120,130], 'max_features': [2, 3], 'min_samples_leaf': [1,2,3, 4], 'min_samples_split': [2,4,6,8, 10, 12], 'n_estimators': [600,700, 800, 900, 1000] } # Create a based model rf = RandomForestRegressor() # Instantiate the grid search model grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2) #grid_search.fit(x_train, y_train) grid_search.fit(X_train, Y_train) grid_search.best_params_ best_random = grid_search.best_estimator_ start=time.time() best_random.fit(x_train_rf,y_train_rf) end=time.time() evaluate(best_random, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_rf_predict=best_random.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_rf_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_rf = pd.DataFrame({'features':train_linear_fea.columns, 'imp':best_random.feature_importances_}).\ sort_values('imp',ascending=False) importance_top20_rf = importance_rf.iloc[:20,] plt.barh(importance_top20_rf.features, importance_top20_rf.imp) plt.xlabel('Feature Importance') test_prediction_rf=np.expm1(best_random.predict(test_linear)) """ . Xgboost """ learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)] # Minimum for sum of weights for observations in a node min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Maximum nodes in each tree max_depth = [int(x) for x in np.linspace(1, 10, num = 10)] n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] subsample=[0.3, 0.4,0.5,0.6, 0.7] model = xgb.XGBRegressor() random_grid = {'learning_rate': learning_rate, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'subsample': subsample, 'n_estimators':n_estimators } # Make a RandomizedSearchCV object with correct model and specified hyperparams xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1) start = time.time() # Fit models xgb_random.fit(X_train, Y_train) xgb_random.best_params_ """ best_params_={'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 900, 'subsample': 0.5} """ model_xgb = XGBRegressor(**xgb_random.best_params_) #model_xgb = XGBRegressor(**best_params_) start=time.time() model_xgb.fit(x_train_rf,y_train_rf) end=time.time() evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_xgb_predict=model_xgb.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_xgb_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\ sort_values('imp',ascending=False) importance_top20_xgb = importance_xgb.iloc[:20,] plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp) plt.xlabel('Feature Importance') test_prediction_xgb=np.expm1(model_xgb.predict(test_linear)) return(test_prediction_lasso, test_prediction_ridge, test_prediction_rf, test_prediction_xgb,y_lasso_predict, y_ridge_predict, y_rf_predict, y_xgb_predict)