def lasso_cv_rmse(X, y, l_alphas): lasso_rmse = [] # initialize model lasso = Lasso() # iterate through Lasso alphas for la in l_alphas: # set the current alpha to the model lasso.set_params(alpha=la) # keep track of fold RMSE lasso_scores = cross_val_score(lasso, X, y, scoring="neg_mean_squared_error") lasso_rmse.append(np.sqrt(np.mean(-1 * lasso_scores))) # We can compared against a multiple linear regression without regularisation linreg = LinearRegression() linreg_scores = cross_val_score(linreg, X, y, scoring="neg_mean_squared_error") linreg_rmse = np.sqrt(np.mean(-1 * linreg_scores)) return lasso_rmse, linreg_rmse
def fit_lasso(self, split_frac=0.5, cv=10): # Split into training and test sets X_train, X_test, y_train, y_test = model_selection.train_test_split( self.X, self.y, test_size=split_frac, random_state=1) #Lasso CV lasso = Lasso(max_iter=10000, normalize=False, fit_intercept=False) lassocv = LassoCV(alphas=None, cv=cv, max_iter=10000, normalize=False, fit_intercept=False) lassocv.fit((X_train), y_train) lasso.set_params(alpha=lassocv.alpha_) lasso.fit((X_train), y_train) pred_train = np.squeeze(lasso.predict((X_train))) pred = np.squeeze(lasso.predict((X_test))) train_corr = np.corrcoef(y_train, pred_train)[0, 1] mse = mean_squared_error(y_test, pred) test_corr = np.corrcoef(y_test, pred)[0, 1] nnz_coef = len(lasso.coef_[np.abs(lasso.coef_ > 1e-10)]) return lassocv.alpha_, train_corr, test_corr, lasso.coef_, nnz_coef, mse
def Lasso_regression(X_train, y_train, X_test, params): # Поиск по сетке для Lasso в виду малого количества гиперпараметров 'GridSearch' и 'RandomGridSearch' - одно и то же if hyperparameters == 'RandomGridSearch' or hyperparameters == 'GridSearch': # Осуществляем поиск по сетке с кросс-валидацией (число фолдов равно 3) alphas = np.arange(1, 800, 50) param_grid = {'alpha': alphas} # Задаем модель, которую будем обучать estimator = Lasso() # Производим обучение модели с заданными вариантами параметров (осуществляем поиск по сетке) optimizer = GridSearchCV(estimator, param_grid, iid = 'deprecated', cv = 3, scoring = 'neg_mean_absolute_error') optimizer.fit(X_train, y_train) regression = optimizer.best_estimator_ predicted = regression.predict(X_test) validation_score = optimizer.best_score_ elif hyperparameters == 'Custom': estimator = Lasso() # Задаем нужные параметры estimator.set_params(**params) # Проверка по кросс-валидации fold = KFold(n_splits = 3, shuffle = True) validation_score = cross_val_score(estimator = estimator, X = X_train, y = y_train, cv = fold, scoring = 'neg_mean_absolute_error') # Обучаем модель уже на всех данных estimator.fit(X_train, np.ravel(y_train)) predicted = estimator.predict(X_test) return(predicted, validation_score)
def est_coefs(i): #coefs=[] cross_val_par = 5 lasso = Lasso(fit_intercept=False, max_iter=10000, normalize=True) lassocv = LassoCV(alphas=None, fit_intercept=False, cv=cross_val_par, max_iter=100000, normalize=True) y = Y[:, i] y = y.reshape(-1, 1) X_train = lagged_ret[train_index, :] X_test = lagged_ret[test_index, :] y_train = y[train_index, :].ravel() y_test = y[test_index, :] lassocv.fit(X_train, y_train) # print("\n") # print("Optimal cv parameter: ", lassocv.alpha_) lasso.set_params(alpha=lassocv.alpha_) lasso.fit(X_train, y_train.ravel()) #coefs.append([col_names[i], lasso.coef_]) coefs = lasso.coef_ # coefs = lasso.coef_.reshape(1, -1) # print("MSE: ", mean_squared_error(y_test, lasso.predict(X_test))) # print("Lasso coefs: ", coefs) return coefs
def get_estimator(self): estimator = self.kwargs.get("estimator", self.ESTIMATOR) if estimator == "Lasso": model = Lasso() elif estimator == "Ridge": model = Ridge() elif estimator == "Linear": model = LinearRegression() elif estimator == "GBM": model = GradientBoostingRegressor() elif estimator == "RandomForest": model = RandomForestRegressor() self.model_params = { # 'n_estimators': [int(x) for x in np.linspace(start = 50, stop = 200, num = 10)], 'max_features': ['auto', 'sqrt'], 'n_estimators': range(60, 220, 40) } # 'max_depth' : [int(x) for x in np.linspace(10, 110, num = 11)]} else: model = Lasso() estimator_params = self.kwargs.get("estimator_params", {}) self.mlflow_log_param("estimator", estimator) model.set_params(**estimator_params) print(colored(model.__class__.__name__, "red")) return model
def _compare_with_lasso(self, lasso_X, lasso_y, wlasso_X, wlasso_y, sample_weight, alpha_range=[0.01], params={}): for alpha in alpha_range: lasso = Lasso(alpha=alpha) lasso.set_params(**params) lasso.fit(lasso_X, lasso_y) wlasso = WeightedLasso(alpha=alpha) wlasso.set_params(**params) wlasso.fit(wlasso_X, wlasso_y, sample_weight=sample_weight) # Check results are similar with tolerance 1e-6 if np.ndim(lasso_y) > 1: for i in range(lasso_y.shape[1]): np.testing.assert_allclose(lasso.coef_[i], wlasso.coef_[i]) if lasso.get_params()["fit_intercept"]: self.assertAlmostEqual(lasso.intercept_[i], wlasso.intercept_[i]) else: np.testing.assert_allclose(lasso.coef_, wlasso.coef_) self.assertAlmostEqual(lasso.intercept_, wlasso.intercept_)
def vendor_lasso(df): ''' Creating a subsetted version of the full dataset ('new_train') and running lasso regression on data with OneHotEncoded vendor column ''' new_train = df[[ 'Order.Month', 'Order.Day', 'Order.Year', 'Vendor Number', 'Pack', 'Bottle Volume (L)', 'Volume Sold (Liters)', 'Profit' ]].copy() sale_data = new_train.loc[:, new_train.columns != 'Profit'] sale_target = new_train['Profit'] X_train, X_test, y_train, y_test = train_test_split(sale_data, sale_target, test_size=0.2, random_state=3) new_train_cols = X_train[[ 'Order.Day', 'Order.Year', 'Order.Month', 'Bottle Volume (L)', 'Pack', 'Volume Sold (Liters)' ]] new_train_nomcols = X_train[['Vendor Number']] enc = OneHotEncoder(drop='first', sparse=False) encodecols = enc.fit_transform(new_train_nomcols) feature_names = enc.get_feature_names(['Vendor Number']) pd.DataFrame(encodecols, columns=feature_names) X_train = pd.concat([ new_train_cols.reset_index(drop=True), pd.DataFrame(encodecols, columns=feature_names).astype(int).reset_index(drop=True) ], axis=1) '''lasso = Lasso() lasso.set_params(alpha=1, normalize=True) lasso.fit(X_train, y_train) print('The intercept is %.4f' %(lasso.intercept_)) lassoCoef = pd.Series(lasso.coef_, index=X_train.columns).sort_values(ascending = False) print('The slopes are \n%s' %(lassoCoef))''' lasso = Lasso() coefs = [] alphaRange = np.linspace(1e-3, 20, 20) for alpha in alphaRange: lasso.set_params(alpha=alpha, normalize=True) lasso.fit(X_train, y_train) coefs.append(lasso.coef_) coefs = pd.DataFrame(np.array(coefs), columns=X_train.columns) for name in coefs.columns: plt.plot(alphaRange, coefs[name]) plt.xlabel('Alpha') plt.ylabel("Coefficients") plt.title('Change of Lasso Slopes Varying Alpha')
def trylasso5times(array_alphas, cv=5, max_iter=1e6, normalize=True): lassocv = LassoCV(alphas=array_alphas, cv=5, max_iter=1e6, normalize=True) lassocv.fit(X_train, y_train) lasso = Lasso() lasso.set_params(alpha=lassocv.alpha_) lasso.fit(X_train, y_train) return lasso, lassocv
def LinearModelLasso(X_train, y_train, X_test, y_test): alphas = 10**np.linspace(10, -2, 100) * 0.5 lasso_cv = LassoCV(alphas=alphas, random_state=0, max_iter=100000, cv=10) lasso_cv.fit(X_train, y_train) print("Value of lasso tuning parameter", lasso_cv.alpha_) lasso = Lasso() lasso.set_params(alpha=lasso_cv.alpha_) lasso.fit(X_train, y_train) print_evaluation_metrics(lasso, "Lasso Model", X_train, y_train, X_test, y_test) print("\n") print("--- Feature Selection -----") coef = pd.Series(lasso.coef_, index=X_train.columns) print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables") for e in sorted(list(zip(list(X_train), lasso.coef_)), key=lambda e: -abs(e[1])): if e[1] != 0: print("\t{}, {:.3f}".format(e[0], e[1]))
def lasso(x, y): lassocv = LassoCV(alphas=None, cv=10, max_iter=10000, normalize=True) lassocv.fit(x, y) lassoreg = Lasso(alpha=.00005, normalize=True, max_iter=1e5) lassoreg.set_params(alpha=lassocv.alpha_) lassoreg.fit(x, y) r2 = round(lassoreg.score(x, y), 2) return r2, lassoreg.coef_
def lasso_coefs(X, Y, alphas): coefs = [] lasso_reg = Lasso() for a in alphas: lasso_reg.set_params(alpha=a) lasso_reg.fit(X, Y) coefs.append(lasso_reg.coef_) return coefs
def DoLasso(alphas, currentCountry, otherCountries): lasso = Lasso(alpha=alphas, normalize=True) coefs = [] for a in alphas: lasso.set_params(alpha=a) lasso.fit(otherCountries.transpose(), currentCountry) coefs.append(lasso.coef_) return coefs
def lasso_coefs(X, Y, alphas): coefs = [] lasso_reg = Lasso() for a in alphas: lasso_reg.set_params(alpha=a) # We set the hyperparameter alpha lasso_reg.fit(X, Y) coefs.append(lasso_reg.coef_) return coefs
def lasso_regression(X_train, y_train, X_test, params, use_cv: bool = True): # If there are not enough points for cross validation if use_cv is False: if params is None: model = Lasso() else: model = Lasso(**params) model.fit(X_train, y_train) predicted = model.predict(X_test) # Calculate score on train train_predicted = model.predict(X_train) validation_score = mean_absolute_error( np.ravel(y_train), np.ravel(train_predicted)) return predicted, validation_score # Grid search for Lasso due to the small number of hyperparameters 'GridSearch' and 'RandomGridSearch' are the same if hyperparameters == 'RandomGridSearch' or hyperparameters == 'GridSearch': # We search the grid with cross-validation (the number of folds is 3) alphas = np.arange(1, 800, 50) param_grid = {'alpha': alphas} # Setting the model to train estimator = Lasso() # We train the model with the specified parameter options (we search the grid) optimizer = GridSearchCV(estimator, param_grid, iid='deprecated', cv=3, scoring='neg_mean_absolute_error') optimizer.fit(X_train, y_train) regression = optimizer.best_estimator_ predicted = regression.predict(X_test) validation_score = optimizer.best_score_ elif hyperparameters == 'Custom': estimator = Lasso() # Setting the necessary parameters estimator.set_params(**params) # Cross-validation check fold = KFold(n_splits=3, shuffle=True) validation_score = cross_val_score( estimator=estimator, X=X_train, y=y_train, cv=fold, scoring='neg_mean_absolute_error') # Training the model already on all data estimator.fit(X_train, np.ravel(y_train)) predicted = estimator.predict(X_test) return predicted, validation_score
class LassoModel(Model): def create_model(self): self.lasso = Lasso() def fit(self, train_x, train_y): self.lasso_pip = make_pipeline(RobustScaler(), self.lasso) self.lasso_pip.fit(train_x, train_y) def set_config(self, config): self.lasso.set_params(**config) def predict(self, test_x): return self.lasso_pip.predict(test_x)
def lasso_prediction(x, y): lasso = Lasso(normalize = "True", tol = 0.01, max_iter = 500000) w, q = x.shape coefs = [] preds = [] i = 0 while i < q: start = timer() x_i = x print("\n------------------------------------\n") name ="Fitting for country no.: %s" %i print(name) alpha = 100 lasso.set_params(alpha = alpha) population_vector = x.iloc[:, i] x_i = x.drop([i], axis = 1) y_i = y.drop([i], axis = 1) lasso.fit(x_i, population_vector) while np.count_nonzero(lasso.coef_) > 5: alpha = alpha + 500 lasso.set_params(alpha = alpha) population_vector = x.iloc[:, i] x_i = x.drop([i], axis = 1) lasso.fit(x_i, population_vector) prediction = lasso.predict(y_i) coefs.append(lasso.coef_) preds.append(prediction) end_timer = timer() - start print("Alpha value: " + str(alpha)) time_statement = "Fitting time for country no. %s: " %i print(time_statement + str(round(end_timer, 4)) + " seconds.") print("Number of countries used for fitting: " + str(np.count_nonzero(lasso.coef_))) i = i + 1 np.savetxt("population_parameters.csv", coefs, delimiter=",") np.savetxt("population_prediction.csv", preds, delimiter=",") return coefs
def lasso_prediction(x, y): lasso = Lasso(normalize="True", tol=0.001, max_iter=5000000) w, q = x.shape coefs = [] preds = [] i = 0 alphas = 10**(np.linspace(0.01, 8, 1000)) while i < q: x_i = x print("\n------------------------------------\n") name = "Fitting for country no.: %s" % (i + 1) print(name) lasso.set_params(alpha=1000000000) population_vector = x.iloc[:, i] y_true = y.iloc[:, i] lasso.fit(x_i, population_vector) prediction = lasso.predict(y) error = mse(y_true, prediction) for a in alphas: lasso.set_params(alpha=a) population_vector = x.iloc[:, i] y_true = y.iloc[:, i] lasso.fit(x, population_vector) prediction = lasso.predict(y) mean_error = mse(y_true, prediction) if (mean_error < error) and 0 < np.count_nonzero(lasso.coef_) <= 5: coefs_best = lasso.coef_ pred_best = prediction error = mean_error alpha_value = a country_number = np.count_nonzero(lasso.coef_) coefs.append(coefs_best) preds.append(pred_best) print("Aplha value: " + str(alpha_value)) print("Number of countries used for fitting: " + str(country_number)) i = i + 1 return coefs, preds
def Lasso_Reg(Phi_train, Y_train, Phi_test, Y_test, alphas): reg = Lasso() coefs = [] train_MSE = [] test_MSE = [] for a in alphas: reg.set_params(alpha=a) reg.fit(Phi_train, Y_train) coefs.append(reg.coef_) train_pred = (reg.predict(Phi_train)) train_MSE.append(mean_squared_error(Y_train, train_pred)) test_pred = (reg.predict(Phi_test)) test_MSE.append(mean_squared_error(Y_test, test_pred)) return {'coefs': coefs, 'train_MSE': train_MSE, 'test_MSE': test_MSE}
def lasso_regress(X, y, a=None, b=None): """ This function returns regression of input data by LASSO regression method. Parameters ---------- X: an array or array-like predictors. It should be scaled by StandardScaler. y: an array or array-like target. It should has compatible dimension with input X. a, b: an array or array-like, optional. another set of data, such as a = X_test, b = y_test. Returns ------- coefs_LASSO: list. a list of coefficients from LASSO with different lambdas lambdas_LASSO: list. a list of lambdas used in this LASSO error1_LASSO: list. a list of MSE of prediction from first input set (X, y) error2_LASSO: list. a list of MSE of prediction from second input set (a, b). Return as None if a and b are not defined. modelLASSO: modelLASSO = Lasso(), the LASSO model command """ # lASSO vs lambda coefs_LASSO = [] error1_LASSO = [] error2_LASSO = [] # Tunning parameter(lambda) lambdas_LASSO = np.logspace(-4, 8, 200) modelLASSO = Lasso(max_iter=1e5) # loop over lambda values to determine the best by mse for l in lambdas_LASSO: modelLASSO.set_params(alpha=l) modelLASSO.fit(X, y) coefs_LASSO.append(modelLASSO.coef_) error1_LASSO.append(mean_squared_error(y, modelLASSO.predict(X))) if a.any() and b.any() is not None: error2_LASSO.append(mean_squared_error(b, modelLASSO.predict(a))) else: error2_LASSO = None return coefs_LASSO, lambdas_LASSO, error1_LASSO, error2_LASSO, modelLASSO
def get_estimator(self): estimator = self.kwargs.get("estimator", self.ESTIMATOR) if estimator == "Lasso": model = Lasso() elif estimator == "Ridge": model = Ridge() elif estimator == "Linear": model = LinearRegression() elif estimator == "GBM": model = GradientBoostingRegressor() else: model = Lasso() estimator_params = self.kwargs.get("estimator_params", {}) model.set_params(**estimator_params) print(model.__class__.__name__) return model
def binarySearch(self, X, y, select_num = 50): model = Lasso() betaM = np.zeros([X.shape[1]]) #max(abs(crossprod(xx, yy / sqrt(sum(yy ^ 2)) / sqrt(n)))) iteration = 0 min_lambda = 1e-15 max_lambda = 1e15 minFactor = 0.9 maxFactor = 1.1 stuckCount = 1 previousC = -1 patience = 20 while min_lambda < max_lambda and iteration < 50: iteration += 1 lmbd = np.exp((np.log(min_lambda) + np.log(max_lambda)) / 2.0) # print "\t\tIter:{}\tlambda:{}".format(iteration, lmbd), model.set_params(alpha=lmbd) model.fit(X, y) beta = model.coef_ c = len(np.where(np.abs(beta) > 0)[0]) # we choose regularizers based on the number of non-zeros it reports # print "# Chosen:{}".format(c) if c < select_num * minFactor: # Regularizer too strong max_lambda = lmbd betaM = beta elif c > select_num * maxFactor: # Regularizer too weak min_lambda = lmbd betaM = beta else: betaM = beta break if c == previousC: stuckCount += 1 else: previousC = c stuckCount = 1 if stuckCount > patience: # print 'Run out of patience' break return betaM
def build_models(x, y): """ Build a Ridge regression model """ # Alpha values uniformly # spaced between 0.01 and 0.02 alpha_range = np.linspace(0, 0.5, 200) model = Lasso(normalize=True) coeffiecients = [] # Fit a model for each alpha value for alpha in alpha_range: model.set_params(alpha=alpha) model.fit(x, y) # Track the coeffiecients for plot coeffiecients.append(model.coef_) # Plot coeffients weight decay vs alpha value # Plot model RMSE vs alpha value coeff_path(alpha_range, coeffiecients)
def build_models(x,y): """ Build a Ridge regression model """ # Alpha values uniformly # spaced between 0.01 and 0.02 alpha_range = np.linspace(0,0.5,200) model = Lasso(normalize=True) coeffiecients = [] # Fit a model for each alpha value for alpha in alpha_range: model.set_params(alpha=alpha) model.fit(x,y) # Track the coeffiecients for plot coeffiecients.append(model.coef_) # Plot coeffients weight decay vs alpha value # Plot model RMSE vs alpha value coeff_path(alpha_range,coeffiecients)
def Lasso_model(train_linear, test_linear): train_linear_fea = train_linear.drop(columns=['SalePrice']) train_linear_tar = train_linear.SalePrice real_train_tar = np.expm1(train_linear_tar) x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar, test_size=0.2, random_state=0) real_train_tar = np.expm1(train_linear_tar) """ . Lasso model """ lassocv = LassoCV(alphas=np.logspace(-5, 4, 400), ) lassocv.fit(train_linear_fea, train_linear_tar) lassocv_score = lassocv.score(train_linear_fea, train_linear_tar) lassocv_alpha = lassocv.alpha_ print("Best alpha : ", lassocv_alpha, "Score: ", lassocv_score) start = time.time() lasso = Lasso(normalize=True) lasso.set_params(alpha=lassocv_alpha, max_iter=10000) lasso.fit(x_train, y_train) end = time.time() mean_squared_error(y_test, lasso.predict(x_test)) coef_lasso = pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending=False) evaluate(lasso, x_test, y_test, x_train, y_train) print('Time elapsed: %.4f seconds' % (end - start)) y_lasso_predict = lasso.predict(train_linear_fea) x_line = np.arange(700000) y_line = x_line plt.scatter(real_train_tar, np.expm1(y_lasso_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_lasso = np.expm1(lasso.predict(test_linear)) write_pkl( lassocv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/lasso_params.pkl' ) return test_prediction_lasso
class LassoRegressor(LinearRegressor): """ Linear Regression via Lasso Regression (OLS Regression w/ L1 penalty) ------------------------------------------------------------------------------- This class is a wrapper for scikit-learn's "sklearn.linear_model.Lasso" class. """ def __init__(self, std=True, max_iter=10000): self.regressor_type = "Lasso" self.sklearn_Lasso = Lasso(fit_intercept=std, normalize=std, max_iter=max_iter) super().__init__(std) self.std = False # let scikit-learn handle preprocessing # regress X on Y def compute_coeff(self): self.sklearn_Lasso.set_params(alpha=self.alpha) self.sklearn_Lasso.fit(self.X, self.Y) self.coeff = self.sklearn_Lasso.coef_
def lasso_reg(alphas, x, y): lasso = Lasso(normalize="True", tol=0.001, max_iter=50000) w, q = x.shape f = 1 error = [] for a in alphas: coefs = [] preds = [] lasso.set_params(alpha=a) i = 0 while i < q: population_vector = x.iloc[:, i] train1 = x.drop([i], axis=1) lasso.fit(train1, population_vector) test1 = y.drop([i], axis=1) prediction = lasso.predict(test1) coefs.append(lasso.coef_) preds.append(prediction) i = i + 1 name_coef = "coef_no_%s.csv" % f name_preds = "pred_no_%s.csv" % f np.savetxt(name_coef, coefs, delimiter=",") np.savetxt(name_preds, preds, delimiter=",") y_true = y.values result = mse(y_true, np.transpose(preds)) error.append(result) f = f + 1 return lasso, preds, error
def fit(x, y): a = 50000000 coefs = [] model = Lasso(tol=0.02, max_iter=500000) x = x.T w, q = x.shape i = 0 while i < q: t = x.iloc[:, i] x_i = x[x != t] model.set_params(alpha=a) model.fit(x_i, t) coefs.append(model.coef_) i = i + 1 return coefs
def best_parameter(x, y, method, alphas): Xm = x.as_matrix() ym = y.as_matrix() if method == "lasso": model = Lasso(fit_intercept=True) elif method == "ridge": model = Ridge(fit_intercept=True) k_fold = cross_validation.KFold(len(Xm), 10) best_cv_mse = float("inf") for a in alphas: model.set_params(alpha=a) mse_list_k10 = [ MSE(model.fit(Xm[train], ym[train]).predict(Xm[val]), ym[val]) for train, val in k_fold] if np.mean(mse_list_k10) < best_cv_mse: best_cv_mse = np.mean(mse_list_k10) best_alpha = a print method, "BEST PARAMETER=%f, MSE(CV)=%f" % (best_alpha, best_cv_mse)
def get_lasso(X, Y): #running lasso with alpha 0 (MLR) lasso = Lasso() lasso.set_params(alpha=0, normalize=True) lasso.fit(X, Y) #Grid search alphas_lasso = np.linspace(0, 10, 50) tuned_parameters_r = [{'alpha': alphas_lasso}] n_folds = 5 cv = KFold(n_splits=n_folds, shuffle=True) tune_lasso = GridSearchCV(lasso, tuned_parameters_r, cv=cv, refit=True, return_train_score=True, scoring='neg_mean_squared_error') tune_lasso.fit(X, Y) print(tune_lasso.best_params_) print(np.max(tune_lasso.cv_results_['mean_test_score'])) print(np.min(tune_lasso.cv_results_['mean_test_score'])) lasso_best = tune_lasso.best_estimator_ lasso_best.fit(X, Y) print(lasso_best.score(X, Y)) suffix = str(datetime.datetime.now()) model_filename = 'lasso' + suffix + '.sav' pickle.dump(lasso_best, open(model_filename, 'wb')) csv_filename = 'ridge ' + suffix + '.csv' raw_test, test_IDs = load_test() predict = lasso_best.predict(raw_test) predict = np.exp(predict) predict = pd.DataFrame(predict) predict = pd.concat([test_IDs, predict], axis=1) predict.columns = ['Id', 'SalePrice'] predict.to_csv(csv_filename, index=False)
def Lasso_model(train_linear, test_linear): train_linear_fea=train_linear.drop(columns=['SalePrice']) train_linear_tar=train_linear.SalePrice real_train_tar=np.expm1(train_linear_tar) x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0) real_train_tar=np.expm1(train_linear_tar) """ . Lasso model """ lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), ) lassocv.fit(train_linear_fea, train_linear_tar) lassocv_score = lassocv.score(train_linear_fea, train_linear_tar) lassocv_alpha = lassocv.alpha_ print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score) start=time.time() lasso =Lasso(normalize = True) lasso.set_params(alpha=lassocv_alpha,max_iter = 10000) lasso.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, lasso.predict(x_test)) coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(lasso,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_lasso_predict=lasso.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_lasso_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_lasso=np.expm1(lasso.predict(test_linear)) write_pkl(lassocv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/lasso_params.pkl') return test_prediction_lasso
def lr(self): self.alphas = np.logspace(self.model_params_obj.lasso_params_obj.minAlpha, self.model_params_obj.lasso_params_obj.maxAlpha, num=self.model_params_obj.lasso_params_obj.lasso_n_alphas,base=np.e) print('alphas', self.alphas) lasso = Lasso(max_iter=self.model_params_obj.lasso_params_obj.lasso_max_iter, normalize=False, fit_intercept=True) #initialize scores for each alpha self.scores = np.empty_like(self.alphas) self.weights = [] for i,a in enumerate(self.alphas): lasso.set_params(alpha=a) lasso.fit(self.X_train, self.y_train) self.weights.append(lasso.coef_) #add scores for each alpha self.scores[i] = lasso.score(self.X_train, self.y_train) print('r-2',self.scores) tuned_parameters = [{'alpha': self.alphas}] n_folds = 6 gridsearch_lasso_cv = GridSearchCV(lasso, tuned_parameters, scoring=['neg_mean_squared_error', 'explained_variance', 'r2'], cv=n_folds, refit='neg_mean_squared_error') gridsearch_lasso_cv.fit(self.X_train, self.y_train) #plot_lasso_permutation_importance(gridsearch_lasso_cv, _X_train, _y_train) self.rmse["lr"] = gridsearch_lasso_cv.cv_results_['mean_test_neg_mean_squared_error'].tolist() self.r2["lr"] = gridsearch_lasso_cv.cv_results_['mean_test_r2'].tolist() self.expvarscore["lr"] = gridsearch_lasso_cv.cv_results_['mean_test_explained_variance'].tolist() self.plotsobj.weights_vs_alphas(self.weights, self.alphas, self.features) self.compute_results(gridsearch_lasso_cv, "Lasso Regression", 'lr') plot_lasso_permutation_importance([gridsearch_lasso_cv, self.X_train, self.y_train], self.plotsobj, self.features) return {'results': [self.plotsobj.results['lr_train'], self.plotsobj.results['lr_test']], \ 'rmse':self.rmse['lr'], 'r2':self.r2['lr'], 'expvar': self.expvarscore['lr']}
def set_params(self, C=None): if self.l1: Lasso.set_params(self, alpha=C) else: Ridge.set_params(self, alpha=1.0/(2.0*C)) return self
def Model(train_linear, test_linear): train_linear_fea=train_linear.drop(columns=['SalePrice']) train_linear_tar=train_linear.SalePrice x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0) def evaluate(model, test_features, test_labels,train_features, train_labels): predictions = model.predict(test_features) errors = abs(predictions - test_labels) mape = 100 * np.mean(errors / test_labels) accuracy = 100 - mape print('Model Performance') print('Average Error: {:0.4f} degrees.'.format(np.mean(errors))) print('Accuracy = {:0.2f}%.'.format(accuracy)) print("MSE for train data is: %f" % mean_squared_error(y_train, model.predict(x_train))) print("MSE for validation data is: %f" % mean_squared_error(y_test, model.predict(x_test))) return accuracy real_train_tar=np.expm1(train_linear_tar) """ . Lasso model """ lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), ) lassocv.fit(train_linear_fea, train_linear_tar) lassocv_score = lassocv.score(train_linear_fea, train_linear_tar) lassocv_alpha = lassocv.alpha_ print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score) start=time.time() lasso =Lasso(normalize = True) lasso.set_params(alpha=lassocv_alpha,max_iter = 10000) lasso.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, lasso.predict(x_test)) coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(lasso,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_lasso_predict=lasso.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_lasso_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_lasso=np.expm1(lasso.predict(test_linear)) """ . Ridge model """ ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400)) ridgecv.fit(x_train, y_train) ridgecv_score = ridgecv.score(x_train, y_train) ridgecv_alpha = ridgecv.alpha_ print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score) coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) start=time.time() ridge =Ridge(normalize = True) ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000) ridge.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, ridge.predict(x_test)) coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(ridge,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_ridge_predict=ridge.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_ridge_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_ridge=np.expm1(ridge.predict(test_linear)) """ . Random Forest """ #train=train.drop(columns=['DateSold']) #test=test.drop(columns=['DateSold']) #X_train=train.drop(columns=['SalePrice']) #Y_train=train['SalePrice'] X_train=train_linear_fea Y_train=train_linear_tar x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X_train, Y_train,test_size=0.2, random_state=0) n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 4] bootstrap = [True, False] random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap} rf = RandomForestRegressor() # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores # rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1) rf_random.fit(X_train, Y_train) #rf_random.fit(x_train_rf, y_train_rf) rf_random.best_params_ #Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search, # we can explicitly specify every combination of settings to try. param_grid = { 'bootstrap': [False], 'max_depth': [80, 90, 100, 110,120,130], 'max_features': [2, 3], 'min_samples_leaf': [1,2,3, 4], 'min_samples_split': [2,4,6,8, 10, 12], 'n_estimators': [600,700, 800, 900, 1000] } # Create a based model rf = RandomForestRegressor() # Instantiate the grid search model grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2) #grid_search.fit(x_train, y_train) grid_search.fit(X_train, Y_train) grid_search.best_params_ best_random = grid_search.best_estimator_ start=time.time() best_random.fit(x_train_rf,y_train_rf) end=time.time() evaluate(best_random, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_rf_predict=best_random.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_rf_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_rf = pd.DataFrame({'features':train_linear_fea.columns, 'imp':best_random.feature_importances_}).\ sort_values('imp',ascending=False) importance_top20_rf = importance_rf.iloc[:20,] plt.barh(importance_top20_rf.features, importance_top20_rf.imp) plt.xlabel('Feature Importance') test_prediction_rf=np.expm1(best_random.predict(test_linear)) """ . Xgboost """ learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)] # Minimum for sum of weights for observations in a node min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Maximum nodes in each tree max_depth = [int(x) for x in np.linspace(1, 10, num = 10)] n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] subsample=[0.3, 0.4,0.5,0.6, 0.7] model = xgb.XGBRegressor() random_grid = {'learning_rate': learning_rate, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'subsample': subsample, 'n_estimators':n_estimators } # Make a RandomizedSearchCV object with correct model and specified hyperparams xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1) start = time.time() # Fit models xgb_random.fit(X_train, Y_train) xgb_random.best_params_ """ best_params_={'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 900, 'subsample': 0.5} """ model_xgb = XGBRegressor(**xgb_random.best_params_) #model_xgb = XGBRegressor(**best_params_) start=time.time() model_xgb.fit(x_train_rf,y_train_rf) end=time.time() evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_xgb_predict=model_xgb.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_xgb_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\ sort_values('imp',ascending=False) importance_top20_xgb = importance_xgb.iloc[:20,] plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp) plt.xlabel('Feature Importance') test_prediction_xgb=np.expm1(model_xgb.predict(test_linear)) return(test_prediction_lasso, test_prediction_ridge, test_prediction_rf, test_prediction_xgb,y_lasso_predict, y_ridge_predict, y_rf_predict, y_xgb_predict)
df = df.drop("train", axis=1) scaler = StandardScaler() df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns) df_scaled["lpsa"] = df["lpsa"] X = df_scaled.ix[:, :-1] N = X.shape[0] X.insert(X.shape[1], "intercept", np.ones(N)) y = df_scaled["lpsa"] names_regressors = ["Lcavol", "Lweight", "Age", "Lbph", "Svi", "Lcp", "Gleason", "Pgg45"] X = X.drop("intercept", axis=1) Xtrain = X[istrain] ytrain = y[istrain] names_regressors = ["Lcavol", "Lweight", "Age", "Lbph", "Svi", "Lcp", "Gleason", "Pgg45"] alphas_ = np.logspace(1, -2, base=10) Xm = Xtrain.as_matrix() ym = ytrain.as_matrix() k_fold = cross_validation.KFold(len(Xm), 10) best_cv_mse = float("inf") model = Lasso(fit_intercept=True) for a in alphas_: model.set_params(alpha=a) mse_list_k10 = [MSE(model.fit(Xm[train], ym[train]).predict(Xm[val]), ym[val]) for train, val in k_fold] if np.mean(mse_list_k10) < best_cv_mse: best_cv_mse = np.mean(mse_list_k10) best_alpha = a print "LASSO BEST PARAMETER=%f, MSE(CV)=%f" % (best_alpha, best_cv_mse)
ax.set_xscale('log') ax.set_xlim(ax.get_xlim()[::-1]) #reverse axis plt.xlabel('Alpha') plt.ylabel('Weights') plt.title('Regularization Path RIDGE') plt.axis('tight') plt.legend(loc=2) plt.show() #Pregunta B clf = Lasso(fit_intercept=True) alphas_2 = np.logspace(1,-2,base=10) coefs = [] for a in alphas_2: clf.set_params(alpha=a) clf.fit(Xtrain,ytrain) coefs.append(clf.coef_) ax = plt.gca() for y_arr, label in zip(np.squeeze(coefs).T, names_regressors): plt.plot(alphas_2, y_arr, label=label) plt.legend() ax.set_xscale('log') ax.set_xlim(ax.get_xlim()[::-1]) #reverse axis plt.xlabel('Alpha') plt.ylabel('Weights') plt.title('Regularization Path LASSO') plt.axis('tight') plt.legend(loc=2) plt.show()