def objective(**params): reg = ElasticNet(max_iter=1000, normalize=False) reg.set_params(**params) cval = cross_val_score(reg, x, y, scoring='r2', cv=3) cval[np.where(cval < 0)[0]] = 0 return -cval.mean()
def fun_en_fs(x, *args): X, y, flag, n_splits, random_seed = args n_samples, n_var = X.shape clf = ElasticNet(random_state=random_seed, max_iter=10000) #TODO: max_iter alterado p = {'alpha': x[0], 'l1_ratio': x[1], 'positive': x[2] < 0.5} clf.set_params(**p) if len(x) <= 3: ft = np.array([1 for i in range(n_var)]) ft = np.where(ft > 0.5) else: ft = np.array([1 if k > 0.5 else 0 for k in x[2::]]) ft = np.where(ft > 0.5) #x[4::] = [1 if k>0.5 else 0 for k in x[4::]] #ft = np.array([1 if k>0.5 else 0 for k in x[4::]]) #ft = np.where(ft>0.5) try: #cv=KFold(n_splits=n_splits, shuffle=True, random_state=random_seed) #cv=KFold(n=n_samples, n_folds=5, shuffle=True, random_state=int(random_seed)) cv = KFold(n_splits=n_splits, shuffle=True, random_state=int(random_seed)) y_p = cross_val_predict(clf, X[:, ft].squeeze(), y, cv=cv, n_jobs=1) r = RMSE(y_p, y) r2 = MAPE(y_p, y) r3 = RRMSE(y_p, y) r4 = -r2_score(y_p, y) #r = mean_squared_error(y,y_p)**0.5 #r = -accuracy_score(y,y_p) #r = -f1_score(y,y_p,average='weighted') except: y_p = [None] r = 1e12 #print(r,'\t',p) if flag == 'eval': return r else: clf.fit(X[:, ft].squeeze(), y) return { 'Y_TRUE': y, 'Y_PRED': y_p, 'EST_PARAMS': p, 'PARAMS': x, 'EST_NAME': 'EN', 'ESTIMATOR': clf, 'ACTIVE_VAR': ft, 'DATA': X, 'SEED': random_seed, 'ERROR_TRAIN': { 'RMSE': r, 'MAPE': r2, 'RRMSE': r3, 'R2_SCORE': r4 } }
def elasticnet_coefs(X, Y, alphas): coefs = [] enet_reg = ElasticNet() for a in alphas: enet_reg.set_params(alpha=a, l1_ratio=0.05) enet_reg.fit(X, Y) coefs.append(enet_reg.coef_) return coefs
class boroReg: def __init__(self, X, y, idx, pipe_X, pipe_y): self.X = X[idx, :] # shift to fix 1 indexing using np broadcasting self.y = y[idx, :] self._gridSearch = None self.pipeline_X = pipe_X self.pipeline_y = pipe_y self._searchSpace = None self._params = None self.lm = ElasticNet() def __imputeVals(self, in_df): return imputeVals(in_df) def gridSearch(self, params, cv=5, njobs=-1, verbose=50): self._searchSpace = params self._gridSearch = GridSearchCV(self.lm, params, cv=cv, scoring="neg_mean_squared_error", n_jobs=njobs, verbose=verbose) self._gridSearch.fit(self.X, self.y) def getBestParams(self): if self._gridSearch is not None: return self._gridSearch.best_params_ else: raise ValueError() def getBestScore(self): if self._gridSearch is not None: return self._gridSearch.best_score_ else: raise ValueError() def fitModel(self, params): self._params = params self.lm.set_params(**params) self.lm.fit(self.X, self.y) def __invert(self, y): return np.exp(self.pipeline_y.inverse_transform(y)) def getTrainScore(self): return self.lm.score(self.X, self.y) def predict(self, test_X): piped_X = self.pipeline_X.transform(self.__imputeVals(test_X)) preds = self.lm.predict(piped_X) return self.__invert(preds)
class ElasticNetModel(Model): def create_model(self): self.elastic_net = ElasticNet() def fit(self, train_x, train_y): self.elastic_net.fit(train_x, train_y) def set_config(self, config): self.elastic_net.set_params(**config) def predict(self, test_x): return self.elastic_net.predict(test_x)
def main(): list_file_path = sorted(glob.glob(os.path.join(DATA_DIR, 'train_all_join_4/*gz'))) df = pandas.read_csv(list_file_path[0], compression='gzip') df = df.fillna(0) data = df[LIST_FEATURE_COLUMN_NAME].values target = df[TARGET_COLUMN_NAME].values model = ElasticNet(random_state=0, alpha=0.001, # warm_start=True, max_iter=1000) params = {'alpha': [0.0001, 0.001, 0.01], 'l1_ratio': [0.2, 0.5, 0.7]} cv = GridSearchCV(model, params, scoring=bimbo_scoring, n_jobs=1, refit=False, verbose=10) cv.fit(data, target) logger.info('best_params: %s' % cv.best_params_) model.set_params(warm_start=True) for i in range(1, len(list_file_path)): logger.info('%s: %s' % (i, list_file_path[i])) test_df = pandas.read_csv(list_file_path[i], compression='gzip') test_df = test_df.fillna(0) test_data = test_df[LIST_FEATURE_COLUMN_NAME].values test_target = test_df[TARGET_COLUMN_NAME].values model.fit(data, target) predict = model.predict(data) predict = numpy.where(predict < 0, 0, predict) score = bimbo_score_func(predict, target) logger.info('INSAMPLE score: %s' % score) predict = model.predict(test_data) predict = numpy.where(predict < 0, 0, predict) score = bimbo_score_func(predict, test_target) logger.info('score: %s' % score) # model.set_params(n_estimators=n_estimators) df = test_df data = test_data target = test_target with open('lasso_model_4.pkl', 'wb') as f: pickle.dump(model, f, -1)
def test_enet_toy(): # Test ElasticNet for various parameters of alpha and l1_ratio. # Actually, the parameters alpha = 0 should not be allowed. However, # we test it as a border case. # ElasticNet is tested with and without precomputed Gram matrix X = np.array([[-1.], [0.], [1.]]) Y = [-1, 0, 1] # just a straight line T = [[2.], [3.], [4.]] # test sample # this should be the same as lasso clf = ElasticNet(alpha=1e-8, l1_ratio=1.0) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [1]) assert_array_almost_equal(pred, [2, 3, 4]) assert_almost_equal(clf.dual_gap_, 0) clf = ElasticNet(alpha=0.5, l1_ratio=0.3, max_iter=100, precompute=False) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.50819], decimal=3) assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3) assert_almost_equal(clf.dual_gap_, 0) clf.set_params(max_iter=100, precompute=True) clf.fit(X, Y) # with Gram pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.50819], decimal=3) assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3) assert_almost_equal(clf.dual_gap_, 0) clf.set_params(max_iter=100, precompute=np.dot(X.T, X)) clf.fit(X, Y) # with Gram pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.50819], decimal=3) assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3) assert_almost_equal(clf.dual_gap_, 0) clf = ElasticNet(alpha=0.5, l1_ratio=0.5) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.45454], 3) assert_array_almost_equal(pred, [0.9090, 1.3636, 1.8181], 3) assert_almost_equal(clf.dual_gap_, 0)
def init_model(model="", parameters={}): new_params = {} if model == "elastic_net": regressor = ElasticNet() elif model == "sgd_regressor": regressor = SGDRegressor() elif model == "ridge": regressor = Ridge() elif model == 'neural_network': return neural_network(parameters) else: regressor = ElasticNet() # get all available parameters available_params = set(regressor.get_params().keys()).intersection( set(parameters.keys())) params = {a_p: parameters[a_p] for a_p in available_params} regressor.set_params(**params) return regressor
def test_warm_start_convergence(): X, y, _, _ = build_dataset() model = ElasticNet(alpha=1e-3, tol=1e-3).fit(X, y) n_iter_reference = model.n_iter_ # This dataset is not trivial enough for the model to converge in one pass. assert n_iter_reference > 2 # Check that n_iter_ is invariant to multiple calls to fit # when warm_start=False, all else being equal. model.fit(X, y) n_iter_cold_start = model.n_iter_ assert n_iter_cold_start == n_iter_reference # Fit the same model again, using a warm start: the optimizer just performs # a single pass before checking that it has already converged model.set_params(warm_start=True) model.fit(X, y) n_iter_warm_start = model.n_iter_ assert n_iter_warm_start == 1
def roi_loop(X, y, params): alpha = params['alpha'] l1_ratio = params['l1_ratio'] results = pd.DataFrame( columns=['alpha', 'l1_ratio', 'edge', 'r2_test', 'r2_train']) row = 0 for a in alpha: for l in l1_ratio: elastic = ElasticNet(normalize=True, max_iter=params['max_iter']) elastic.set_params(**{'alpha': a, 'l1_ratio': l}) print('PARAMS alpha: {}, l1_ratio: {}'.format(a, l)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) elastic.fit(X_train, y_train) r2_test = r2_score(y_test, elastic.predict(X_test)) r2_train = r2_score(y_train, elastic.predict(X_train)) results.loc[row] = [a, l, params['y'], r2_test, r2_train] row += 1 results.to_csv(params['name'])
class modelEN(base_model): ''' ElasticNet Regressor model with best parameters ''' def __init__(self, target_range): self.name = 'ElasticNet' self.postclip = True self.target_range = target_range self.fit_cols = None # Better to use all features self.trained = False #self.alpha, self.l1_ratio = 0.404, 1.0 #CV search, but worsens RMSE... self.alpha, self.l1_ratio = 1.0, 0.5 self.max_iter = 1000 self.en_params = { 'max_iter': self.max_iter, 'random_state': 0, # Changing this could generate ensembling options 'alpha': self.alpha, 'l1_ratio': self.l1_ratio, 'tol': 0.0001, 'fit_intercept': True, 'normalize': False, 'positive': False, 'precompute': False, 'selection': 'cyclic', 'copy_X': True, 'warm_start': False } self.model = ElasticNet() self.model = self.model.set_params(**self.en_params) def _raw_fit(self, X, y): self.model = self.model.fit(X.values, y) self.trained = True def _raw_predict(self, X): return self.model.predict(X.values)
100*percentnulldf[percentnulldf>0].sort_values(ascending=False) # In[ ]: elasticnet = ElasticNet(alpha=0.1, l1_ratio=0.5, normalize=True) N_alpha = 100 N_rho = 10 alphaRange = np.logspace(-10, 2, N_alpha) rhoRange = np.linspace(0.1,1, N_rho) # we avoid very small rho by starting at 0.1 scores = np.zeros((N_rho, N_alpha)) prices = pd.Series(clean_train.SalePrice) for alphaIdx, alpha in enumerate(alphaRange): for rhoIdx, rho in enumerate(rhoRange): elasticnet.set_params(alpha=alpha, l1_ratio=rho) elasticnet.fit(df_fin_features, prices) scores[rhoIdx, alphaIdx] = elasticnet.score(df_fin_features, prices) # In[ ]: from sklearn.linear_model import Lasso from sklearn.model_selection import GridSearchCV # In[ ]: from sklearn.linear_model import Lasso
def elastic_net_regression(X_train, X_test, y_train, y_test, outputs=False, plots=False): # storing the values of the coefficients and the corresponding value of mse for each value of lambda elastic_net = ElasticNet(max_iter=MAX_ITER) coefficients = [] mse_training = [] for lambda_value in LAMBDA_VALUES: elastic_net.set_params(alpha=lambda_value) elastic_net.fit(X_train, y_train) # storing the values for each value of lambda for the plots coefficients.append(elastic_net.coef_) mse_training.append( mean_squared_error(y_train, elastic_net.predict(X_train))) # definel model evaluation method cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1) # using cross validation to determine best value for lambda cv_elastic_net = ElasticNetCV(alphas=LAMBDA_VALUES, cv=cv) cv_elastic_net.fit(X_train, y_train) # extracting the optimal value of lambda corresponding to the min(MSE) optimal_lambda = cv_elastic_net.alpha_ optimal_l1_ratio = cv_elastic_net.l1_ratio_ # fitting the lasso regression model with the optimal value of lambda found with cross validation elastic_net_model = ElasticNet(alpha=optimal_lambda, l1_ratio=optimal_l1_ratio).fit( X_train, y_train) # predicting y values of the training set y_train_predicted = elastic_net_model.predict(X_train) train_set_mse = mean_squared_error(y_train, y_train_predicted) train_set_score = elastic_net_model.score(X_train, y_train) # predicting y values of the training set y_test_predicted = elastic_net_model.predict(X_test) test_set_mse = mean_squared_error(y_test, y_test_predicted) test_set_score = elastic_net_model.score(X_test, y_test) if outputs: # output and analysis print('Elastic Net regression coefficients:', np.round(elastic_net_model.coef_, 4)) print('Optimal lambda:', round(optimal_lambda, 4)) print('Training test: MSE:', round(train_set_mse, 4), ', R2:', round(train_set_score, 4)) print('Test test: MSE:', round(test_set_mse, 4), ', R2:', round(test_set_score, 4)) if plots: # plot that visualizes the coefficients getting shrinked ax = plt.gca() ax.plot(np.log(LAMBDA_VALUES), coefficients) plt.vlines(x=np.log(optimal_lambda), ymin=np.min(coefficients), ymax=np.max(coefficients), linestyles='dashed', color='black') plt.axis('tight') plt.xlabel('log(λ)') plt.ylabel('Coefficients') plt.title('Elastic Net parameters shrinkage') plt.show() # plot for optimal value of lambda obtained with cross validation plt.plot(np.log(LAMBDA_VALUES), mse_training) plt.vlines(x=np.log(optimal_lambda), ymin=np.min(mse_training), ymax=np.max(mse_training), linestyles='dashed', color='black') plt.axis('tight') plt.xlabel('log(λ)') plt.ylabel('MSE') plt.title("Elastic Net optimal value of λ using cross-validation") plt.show() return train_set_mse, test_set_mse
X = df.iloc[:, 0:8].values.tolist() y = df.iloc[:, 8].values.tolist() # Split train and test data X_train, X_test = X[:n_samples_train], X[n_samples_train:] y_train, y_test = y[:n_samples_train], y[n_samples_train:] ############################################################################### # Compute train and test errors alphas = np.logspace(-5, 1, 60) enet = ElasticNet(l1_ratio=1) train_errors = list() test_errors = list() for alpha in alphas: enet.set_params(alpha=alpha) enet.fit(X_train, y_train) train_errors.append(enet.score(X_train, y_train)) test_errors.append(enet.score(X_test, y_test)) i_alpha_optim = np.argmax(test_errors) alpha_optim = alphas[i_alpha_optim] print("Optimal regularization parameter : %s" % alpha_optim) # Estimate the coef_ on full data with optimal regularization parameter enet.set_params(alpha=alpha_optim) joblib.dump(enet, "train_model.m") ############################################################################### # Plot results functions
class linReg: def __init__(self, in_df): df = self.__imputeVals(in_df.copy()) self.X = df.drop(columns=["SalePrice"]).copy() self.y = np.log(df.SalePrice.values.reshape(-1, 1)) self._gridSearch = None self.pipeline_X = self.__make_pipe() #self.pipeline_y = StandardScaler() self.pipeline_y = PowerTransformer() self._searchSpace = None self._params = None self.lm = ElasticNet() def __imputeVals(self, in_df): return imputeVals(in_df) def __make_pipe(self): nonePipeline = make_pipeline(SimpleImputer( strategy="constant", fill_value="None"), OneHotEncoder(drop="first")) zeroPipeline = make_pipeline(SimpleImputer( strategy="constant", fill_value=0), OneHotEncoder(drop="first", categories="auto")) scalePipeline = make_pipeline(SimpleImputer( strategy="constant", fill_value=0), PowerTransformer()) regressionPipeline = ColumnTransformer([ ("setNone", nonePipeline, fillNone), ("setZero", zeroPipeline, fillZeroCat), ("transformed", scalePipeline, fillZeroCont), ("dictImputed", make_pipeline(dictImputer(imputeDict), OneHotEncoder(drop="first")), list(imputeDict.keys())), ("bool", "passthrough", imputeBool), ("categoricalInts", "passthrough", cat_to_int), ("dropped", "drop", dropList) ], remainder="drop") return regressionPipeline def gridSearch(self, params, cv=5, njobs=-1, verbose=50): self._searchSpace = params #self._params = None piped_X = self.pipeline_X.fit_transform(self.X) piped_y = self.pipeline_y.fit_transform(self.y) self._gridSearch = GridSearchCV( self.lm, params, cv=cv, scoring="neg_mean_squared_error", n_jobs=njobs, verbose=verbose) self._gridSearch.fit(piped_X, piped_y) def getBestParams(self): if self._gridSearch is not None: return self._gridSearch.best_params_ else: raise ValueError() def getBestScore(self): if self._gridSearch is not None: return self._gridSearch.best_score_ else: raise ValueError() def fitModel(self, params): piped_X = self.pipeline_X.fit_transform(self.X) piped_y = self.pipeline_y.fit_transform(self.y) self._params = params self.lm.set_params(**params) self.lm.fit(piped_X, piped_y) def __invert(self, y): return np.exp(self.pipeline_y.inverse_transform(y)) def getTrainScore(self): piped_X = self.pipeline_X.transform(self.X) piped_y = self.pipeline_y.transform(self.y) return self.lm.score(piped_X, piped_y) # Root Mean Square Log Error def getRMSLE(self): piped_X = self.pipeline_X.transform(self.X) preds = self.lm.predict(piped_X).reshape(-1,1) preds = self.pipeline_y.inverse_transform(preds) return mean_squared_error(self.y,preds) def predict(self, test_X): piped_X = self.pipeline_X.transform(self.__imputeVals(test_X)) preds = self.lm.predict(piped_X).reshape(-1,1) return self.__invert(preds)
class Victor(): def __init__(self, bcupath='backup', cv=3, max_evals=20): """Initialize data""" subfolder = basedir.split('/')[:-1] self.bcupath = os.path.join('/', *subfolder, bcupath) #self.load_model() try: self.load_model() except: self.scaler = StandardScaler() self.pca = PCA() self.estimator = ElasticNet(random_state=0) self.rmsecv = 1e20 self.trained = False self.cv = cv self.columns = ['wl_{}'.format(x) for x in range(950, 1530 + 1, 2)] self.max_evals = max_evals self.lmbda = 0 def load_model(self): """Load the trained models""" with open(os.path.join(self.bcupath, 'misc-estimator.pkl'), 'rb') as handle: saved = pickle.load(handle) self.rmsecv = saved['rmsecv'] self.trained = saved['trained'] self.cv = saved['cv'] self.columns = saved['columns'] self.lmbda = saved['lmbda'] self.max_evals = saved['max_evals'] self.scaler = joblib.load(os.path.join(self.bcupath, 'scaler.pkl')) self.pca = joblib.load(os.path.join(self.bcupath, 'pca.pkl')) self.estimator = joblib.load( os.path.join(self.bcupath, 'estimator.pkl')) #print('loaded :', self.scaler.mean_) def save_model(self): """Save the trained model""" tosave = { 'rmsecv': self.rmsecv, 'trained': self.trained, 'cv': self.cv, 'columns': self.columns, 'lmbda': self.lmbda, 'max_evals': self.max_evals, } with open(os.path.join(self.bcupath, 'misc-estimator.pkl'), 'wb') as handle: pickle.dump(tosave, handle, protocol=pickle.HIGHEST_PROTOCOL) #print('saved : ', self.scaler.mean_) joblib.dump(self.scaler, os.path.join(self.bcupath, 'scaler.pkl')) joblib.dump(self.pca, os.path.join(self.bcupath, 'pca.pkl')) joblib.dump(self.estimator, os.path.join(self.bcupath, 'estimator.pkl')) def fit(self, dataset): """fit the model""" X = dataset[self.columns] y = dataset['target'] ybc, self.lmbda = stats.boxcox(y) ## HyperOpt features def objective(params): hyperparams = { 'alpha': params['alpha'], 'l1_ratio': params['l1_ratio'], 'random_state': 0, } elnet = ElasticNet(**hyperparams) scaler = StandardScaler() pca = PCA(random_state=0) Xscaled = scaler.fit_transform(X) Xpca = pca.fit_transform(Xscaled) preds = cross_val_predict(elnet, Xpca, ybc, cv=self.cv, n_jobs=-2) score = mean_squared_error(inv_boxcox(preds, self.lmbda), y) return score space = { 'alpha': hp.loguniform('alpha', -10, 2), 'l1_ratio': hp.loguniform('l1_ratio', -20, 0), } best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=self.max_evals) ## Hyperopt best results and training params = { 'alpha': best['alpha'], 'l1_ratio': best['l1_ratio'], 'random_state': 0, } print(params) self.estimator.set_params(**params) Xscaled = self.scaler.fit_transform(X) Xpca = self.pca.fit_transform(Xscaled) self.estimator.fit(Xpca, ybc) ## Performance measurement self.trained = True preds = cross_val_predict(self.estimator, Xpca, ybc, cv=self.cv, n_jobs=-2) self.rmsecv = mean_squared_error(inv_boxcox(preds, self.lmbda), y)**.5 ## save the model self.save_model() def predict(self, dataset): """Predict from X""" X = dataset[self.columns] Xscaled = self.scaler.transform(X) Xpca = self.pca.transform(Xscaled) predsbc = self.estimator.predict(Xpca) return inv_boxcox(predsbc, self.lmbda)
lasso = make_pipeline( RobustScaler(), Lasso(alpha=gs_lasso.best_params_['alpha'], random_state=1)) lasso.fit(x_train, y_train) score_lasso = lasso.score(x_train, y_train) rmse_lasso = np.sqrt(mean_squared_error(y_train, lasso.predict(x_train))) lasso_pred = np.expm1(lasso.predict(x_test)) # ElasticNet ElNet = ElasticNet() para_ElNet = { "alpha": np.logspace(-3.8, -3.3, 10), "l1_ratio": np.linspace(0.7, 0.9, 10) } ElNet.set_params(max_iter=5000) gs_ElNet = GridSearchCV(ElNet, para_ElNet, cv=10, scoring='neg_mean_squared_error', n_jobs=-1) gs_ElNet.fit(x_train, y_train) gs_ElNet.best_params_ #{'alpha': 0.0003880510732210184, 'l1_ratio': 0.9} ElNet.set_params(alpha=gs_ElNet.best_params_['alpha'], l1_ratio=gs_ElNet.best_params_['l1_ratio']) ElNet.fit(x_train, y_train) score_ElNet = ElNet.score(x_train, y_train) rmse_ElNet = np.sqrt(mean_squared_error(y_train, ElNet.predict(x_train))) ElNet_pred = np.expm1(ElNet.predict(x_test))
plt.plot(m_log_alphas, np.log10(model.mse_path_), ':') plt.plot(m_log_alphas, np.log10(model.mse_path_.mean(axis=-1)), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',label='alpha: CV estimate') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: coordinate descent') plt.axis('tight') plt.show() elastic = ElasticNet(max_iter=10000, normalize=True, positive=True) coefs = [] for a in alphas: elastic.set_params(alpha=a) elastic.fit(scale(X_train), y_train) coefs.append(elastic.coef_) ax = plt.gca() ax.plot(alphas*2, coefs) ax.set_xscale('log') plt.xlabel('Alpha') plt.ylabel('Coefficients') plt.axvline(model.alpha_, linestyle='--', color='k',label='alpha: CV estimate') plt.title('Optimal Alpha Parameters') plt.show() en.set_params(alphas=model.alpha_) en.fit(X_train, y_train) mean_squared_error(y_test, en.predict(X_test))
m_log_alphas = -np.log10(model.alphas_) plt.figure() plt.plot(m_log_alphas, model.mse_path_, ':') plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha: CV estimate') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.show() for a in alphas: ElasticNet.set_params(alpha=a) ElasticNet.fit(scale(X_train), y_train) coefs.append(ElasticNet.coef_) label_list=['Na Library', 'Cl Library', 'Water Library', 'Fe Library', 'Cu Library'] ax = plt.gca() lineObjects = ax.plot(alphas, coefs) ax.set_xscale('log') plt.ticklabel_format(axis='y', style='sci', scilimits=(0,0)) plt.axvline(en.alpha_, linestyle='--', color='k',label='alpha: CV estimate') plt.locator_params(axis='y', nbins=10) """ """ plt.xlabel('Alpha')
# model.add(Dropout(0.2)) # model.add(LSTM(input_shape=(None,50),units=50,return_sequences=True)) # model.add(Dropout(0.2)) # model.add(LSTM(input_shape=(None,50),units=50,return_sequences=True)) # model.add(Dropout(0.2)) # model.add(LSTM(50,return_sequences=False)) # model.add(Dropout(0.2)) # model.add(Dense(units=1)) # model.add(Activation("linear")) # start = time.time() # model.compile(loss="mse", optimizer="rmsprop") # print("Time : ", time.time() - start) # #print(model.layers) # model.fit(X_train,y_train,batch_size=50,epochs=10,validation_split=0.05) model = ElasticNet(l1_ratio=0.5, normalize=True, max_iter=15000) model.set_params(alpha=0.001) model.fit(X_train, y_train) y_test = model.predict(X_test) prediction_result = y_test #prediction_result = [] # for i in range(len(y_test)): # prediction_result.append(y_test[i][0]) speed_id = [x for x in range(len(y_test))] #print(type(y_test)) result = pd.DataFrame({'id': speed_id, 'speed': prediction_result}) result.to_csv('submission.csv', index=False) print(y_test)
class ElasticReg(customRegressor): def __init__(self, in_df, zoning, utilities, frontage, qualPow): super(ElasticReg, self).__init__() from lm_features import impute_shell ## Because we're currying in python now self._imputeVals = impute_shell(frontage=frontage, zoning=zoning, utilities=utilities, qualPow=qualPow) tempDF = self._imputeVals(in_df.copy()) self.X = tempDF.drop(columns=["SalePrice"]).copy() self.y = np.log(tempDF.SalePrice.values.reshape(-1, 1)) self.pipeline_X = self._make_pipe() self.pipeline_X.fit(self.X) self.pipeline_y = StandardScaler() self.pipeline_y.fit(self.y) def _rmOutliers(self, x, y): outliers = ((y > 4000) & (y < 5E5)) out = x[~(outliers)] return out def _make_pipe(self): import lm_features as f nonePipeline = make_pipeline( SimpleImputer(strategy="constant", fill_value="None"), OneHotEncoder(drop="first")) zeroPipeline = make_pipeline( SimpleImputer(strategy="constant", fill_value=0), OneHotEncoder(drop="first", categories="auto")) scalePipeline = make_pipeline( SimpleImputer(strategy="constant", fill_value=0), PowerTransformer()) regressionPipeline = ColumnTransformer( [("setNone", nonePipeline, f.fillNone), ("setZero", zeroPipeline, f.fillZeroCat), ("transformed", scalePipeline, f.fillZeroCont), ("dictImputed", make_pipeline( self.dictImputer(f.imputeDict), OneHotEncoder(drop="first")), list(f.imputeDict.keys())), ("bool", "passthrough", f.imputeBool), ("categoricalInts", "passthrough", f.cat_to_int), ("dropped", "drop", f.dropList)], remainder="drop") return make_pipeline(regressionPipeline, RobustScaler()) def gridSearch(self, params, cv=5, njobs=-1, verbose=50): self._searchSpace = params piped_X = self._rmOutliers(self.X, self.y) piped_X = self.pipeline_X.transform(piped_X) piped_y = self.pipeline_y.transform(self.y) self._gridSearchObject = GridSearchCV(ElasticNet(), params, cv=cv, scoring="neg_mean_squared_error", n_jobs=njobs, verbose=verbose) self._gridSearchObject.fit(piped_X, piped_y) def fitModel(self, params): self.model = ElasticNet() self._params = params piped_X = self._rmOutliers(self.X, self.y) piped_X = self.pipeline_X.transform(piped_X) piped_y = self.pipeline_y.transform(self.y) self.model.set_params(**params) self.model.fit(piped_X, piped_y) def getTrainRsquared(self): piped_X = self._rmOutliers(self.X, self.y) piped_X = self.pipeline_X.transform(piped_X) piped_y = self.pipeline_y.transform(self.y) return self.model.score(piped_X, piped_y)
print(u_test, std_test) print(u_val, std_val) # best alphas and l1 best_alphas = [] best_l1s = [] a_lasso = [] b_ridge = [] for p in best_params: best_alphas.append(p['alpha']) best_l1s.append(p['l1_ratio']) a_lasso.append(p['alpha'] * p['l1_ratio']) b_ridge.append(p['alpha'] - p['alpha'] * p['l1_ratio']) # make two submissions to compare # model 1: alpha = 0.09102981779915217, l1 = 0.035448275862068966 # equivalent to a_lasso = 0.003226850093018222, b_ridge = 0.08780296770613395 # model 2: alpha = 0.005428675439323859, l1 = 0.8277586206896551, # equivalent to a_lasso = 0.004493632893826525, b_ridge = 0.0009350425454973344 enet.set_params(alpha=best_alphas[2], l1_ratio=best_l1s[2]) enet.fit(X, y) cols = Xd_train.columns coefs = sorted(list(zip(cols, enet.coef_)), key=lambda t: abs(t[1]), reverse=True) coefs = pd.DataFrame(coefs, columns=['Feature', 'Coef']) print(len(coefs[np.abs(coefs['Coef']) > 0])) submissiondata = make_prediction(enet, scaler) submissiondata.to_csv("yq_submission8_enet2.csv", index=False)