def test_predict_without_cv(self): x, y = self.inputs[0] m = ElasticNet(n_splits=0, random_state=340561) m = m.fit(x, y) # should not make prediction unless value is passed for lambda with self.assertRaises(ValueError): m.predict(x)
def test_lambda_clip_warning(self): x, y = self.inputs[0] m = ElasticNet(n_splits=0, random_state=1729) m = m.fit(x, y) # we should get a warning when we ask for predictions at values of # lambda outside the range of lambda_path_ with self.assertWarns(RuntimeWarning): # note, lambda_path_ is in decreasing order m.predict(x, lamb=m.lambda_path_[0] + 1) with self.assertWarns(RuntimeWarning): m.predict(x, lamb=m.lambda_path_[-1] - 1)
def test_one_row_predict(self): # Verify that predicting on one row gives only one row of output m = ElasticNet(random_state=42) for X, y in self.inputs: m.fit(X, y) p = m.predict(X[0].reshape((1, -1))) assert p.shape == (1,)
def test_one_row_predict_with_lambda(self): # One row to predict along with lambdas should give 2D output m = ElasticNet(random_state=42) for X, y in self.inputs: m.fit(X, y) p = m.predict(X[0].reshape((1, -1)), lamb=[20, 10]) assert p.shape == (1, 2)
def glmnet_box(): m1 = ElasticNet(n_splits=20, scoring='r2', alpha=0) m1.fit(music_features, box_latitude_label) lat_r_squared = m1.score(music_features, box_latitude_label) print('GLMNET ridge lattitude r2 {}'.format(lat_r_squared)) plot_predictions( inverse_box_cox(m1.predict(music_features), lambda_lat, 90), latitude_label, 'ridge_latitude_residual.png', 'residual vs fitted latitude for Ridge') m1.fit(music_features, box_longitude_label) lon_r_squared = m1.score(music_features, box_longitude_label) print('GLMNET ridge longitude r2 {}'.format(lon_r_squared)) plot_predictions( inverse_box_cox(m1.predict(music_features), lambda_lon, 180), longitude_label, 'ridge_longitude_residual.png', 'residual vs fitted longitude for Ridge regression')
def glmnet_lasso(): m = ElasticNet(n_splits=20, scoring='r2', alpha=1) m.fit(music_features, box_latitude_label) latitude_r_squared = m.score(music_features, box_latitude_label) print('GLMNET lasso latitude r2 {}'.format(latitude_r_squared)) plot_predictions( inverse_box_cox(m.predict(music_features), lambda_lat, 90), latitude_label, 'lasso_latitude_residual.png', 'residual vs fitted latitude for lasso regression') m.fit(music_features, box_longitude_label) longitude_r_squared = m.score(music_features, box_longitude_label) print('GLMNET lasso longitude r2 {}'.format(longitude_r_squared)) plot_predictions( inverse_box_cox(m.predict(music_features), lambda_lon, 180), longitude_label, 'lasso_longitude_residual.png', 'residual vs fitted longitude for lasso regression')
def test_with_single_var(self): x = np.random.rand(500,1) y = (1.3 * x).ravel() m = ElasticNet(random_state=449065) m = m.fit(x, y) self.check_r2_score(y, m.predict(x), 0.90)
def test_coef_interpolation(self): x, y = self.inputs[0] m = ElasticNet(n_splits=0, random_state=1729) m = m.fit(x, y) # predict for a value of lambda between two values on the computed path lamb_lo = m.lambda_path_[1] lamb_hi = m.lambda_path_[2] # a value not equal to one on the computed path lamb_mid = (lamb_lo + lamb_hi) / 2.0 pred_lo = m.predict(x, lamb=lamb_lo) pred_hi = m.predict(x, lamb=lamb_hi) pred_mid = m.predict(x, lamb=lamb_mid) self.assertFalse(np.allclose(pred_lo, pred_mid)) self.assertFalse(np.allclose(pred_hi, pred_mid))
def l1_l2_regression(alpha): m = ElasticNet(n_splits=20, scoring='r2', alpha=alpha) m.fit(music_features, box_latitude_label) lat_r_squared = m.score(music_features, box_latitude_label) print('GLMNET L1 L2 alpha {} latitude r2 {}'.format(alpha, lat_r_squared)) plot_predictions( inverse_box_cox(m.predict(music_features), lambda_lat, 90), latitude_label, 'l1_l2_latitude_residual_{}.png'.format(alpha), 'residual vs fitted latitude for l1_l2 \n regression alpha {}'.format( alpha)) m.fit(music_features, box_longitude_label) lon_r_squared = m.score(music_features, box_longitude_label) print('GLMNET L1 L2 alpha {} longitude r2 {}'.format(alpha, lon_r_squared)) plot_predictions( inverse_box_cox(m.predict(music_features), lambda_lon, 180), longitude_label, 'l1_l2_longitude_residual_{}.png'.format(alpha), 'residual vs fitted longitude for l1_l2 \n regression alpha {}'.format( alpha))
def test_with_defaults(self): m = ElasticNet(random_state=2821) for x, y in self.inputs: m = m.fit(x, y) sanity_check_regression(m, x) # check selection of lambda_best self.assertTrue(m.lambda_best_inx_ <= m.lambda_max_inx_) # check full path predict p = m.predict(x, lamb=m.lambda_path_) self.assertEqual(p.shape[-1], m.lambda_path_.size)
def test_edge_cases(self): '''Edge cases in model specification.''' X = np.random.random(size=(50, 10)) w = np.random.random(size=(10, )) y = np.dot(X, w) # Edge case # A single lambda is so big that it sets all estimated coefficients # to zero. This used to break the predict method. enet = ElasticNet(alpha=1) enet.fit(X, y, lambdas=[10**5]) _ = enet.predict(X) # Edge case # Multiple lambdas are so big as to set all estiamted coefficients # to zero. This used to break the predict method. enet = ElasticNet(alpha=1) enet.fit(X, y, lambdas=[10**5, 2 * 10**5]) _ = enet.predict(X) # Edge case: # Some predictors have zero varaince. This used to break lambda # max. X = np.random.random(size=(50, 10)) X[:, 2] = 0 X[:, 8] = 1 y = np.dot(X, w) enet = ElasticNet(alpha=.1) enet.fit(X, y) ol = enet.out_lambdas max_lambda_from_fortran = ol[1] * (ol[1] / ol[2]) max_lambda_from_python = enet._max_lambda(X, y) self.assertAlmostEqual(max_lambda_from_fortran, max_lambda_from_python, 4) # Edge case. # All predictors have zero variance. This is an error in # sepcification. with self.assertRaises(ValueError): X = np.ones(shape=(50, 10)) enet = ElasticNet(alpha=.1) enet.fit(X, y)
def test_edge_cases(self): '''Edge cases in model specification.''' X = np.random.random(size=(50,10)) w = np.random.random(size=(10,)) y = np.dot(X, w) # Edge case # A single lambda is so big that it sets all estimated coefficients # to zero. This used to break the predict method. enet = ElasticNet(alpha=1) enet.fit(X, y, lambdas=[10**5]) _ = enet.predict(X) # Edge case # Multiple lambdas are so big as to set all estiamted coefficients # to zero. This used to break the predict method. enet = ElasticNet(alpha=1) enet.fit(X, y, lambdas=[10**5, 2*10**5]) _ = enet.predict(X) # Edge case: # Some predictors have zero varaince. This used to break lambda # max. X = np.random.random(size=(50,10)) X[:,2] = 0; X[:,8] = 1 y = np.dot(X, w) enet = ElasticNet(alpha=.1) enet.fit(X, y) ol = enet.out_lambdas max_lambda_from_fortran = ol[1] * (ol[1]/ol[2]) max_lambda_from_python = enet._max_lambda(X, y) self.assertAlmostEqual( max_lambda_from_fortran, max_lambda_from_python, 4 ) # Edge case. # All predictors have zero variance. This is an error in # sepcification. with self.assertRaises(ValueError): X = np.ones(shape=(50,10)) enet = ElasticNet(alpha=.1) enet.fit(X, y)
def test_unregularized_with_weights(self): '''Test that fitting an unregularized model (lambda=0) gives expected results when sample weights are used. ''' Xdn = np.random.random(size=(5000,10)) Xsp = csc_matrix(Xdn) w = np.random.random(size=(10,)) y = np.dot(Xdn, w) sw = np.random.uniform(size=(5000,)) for alpha in [0, .5, 1]: for X in (Xdn, Xsp): enet = ElasticNet(alpha=alpha) enet.fit(X, y, lambdas=[0], weights=sw) test_preds = np.allclose(enet.predict(X).ravel(), y, atol=.01) self.assertTrue(test_preds) test_coefs = np.allclose(enet._coefficients.ravel(), w, atol=.02) self.assertTrue(test_coefs)
def test_unregularized_with_weights(self): '''Test that fitting an unregularized model (lambda=0) gives expected results when sample weights are used. ''' Xdn = np.random.random(size=(5000, 10)) Xsp = csc_matrix(Xdn) w = np.random.random(size=(10, )) y = np.dot(Xdn, w) sw = np.random.uniform(size=(5000, )) for alpha in [0, .5, 1]: for X in (Xdn, Xsp): enet = ElasticNet(alpha=alpha) enet.fit(X, y, lambdas=[0], weights=sw) test_preds = np.allclose(enet.predict(X).ravel(), y, atol=.01) self.assertTrue(test_preds) test_coefs = np.allclose(enet._coefficients.ravel(), w, atol=.02) self.assertTrue(test_coefs)
def test_unregularized_models(self): '''Test that fitting an unregularized model (lambda=0) gives expected results for both dense and sparse model matricies. We test that an unregularized model captures a perfect linear relationship without error. That is, the fit parameters equals the true coefficients. ''' Xdn = np.random.random(size=(5000,10)) Xsp = csc_matrix(Xdn) w = np.random.random(size=(10,)) y = np.dot(Xdn, w) for alpha in [0, .5, 1]: for X in (Xdn, Xsp): enet = ElasticNet(alpha=alpha) enet.fit(X, y, lambdas=[0]) test_preds = np.allclose(enet.predict(X).ravel(), y, atol=.01) self.assertTrue(test_preds) test_coefs = np.allclose(enet._coefficients.ravel(), w, atol=.02) self.assertTrue(test_coefs)
def test_unregularized_models(self): '''Test that fitting an unregularized model (lambda=0) gives expected results for both dense and sparse model matricies. We test that an unregularized model captures a perfect linear relationship without error. That is, the fit parameters equals the true coefficients. ''' Xdn = np.random.random(size=(5000, 10)) Xsp = csc_matrix(Xdn) w = np.random.random(size=(10, )) y = np.dot(Xdn, w) for alpha in [0, .5, 1]: for X in (Xdn, Xsp): enet = ElasticNet(alpha=alpha) enet.fit(X, y, lambdas=[0]) test_preds = np.allclose(enet.predict(X).ravel(), y, atol=.01) self.assertTrue(test_preds) test_coefs = np.allclose(enet._coefficients.ravel(), w, atol=.02) self.assertTrue(test_coefs)
# (ii) Partially missing features (start with least missing one) cn_X_full_num = ['age_days'] cn_X_full_cat = list(np.setdiff1d(cn_X_full,cn_X_full_num)) OHE = OneHotEncoder(handle_unknown='ignore') scaler = StandardScaler() transformer = ColumnTransformer([('cat_cols', OHE, list(Xtrain.columns.isin(cn_X_full_cat))), ('num_cols', scaler, list(Xtrain.columns.isin(cn_X_full_num)))]) enc_X = transformer.fit(Xtrain.iloc[idx_train]) for cn in cn_partial: print('cn: %s' % cn) y_train, y_test = df_X[cn].iloc[idx_train].values, df_X[cn].iloc[idx_test].values mdl_lasso = ElasticNet(alpha=1, n_lambda=50, n_splits=5, random_state=1,verbose=False,n_jobs=5) mdl_lasso.fit(X=enc_X.transform(Xtrain.iloc[idx_train]), y=y_train) y_pred = mdl_lasso.predict(enc_X.transform(Xtrain.iloc[idx_test])) r2_pred = r2_score(y_test, y_pred) print('R2-score: %0.3f' % r2_pred) dat_Xmap[cn+'2'] = np.where(dat_Xmap[cn].isnull(),mdl_lasso.predict(enc_X.transform(Xtarget)),dat_Xmap[cn]) # Assign dat_Xmap = dat_Xmap.assign(height=lambda x: np.where(x.height.isnull(), x.height2, x.height), weight=lambda x: np.where(x.weight.isnull(), x.weight2, x.weight)) dat_Xmap.drop(columns = ['height2', 'weight2'], inplace=True) # (iii) Impute the "fully" missing features cn_impute_new = list(np.setdiff1d(cn_impute,['workrvu','ethnicity_hispanic'])) cn_X_full_new = list(cn_X_full) + ['height','weight','workrvu'] # Make sure columns line up for preprocessor Xtarget_new = dat_Xmap[cn_X_full].copy()
linestyle='None', marker='o', markersize=5, yerr=ridge_reg.cv_standard_error_, ecolor='lightgrey', capsize=4) for ref, txt in zip([ridge_reg.lambda_best_, ridge_reg.lambda_max_], ['Lambda best', 'Lambda max']): plt.axvline(x=np.log(ref), linestyle='dashed', color='lightgrey') plt.text(np.log(ref), .95 * plt.gca().get_ylim()[1], txt, ha='center') plt.xlabel('log(Lambda)') plt.ylabel('Mean-Squared Error') y_pred = ridge_reg.predict(X_test, lamb=ridge_reg.lambda_max_) ridge_err = mean_squared_error(y_pred, y_test) ################# Lasso Regression ##################### lasso_reg = ElasticNet(alpha=1, scoring="mean_squared_error", lambda_path=grid) lasso_reg.fit(X_train, y_train) lasso_reg.lambda_best_ lasso_reg.lambda_max_ plt.figure(figsize=(10, 7)) plt.errorbar(np.log(lasso_reg.lambda_path_),
from sklearn.datasets import make_regression display_bar = '-' * 70 X, y = make_regression( n_samples=5000, n_features=100, n_informative=30, effective_rank=40, noise=.1, ) print display_bar print "Fit an elastic net on some fake data" print display_bar enet = ElasticNet(alpha=.025) enet.fit(X, y) print enet print display_bar print "Predictions vs. actuals for the last elastic net model:" print display_bar preds = enet.predict(X) print y[:10] print preds[:10, np.shape(preds)[1] - 1] enet.plot_paths()
def get_glmnet_sig(sig_df, ret_sr, look_back = 12,sample_decay =1.0,num_sig_vec =[5], alpha = 0.5, signs_vec = None ): sig_df = sig_df.copy() ret_sr = ret_sr.copy() rebalance_dates = (sig_df.index.unique()).sort_values() data = sig_df data['y'] = ret_sr comb_sig_df = pd.DataFrame() sel_sig_names_vec = [] print('inside') for ind in range(look_back, rebalance_dates.shape[0] ) : r_d = rebalance_dates[ind] print(r_d) train_end_date = rebalance_dates[ind-1] #train_end_year = train_end_date.year train_start_date = rebalance_dates[ind-look_back] #train_start_year = train_start_date.year #curr_year = r_d.year train_data = data train_data = train_data[train_data.index>=train_start_date] train_data = train_data[train_data.index<=train_end_date] test_data = data test_data = test_data[test_data.index==r_d] train_x = train_data.drop(['y'], axis=1) train_y = train_data['y'] test_x = test_data.drop(['y'], axis=1) test_y = test_data['y'] num_stocks = test_data.shape[0] sample_weights = np.ones(num_stocks*look_back) for i1 in range(look_back): this_i1 = range(i1*num_stocks, ((i1+1)*num_stocks)-1) sample_weights[this_i1] = np.exp(-sample_decay*(look_back-1-i1)) model = ElasticNet(alpha=alpha, fit_intercept=True, n_lambda=1000,tol=1e-8 ) model.fit(train_x, train_y,sample_weight= sample_weights, signs_vec=signs_vec) this_comb_sig_df = pd.DataFrame() sel_sig_names = [] for num_sig in num_sig_vec : s, w_ind = get_lambda(model, num_sig) #print(s) #print(w_ind) this_sel_sig = test_x.columns[w_ind].values if this_sel_sig.shape[0] < num_sig : this_sel_sig = np.append( this_sel_sig, ['NA']*(num_sig - this_sel_sig.shape[0]) ) #print(this_sel_sig) this_sig = model.predict(test_x, s) this_sig = pd.Series(this_sig, index = test_x.index) this_sig = this_sig / np.sum( np.abs( this_sig ) ) this_comb_sig_df[str(num_sig) ] = this_sig sel_sig_names.append(this_sel_sig) sel_sig_names_vec.append(sel_sig_names) this_comb_sig_df.index = test_x.index #this_comb_sig_df = this_comb_sig_df.rank(axis=0) #this_comb_sig_df = ( this_comb_sig_df - this_comb_sig_df.mean(axis=0) ) / this_comb_sig_df.std(axis=0) comb_sig_df = comb_sig_df.append(this_comb_sig_df) return(comb_sig_df, sel_sig_names_vec)
def test_cv_scoring(self): x, y = self.inputs[0] for method in self.scoring: m = ElasticNet(scoring=method, random_state=1729) m = m.fit(x, y) self.check_r2_score(y, m.predict(x), 0.90, scoring=method)
def test_alphas(self): x, y = self.inputs[0] for alpha in self.alphas: m = ElasticNet(alpha=alpha, random_state=2465) m = m.fit(x, y) self.check_r2_score(y, m.predict(x), 0.90, alpha=alpha)
from sklearn.datasets import make_regression display_bar = '-'*70 X, y = make_regression( n_samples = 5000, n_features = 100, n_informative = 30, effective_rank = 40, noise = .1, ) print display_bar print "Fit an elastic net on some fake data" print display_bar enet = ElasticNet(alpha=.025) enet.fit(X, y) print enet print display_bar print "Predictions vs. actuals for the last elastic net model:" print display_bar preds = enet.predict(X) print y[:10] print preds[:10,np.shape(preds)[1]-1] enet.plot_paths()