def main(lr, train_path, eval_path, save_path, save_img): """Problem: Poisson regression with gradient ascent. Args: lr: Learning rate for gradient ascent. train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. save_path: Path to save predictions. """ # Load training set train = pd.read_csv(train_path) x_train, y_train = train[['x_1', 'x_2', 'x_3', 'x_4']], train[['y']].values.ravel() glm = PoissonRegressor(tol=1e-5, max_iter=10000000) glm.fit(x_train, y_train) valid = pd.read_csv(eval_path) x_eval, y_eval = valid[['x_1', 'x_2', 'x_3', 'x_4']], valid[['y']].values.ravel() predictions = glm.predict(x_eval) np.savetxt(save_path, predictions) util.scatter(y_eval, predictions, save_img) print(glm.coef_) print(glm.score(x_eval, y_eval))
def sk_poisson_regression(X_train, X_test, y_train, y_test): glm = PoissonRegressor(alpha=0, fit_intercept=False, max_iter=300) glm.fit(X_train, y_train) print('score: ', glm.score(X_test, y_test)) y_hat = glm.predict(X) fig = plt.figure(figsize=(6.0, 6.0)) plt.plot(X, y, 'o') plt.plot(X, y_hat, '*', color='r') plt.xlabel('x (total_bill)') plt.ylabel('y (tips)') plt.xlim(0, 60) plt.ylim(0, 12) plt.show()
#!/usr/bin/env python import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import PoissonRegressor from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor n_samples, n_features = 1000, 20 rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) # positive integer target correlated with X[:, 5] with many zeros: y = rng.poisson(lam=np.exp(X[:, 5]) / 2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) glm = PoissonRegressor() gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01) glm.fit(X_train, y_train) gbdt.fit(X_train, y_train) print(glm.score(X_test, y_test)) print(gbdt.score(X_test, y_test))
def test_warm_start(solver, fit_intercept, global_random_seed): n_samples, n_features = 100, 10 X, y = make_regression( n_samples=n_samples, n_features=n_features, n_informative=n_features - 2, bias=fit_intercept * 1.0, noise=1.0, random_state=global_random_seed, ) y = np.abs(y) # Poisson requires non-negative targets. alpha = 1 params = { # "solver": solver, # only lbfgs available "fit_intercept": fit_intercept, "tol": 1e-10, } glm1 = PoissonRegressor(warm_start=False, max_iter=1000, alpha=alpha, **params) glm1.fit(X, y) glm2 = PoissonRegressor(warm_start=True, max_iter=1, alpha=alpha, **params) # As we intentionally set max_iter=1 such that the solver should raise a # ConvergenceWarning. with pytest.warns(ConvergenceWarning): glm2.fit(X, y) linear_loss = LinearModelLoss( base_loss=glm1._get_loss(), fit_intercept=fit_intercept, ) sw = np.full_like(y, fill_value=1 / n_samples) objective_glm1 = linear_loss.loss( coef=np.r_[glm1.coef_, glm1.intercept_] if fit_intercept else glm1.coef_, X=X, y=y, sample_weight=sw, l2_reg_strength=alpha, ) objective_glm2 = linear_loss.loss( coef=np.r_[glm2.coef_, glm2.intercept_] if fit_intercept else glm2.coef_, X=X, y=y, sample_weight=sw, l2_reg_strength=alpha, ) assert objective_glm1 < objective_glm2 glm2.set_params(max_iter=1000) glm2.fit(X, y) # The two models are not exactly identical since the lbfgs solver # computes the approximate hessian from previous iterations, which # will not be strictly identical in the case of a warm start. assert_allclose(glm1.coef_, glm2.coef_, rtol=2e-4) assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-5)
def modeling_compare(X, y): import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.linear_model import RidgeCV from sklearn.model_selection import RepeatedKFold from sklearn.linear_model import ElasticNet from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import PoissonRegressor from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.linear_model import Lasso from sklearn.linear_model import SGDRegressor from sklearn.neural_network import MLPClassifier from sklearn.ensemble import VotingRegressor models_lab = [ 'Linear Regression', 'Ridge', 'Ridge with tuning hyperparameters', 'Elastic Net', 'Random Forest', 'Poisson Regression', 'Gradient Boosting regression', 'Lasso', 'Stochastic Gradient Descent', 'Neural Network', 'Voting Regression' ] reg1 = LinearRegression().fit(X, y) reg2 = Ridge().fit(X, y) reg3 = Ridge(alpha=0.2).fit(X, y) cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) grid = dict() grid['alpha'] = arange(0, 1, 0.01) cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) reg3 = RidgeCV(alphas=arange(0, 1, 0.01), cv=cv, scoring='neg_mean_absolute_error').fit(X, y) reg4 = ElasticNet().fit(X, y) reg5 = RandomForestRegressor().fit(X, y) reg6 = PoissonRegressor().fit(X, y) reg7 = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01).fit(X, y) reg8 = Lasso().fit(X, y) reg9 = SGDRegressor(loss='squared_loss', penalty='l2').fit(X, y) reg10 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(17, 10), random_state=1).fit(X, y) # VotingRegressor without NN ereg = VotingRegressor(estimators=[('lr', reg1), ('rd', reg2), ( 'rs', reg3), ('en', reg4), ('rf', reg5), ('pr', reg6), ('gb', reg7), ('ls', reg8), ('gd', reg9)]).fit(X, y) models_obj = [ reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10, ereg ] score = [ reg1.score(X, y), reg2.score(X, y), reg3.score(X, y), reg4.score(X, y), reg5.score(X, y), reg6.score(X, y), reg7.score(X, y), reg8.score(X, y), reg9.score(X, y), reg10.score(X, y), ereg.score(X, y) ] score_df = pd.DataFrame() score_df['models_lab'] = models_lab score_df['models_obj'] = models_obj score_df['score'] = score return (score_df)