def test_sklearn_poisson_regression(nps_app_inst: ArrayApplication): def dsqr(dev_func, y, _y_pred): dev = dev_func(y, _y_pred) y_mean = nps_app_inst.mean(y) dev_null = dev_func(y, y_mean) return 1 - dev / dev_null from sklearn.linear_model import PoissonRegressor as SKPoissonRegressor coef = np.array([0.2, -0.1]) real_X = np.array([[0, 1, 2, 3, 4]]).T real_y = np.exp(np.dot(real_X, coef[0]) + coef[1]).reshape(-1) X = nps_app_inst.array(real_X, block_shape=real_X.shape) y = nps_app_inst.array(real_y, block_shape=real_y.shape) param_set = [ {"tol": 1e-4, "max_iter": 100}, ] for kwargs in param_set: lr_model: PoissonRegression = PoissonRegression(**kwargs) lr_model.fit(X, y) y_pred = lr_model.predict(X).get() print("D^2", dsqr(lr_model.deviance, y, y_pred).get()) sk_lr_model = SKPoissonRegressor(**kwargs) sk_lr_model.fit(real_X, real_y) sk_y_pred = sk_lr_model.predict(real_X) print("D^2", dsqr(lr_model.deviance, y, sk_y_pred).get())
def PoissonReg(X_train, X_test, y_train, y_test): y_train1 = y_train[:, 0] y_train2 = y_train[:, 1] reg1 = PoissonRegressor() reg1.fit(X_train, y_train1) reg2 = PoissonRegressor() reg2.fit(X_train, y_train2) y_pred1 = reg1.predict(X=X_test) y_pred2 = reg2.predict(X=X_test) y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1))) printMetrics(y_true=y_test, y_pred=y_pred) val_metrics = getMetrics(y_true=y_test, y_pred=y_pred) y_pred1 = reg1.predict(X=X_train) y_pred2 = reg2.predict(X=X_train) y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1))) metrics = getMetrics(y_true=y_train, y_pred=y_pred) printMetrics(y_true=y_train, y_pred=y_pred) logSave(nameOfModel="PoissonReg", reg=[reg1, reg2], metrics=metrics, val_metrics=val_metrics)
def main(lr, train_path, eval_path, save_path, save_img): """Problem: Poisson regression with gradient ascent. Args: lr: Learning rate for gradient ascent. train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. save_path: Path to save predictions. """ # Load training set train = pd.read_csv(train_path) x_train, y_train = train[['x_1', 'x_2', 'x_3', 'x_4']], train[['y']].values.ravel() glm = PoissonRegressor(tol=1e-5, max_iter=10000000) glm.fit(x_train, y_train) valid = pd.read_csv(eval_path) x_eval, y_eval = valid[['x_1', 'x_2', 'x_3', 'x_4']], valid[['y']].values.ravel() predictions = glm.predict(x_eval) np.savetxt(save_path, predictions) util.scatter(y_eval, predictions, save_img) print(glm.coef_) print(glm.score(x_eval, y_eval))
def PoissonRegGS(X_train, X_test, y_train, y_test): y_train1 = y_train[:, 0] y_train2 = y_train[:, 1] reg1 = PoissonRegressor() reg2 = PoissonRegressor() grid_values = {'alpha': list(range(1, 3))} grid_reg1 = GridSearchCV( reg1, param_grid=grid_values, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'], refit='r2', n_jobs=-1, cv=2, verbose=100) grid_reg1.fit(X_train, y_train1) reg1 = grid_reg1.best_estimator_ reg1.fit(X_train, y_train1) grid_reg2 = GridSearchCV( reg2, param_grid=grid_values, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'], refit='r2', n_jobs=-1, cv=2, verbose=100) grid_reg2.fit(X_train, y_train2) reg2 = grid_reg1.best_estimator_ reg2.fit(X_train, y_train2) y_pred1 = reg1.predict(X=X_test) y_pred2 = reg2.predict(X=X_test) y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1))) printMetrics(y_true=y_test, y_pred=y_pred) val_metrics = getMetrics(y_true=y_test, y_pred=y_pred) y_pred1 = reg1.predict(X=X_train) y_pred2 = reg2.predict(X=X_train) y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1))) metrics = getMetrics(y_true=y_train, y_pred=y_pred) printMetrics(y_true=y_train, y_pred=y_pred) best_params1: dict = grid_reg1.best_params_ best_params2: dict = grid_reg2.best_params_ best_params = {} for key in best_params1.keys(): best_params[key] = [best_params1[key], best_params2[key]] saveBestParams(nameOfModel="PoissonRegGS", best_params=best_params) logSave(nameOfModel="PoissonRegGS", reg=[reg1, reg2], metrics=metrics, val_metrics=val_metrics)
def get_trained_model(X, y): #Split data into test verification set and training set X_Train, X_Test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) print('got here') print('Training:\n') #Switching to Poisson from Linear brought RMSE down from 2614.92 to 2281.12 mlModel = PoissonRegressor( ) #create model object #Switched from LinearRegression to PoissonRegressor mlModel.fit(X_Train, y_train.values.ravel()) #train model object return mlModel
def sk_poisson_regression(X_train, X_test, y_train, y_test): glm = PoissonRegressor(alpha=0, fit_intercept=False, max_iter=300) glm.fit(X_train, y_train) print('score: ', glm.score(X_test, y_test)) y_hat = glm.predict(X) fig = plt.figure(figsize=(6.0, 6.0)) plt.plot(X, y, 'o') plt.plot(X, y_hat, '*', color='r') plt.xlabel('x (total_bill)') plt.ylabel('y (tips)') plt.xlim(0, 60) plt.ylim(0, 12) plt.show()
def test_poisson_glmnet(): """Compare Poisson regression with L2 regularization and LogLink to glmnet""" # library("glmnet") # options(digits=10) # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) # x <- data.matrix(df[,c("a", "b")]) # y <- df$y # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson", # standardize=F, thresh=1e-10, nlambda=10000) # coef(fit, s=1) # (Intercept) -0.12889386979 # a 0.29019207995 # b 0.03741173122 X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) glm = PoissonRegressor( alpha=1, fit_intercept=True, tol=1e-7, max_iter=300, ) glm.fit(X, y) assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5) assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5)
def poisson_regression(self, df, split=0.7): split = np.random.rand(len(df)) < split df = df[self.select_cols] df = pd.get_dummies(df, columns=self.dummy_cols, drop_first=False) y_train, x_train, y_test, x_test = self.get_split(df, split) model = PoissonRegressor() result = model.fit(x_train, y_train) x_train.to_csv('x_train.csv') result_dict = { 'model': result, 'score': result.score(x_train, y_train), 'intercept': result.intercept_, 'parameters': { x_train.columns[j]: result.coef_[j] for j in range(len(result.coef_)) } } return result_dict
#!/usr/bin/env python import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import PoissonRegressor from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor n_samples, n_features = 1000, 20 rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) # positive integer target correlated with X[:, 5] with many zeros: y = rng.poisson(lam=np.exp(X[:, 5]) / 2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) glm = PoissonRegressor() gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01) glm.fit(X_train, y_train) gbdt.fit(X_train, y_train) print(glm.score(X_test, y_test)) print(gbdt.score(X_test, y_test))
def test_warm_start(solver, fit_intercept, global_random_seed): n_samples, n_features = 100, 10 X, y = make_regression( n_samples=n_samples, n_features=n_features, n_informative=n_features - 2, bias=fit_intercept * 1.0, noise=1.0, random_state=global_random_seed, ) y = np.abs(y) # Poisson requires non-negative targets. alpha = 1 params = { # "solver": solver, # only lbfgs available "fit_intercept": fit_intercept, "tol": 1e-10, } glm1 = PoissonRegressor(warm_start=False, max_iter=1000, alpha=alpha, **params) glm1.fit(X, y) glm2 = PoissonRegressor(warm_start=True, max_iter=1, alpha=alpha, **params) # As we intentionally set max_iter=1 such that the solver should raise a # ConvergenceWarning. with pytest.warns(ConvergenceWarning): glm2.fit(X, y) linear_loss = LinearModelLoss( base_loss=glm1._get_loss(), fit_intercept=fit_intercept, ) sw = np.full_like(y, fill_value=1 / n_samples) objective_glm1 = linear_loss.loss( coef=np.r_[glm1.coef_, glm1.intercept_] if fit_intercept else glm1.coef_, X=X, y=y, sample_weight=sw, l2_reg_strength=alpha, ) objective_glm2 = linear_loss.loss( coef=np.r_[glm2.coef_, glm2.intercept_] if fit_intercept else glm2.coef_, X=X, y=y, sample_weight=sw, l2_reg_strength=alpha, ) assert objective_glm1 < objective_glm2 glm2.set_params(max_iter=1000) glm2.fit(X, y) # The two models are not exactly identical since the lbfgs solver # computes the approximate hessian from previous iterations, which # will not be strictly identical in the case of a warm start. assert_allclose(glm1.coef_, glm2.coef_, rtol=2e-4) assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-5)
# # The number of claims (``ClaimNb``) is a positive integer (0 included). # Thus, this target can be modelled by a Poisson distribution. # It is then assumed to be the number of discrete events occurring with a # constant rate in a given time interval (``Exposure``, in units of years). # Here we model the frequency ``y = ClaimNb / Exposure``, which is still a # (scaled) Poisson distribution, and use ``Exposure`` as `sample_weight`. df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0) # The parameters of the model are estimated by minimizing the Poisson deviance # on the training set via a quasi-Newton solver: l-BFGS. Some of the features # are collinear, we use a weak penalization to avoid numerical issues. glm_freq = PoissonRegressor(alpha=1e-3, max_iter=400) glm_freq.fit(X_train, df_train["Frequency"], sample_weight=df_train["Exposure"]) scores = score_estimator( glm_freq, X_train, X_test, df_train, df_test, target="Frequency", weights="Exposure", ) print("Evaluation of PoissonRegressor on target Frequency") print(scores) # %%
def poissonregressor(self,X_train,X_test,y_train,y_test): regressor= PoissonRegressor() regfit=regressor.fit(self.X_train,self.y_train) return regressor.predict(self.X_test)
def test_poisson(self): # to do n = 100 p = 20 k = 3 family = "poisson" rho = 0.5 sigma = 1 M = 1 np.random.seed(3) data = gen_data(n, p, family=family, k=k, rho=rho, sigma=sigma) data2 = gen_data_splicing(family=family, n=n, p=p, k=k, rho=rho, M=M) support_size = range(0, 20) model = abessPoisson(path_type="seq", support_size=support_size, ic_type='ebic', is_screening=True, screening_size=20, K_max=10, epsilon=10, powell_path=2, s_min=1, s_max=p, lambda_min=0.01, lambda_max=100, is_cv=True, K=5, exchange_num=2, tau=0.1 * np.log(n * p) / n, primary_model_fit_max_iter=10, primary_model_fit_epsilon=1e-6, early_stop=False, approximate_Newton=True, ic_coef=1., thread=5, sparse_matrix=True) group = np.linspace(1, p, p) model.fit(data.x, data.y, group=group) model2 = abessPoisson(path_type="seq", support_size=support_size, ic_type='ebic', is_screening=True, screening_size=20, K_max=10, epsilon=10, powell_path=2, s_min=1, s_max=p, lambda_min=0.01, lambda_max=100, is_cv=True, K=5, exchange_num=2, tau=0.1 * np.log(n * p) / n, primary_model_fit_max_iter=80, primary_model_fit_epsilon=1e-6, early_stop=False, approximate_Newton=False, ic_coef=1., thread=5) group = np.linspace(1, p, p) model2.fit(data.x, data.y, group=group) model2.predict(data.x) nonzero_true = np.nonzero(data.coef_)[0] nonzero_fit = np.nonzero(model2.coef_)[0] print(nonzero_true) print(nonzero_fit) assert (nonzero_true == nonzero_fit).all() if sys.version_info[1] >= 6: new_x = data.x[:, nonzero_fit] reg = PoissonRegressor(alpha=0, tol=1e-6, max_iter=200) reg.fit(new_x, data.y) print(model2.coef_[nonzero_fit]) print(reg.coef_) assert model2.coef_[nonzero_fit] == approx(reg.coef_, rel=1e-2, abs=1e-2)
# Alpha = 100 regr_l2_100 = linear_model.Ridge(alpha=100) scores_length_l2_100_reg = cross_val_score(regr_l2_100, X_train_std, y_train, cv=5, scoring='r2') regr_l2_100.fit(X_train_std, y_train) #print(scores_length_l2_100_reg) #The mean score and the standard deviation are hence given by: print("%0.2f (with L2 alpha = 100) accuracy with a standard deviation of %0.2f" % (scores_length_l2_100_reg.mean(), scores_length_l2_100_reg.std())) #print(patient) # Commented out IPython magic to ensure Python compatibility. # Modeling with Poisson Regressor import sklearn from sklearn.linear_model import PoissonRegressor regr = PoissonRegressor(alpha=1.0, fit_intercept=True, max_iter=100, tol=0.0001, warm_start=False, verbose=0) regr.fit(X_train_std, y_train) # Make predictions using the testing set y_pred = regr.predict(X_test_std) from sklearn.metrics import r2_score print(r2_score(y_test, y_pred)) # The coefficients # print('Coefficients: \n', regr.coef_) # The mean squared error print('Mean squared error: %.2f' # % mean_squared_error(y_test, y_pred)) # The coefficient of determination: 1 is perfect prediction print('Coefficient of determination: %.2f' # % r2_score(y_test, y_pred))