def test_sklearn_poisson_regression(nps_app_inst: ArrayApplication): def dsqr(dev_func, y, _y_pred): dev = dev_func(y, _y_pred) y_mean = nps_app_inst.mean(y) dev_null = dev_func(y, y_mean) return 1 - dev / dev_null from sklearn.linear_model import PoissonRegressor as SKPoissonRegressor coef = np.array([0.2, -0.1]) real_X = np.array([[0, 1, 2, 3, 4]]).T real_y = np.exp(np.dot(real_X, coef[0]) + coef[1]).reshape(-1) X = nps_app_inst.array(real_X, block_shape=real_X.shape) y = nps_app_inst.array(real_y, block_shape=real_y.shape) param_set = [ {"tol": 1e-4, "max_iter": 100}, ] for kwargs in param_set: lr_model: PoissonRegression = PoissonRegression(**kwargs) lr_model.fit(X, y) y_pred = lr_model.predict(X).get() print("D^2", dsqr(lr_model.deviance, y, y_pred).get()) sk_lr_model = SKPoissonRegressor(**kwargs) sk_lr_model.fit(real_X, real_y) sk_y_pred = sk_lr_model.predict(real_X) print("D^2", dsqr(lr_model.deviance, y, sk_y_pred).get())
def test_poisson_regression_family(regression_data): # Make sure the family attribute is read-only to prevent searching over it # e.g. in a grid search est = PoissonRegressor() est.family == "poisson" msg = "PoissonRegressor.family must be 'poisson'!" with pytest.raises(ValueError, match=msg): est.family = 0
def get_trained_model(X, y): #Split data into test verification set and training set X_Train, X_Test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) print('got here') print('Training:\n') #Switching to Poisson from Linear brought RMSE down from 2614.92 to 2281.12 mlModel = PoissonRegressor( ) #create model object #Switched from LinearRegression to PoissonRegressor mlModel.fit(X_Train, y_train.values.ravel()) #train model object return mlModel
def _fit_sklearn(self, dm, binned, alpha, cells=None, retvar=False, noncovwarn=True): """ Fit a GLM using scikit-learn implementation of PoissonRegressor. Uses a regularization strength parameter alpha, which is the strength of ridge regularization term. When alpha is set to 0, this *should* in theory be the same as _fit_minimize, but in practice it is not and seems to exhibit some regularization still. Parameters ---------- dm : numpy.ndarray Design matrix, in which rows are observations and columns are regressor values. Should NOT contain a bias column for the intercept. Scikit-learn handles that. binned : numpy.ndarray Vector of observed spike counts which we seek to predict. Must be of the same length as dm.shape[0] alpha : float Regularization strength, applied as multiplicative constant on ridge regularization. cells : list List of cells which should be fit. If None is passed, will default to fitting all cells in clu_ids variances : bool Whether or not to return variances on parameters in dm. """ if cells is None: cells = self.clu_ids.flatten() coefs = pd.Series(index=cells, name='coefficients', dtype=object) intercepts = pd.Series(index=cells, name='intercepts') variances = pd.Series(index=cells, name='variances', dtype=object) nonconverged = [] for cell in tqdm(cells, 'Fitting units:', leave=False): cell_idx = np.argwhere(self.clu_ids == cell)[0, 0] cellbinned = binned[:, cell_idx] with catch_warnings(record=True) as w: fitobj = PoissonRegressor(alpha=alpha, max_iter=300).fit(dm, cellbinned) if len(w) != 0: nonconverged.append(cell) wts = np.concatenate([[fitobj.intercept_], fitobj.coef_], axis=0) biasdm = np.pad(dm.copy(), ((0, 0), (1, 0)), 'constant', constant_values=1) if retvar: wvar = np.diag( np.linalg.inv(dd_neglog(wts, biasdm, cellbinned))) else: wvar = np.ones((wts.shape[0], wts.shape[0])) * np.nan coefs.at[cell] = fitobj.coef_ variances.at[cell] = wvar[1:] intercepts.at[cell] = fitobj.intercept_ if noncovwarn: if len(nonconverged) != 0: warn( f'Fitting did not converge for some units: {nonconverged}') return coefs, intercepts, variances
def __init__(self, correct_glm_bounds=True, recursive_forecast=False): # optional parameters self.correct_glm_bounds = correct_glm_bounds self.recursive_forecast = recursive_forecast # pipelines for the models. # Scaling for Poisson and Gamma Regression models, they use L2 regularization penalty self.pipe_lin_reg_ar = Pipeline([ ('poly', PolynomialFeatures(1, include_bias=False)), ('scale', StandardScaler()), ('reg_lin', LinearRegression()) ]) self.pipe_reg_pois = Pipeline([ ('poly', PolynomialFeatures(2, include_bias=False)), ('scale', StandardScaler()), ('reg_pois', PoissonRegressor(alpha=0, max_iter=5000)) ]) self.pipe_reg_gamm = Pipeline([ ('poly', PolynomialFeatures(2, include_bias=False)), ('scale', StandardScaler()), ('reg_gamm', GammaRegressor(alpha=0, max_iter=5000)) ]) # initial data values for checking estimators fit ? self.x = None self.y = None self.x_ar = None self.y_ar = None # dictionary for results. self.results = {}
def Score(self): ## data ######################################################################## model_1 = RandomForestRegressor(max_depth=15,random_state=0) model_2 = LinearRegression(fit_intercept=True) model_3 = Ridge(alpha=5) model_4 = Lasso(alpha=10) model_5 = SVR(C=2.5, epsilon=0.5) model_6 = GradientBoostingRegressor(random_state=0) model_7 = PoissonRegressor() MSE = [] R2 = [] for mymodels in [model_1,model_2,model_3,model_4,model_5,model_6,model_7]: model_pipeline = Pipeline(steps=[('pre_processing',self.pre_process),('scaler', StandardScaler()),('reduce_dim', PCA()), ('model', mymodels) ]) model_pipeline.fit(self.X_train,self.y_train) MSE.append(mean_squared_error(self.y_train,model_pipeline.predict(self.X_train))**0.5) R2.append(r2_score(self.y_train,model_pipeline.predict(self.X_train))) print(np.round(MSE,2)) print(np.round(R2,2))
def get_model(self): one_hot = OneHotEncoder(handle_unknown="ignore", sparse=True) param_grid = {} poisson = PoissonRegressor( max_iter=1000, alpha=0.2, ) poisson_params = {'clf__alpha': [0.2, 0.4]} # param_grid.update(poisson_params) pipe = Pipeline([ ('one_hot', one_hot), ('clf', poisson), ]) search = GridSearchCV( pipe, param_grid, n_jobs=-1, scoring='r2', ) self.model = pipe return pipe
def poisson_regression(self, df, split=0.7): split = np.random.rand(len(df)) < split df = df[self.select_cols] df = pd.get_dummies(df, columns=self.dummy_cols, drop_first=False) y_train, x_train, y_test, x_test = self.get_split(df, split) model = PoissonRegressor() result = model.fit(x_train, y_train) x_train.to_csv('x_train.csv') result_dict = { 'model': result, 'score': result.score(x_train, y_train), 'intercept': result.intercept_, 'parameters': { x_train.columns[j]: result.coef_[j] for j in range(len(result.coef_)) } } return result_dict
def regression(transformed, train_data_index_list, test_data_index_list, combined_data, dataset_name, data_path, regression_type): X_train1 = transformed[transformed.index.isin(train_data_index_list)] X_train1 = np.array(X_train1) X_test1 = transformed[transformed.index.isin(test_data_index_list)] X_test1 = np.array(X_test1) Y_train1 = combined_data[transformed.index.isin(train_data_index_list)] Y_train1 = Y_train1['bug'] Y_test1 = combined_data[transformed.index.isin(test_data_index_list)] Y_test1 = Y_test1['bug'] if (regression_type == 'poisson'): reg = PoissonRegressor().fit(X_train1, Y_train1) elif (regression_type == 'linear'): reg = LinearRegression().fit(X_train1, Y_train1) else: reg = Lasso().fit(X_train1, Y_train1) predictions = reg.predict(X_test1) FPA_result = str(FPA(predictions)) CLC_result = str(CLC(predictions)) if (regression_type == 'poisson'): path_to_save = '../../BTP_results/ml_results/poisson' + '_' + dataset_name write_to_file('poisson_' + data_path, FPA_result, CLC_result, path_to_save) elif (regression_type == 'linear'): path_to_save = '../../BTP_results/ml_results/linear' + '_' + dataset_name write_to_file('linear_' + data_path, FPA_result, CLC_result, path_to_save) else: path_to_save = '../../BTP_results/ml_results/lasso' + '_' + dataset_name write_to_file('lasso_' + data_path, FPA_result, CLC_result, path_to_save) print("FPA metric value obtained is: " + FPA_result) print("CLC metric value obtained is: " + CLC_result) print("MSE is: " + str(mean_squared_error(Y_test1, predictions))) print("success!!")
def main(lr, train_path, eval_path, save_path, save_img): """Problem: Poisson regression with gradient ascent. Args: lr: Learning rate for gradient ascent. train_path: Path to CSV file containing dataset for training. eval_path: Path to CSV file containing dataset for evaluation. save_path: Path to save predictions. """ # Load training set train = pd.read_csv(train_path) x_train, y_train = train[['x_1', 'x_2', 'x_3', 'x_4']], train[['y']].values.ravel() glm = PoissonRegressor(tol=1e-5, max_iter=10000000) glm.fit(x_train, y_train) valid = pd.read_csv(eval_path) x_eval, y_eval = valid[['x_1', 'x_2', 'x_3', 'x_4']], valid[['y']].values.ravel() predictions = glm.predict(x_eval) np.savetxt(save_path, predictions) util.scatter(y_eval, predictions, save_img) print(glm.coef_) print(glm.score(x_eval, y_eval))
def analyze(data, neuron, args=None, confs=None): if args is None: args = DEFAULT_ARGS if confs is None: confs = DEFAULT_CONFS firing_rates = transform_spikes(neuron, filter_width=50) data, neuron, firing_rates = remove_nans(data, neuron, firing_rates) create_model = lambda: Model(PoissonRegressor(), spikes=neuron, n_folds=10) create_model = lambda: Model(LinearRegression(), spikes=neuron, n_folds=10) best_model = create_model() subset = data.columns.to_list() # starting with all columns data_ = transform_data(data[subset], args['bins']) best_model(data_, firing_rates) plot_model(data_, neuron, firing_rates, best_model, subset) # return bins = args['bins'] # naive estimation of SHAP values # a real calculation will require 2^k (k=no. features) models, and to avg them with different weights (N choose k) features_to_remove = get_one_dim_feature_names( ) + get_two_dim_feature_names() for feature_to_remove in features_to_remove: if isinstance(feature_to_remove, str): # 1D feature feature_to_remove = [feature_to_remove] model = create_model() new_subset = [col for col in subset if col not in feature_to_remove] print(new_subset, feature_to_remove) new_bins = bins.copy() [new_bins.pop(x) for x in feature_to_remove] data_ = transform_data(data[new_subset], new_bins) print(data_.shape) model(data_, firing_rates) if model > best_model: subset = new_subset best_model = model bins = new_bins get_avg_ll = lambda m: np.mean( [x.results.likelihood for x in m.CVfolds]) print(feature_to_remove, get_avg_ll(model)) # shuffles_results = run_shuffles(best_model) plot_model(data_, neuron, firing_rates, model, subset)
def test_poisson_glmnet(): """Compare Poisson regression with L2 regularization and LogLink to glmnet""" # library("glmnet") # options(digits=10) # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) # x <- data.matrix(df[,c("a", "b")]) # y <- df$y # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson", # standardize=F, thresh=1e-10, nlambda=10000) # coef(fit, s=1) # (Intercept) -0.12889386979 # a 0.29019207995 # b 0.03741173122 X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) glm = PoissonRegressor( alpha=1, fit_intercept=True, tol=1e-7, max_iter=300, ) glm.fit(X, y) assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5) assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5)
def _fit(self, dm, binned, cells=None, noncovwarn=False): """ Fit a GLM using scikit-learn implementation of PoissonRegressor. Uses a regularization strength parameter alpha, which is the strength of ridge regularization term. Parameters ---------- dm : numpy.ndarray Design matrix, in which rows are observations and columns are regressor values. Should NOT contain a bias column for the intercept. Scikit-learn handles that. binned : numpy.ndarray Vector of observed spike counts which we seek to predict. Must be of the same length as dm.shape[0] alpha : float Regularization strength, applied as multiplicative constant on ridge regularization. cells : list List of cells labels for columns in binned. Will default to all cells in model if None is passed. Must be of the same length as columns in binned. By default None. """ if cells is None: cells = self.clu_ids.flatten() if cells.shape[0] != binned.shape[1]: raise ValueError('Length of cells does not match shape of binned') coefs = pd.Series(index=cells, name='coefficients', dtype=object) intercepts = pd.Series(index=cells, name='intercepts') nonconverged = [] for cell in tqdm(cells, 'Fitting units:', leave=False): cell_idx = np.argwhere(cells == cell)[0, 0] cellbinned = binned[:, cell_idx] with catch_warnings(record=True) as w: fitobj = PoissonRegressor( alpha=self.alpha, max_iter=300, fit_intercept=self.fit_intercept).fit(dm, cellbinned) if len(w) != 0: nonconverged.append(cell) coefs.at[cell] = fitobj.coef_ if self.fit_intercept: intercepts.at[cell] = fitobj.intercept_ else: intercepts.at[cell] = 0 if noncovwarn: if len(nonconverged) != 0: warn( f'Fitting did not converge for some units: {nonconverged}') return coefs, intercepts
def get_regressors_generalized(nmodels='all'): """ Returns one or all of Generalized linear regressors """ # 1. PoissonRegressor lr1 = PoissonRegressor() # 2. TweedieRegressor lr2 = TweedieRegressor() # 3. GammaRegressor lr3 = GammaRegressor() if (nmodels == 'all'): models = [lr1, lr2, lr3] else: models = ['lr' + str(nmodels)] return models
def sk_poisson_regression(X_train, X_test, y_train, y_test): glm = PoissonRegressor(alpha=0, fit_intercept=False, max_iter=300) glm.fit(X_train, y_train) print('score: ', glm.score(X_test, y_test)) y_hat = glm.predict(X) fig = plt.figure(figsize=(6.0, 6.0)) plt.plot(X, y, 'o') plt.plot(X, y_hat, '*', color='r') plt.xlabel('x (total_bill)') plt.ylabel('y (tips)') plt.xlim(0, 60) plt.ylim(0, 12) plt.show()
def test_warm_start(solver, fit_intercept, global_random_seed): n_samples, n_features = 100, 10 X, y = make_regression( n_samples=n_samples, n_features=n_features, n_informative=n_features - 2, bias=fit_intercept * 1.0, noise=1.0, random_state=global_random_seed, ) y = np.abs(y) # Poisson requires non-negative targets. alpha = 1 params = { # "solver": solver, # only lbfgs available "fit_intercept": fit_intercept, "tol": 1e-10, } glm1 = PoissonRegressor(warm_start=False, max_iter=1000, alpha=alpha, **params) glm1.fit(X, y) glm2 = PoissonRegressor(warm_start=True, max_iter=1, alpha=alpha, **params) # As we intentionally set max_iter=1 such that the solver should raise a # ConvergenceWarning. with pytest.warns(ConvergenceWarning): glm2.fit(X, y) linear_loss = LinearModelLoss( base_loss=glm1._get_loss(), fit_intercept=fit_intercept, ) sw = np.full_like(y, fill_value=1 / n_samples) objective_glm1 = linear_loss.loss( coef=np.r_[glm1.coef_, glm1.intercept_] if fit_intercept else glm1.coef_, X=X, y=y, sample_weight=sw, l2_reg_strength=alpha, ) objective_glm2 = linear_loss.loss( coef=np.r_[glm2.coef_, glm2.intercept_] if fit_intercept else glm2.coef_, X=X, y=y, sample_weight=sw, l2_reg_strength=alpha, ) assert objective_glm1 < objective_glm2 glm2.set_params(max_iter=1000) glm2.fit(X, y) # The two models are not exactly identical since the lbfgs solver # computes the approximate hessian from previous iterations, which # will not be strictly identical in the case of a warm start. assert_allclose(glm1.coef_, glm2.coef_, rtol=2e-4) assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-5)
@pytest.fixture(scope="module") def regression_data(): X, y = make_regression(n_samples=107, n_features=10, n_informative=80, noise=0.5, random_state=2) return X, y @pytest.fixture( params=itertools.product( ["long", "wide"], [ BinomialRegressor(), PoissonRegressor(), GammaRegressor(), # TweedieRegressor(power=3.0), # too difficult # TweedieRegressor(power=0, link="log"), # too difficult TweedieRegressor(power=1.5), ], ), ids=lambda param: f"{param[0]}-{param[1]}", ) def glm_dataset(global_random_seed, request): """Dataset with GLM solutions, well conditioned X. This is inspired by ols_ridge_dataset in test_ridge.py. The construction is based on the SVD decomposition of X = U S V'.
# the power attribute is properly updated power = 2.0 est = TweedieRegressor(power=power) assert isinstance(est.family, TweedieDistribution) assert est.family.power == power assert est.power == power new_power = 0 new_family = TweedieDistribution(power=new_power) est.family = new_family assert isinstance(est.family, TweedieDistribution) assert est.family.power == new_power assert est.power == new_power msg = "TweedieRegressor.family must be of type TweedieDistribution!" with pytest.raises(TypeError, match=msg): est.family = None @pytest.mark.parametrize( "estimator, value", [ (PoissonRegressor(), True), (GammaRegressor(), True), (TweedieRegressor(power=1.5), True), (TweedieRegressor(power=0), False), ], ) def test_tags(estimator, value): assert estimator._get_tags()["requires_positive_y"] is value
features = [ 'year', 'month', 'workingday', 'hour', 'holiday', 'weather', 'atemp', 'humidity', 'windspeed', 'season' ] for f in ['holiday', 'atemp']: features.remove(f) linear_model_preprocessor = ColumnTransformer( [ # ('passthrough_numeric', 'passthrough', features) # ('passthrough_numeric', Normalizer(norm='l2'), features) ('passthrough_numeric', StandardScaler(), features) ], remainder='drop') # poisson_regressor = PoissonRegressor(len(features)) poisson_regressor = PoissonRegressor(alpha=0) # poisson_regressor = Ridge() model = Pipeline([('preprocessor', linear_model_preprocessor), ('regressor', poisson_regressor)]) print('fitting model...') model.fit(train_df, train_df['count']) print('evaluating model over train...') error = evaluate(model, train_df) print('error', error) print('evaluating model over test...') error = evaluate(model, validation_df) print('error', error) print('params:', poisson_regressor.coef_) print('intercept:', poisson_regressor.intercept_) print(dir(poisson_regressor))
#!/usr/bin/env python import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import PoissonRegressor from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor n_samples, n_features = 1000, 20 rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) # positive integer target correlated with X[:, 5] with many zeros: y = rng.poisson(lam=np.exp(X[:, 5]) / 2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) glm = PoissonRegressor() gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01) glm.fit(X_train, y_train) gbdt.fit(X_train, y_train) print(glm.score(X_test, y_test)) print(gbdt.score(X_test, y_test))
# Frequency model -- Poisson distribution # --------------------------------------- # # The number of claims (``ClaimNb``) is a positive integer (0 included). # Thus, this target can be modelled by a Poisson distribution. # It is then assumed to be the number of discrete events occurring with a # constant rate in a given time interval (``Exposure``, in units of years). # Here we model the frequency ``y = ClaimNb / Exposure``, which is still a # (scaled) Poisson distribution, and use ``Exposure`` as `sample_weight`. df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0) # The parameters of the model are estimated by minimizing the Poisson deviance # on the training set via a quasi-Newton solver: l-BFGS. Some of the features # are collinear, we use a weak penalization to avoid numerical issues. glm_freq = PoissonRegressor(alpha=1e-3, max_iter=400) glm_freq.fit(X_train, df_train["Frequency"], sample_weight=df_train["Exposure"]) scores = score_estimator( glm_freq, X_train, X_test, df_train, df_test, target="Frequency", weights="Exposure", ) print("Evaluation of PoissonRegressor on target Frequency") print(scores)
def PoissonRegGS(X_train, X_test, y_train, y_test): y_train1 = y_train[:, 0] y_train2 = y_train[:, 1] reg1 = PoissonRegressor() reg2 = PoissonRegressor() grid_values = {'alpha': list(range(1, 3))} grid_reg1 = GridSearchCV( reg1, param_grid=grid_values, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'], refit='r2', n_jobs=-1, cv=2, verbose=100) grid_reg1.fit(X_train, y_train1) reg1 = grid_reg1.best_estimator_ reg1.fit(X_train, y_train1) grid_reg2 = GridSearchCV( reg2, param_grid=grid_values, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'], refit='r2', n_jobs=-1, cv=2, verbose=100) grid_reg2.fit(X_train, y_train2) reg2 = grid_reg1.best_estimator_ reg2.fit(X_train, y_train2) y_pred1 = reg1.predict(X=X_test) y_pred2 = reg2.predict(X=X_test) y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1))) printMetrics(y_true=y_test, y_pred=y_pred) val_metrics = getMetrics(y_true=y_test, y_pred=y_pred) y_pred1 = reg1.predict(X=X_train) y_pred2 = reg2.predict(X=X_train) y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1))) metrics = getMetrics(y_true=y_train, y_pred=y_pred) printMetrics(y_true=y_train, y_pred=y_pred) best_params1: dict = grid_reg1.best_params_ best_params2: dict = grid_reg2.best_params_ best_params = {} for key in best_params1.keys(): best_params[key] = [best_params1[key], best_params2[key]] saveBestParams(nameOfModel="PoissonRegGS", best_params=best_params) logSave(nameOfModel="PoissonRegGS", reg=[reg1, reg2], metrics=metrics, val_metrics=val_metrics)
def PoissonReg(X_train, X_test, y_train, y_test): y_train1 = y_train[:, 0] y_train2 = y_train[:, 1] reg1 = PoissonRegressor() reg1.fit(X_train, y_train1) reg2 = PoissonRegressor() reg2.fit(X_train, y_train2) y_pred1 = reg1.predict(X=X_test) y_pred2 = reg2.predict(X=X_test) y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1))) printMetrics(y_true=y_test, y_pred=y_pred) val_metrics = getMetrics(y_true=y_test, y_pred=y_pred) y_pred1 = reg1.predict(X=X_train) y_pred2 = reg2.predict(X=X_train) y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1))) metrics = getMetrics(y_true=y_train, y_pred=y_pred) printMetrics(y_true=y_train, y_pred=y_pred) logSave(nameOfModel="PoissonReg", reg=[reg1, reg2], metrics=metrics, val_metrics=val_metrics)
def test_poisson(self): # to do n = 100 p = 20 k = 3 family = "poisson" rho = 0.5 sigma = 1 M = 1 np.random.seed(3) data = gen_data(n, p, family=family, k=k, rho=rho, sigma=sigma) data2 = gen_data_splicing(family=family, n=n, p=p, k=k, rho=rho, M=M) support_size = range(0, 20) model = abessPoisson(path_type="seq", support_size=support_size, ic_type='ebic', is_screening=True, screening_size=20, K_max=10, epsilon=10, powell_path=2, s_min=1, s_max=p, lambda_min=0.01, lambda_max=100, is_cv=True, K=5, exchange_num=2, tau=0.1 * np.log(n * p) / n, primary_model_fit_max_iter=10, primary_model_fit_epsilon=1e-6, early_stop=False, approximate_Newton=True, ic_coef=1., thread=5, sparse_matrix=True) group = np.linspace(1, p, p) model.fit(data.x, data.y, group=group) model2 = abessPoisson(path_type="seq", support_size=support_size, ic_type='ebic', is_screening=True, screening_size=20, K_max=10, epsilon=10, powell_path=2, s_min=1, s_max=p, lambda_min=0.01, lambda_max=100, is_cv=True, K=5, exchange_num=2, tau=0.1 * np.log(n * p) / n, primary_model_fit_max_iter=80, primary_model_fit_epsilon=1e-6, early_stop=False, approximate_Newton=False, ic_coef=1., thread=5) group = np.linspace(1, p, p) model2.fit(data.x, data.y, group=group) model2.predict(data.x) nonzero_true = np.nonzero(data.coef_)[0] nonzero_fit = np.nonzero(model2.coef_)[0] print(nonzero_true) print(nonzero_fit) assert (nonzero_true == nonzero_fit).all() if sys.version_info[1] >= 6: new_x = data.x[:, nonzero_fit] reg = PoissonRegressor(alpha=0, tol=1e-6, max_iter=200) reg.fit(new_x, data.y) print(model2.coef_[nonzero_fit]) print(reg.coef_) assert model2.coef_[nonzero_fit] == approx(reg.coef_, rel=1e-2, abs=1e-2)
def arid_countreg(data_frame, response, con_features=[], cat_features=[], model="additive", alpha=1): # noqaE501 """ Function that performs a count regression on a numerical discete response data, using both an sklearn and statsmodel model analogs (prediction and inference). The function will return both models,each one with their respective insights. Parameters ---------- data_frame : pandas.Dataframe The input dataframe to analyze. response : str A column name of the response variable. Because the function manipulates count data, it must be of type int. con_features : list A list of the continuous explanatory variables to be used in the analysis. Default value is None, meaning to use all the numerical columns in the data frame. cat_features : list A list of the categorical explanatory variables to be used in the analysis.Default value is None, meaning to use all the categorical columns in the data frame. model: str Model type. Either "additive" or "interactive" alpha: float Constant the controls regularization strength in predictive model Returns ------- sklearn.linear_model A fitted sklearn model configured with the chosen input parameters statsmodels.regression.linear_model A fitted statsmodel configured with the chosen input parameters Examples -------- >>> from aridanalysis import aridanalysis >>> aridanalysis.arid_countreg(df, income, features=[feat1, feat5], "additive") """ assert isinstance(con_features, list), "ERROR: INVALID LIST INTPUT PASSED" assert isinstance(cat_features, list), "ERROR: INVALID LIST INTPUT PASSED" # Deal with the features column if len(con_features) == 0: con_features = (data_frame.drop( columns=[response]).select_dtypes("number").columns.tolist()) if len(cat_features) == 0: cat_features = (data_frame.drop(columns=[response]).select_dtypes( ["category", "object"]).columns.tolist()) assert isinstance(data_frame, pd.DataFrame), errors.INVALID_DATAFRAME assert not data_frame.empty, errors.EMPTY_DATAFRAME assert response in data_frame.columns.tolist(), errors.RESPONSE_NOT_FOUND assert all(item in data_frame.columns.tolist() for item in con_features), \ "ERROR: CONTINUOUS VARIABLE(S) NOT IN DATAFRAME" assert all(item in data_frame.columns.tolist() for item in cat_features), \ "ERROR: CATEGORICAL VARIABLE(S) NOT IN DATAFRAME" assert ptypes.is_integer_dtype(data_frame[response].dtype), \ "ERROR: INVALID RESPONSE DATATYPE FOR COUNT REGRESSION: MUST BE TYPE INT" # noqaE501 assert model in ["additive", "interactive"], "ERROR: INVALID MODEL PASSED" assert ptypes.is_numeric_dtype(type(alpha)), errors.INVALID_ALPHA_INPUT # Scikit Learn Model if len(cat_features) != 0: X_sk = data_frame[con_features + cat_features] y_sk = data_frame[response] preprocessor = make_column_transformer( (OneHotEncoder(handle_unknown="ignore"), cat_features)) pipeline = make_pipeline( preprocessor, PoissonRegressor( alpha=alpha, fit_intercept=True, ), ) sk_model = pipeline.fit(X_sk, y_sk) else: X_sk = data_frame[con_features] y_sk = data_frame[response] pipeline = make_pipeline( PoissonRegressor(alpha=0, fit_intercept=True, max_iter=100)) sk_model = pipeline.fit(X_sk, y_sk) # Aditive inferential model if model == "additive": cat_features = ["C(" + i + ")" for i in cat_features] con_list = "".join([ f"{i}" if i is con_features[0] else f" + {i}" for i in con_features ] # noqaE501 ) cat_list = "".join([ f"{i}" if i is cat_features[0] else f" + {i}" for i in cat_features ] # noqaE501 ) if len(cat_list) > 0: formula = f"{response} ~ {con_list} + {cat_list}" else: formula = f"{response} ~ {con_list}" glm_count = smf.glm(formula=formula, data=data_frame, family=sm.families.Poisson()).fit() print(glm_count.summary()) else: cat_features = ["C(" + i + ")" for i in cat_features] con_list = "".join([ f"{i}" if i is con_features[0] else f" + {i}" for i in con_features ] # noqaE501 ) cat_list = "".join([ f"{i}" if i is cat_features[0] else f" + {i}" for i in cat_features ] # noqaE501 ) interact_list = "".join([ f"{i} * {j}" if j is cat_features[0] and i is con_features[0] else f" + {i} * {j}" for i in con_features for j in cat_features ]) equal = set() cont_interaction = "" for i in con_features[0:]: for j in con_features[1:]: if i is con_features[0] and j is con_features[1]: cont_interaction = f"{i} * {j}" equal.update([(i, j)]) if len(equal) > 0: continue if i != j and (j, i) not in equal: equal.update([(i, j)]) cont_interaction += f" + {i} * {j}" if len(cat_features) > 0 and len(cont_interaction) > 0: formula = f"{response} ~ {con_list} + {cat_list} + {interact_list} + {cont_interaction}" # noqaE501 elif len(cat_features) == 0 and len(cont_interaction) > 0: formula = f"{response} ~ {con_list} + {cont_interaction}" elif len(cat_features) > 0 and len(cont_interaction) == 0: formula = f"{response} ~ {con_list} + {cat_list} + {interact_list}" else: formula = f"{response} ~ {con_list}" glm_count = smf.glm(formula=formula, data=data_frame, family=sm.families.Poisson()).fit() print(glm_count.summary()) return (sk_model, glm_count)
[(["age"], ContinuousDomain())] + [(["hhninc", "educ"], ContinuousDomain())] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(visit_X, visit_y) pipeline.verify(visit_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) docvis = DataFrame(pipeline.predict(visit_X), columns = ["docvis"]) store_csv(docvis, name) if "Visit" in datasets: build_visit(GammaRegressor(), "GammaRegressionVisit") build_visit(PoissonRegressor(), "PoissonRegressionVisit") # # Outlier detection # def build_iforest_housing(iforest, name, **pmml_options): mapper = DataFrameMapper([ (housing_X.columns.values, ContinuousDomain()) ]) pipeline = Pipeline([ ("mapper", mapper), ("estimator", iforest) ]) pipeline.fit(housing_X) pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values)
# meaning that the obtained Poisson deviance is approximate. An alternative # approach could be to use :class:`compose.TransformedTargetRegressor` # meta-estimator to map ``y_pred`` to a strictly positive domain. print("Ridge evaluation:") score_estimator(ridge, df_test) ############################################################################## # Next we fit the Poisson regressor on the target variable. We set the # regularization strength ``alpha`` to 1 over number of samples in oder to # mimic the Ridge regressor whose L2 penalty term scales differently with the # number of samples. poisson = make_pipeline( linear_model_preprocessor, PoissonRegressor(alpha=1 / df_train.shape[0], max_iter=1000)) poisson.fit(df_train, df_train["Frequency"], poissonregressor__sample_weight=df_train["Exposure"]) print("PoissonRegressor evaluation:") score_estimator(poisson, df_test) ############################################################################## # Finally, we will consider a non-linear model, namely a random forest. Random # forests do not require the categorical data to be one-hot encoded: instead, # we can encode each category label with an arbitrary integer using # :class:`preprocessing.OrdinalEncoder`. With this encoding, the forest will # treat the categorical features as ordered features, which might not be always # a desired behavior. However this effect is limited for deep enough trees # which are able to recover the categorical nature of the features. The main
# Alpha = 100 regr_l2_100 = linear_model.Ridge(alpha=100) scores_length_l2_100_reg = cross_val_score(regr_l2_100, X_train_std, y_train, cv=5, scoring='r2') regr_l2_100.fit(X_train_std, y_train) #print(scores_length_l2_100_reg) #The mean score and the standard deviation are hence given by: print("%0.2f (with L2 alpha = 100) accuracy with a standard deviation of %0.2f" % (scores_length_l2_100_reg.mean(), scores_length_l2_100_reg.std())) #print(patient) # Commented out IPython magic to ensure Python compatibility. # Modeling with Poisson Regressor import sklearn from sklearn.linear_model import PoissonRegressor regr = PoissonRegressor(alpha=1.0, fit_intercept=True, max_iter=100, tol=0.0001, warm_start=False, verbose=0) regr.fit(X_train_std, y_train) # Make predictions using the testing set y_pred = regr.predict(X_test_std) from sklearn.metrics import r2_score print(r2_score(y_test, y_pred)) # The coefficients # print('Coefficients: \n', regr.coef_) # The mean squared error print('Mean squared error: %.2f' # % mean_squared_error(y_test, y_pred)) # The coefficient of determination: 1 is perfect prediction print('Coefficient of determination: %.2f'
# samples (i.e. `1e-12`) in order to mimic the Ridge regressor whose L2 penalty # term scales differently with the number of samples. # # Since the Poisson regressor internally models the log of the expected target # value instead of the expected value directly (log vs identity link function), # the relationship between X and y is not exactly linear anymore. Therefore the # Poisson regressor is called a Generalized Linear Model (GLM) rather than a # vanilla linear model as is the case for Ridge regression. from sklearn.linear_model import PoissonRegressor n_samples = df_train.shape[0] poisson_glm = Pipeline([ ("preprocessor", linear_model_preprocessor), ("regressor", PoissonRegressor(alpha=1e-12, max_iter=300)), ]) poisson_glm.fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) print("PoissonRegressor evaluation:") score_estimator(poisson_glm, df_test) # %% # Gradient Boosting Regression Trees for Poisson regression # --------------------------------------------------------- # # Finally, we will consider a non-linear model, namely Gradient Boosting # Regression Trees. Tree-based models do not require the categorical data to be # one-hot encoded: instead, we can encode each category label with an arbitrary
def poissonregressor(self,X_train,X_test,y_train,y_test): regressor= PoissonRegressor() regfit=regressor.fit(self.X_train,self.y_train) return regressor.predict(self.X_test)