示例#1
0
def test_sklearn_poisson_regression(nps_app_inst: ArrayApplication):
    def dsqr(dev_func, y, _y_pred):
        dev = dev_func(y, _y_pred)
        y_mean = nps_app_inst.mean(y)
        dev_null = dev_func(y, y_mean)
        return 1 - dev / dev_null

    from sklearn.linear_model import PoissonRegressor as SKPoissonRegressor

    coef = np.array([0.2, -0.1])
    real_X = np.array([[0, 1, 2, 3, 4]]).T
    real_y = np.exp(np.dot(real_X, coef[0]) + coef[1]).reshape(-1)
    X = nps_app_inst.array(real_X, block_shape=real_X.shape)
    y = nps_app_inst.array(real_y, block_shape=real_y.shape)
    param_set = [
        {"tol": 1e-4, "max_iter": 100},
    ]
    for kwargs in param_set:
        lr_model: PoissonRegression = PoissonRegression(**kwargs)
        lr_model.fit(X, y)
        y_pred = lr_model.predict(X).get()
        print("D^2", dsqr(lr_model.deviance, y, y_pred).get())

        sk_lr_model = SKPoissonRegressor(**kwargs)
        sk_lr_model.fit(real_X, real_y)
        sk_y_pred = sk_lr_model.predict(real_X)
        print("D^2", dsqr(lr_model.deviance, y, sk_y_pred).get())
示例#2
0
def test_poisson_regression_family(regression_data):
    # Make sure the family attribute is read-only to prevent searching over it
    # e.g. in a grid search
    est = PoissonRegressor()
    est.family == "poisson"

    msg = "PoissonRegressor.family must be 'poisson'!"
    with pytest.raises(ValueError, match=msg):
        est.family = 0
示例#3
0
def get_trained_model(X, y):
    #Split data into test verification set and training set
    X_Train, X_Test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=1)
    print('got here')

    print('Training:\n')
    #Switching to Poisson from Linear brought RMSE down from 2614.92 to 2281.12
    mlModel = PoissonRegressor(
    )  #create model object #Switched from LinearRegression to PoissonRegressor
    mlModel.fit(X_Train, y_train.values.ravel())  #train model object
    return mlModel
示例#4
0
    def _fit_sklearn(self,
                     dm,
                     binned,
                     alpha,
                     cells=None,
                     retvar=False,
                     noncovwarn=True):
        """
        Fit a GLM using scikit-learn implementation of PoissonRegressor. Uses a regularization
        strength parameter alpha, which is the strength of ridge regularization term. When alpha
        is set to 0, this *should* in theory be the same as _fit_minimize, but in practice it is
        not and seems to exhibit some regularization still.

        Parameters
        ----------
        dm : numpy.ndarray
            Design matrix, in which rows are observations and columns are regressor values. Should
            NOT contain a bias column for the intercept. Scikit-learn handles that.
        binned : numpy.ndarray
            Vector of observed spike counts which we seek to predict. Must be of the same length
            as dm.shape[0]
        alpha : float
            Regularization strength, applied as multiplicative constant on ridge regularization.
        cells : list
            List of cells which should be fit. If None is passed, will default to fitting all cells
            in clu_ids
        variances : bool
            Whether or not to return variances on parameters in dm.
        """
        if cells is None:
            cells = self.clu_ids.flatten()
        coefs = pd.Series(index=cells, name='coefficients', dtype=object)
        intercepts = pd.Series(index=cells, name='intercepts')
        variances = pd.Series(index=cells, name='variances', dtype=object)
        nonconverged = []
        for cell in tqdm(cells, 'Fitting units:', leave=False):
            cell_idx = np.argwhere(self.clu_ids == cell)[0, 0]
            cellbinned = binned[:, cell_idx]
            with catch_warnings(record=True) as w:
                fitobj = PoissonRegressor(alpha=alpha,
                                          max_iter=300).fit(dm, cellbinned)
            if len(w) != 0:
                nonconverged.append(cell)
            wts = np.concatenate([[fitobj.intercept_], fitobj.coef_], axis=0)
            biasdm = np.pad(dm.copy(), ((0, 0), (1, 0)),
                            'constant',
                            constant_values=1)
            if retvar:
                wvar = np.diag(
                    np.linalg.inv(dd_neglog(wts, biasdm, cellbinned)))
            else:
                wvar = np.ones((wts.shape[0], wts.shape[0])) * np.nan
            coefs.at[cell] = fitobj.coef_
            variances.at[cell] = wvar[1:]
            intercepts.at[cell] = fitobj.intercept_
        if noncovwarn:
            if len(nonconverged) != 0:
                warn(
                    f'Fitting did not converge for some units: {nonconverged}')
        return coefs, intercepts, variances
    def __init__(self, correct_glm_bounds=True, recursive_forecast=False):

        # optional parameters
        self.correct_glm_bounds = correct_glm_bounds
        self.recursive_forecast = recursive_forecast

        # pipelines for the models.
        # Scaling for Poisson and Gamma Regression models, they use L2 regularization penalty
        self.pipe_lin_reg_ar = Pipeline([
            ('poly', PolynomialFeatures(1, include_bias=False)),
            ('scale', StandardScaler()), ('reg_lin', LinearRegression())
        ])
        self.pipe_reg_pois = Pipeline([
            ('poly', PolynomialFeatures(2, include_bias=False)),
            ('scale', StandardScaler()),
            ('reg_pois', PoissonRegressor(alpha=0, max_iter=5000))
        ])
        self.pipe_reg_gamm = Pipeline([
            ('poly', PolynomialFeatures(2, include_bias=False)),
            ('scale', StandardScaler()),
            ('reg_gamm', GammaRegressor(alpha=0, max_iter=5000))
        ])
        # initial data values for checking estimators fit ?
        self.x = None
        self.y = None
        self.x_ar = None
        self.y_ar = None
        # dictionary for results.
        self.results = {}
    def Score(self):
        ## data 
        ########################################################################
        model_1 = RandomForestRegressor(max_depth=15,random_state=0)
        model_2 = LinearRegression(fit_intercept=True)
        model_3 = Ridge(alpha=5)
        model_4 = Lasso(alpha=10)
        model_5 = SVR(C=2.5, epsilon=0.5)
        model_6 = GradientBoostingRegressor(random_state=0)
        model_7 = PoissonRegressor()
        
        

        MSE = []
        R2 = []
        for mymodels in [model_1,model_2,model_3,model_4,model_5,model_6,model_7]:
            model_pipeline = Pipeline(steps=[('pre_processing',self.pre_process),('scaler', StandardScaler()),('reduce_dim', PCA()),
                                 ('model', mymodels)
                                 ])
            model_pipeline.fit(self.X_train,self.y_train)
            MSE.append(mean_squared_error(self.y_train,model_pipeline.predict(self.X_train))**0.5)
            R2.append(r2_score(self.y_train,model_pipeline.predict(self.X_train)))
    
        print(np.round(MSE,2))   
        print(np.round(R2,2))
    def get_model(self):

        one_hot = OneHotEncoder(handle_unknown="ignore", sparse=True)

        param_grid = {}

        poisson = PoissonRegressor(
            max_iter=1000,
            alpha=0.2,
        )

        poisson_params = {'clf__alpha': [0.2, 0.4]}

        # param_grid.update(poisson_params)

        pipe = Pipeline([
            ('one_hot', one_hot),
            ('clf', poisson),
        ])

        search = GridSearchCV(
            pipe,
            param_grid,
            n_jobs=-1,
            scoring='r2',
        )

        self.model = pipe

        return pipe
示例#8
0
 def poisson_regression(self, df, split=0.7):
     split = np.random.rand(len(df)) < split
     df = df[self.select_cols]
     df = pd.get_dummies(df, columns=self.dummy_cols, drop_first=False)
     y_train, x_train, y_test, x_test = self.get_split(df, split)
     model = PoissonRegressor()
     result = model.fit(x_train, y_train)
     x_train.to_csv('x_train.csv')
     result_dict = {
         'model': result,
         'score': result.score(x_train, y_train),
         'intercept': result.intercept_,
         'parameters': {
             x_train.columns[j]: result.coef_[j]
             for j in range(len(result.coef_))
         }
     }
     return result_dict
示例#9
0
def regression(transformed, train_data_index_list, test_data_index_list,
               combined_data, dataset_name, data_path, regression_type):
    X_train1 = transformed[transformed.index.isin(train_data_index_list)]
    X_train1 = np.array(X_train1)

    X_test1 = transformed[transformed.index.isin(test_data_index_list)]
    X_test1 = np.array(X_test1)

    Y_train1 = combined_data[transformed.index.isin(train_data_index_list)]
    Y_train1 = Y_train1['bug']

    Y_test1 = combined_data[transformed.index.isin(test_data_index_list)]
    Y_test1 = Y_test1['bug']

    if (regression_type == 'poisson'):
        reg = PoissonRegressor().fit(X_train1, Y_train1)
    elif (regression_type == 'linear'):
        reg = LinearRegression().fit(X_train1, Y_train1)
    else:
        reg = Lasso().fit(X_train1, Y_train1)

    predictions = reg.predict(X_test1)

    FPA_result = str(FPA(predictions))
    CLC_result = str(CLC(predictions))

    if (regression_type == 'poisson'):
        path_to_save = '../../BTP_results/ml_results/poisson' + '_' + dataset_name
        write_to_file('poisson_' + data_path, FPA_result, CLC_result,
                      path_to_save)
    elif (regression_type == 'linear'):
        path_to_save = '../../BTP_results/ml_results/linear' + '_' + dataset_name
        write_to_file('linear_' + data_path, FPA_result, CLC_result,
                      path_to_save)
    else:
        path_to_save = '../../BTP_results/ml_results/lasso' + '_' + dataset_name
        write_to_file('lasso_' + data_path, FPA_result, CLC_result,
                      path_to_save)

    print("FPA metric value obtained is: " + FPA_result)
    print("CLC metric value obtained is: " + CLC_result)
    print("MSE is: " + str(mean_squared_error(Y_test1, predictions)))

    print("success!!")
示例#10
0
def main(lr, train_path, eval_path, save_path, save_img):
    """Problem: Poisson regression with gradient ascent.

    Args:
        lr: Learning rate for gradient ascent.
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        save_path: Path to save predictions.
    """
    # Load training set
    train = pd.read_csv(train_path)
    x_train, y_train = train[['x_1', 'x_2', 'x_3',
                              'x_4']], train[['y']].values.ravel()
    glm = PoissonRegressor(tol=1e-5, max_iter=10000000)
    glm.fit(x_train, y_train)

    valid = pd.read_csv(eval_path)
    x_eval, y_eval = valid[['x_1', 'x_2', 'x_3',
                            'x_4']], valid[['y']].values.ravel()
    predictions = glm.predict(x_eval)

    np.savetxt(save_path, predictions)
    util.scatter(y_eval, predictions, save_img)
    print(glm.coef_)
    print(glm.score(x_eval, y_eval))
示例#11
0
文件: main.py 项目: 1tux/Bat-Lab
def analyze(data, neuron, args=None, confs=None):

    if args is None:
        args = DEFAULT_ARGS

    if confs is None:
        confs = DEFAULT_CONFS

    firing_rates = transform_spikes(neuron, filter_width=50)
    data, neuron, firing_rates = remove_nans(data, neuron, firing_rates)

    create_model = lambda: Model(PoissonRegressor(), spikes=neuron, n_folds=10)
    create_model = lambda: Model(LinearRegression(), spikes=neuron, n_folds=10)
    best_model = create_model()
    subset = data.columns.to_list()  # starting with all columns
    data_ = transform_data(data[subset], args['bins'])
    best_model(data_, firing_rates)
    plot_model(data_, neuron, firing_rates, best_model, subset)
    # return
    bins = args['bins']

    # naive estimation of SHAP values
    # a real calculation will require 2^k (k=no. features) models, and to avg them with different weights (N choose k)
    features_to_remove = get_one_dim_feature_names(
    ) + get_two_dim_feature_names()
    for feature_to_remove in features_to_remove:
        if isinstance(feature_to_remove, str):  # 1D feature
            feature_to_remove = [feature_to_remove]

        model = create_model()
        new_subset = [col for col in subset if col not in feature_to_remove]
        print(new_subset, feature_to_remove)
        new_bins = bins.copy()
        [new_bins.pop(x) for x in feature_to_remove]

        data_ = transform_data(data[new_subset], new_bins)
        print(data_.shape)
        model(data_, firing_rates)
        if model > best_model:
            subset = new_subset
            best_model = model
            bins = new_bins
        get_avg_ll = lambda m: np.mean(
            [x.results.likelihood for x in m.CVfolds])
        print(feature_to_remove, get_avg_ll(model))

    # shuffles_results = run_shuffles(best_model)

    plot_model(data_, neuron, firing_rates, model, subset)
示例#12
0
def test_poisson_glmnet():
    """Compare Poisson regression with L2 regularization and LogLink to glmnet"""
    # library("glmnet")
    # options(digits=10)
    # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
    # x <- data.matrix(df[,c("a", "b")])
    # y <- df$y
    # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson",
    #               standardize=F, thresh=1e-10, nlambda=10000)
    # coef(fit, s=1)
    # (Intercept) -0.12889386979
    # a            0.29019207995
    # b            0.03741173122
    X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
    y = np.array([0, 1, 1, 2])
    glm = PoissonRegressor(
        alpha=1,
        fit_intercept=True,
        tol=1e-7,
        max_iter=300,
    )
    glm.fit(X, y)
    assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5)
    assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5)
示例#13
0
    def _fit(self, dm, binned, cells=None, noncovwarn=False):
        """
        Fit a GLM using scikit-learn implementation of PoissonRegressor. Uses a regularization
        strength parameter alpha, which is the strength of ridge regularization term.

        Parameters
        ----------
        dm : numpy.ndarray
            Design matrix, in which rows are observations and columns are regressor values. Should
            NOT contain a bias column for the intercept. Scikit-learn handles that.
        binned : numpy.ndarray
            Vector of observed spike counts which we seek to predict. Must be of the same length
            as dm.shape[0]
        alpha : float
            Regularization strength, applied as multiplicative constant on ridge regularization.
        cells : list
            List of cells labels for columns in binned. Will default to all cells in model if None
            is passed. Must be of the same length as columns in binned. By default None.
        """
        if cells is None:
            cells = self.clu_ids.flatten()
        if cells.shape[0] != binned.shape[1]:
            raise ValueError('Length of cells does not match shape of binned')

        coefs = pd.Series(index=cells, name='coefficients', dtype=object)
        intercepts = pd.Series(index=cells, name='intercepts')
        nonconverged = []
        for cell in tqdm(cells, 'Fitting units:', leave=False):
            cell_idx = np.argwhere(cells == cell)[0, 0]
            cellbinned = binned[:, cell_idx]
            with catch_warnings(record=True) as w:
                fitobj = PoissonRegressor(
                    alpha=self.alpha,
                    max_iter=300,
                    fit_intercept=self.fit_intercept).fit(dm, cellbinned)
            if len(w) != 0:
                nonconverged.append(cell)
            coefs.at[cell] = fitobj.coef_
            if self.fit_intercept:
                intercepts.at[cell] = fitobj.intercept_
            else:
                intercepts.at[cell] = 0
        if noncovwarn:
            if len(nonconverged) != 0:
                warn(
                    f'Fitting did not converge for some units: {nonconverged}')

        return coefs, intercepts
def get_regressors_generalized(nmodels='all'):
    """
		Returns one or all of Generalized linear regressors 
	"""
    # 1. PoissonRegressor
    lr1 = PoissonRegressor()

    # 2. TweedieRegressor
    lr2 = TweedieRegressor()

    # 3. GammaRegressor
    lr3 = GammaRegressor()

    if (nmodels == 'all'):
        models = [lr1, lr2, lr3]
    else:
        models = ['lr' + str(nmodels)]

    return models
def sk_poisson_regression(X_train, X_test, y_train, y_test):
    glm = PoissonRegressor(alpha=0, fit_intercept=False, max_iter=300)
    glm.fit(X_train, y_train)
    print('score: ', glm.score(X_test, y_test))

    y_hat = glm.predict(X)

    fig = plt.figure(figsize=(6.0, 6.0))
    plt.plot(X, y, 'o')
    plt.plot(X, y_hat, '*', color='r')
    plt.xlabel('x (total_bill)')
    plt.ylabel('y (tips)')
    plt.xlim(0, 60)
    plt.ylim(0, 12)
    plt.show()
示例#16
0
def test_warm_start(solver, fit_intercept, global_random_seed):
    n_samples, n_features = 100, 10
    X, y = make_regression(
        n_samples=n_samples,
        n_features=n_features,
        n_informative=n_features - 2,
        bias=fit_intercept * 1.0,
        noise=1.0,
        random_state=global_random_seed,
    )
    y = np.abs(y)  # Poisson requires non-negative targets.
    alpha = 1
    params = {
        # "solver": solver,  # only lbfgs available
        "fit_intercept": fit_intercept,
        "tol": 1e-10,
    }

    glm1 = PoissonRegressor(warm_start=False,
                            max_iter=1000,
                            alpha=alpha,
                            **params)
    glm1.fit(X, y)

    glm2 = PoissonRegressor(warm_start=True, max_iter=1, alpha=alpha, **params)
    # As we intentionally set max_iter=1 such that the solver should raise a
    # ConvergenceWarning.
    with pytest.warns(ConvergenceWarning):
        glm2.fit(X, y)

    linear_loss = LinearModelLoss(
        base_loss=glm1._get_loss(),
        fit_intercept=fit_intercept,
    )
    sw = np.full_like(y, fill_value=1 / n_samples)

    objective_glm1 = linear_loss.loss(
        coef=np.r_[glm1.coef_,
                   glm1.intercept_] if fit_intercept else glm1.coef_,
        X=X,
        y=y,
        sample_weight=sw,
        l2_reg_strength=alpha,
    )
    objective_glm2 = linear_loss.loss(
        coef=np.r_[glm2.coef_,
                   glm2.intercept_] if fit_intercept else glm2.coef_,
        X=X,
        y=y,
        sample_weight=sw,
        l2_reg_strength=alpha,
    )
    assert objective_glm1 < objective_glm2

    glm2.set_params(max_iter=1000)
    glm2.fit(X, y)
    # The two models are not exactly identical since the lbfgs solver
    # computes the approximate hessian from previous iterations, which
    # will not be strictly identical in the case of a warm start.
    assert_allclose(glm1.coef_, glm2.coef_, rtol=2e-4)
    assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-5)
示例#17
0
@pytest.fixture(scope="module")
def regression_data():
    X, y = make_regression(n_samples=107,
                           n_features=10,
                           n_informative=80,
                           noise=0.5,
                           random_state=2)
    return X, y


@pytest.fixture(
    params=itertools.product(
        ["long", "wide"],
        [
            BinomialRegressor(),
            PoissonRegressor(),
            GammaRegressor(),
            # TweedieRegressor(power=3.0),  # too difficult
            # TweedieRegressor(power=0, link="log"),  # too difficult
            TweedieRegressor(power=1.5),
        ],
    ),
    ids=lambda param: f"{param[0]}-{param[1]}",
)
def glm_dataset(global_random_seed, request):
    """Dataset with GLM solutions, well conditioned X.

    This is inspired by ols_ridge_dataset in test_ridge.py.

    The construction is based on the SVD decomposition of X = U S V'.
示例#18
0
    # the power attribute is properly updated
    power = 2.0
    est = TweedieRegressor(power=power)
    assert isinstance(est.family, TweedieDistribution)
    assert est.family.power == power
    assert est.power == power

    new_power = 0
    new_family = TweedieDistribution(power=new_power)
    est.family = new_family
    assert isinstance(est.family, TweedieDistribution)
    assert est.family.power == new_power
    assert est.power == new_power

    msg = "TweedieRegressor.family must be of type TweedieDistribution!"
    with pytest.raises(TypeError, match=msg):
        est.family = None


@pytest.mark.parametrize(
    "estimator, value",
    [
        (PoissonRegressor(), True),
        (GammaRegressor(), True),
        (TweedieRegressor(power=1.5), True),
        (TweedieRegressor(power=0), False),
    ],
)
def test_tags(estimator, value):
    assert estimator._get_tags()["requires_positive_y"] is value
示例#19
0
    features = [
        'year', 'month', 'workingday', 'hour', 'holiday', 'weather', 'atemp',
        'humidity', 'windspeed', 'season'
    ]
    for f in ['holiday', 'atemp']:
        features.remove(f)
    linear_model_preprocessor = ColumnTransformer(
        [
            # ('passthrough_numeric', 'passthrough', features)
            # ('passthrough_numeric', Normalizer(norm='l2'), features)
            ('passthrough_numeric', StandardScaler(), features)
        ],
        remainder='drop')

    # poisson_regressor = PoissonRegressor(len(features))
    poisson_regressor = PoissonRegressor(alpha=0)
    # poisson_regressor = Ridge()
    model = Pipeline([('preprocessor', linear_model_preprocessor),
                      ('regressor', poisson_regressor)])
    print('fitting model...')
    model.fit(train_df, train_df['count'])
    print('evaluating model over train...')
    error = evaluate(model, train_df)
    print('error', error)
    print('evaluating model over test...')
    error = evaluate(model, validation_df)
    print('error', error)
    print('params:', poisson_regressor.coef_)
    print('intercept:', poisson_regressor.intercept_)
    print(dir(poisson_regressor))
示例#20
0
#!/usr/bin/env python
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor

n_samples, n_features = 1000, 20
rng = np.random.RandomState(0)
X = rng.randn(n_samples, n_features)
# positive integer target correlated with X[:, 5] with many zeros:
y = rng.poisson(lam=np.exp(X[:, 5]) / 2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
glm = PoissonRegressor()
gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01)
glm.fit(X_train, y_train)
gbdt.fit(X_train, y_train)
print(glm.score(X_test, y_test))
print(gbdt.score(X_test, y_test))
# Frequency model -- Poisson distribution
# ---------------------------------------
#
# The number of claims (``ClaimNb``) is a positive integer (0 included).
# Thus, this target can be modelled by a Poisson distribution.
# It is then assumed to be the number of discrete events occurring with a
# constant rate in a given time interval (``Exposure``, in units of years).
# Here we model the frequency ``y = ClaimNb / Exposure``, which is still a
# (scaled) Poisson distribution, and use ``Exposure`` as `sample_weight`.

df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0)

# The parameters of the model are estimated by minimizing the Poisson deviance
# on the training set via a quasi-Newton solver: l-BFGS. Some of the features
# are collinear, we use a weak penalization to avoid numerical issues.
glm_freq = PoissonRegressor(alpha=1e-3, max_iter=400)
glm_freq.fit(X_train,
             df_train["Frequency"],
             sample_weight=df_train["Exposure"])

scores = score_estimator(
    glm_freq,
    X_train,
    X_test,
    df_train,
    df_test,
    target="Frequency",
    weights="Exposure",
)
print("Evaluation of PoissonRegressor on target Frequency")
print(scores)
def PoissonRegGS(X_train, X_test, y_train, y_test):
    y_train1 = y_train[:, 0]
    y_train2 = y_train[:, 1]
    reg1 = PoissonRegressor()
    reg2 = PoissonRegressor()
    grid_values = {'alpha': list(range(1, 3))}

    grid_reg1 = GridSearchCV(
        reg1,
        param_grid=grid_values,
        scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'],
        refit='r2',
        n_jobs=-1,
        cv=2,
        verbose=100)
    grid_reg1.fit(X_train, y_train1)
    reg1 = grid_reg1.best_estimator_
    reg1.fit(X_train, y_train1)
    grid_reg2 = GridSearchCV(
        reg2,
        param_grid=grid_values,
        scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'],
        refit='r2',
        n_jobs=-1,
        cv=2,
        verbose=100)
    grid_reg2.fit(X_train, y_train2)
    reg2 = grid_reg1.best_estimator_
    reg2.fit(X_train, y_train2)
    y_pred1 = reg1.predict(X=X_test)
    y_pred2 = reg2.predict(X=X_test)
    y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1)))

    printMetrics(y_true=y_test, y_pred=y_pred)

    val_metrics = getMetrics(y_true=y_test, y_pred=y_pred)
    y_pred1 = reg1.predict(X=X_train)
    y_pred2 = reg2.predict(X=X_train)
    y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1)))
    metrics = getMetrics(y_true=y_train, y_pred=y_pred)

    printMetrics(y_true=y_train, y_pred=y_pred)

    best_params1: dict = grid_reg1.best_params_
    best_params2: dict = grid_reg2.best_params_
    best_params = {}
    for key in best_params1.keys():
        best_params[key] = [best_params1[key], best_params2[key]]
    saveBestParams(nameOfModel="PoissonRegGS", best_params=best_params)
    logSave(nameOfModel="PoissonRegGS",
            reg=[reg1, reg2],
            metrics=metrics,
            val_metrics=val_metrics)
def PoissonReg(X_train, X_test, y_train, y_test):
    y_train1 = y_train[:, 0]
    y_train2 = y_train[:, 1]
    reg1 = PoissonRegressor()
    reg1.fit(X_train, y_train1)
    reg2 = PoissonRegressor()
    reg2.fit(X_train, y_train2)
    y_pred1 = reg1.predict(X=X_test)
    y_pred2 = reg2.predict(X=X_test)
    y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1)))

    printMetrics(y_true=y_test, y_pred=y_pred)

    val_metrics = getMetrics(y_true=y_test, y_pred=y_pred)
    y_pred1 = reg1.predict(X=X_train)
    y_pred2 = reg2.predict(X=X_train)
    y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1)))
    metrics = getMetrics(y_true=y_train, y_pred=y_pred)

    printMetrics(y_true=y_train, y_pred=y_pred)

    logSave(nameOfModel="PoissonReg",
            reg=[reg1, reg2],
            metrics=metrics,
            val_metrics=val_metrics)
示例#24
0
    def test_poisson(self):
        # to do
        n = 100
        p = 20
        k = 3
        family = "poisson"
        rho = 0.5
        sigma = 1
        M = 1
        np.random.seed(3)
        data = gen_data(n, p, family=family, k=k, rho=rho, sigma=sigma)
        data2 = gen_data_splicing(family=family, n=n, p=p, k=k, rho=rho, M=M)
        support_size = range(0, 20)

        model = abessPoisson(path_type="seq",
                             support_size=support_size,
                             ic_type='ebic',
                             is_screening=True,
                             screening_size=20,
                             K_max=10,
                             epsilon=10,
                             powell_path=2,
                             s_min=1,
                             s_max=p,
                             lambda_min=0.01,
                             lambda_max=100,
                             is_cv=True,
                             K=5,
                             exchange_num=2,
                             tau=0.1 * np.log(n * p) / n,
                             primary_model_fit_max_iter=10,
                             primary_model_fit_epsilon=1e-6,
                             early_stop=False,
                             approximate_Newton=True,
                             ic_coef=1.,
                             thread=5,
                             sparse_matrix=True)
        group = np.linspace(1, p, p)
        model.fit(data.x, data.y, group=group)

        model2 = abessPoisson(path_type="seq",
                              support_size=support_size,
                              ic_type='ebic',
                              is_screening=True,
                              screening_size=20,
                              K_max=10,
                              epsilon=10,
                              powell_path=2,
                              s_min=1,
                              s_max=p,
                              lambda_min=0.01,
                              lambda_max=100,
                              is_cv=True,
                              K=5,
                              exchange_num=2,
                              tau=0.1 * np.log(n * p) / n,
                              primary_model_fit_max_iter=80,
                              primary_model_fit_epsilon=1e-6,
                              early_stop=False,
                              approximate_Newton=False,
                              ic_coef=1.,
                              thread=5)
        group = np.linspace(1, p, p)
        model2.fit(data.x, data.y, group=group)
        model2.predict(data.x)

        nonzero_true = np.nonzero(data.coef_)[0]
        nonzero_fit = np.nonzero(model2.coef_)[0]
        print(nonzero_true)
        print(nonzero_fit)
        assert (nonzero_true == nonzero_fit).all()

        if sys.version_info[1] >= 6:
            new_x = data.x[:, nonzero_fit]
            reg = PoissonRegressor(alpha=0, tol=1e-6, max_iter=200)
            reg.fit(new_x, data.y)
            print(model2.coef_[nonzero_fit])
            print(reg.coef_)
            assert model2.coef_[nonzero_fit] == approx(reg.coef_,
                                                       rel=1e-2,
                                                       abs=1e-2)
示例#25
0
def arid_countreg(data_frame,
                  response,
                  con_features=[],
                  cat_features=[],
                  model="additive",
                  alpha=1):  # noqaE501
    """
    Function that performs a count regression on a numerical discete response
    data, using both an sklearn and statsmodel model analogs (prediction and
    inference). The function will return both models,each one with their
    respective insights.

    Parameters
    ----------
    data_frame : pandas.Dataframe
      The input dataframe to analyze.
    response : str
      A column name of the response variable. Because the function manipulates
      count data, it must be of type int.
    con_features : list
      A list of the continuous explanatory variables to be used in the
      analysis. Default value is None, meaning to use all the numerical
      columns in the data frame.
    cat_features : list
      A list of the categorical explanatory variables to be used in the
      analysis.Default value is None, meaning to use all the categorical
      columns in the data frame.
    model: str
      Model type. Either "additive" or "interactive"
    alpha: float
      Constant the controls regularization strength in predictive model

    Returns
    -------
    sklearn.linear_model
        A fitted sklearn model configured with the chosen input parameters
    statsmodels.regression.linear_model
        A fitted statsmodel configured with the chosen input parameters

    Examples
    --------
    >>> from aridanalysis import aridanalysis
    >>> aridanalysis.arid_countreg(df,
                                  income,
                                  features=[feat1, feat5],
                                  "additive")
    """
    assert isinstance(con_features, list), "ERROR: INVALID LIST INTPUT PASSED"
    assert isinstance(cat_features, list), "ERROR: INVALID LIST INTPUT PASSED"

    # Deal with the features column
    if len(con_features) == 0:
        con_features = (data_frame.drop(
            columns=[response]).select_dtypes("number").columns.tolist())
    if len(cat_features) == 0:
        cat_features = (data_frame.drop(columns=[response]).select_dtypes(
            ["category", "object"]).columns.tolist())

    assert isinstance(data_frame, pd.DataFrame), errors.INVALID_DATAFRAME
    assert not data_frame.empty, errors.EMPTY_DATAFRAME
    assert response in data_frame.columns.tolist(), errors.RESPONSE_NOT_FOUND
    assert all(item in data_frame.columns.tolist() for item in con_features), \
        "ERROR: CONTINUOUS VARIABLE(S) NOT IN DATAFRAME"
    assert all(item in data_frame.columns.tolist() for item in cat_features), \
        "ERROR: CATEGORICAL VARIABLE(S) NOT IN DATAFRAME"
    assert ptypes.is_integer_dtype(data_frame[response].dtype), \
        "ERROR: INVALID RESPONSE DATATYPE FOR COUNT REGRESSION: MUST BE TYPE INT" # noqaE501
    assert model in ["additive", "interactive"], "ERROR: INVALID MODEL PASSED"
    assert ptypes.is_numeric_dtype(type(alpha)), errors.INVALID_ALPHA_INPUT

    # Scikit Learn Model
    if len(cat_features) != 0:
        X_sk = data_frame[con_features + cat_features]
        y_sk = data_frame[response]
        preprocessor = make_column_transformer(
            (OneHotEncoder(handle_unknown="ignore"), cat_features))
        pipeline = make_pipeline(
            preprocessor,
            PoissonRegressor(
                alpha=alpha,
                fit_intercept=True,
            ),
        )
        sk_model = pipeline.fit(X_sk, y_sk)
    else:
        X_sk = data_frame[con_features]
        y_sk = data_frame[response]
        pipeline = make_pipeline(
            PoissonRegressor(alpha=0, fit_intercept=True, max_iter=100))
        sk_model = pipeline.fit(X_sk, y_sk)

    # Aditive inferential model
    if model == "additive":
        cat_features = ["C(" + i + ")" for i in cat_features]
        con_list = "".join([
            f"{i}" if i is con_features[0] else f" + {i}" for i in con_features
        ]  # noqaE501
                           )
        cat_list = "".join([
            f"{i}" if i is cat_features[0] else f" + {i}" for i in cat_features
        ]  # noqaE501
                           )
        if len(cat_list) > 0:
            formula = f"{response} ~ {con_list} + {cat_list}"
        else:
            formula = f"{response} ~ {con_list}"
        glm_count = smf.glm(formula=formula,
                            data=data_frame,
                            family=sm.families.Poisson()).fit()
        print(glm_count.summary())
    else:
        cat_features = ["C(" + i + ")" for i in cat_features]
        con_list = "".join([
            f"{i}" if i is con_features[0] else f" + {i}" for i in con_features
        ]  # noqaE501
                           )
        cat_list = "".join([
            f"{i}" if i is cat_features[0] else f" + {i}" for i in cat_features
        ]  # noqaE501
                           )
        interact_list = "".join([
            f"{i} * {j}" if j is cat_features[0] and i is con_features[0] else
            f" + {i} * {j}" for i in con_features for j in cat_features
        ])
        equal = set()
        cont_interaction = ""
        for i in con_features[0:]:
            for j in con_features[1:]:
                if i is con_features[0] and j is con_features[1]:
                    cont_interaction = f"{i} * {j}"
                    equal.update([(i, j)])
                    if len(equal) > 0:
                        continue
                if i != j and (j, i) not in equal:
                    equal.update([(i, j)])
                    cont_interaction += f" + {i} * {j}"
        if len(cat_features) > 0 and len(cont_interaction) > 0:
            formula = f"{response} ~ {con_list} + {cat_list} + {interact_list} + {cont_interaction}"  # noqaE501
        elif len(cat_features) == 0 and len(cont_interaction) > 0:
            formula = f"{response} ~ {con_list} + {cont_interaction}"
        elif len(cat_features) > 0 and len(cont_interaction) == 0:
            formula = f"{response} ~ {con_list} + {cat_list} + {interact_list}"
        else:
            formula = f"{response} ~ {con_list}"
        glm_count = smf.glm(formula=formula,
                            data=data_frame,
                            family=sm.families.Poisson()).fit()
        print(glm_count.summary())

    return (sk_model, glm_count)
示例#26
0
		[(["age"], ContinuousDomain())] +
		[(["hhninc", "educ"], ContinuousDomain())]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(visit_X, visit_y)
	pipeline.verify(visit_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	docvis = DataFrame(pipeline.predict(visit_X), columns = ["docvis"])
	store_csv(docvis, name)

if "Visit" in datasets:
	build_visit(GammaRegressor(), "GammaRegressionVisit")
	build_visit(PoissonRegressor(), "PoissonRegressionVisit")

#
# Outlier detection
#

def build_iforest_housing(iforest, name, **pmml_options):
	mapper = DataFrameMapper([
		(housing_X.columns.values, ContinuousDomain())
	])
	pipeline = Pipeline([
		("mapper", mapper),
		("estimator", iforest)
	])
	pipeline.fit(housing_X)
	pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values)
示例#27
0
# meaning that the obtained Poisson deviance is approximate. An alternative
# approach could be to use :class:`compose.TransformedTargetRegressor`
# meta-estimator to map ``y_pred`` to a strictly positive domain.

print("Ridge evaluation:")
score_estimator(ridge, df_test)

##############################################################################
# Next we fit the Poisson regressor on the target variable. We set the
# regularization strength ``alpha`` to 1 over number of samples in oder to
# mimic the Ridge regressor whose L2 penalty term scales differently with the
# number of samples.

poisson = make_pipeline(
    linear_model_preprocessor,
    PoissonRegressor(alpha=1 / df_train.shape[0], max_iter=1000))
poisson.fit(df_train,
            df_train["Frequency"],
            poissonregressor__sample_weight=df_train["Exposure"])

print("PoissonRegressor evaluation:")
score_estimator(poisson, df_test)

##############################################################################
# Finally, we will consider a non-linear model, namely a random forest. Random
# forests do not require the categorical data to be one-hot encoded: instead,
# we can encode each category label with an arbitrary integer using
# :class:`preprocessing.OrdinalEncoder`. With this encoding, the forest will
# treat the categorical features as ordered features, which might not be always
# a desired behavior. However this effect is limited for deep enough trees
# which are able to recover the categorical nature of the features. The main
示例#28
0
# Alpha = 100
regr_l2_100 = linear_model.Ridge(alpha=100)
scores_length_l2_100_reg = cross_val_score(regr_l2_100, X_train_std, y_train, cv=5, scoring='r2') 
regr_l2_100.fit(X_train_std, y_train)
#print(scores_length_l2_100_reg)
#The mean score and the standard deviation are hence given by:
print("%0.2f (with L2 alpha = 100) accuracy with a standard deviation of %0.2f" % (scores_length_l2_100_reg.mean(), scores_length_l2_100_reg.std()))
#print(patient)

# Commented out IPython magic to ensure Python compatibility.
# Modeling with Poisson Regressor

import sklearn
from sklearn.linear_model import PoissonRegressor
regr = PoissonRegressor(alpha=1.0, fit_intercept=True, max_iter=100, tol=0.0001, warm_start=False, verbose=0)
regr.fit(X_train_std, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test_std)

from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))

# The coefficients
# print('Coefficients: \n', regr.coef_)
# The mean squared error
print('Mean squared error: %.2f'
#       % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
# samples (i.e. `1e-12`) in order to mimic the Ridge regressor whose L2 penalty
# term scales differently with the number of samples.
#
# Since the Poisson regressor internally models the log of the expected target
# value instead of the expected value directly (log vs identity link function),
# the relationship between X and y is not exactly linear anymore. Therefore the
# Poisson regressor is called a Generalized Linear Model (GLM) rather than a
# vanilla linear model as is the case for Ridge regression.

from sklearn.linear_model import PoissonRegressor

n_samples = df_train.shape[0]

poisson_glm = Pipeline([
    ("preprocessor", linear_model_preprocessor),
    ("regressor", PoissonRegressor(alpha=1e-12, max_iter=300)),
])
poisson_glm.fit(df_train,
                df_train["Frequency"],
                regressor__sample_weight=df_train["Exposure"])

print("PoissonRegressor evaluation:")
score_estimator(poisson_glm, df_test)

# %%
# Gradient Boosting Regression Trees for Poisson regression
# ---------------------------------------------------------
#
# Finally, we will consider a non-linear model, namely Gradient Boosting
# Regression Trees. Tree-based models do not require the categorical data to be
# one-hot encoded: instead, we can encode each category label with an arbitrary
示例#30
0
 def poissonregressor(self,X_train,X_test,y_train,y_test):
     
     regressor= PoissonRegressor()
     regfit=regressor.fit(self.X_train,self.y_train)
     return regressor.predict(self.X_test)