示例#1
0
def test_tweedie_score(regression_data, power, link):
    """Test that GLM score equals d2_tweedie_score for Tweedie losses."""
    X, y = regression_data
    # make y positive
    y = np.abs(y) + 1.0
    glm = TweedieRegressor(power=power, link=link).fit(X, y)
    assert glm.score(X, y) == pytest.approx(
        d2_tweedie_score(y, glm.predict(X), power=power))
示例#2
0
def regression(linkfunc, x, y, test):

    reg = TweedieRegressor(power=POWER, alpha=ALPHA, link=linkfunc)

    # reshaping when there is only 1 feature (= dependent variable)
    if len(x.shape) == 1:
        x = x.reshape(-1, 1) # ! convert to "column" vector
                             #   as data should be in rows
        test = test.reshape(-1, 1)

    variable_cnt = x.shape[1]

    plur = "s" if variable_cnt > 1 else ""
    print()
    print('generalized linear regression parametrized as')
    print(f' -- link = \'{linkfunc}\'')
    print(f' -- {variable_cnt} dependent variable{plur}')
    print(reg)

    print()
    print(f'train: {x} -> {y}')
    print(f'test: {test} -> ???')

    reg.fit(x, y)

    predicted = reg.predict(test)

    print()
    print('predicted:')
    print(predicted)

    print()
    print('y = reg.coef_ * x + reg.intercept_')
    print(f'reg.coef_ = {reg.coef_}')
    print(f'reg.intercept_ = {reg.intercept_:.2f}')
    for t in test:
        x_val = t
        y_val = reg.coef_ * t + reg.intercept_
        print(f'{x_val} -> {y_val}')

    strs = []
    if variable_cnt > 1:
        strs.append('sum')
    if linkfunc != 'identity':
        strs.append(f'inverse of link function \'{linkfunc}\'')
    basic_str = 'to be applied!'

    if len(strs) > 0:
        print(' and '.join(strs), basic_str)

    print()
    print(f'n_iter_ = {reg.n_iter_}')

    print()
示例#3
0
def tweedie_test(X_train, y_train, X_test, y_test, pwr, alf):
    '''
    runs tweedie algorithm
    '''
    # Make Model
    tw = TweedieRegressor(power=pwr, alpha=alf)  # 0 = normal distribution
    # Fit Model
    tw.fit(X_train, y_train)
    # Make Predictions
    tw_pred = tw.predict(X_test)
    # Compute root mean squared error
    tw_MAE = mean_absolute_error(y_test, tw_pred)
    return tw_MAE, tw, tw_pred
def tweedie(X_train_scaled, y_train):
    '''
    runs tweedie algorithm
    '''
    # Make Model
    tw = TweedieRegressor(power=0, alpha=.001)  # 0 = normal distribution
    # Fit Model
    tw.fit(X_train_scaled, y_train)
    # Make Predictions
    tw_pred = tw.predict(X_train_scaled)
    # Compute root mean squared error
    tw_rmse = sqrt(mean_squared_error(y_train, tw_pred))
    return tw_rmse
示例#5
0
def tweedie05(X_train_scaled, y_train):
    '''
    runs tweedie algorithm
    '''
    # Make Model
    tw = TweedieRegressor(power=0, alpha=.5)  # 0 = normal distribution
    # Fit Model
    tw.fit(X_train_scaled, y_train)
    # Make Predictions
    tw_pred = tw.predict(X_train_scaled)
    # Compute root mean squared error
    tw_MAE = mean_absolute_error(y_train, tw_pred)
    return tw_MAE
def tweedie_vt(X_train_scaled, X_validate_scaled, y_train, y_validate):
    '''
    runs tweedie algorithm on validate and test
    but fits model on train
    '''
    # Make Model
    tw = TweedieRegressor(power=0, alpha=0.001)  # 0 = normal distribution
    # Fit Model
    tw.fit(X_train_scaled, y_train)
    # Make Predictions
    tw_pred = tw.predict(X_validate_scaled)
    # Compute root mean squared error
    tw_rmse = sqrt(mean_squared_error(y_validate, tw_pred))
    return tw_rmse
示例#7
0
def train_model_normalize(document, drop):
    std = StandardScaler()
    drop.append('charges')
    datos_normalizados = std.fit_transform(document)
    dataframe_normalizado = pd.DataFrame(datos_normalizados,
                                         index=document.index,
                                         columns=document.columns)
    # print(dataframe_normalizado)
    x = dataframe_normalizado.drop(drop, 1)
    y = dataframe_normalizado['charges']

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        train_size=0.7,
                                                        random_state=1)

    #reg = LinearRegression().fit(x_train, y_train)
    #reg = Ridge(alpha=100).fit(x_train, y_train)
    #br = BayesianRidge()
    #reg = br.fit(x_train, y_train)
    #reg = Lasso(alpha=0.0000005, fit_intercept=False, tol=0.000000000000001,
    #      max_iter=1000000000).fit(x_train, y_train)
    reg = TweedieRegressor(alpha=0.1).fit(x_train, y_train)
    corr = x_train.corr()
    sns.heatmap(corr,
                mask=np.zeros_like(corr, dtype=np.bool),
                cmap=sns.diverging_palette(220, 10, as_cmap=True),
                square=True)
    return reg, x_test, y_test
示例#8
0
 def __params_pipe(**glm_pars):
     glm_pars.update(self.params)
     return Pipeline([
         ('scaler',
          StandardScaler(with_mean=self.standardize,
                         with_std=self.standardize)),
         ('glm', TweedieRegressor(**glm_pars))
     ])
示例#9
0
def glm_model(X_tr, y_tr, X_v, y_v, X_te, y_te, d_str, **kwargs):
    '''
    Generalized Linear Model with a Tweedie distribution.
    This estimator can be used to model different GLMs depending 
    on the power parameter, which determines the underlying distribution.
    '''
    # create the model object
    glm = TweedieRegressor(**kwargs)

    # fit the model to our training data
    glm.fit(X_tr, y_tr)

    # predict on train
    glm_pred = glm.predict(X_tr)
    # compute root mean squared error
    glm_rmse = sqrt(mean_squared_error(y_tr, glm_pred))

    # predict on validate
    glm_pred_v = glm.predict(X_v)
    # compute root mean squared error
    glm_rmse_v = sqrt(mean_squared_error(y_v, glm_pred_v))

    # predict on test
    glm_pred_t = glm.predict(X_te)
    # compute root mean squared error
    glm_rmse_t = sqrt(mean_squared_error(y_te, glm_pred_t))
    print(f'RMSE for GLM using {d_str} Distribution \n')
    print('On train data:\n', round(glm_rmse, 6), '\n')
    # print(glm_rmse_v)
    return glm_rmse, glm_rmse_v, glm_rmse_t, glm_pred_t
示例#10
0
def test_tweedie_regression_family(regression_data):
    # Make sure the family attribute is always a TweedieDistribution and that
    # the power attribute is properly updated
    power = 2.0
    est = TweedieRegressor(power=power)
    assert isinstance(est.family, TweedieDistribution)
    assert est.family.power == power
    assert est.power == power

    new_power = 0
    new_family = TweedieDistribution(power=new_power)
    est.family = new_family
    assert isinstance(est.family, TweedieDistribution)
    assert est.family.power == new_power
    assert est.power == new_power

    msg = "TweedieRegressor.family must be of type TweedieDistribution!"
    with pytest.raises(TypeError, match=msg):
        est.family = None
示例#11
0
 def fit(self, X, y=None, sample_weight=None):
     response = X.columns[0] if not self.response else self.response
     self.model_ = DevelopmentML(Pipeline(steps=[
         ('design_matrix', PatsyFormula(self.design_matrix)),
         ('model', TweedieRegressor(
                 link=self.link, power=self.power, max_iter=self.max_iter,
                 tol=self.tol, warm_start=self.warm_start,
                 verbose=self.verbose, fit_intercept=False))]),
                 y_ml=response, weight_ml=self.weight).fit(X)
     return self
示例#12
0
def hurdle(x, y, log=True, max_iter=1000):
    x, y = remove_nans(x, y)
    n_obs = len(x)

    clf = LogisticRegression(fit_intercept=True,
                             penalty='none',
                             max_iter=max_iter)

    if log:
        reg = TweedieRegressor(fit_intercept=True,
                               power=0,
                               link='log',
                               alpha=0,
                               tol=1e-8,
                               max_iter=max_iter)
    else:
        reg = LinearRegression(fit_intercept=True)

    clf.fit(x, y > 0)
    reg.fit(x[y > 0, :], y[y > 0])

    return HurdleModel(clf, reg, n_obs, log=log, x=x, y=y)
def get_regressors_generalized(nmodels='all'):
    """
		Returns one or all of Generalized linear regressors 
	"""
    # 1. PoissonRegressor
    lr1 = PoissonRegressor()

    # 2. TweedieRegressor
    lr2 = TweedieRegressor()

    # 3. GammaRegressor
    lr3 = GammaRegressor()

    if (nmodels == 'all'):
        models = [lr1, lr2, lr3]
    else:
        models = ['lr' + str(nmodels)]

    return models
示例#14
0
def test_tweedie_link_argument(name, link_class):
    """Test GLM link argument set as string."""
    y = np.array([0.1, 0.5])  # in range of all distributions
    X = np.array([[1], [2]])
    glm = TweedieRegressor(power=1, link=name).fit(X, y)
    assert isinstance(glm._base_loss.link, link_class)

    glm = TweedieRegressor(power=1, link="not a link")
    with pytest.raises(
            ValueError,
            match=re.escape(
                "The link must be an element of ['auto', 'identity', 'log']"),
    ):
        glm.fit(X, y)
示例#15
0
def cvTweedie(X_train, y_train, pwr, alf):
    # Tweedie Regressor
    # create loocv procedure
    cvTW = LeaveOneOut()
    # create model
    modelTW = TweedieRegressor(power=pwr, alpha=alf)  # 0 = normal distribution
    # evaluate model
    scoresTW = cross_val_score(modelTW,
                               X_train,
                               y_train,
                               scoring='neg_mean_absolute_error',
                               cv=cvTW,
                               n_jobs=-1)
    # force positive
    scoresTW = absolute(scoresTW)
    # report performance
    print('MAE: %.3f (%.3f)' % (mean(scoresTW), std(scoresTW)))
    meanMAE = mean(scoresTW)
    stddevMAE = std(scoresTW)
    return meanMAE
def sk_tweedie_regression(X_train,
                          X_test,
                          y_train,
                          y_test,
                          set_model='linear'):
    if set_model == 'Poisson':
        reg = TweedieRegressor(
            alpha=0,
            power=1,  # Poisson distribution
            link='log',
            fit_intercept=False,
            max_iter=300)
    elif set_model == 'linear':
        reg = TweedieRegressor(
            alpha=0,
            power=0,  # Normal distribution
            link='identity',
            fit_intercept=False,
            max_iter=300)
    else:
        print('Set the correct name.')
        return

    reg.fit(X_train, y_train)
    print('score: ', reg.score(X_test, y_test))

    y_hat = reg.predict(X)

    fig = plt.figure(figsize=(6.0, 6.0))
    plt.plot(X, y, 'o')
    plt.plot(X, y_hat, '*', color='r')
    plt.xlabel('x (total_bill)')
    plt.ylabel('y (tips)')
    plt.xlim(0, 60)
    plt.ylim(0, 12)
    plt.show()
示例#17
0
    def get_pipe(self):
        try:

            if self.inner_cv is None:
                inner_cv = RepeatedKFold(n_splits=10,
                                         n_repeats=1,
                                         random_state=0)
            else:
                inner_cv = self.inner_cv
            if self.est_kwargs is None:
                self.est_kwargs = {
                    'reg__alpha':
                    np.logspace(-5, 10, self.gridpoints).tolist(
                    ),  #investigate the ideal range for alpha
                    'reg__power':
                    [0, *np.logspace(1, 3, self.gridpoints - 1).tolist()
                     ],  #investigate power values between 2 and 3
                    'select__max_k': [4, 8, 32]
                }  #maybe look to tweak using k_share
            steps = [('scaler', StandardScaler()),
                     ('select', shrinkBigKTransformer(max_k=8)),
                     ('reg', TweedieRegressor())]
            if self.bestT:
                steps.insert(
                    0, 'xtransform',
                    columnBestTransformer(float_k=len(self.float_idx)))

            outerpipe = GridSearchCV(Pipeline(steps=steps),
                                     param_grid=self.est_kwargs,
                                     cv=inner_cv)
            if self.do_prep:
                steps = [('prep', missingValHandler(prep_dict=self.prep_dict)),
                         ('post', outerpipe)]
                outerpipe = Pipeline(steps=steps)
            return outerpipe
        except:
            self.logger.exception(f'get_pipe error for flexibleGLM')
# implementation does not allow for this (yet).
#
# We will compare the performance of both approaches.
# To quantify the performance of both models, one can compute
# the mean deviance of the train and test data assuming a Compound
# Poisson-Gamma distribution of the total claim amount. This is equivalent to
# a Tweedie distribution with a `power` parameter between 1 and 2.
#
# The :func:`sklearn.metrics.mean_tweedie_deviance` depends on a `power`
# parameter. As we do not know the true value of the `power` parameter, we here
# compute the mean deviances for a grid of possible values, and compare the
# models side by side, i.e. we compare them at identical values of `power`.
# Ideally, we hope that one model will be consistently better than the other,
# regardless of `power`.

glm_pure_premium = TweedieRegressor(power=1.9, alpha=0.1, max_iter=10000)
glm_pure_premium.fit(X_train,
                     df_train["PurePremium"],
                     sample_weight=df_train["Exposure"])

tweedie_powers = [1.5, 1.7, 1.8, 1.9, 1.99, 1.999, 1.9999]

scores_product_model = score_estimator(
    (glm_freq, glm_sev),
    X_train,
    X_test,
    df_train,
    df_test,
    target="PurePremium",
    weights="Exposure",
    tweedie_powers=tweedie_powers,
示例#19
0
def main_Calib(filename, output, mode, alg, basis, order, figure, verbose, offset, qt, pre, split):
    '''
    # main program
    # input: radius: %+.3f, 'str' (in makefile, str is default)
    #        path: file storage path, 'str'
    #        fout: file output name as .h5, 'str' (.h5 not included')
    #        cut_max: cut off of Legendre
    # output: the gathered result EventID, ChannelID, x, y, z
    '''
    if pre != 'r':
        print('begin reading file', flush=True)
        EventID, ChannelID, Q, PETime, photonTime, PulseTime, dETime, x, y, z = pub.ReadFile(filename)
        VertexTruth = (np.vstack((x, y, z))/1e3).T
        if(offset):
            off = pub.LoadBase(offset)
        else:
            off = np.zeros_like(PMTPos[:,0])
        print('total event: %d' % np.size(np.unique(EventID)), flush=True)
        print('begin processing legendre coeff', flush=True)
        # this part for the same vertex

        tmp = time.time()
        EventNo = np.size(np.unique(EventID))
        PMTNo = np.size(PMTPos[:,0])
        if mode == 'PE':
            PMTPosRep = np.tile(PMTPos, (EventNo,1))
            vertex = np.repeat(VertexTruth, PMTNo, axis=0)
        elif mode == 'time':
            counts = np.bincount(EventID)
            counts = counts[counts!=0]
            PMTPosRep = PMTPos[ChannelID]
            vertex = np.repeat(VertexTruth, counts, axis=0)
        elif mode == 'combined':
            PMTPosRep = np.tile(PMTPos, (EventNo,1))
            vertex = np.repeat(VertexTruth, PMTNo, axis=0)

        if basis == 'Legendre':
            X, cos_theta = pub.LegendreCoeff(PMTPosRep, vertex, order, Legendre=True)
        elif basis == 'Zernike':
            from zernike import RZern
            cos_theta = pub.LegendreCoeff(PMTPosRep, vertex, order, Legendre=False)
            cart = RZern(order)
            nk = cart.nk
            m = cart.mtab
            n = cart.ntab
            rho = np.linalg.norm(vertex, axis=1)/0.65
            theta = np.arccos(cos_theta)
            X = np.zeros((rho.shape[0], nk))

            for i in np.arange(nk):
                if not i % 5:
                    print(f'process {i}-th event')
                X[:,i] = cart.Zk(i, rho, theta)
            X = X[:,m>=0]
            print(f'rank: {np.linalg.matrix_rank(X)}')    
        print(f'use {time.time() - tmp} s')

        # which info should be used
        if mode == 'PE':
            y = Q
        elif mode == 'time':
            y = PulseTime 
        elif mode == 'combined':
            # PulseTime = PulseTime - np.min(PulseTime)
            # PulseTime = (PulseTime - np.max(PulseTime)/2)/np.max(PulseTime)*2
            # print(np.min(PulseTime), np.max(PulseTime))
            PulseTime = (PulseTime - np.max(PulseTime)/2)/np.max(PulseTime)*2
            bins = np.arange(-1, 0.05, 0.1)
            N = 10
            # Legendre coeff
            x = pub.legval(bins, np.eye(N).reshape(N, N, 1))
            # 1st basis
            Y = np.tile(x, len(np.unique(EventID))*len(np.unique(ChannelID))).T
            # 2nd basis
            X = np.repeat(X, bins.shape[0], axis=0)
            # output
            y = np.zeros((len(np.unique(EventID)), len(np.unique(ChannelID)), len(bins)))
            '''
            basis = np.zeros((X.shape[0], X.shape[1]*Y.shape[1]))
            for i_index, i in enumerate(np.arange(X.shape[1])):
                for j_index, j in enumerate(np.arange(Y.shape[1])):
                    total_index = i_index*Y.shape[1] + j_index
                    if not total_index % 10:
                        print(total_index)
                    basis[:, total_index] = X[:,i_index]*Y[:,j_index]
            X = basis
            '''
            split_index = np.unique(EventID).shape[0]
            for k_index, k in enumerate(np.unique(EventID)): # event begin with 1
                if k_index > split_index * split:
                    break
                if not k % 100:
                    print(k)
                index = EventID == k
                CID = ChannelID[index]
                Pulse_t = PulseTime[index]
                for i in np.unique(CID): # PMT begin with 0
                    y[k_index, i, 1:], _ = np.histogram(Pulse_t[CID==i], bins=bins)
            y = np.reshape(y,(-1))
        if verbose:
            print(f'the basis shape is {X.shape}, and the dependent variable shape is {y.shape}')
    if pre =='w':
        if split != 1:
            split_index = np.int(split*y.shape[0])
            X = X[:split_index]
            Y = Y[:split_index]
            y = y[:split_index]
        import pandas as pd
        import pyarrow as pa
        import pyarrow.parquet as pq
        y = np.atleast_2d(y).T
        #data = np.hstack((X, y, np.ones_like(y)))
        df_X = pd.DataFrame(X)
        X_names = []
        for i in df_X.columns:
            X_names.append('X' + str(i))
        df_X.columns = X_names    
        
        df_Y = pd.DataFrame(Y)
        Y_names = []
        for i in df_Y.columns:
            Y_names.append('Y' + str(i))
        df_Y.columns = Y_names
        
        df_y = pd.DataFrame(y)
        df_y.columns = ['output']
        df = pd.concat([df_X, df_Y, df_y], axis=1)
        table = pa.Table.from_pandas(df)
        
        pq.write_table(table, 'test1.parquet')
        return

    if not pre:
        # Regression methods:
        if alg == 'sms':
            import statsmodels.api as sm
            if mode == 'PE':
                model = sm.GLM(y, X, family=sm.families.Poisson(), fit_intercept=False)
                result = model.fit()
                if verbose:
                    print(result.summary())
                AIC = result.aic
                coef_ = result.params
                std = result.bse
                
            elif mode == 'time':
                import pandas as pd
                data = pd.DataFrame(data = np.hstack((X, np.atleast_2d(y).T)))                
                strs = 'y ~ '
                start = data.keys().start
                stop = data.keys().stop
                step = data.keys().step

                cname = []
                cname.append('X0')
                for i in np.arange(start+1, stop, step):
                    if i == start + 1:
                        strs += 'X%d ' % i
                    elif i == stop - step:
                        pass
                    else:
                        strs += ' + X%d ' % i                      

                    if i == stop - step:
                        cname.append('y')
                    else:
                        cname.append('X%d' % i)
                data.columns = cname

                mod = sm.formula.quantreg(strs, data[cname])

                result = mod.fit(q=qt,)
                coef_ = result.params
                AIC = np.zeros_like(coef_)
                std = np.zeros_like(coef_)           
                print('Waring! No AIC and std value')
            elif mode == 'combined':
                # data = pd.DataFrame(data = np.hstack((basis, np.atleast_2d(y).T)))  
                with h5py.File(output,'w') as out:        
                    out.create_dataset('X', data = X)
                    out.create_dataset('Y', data = y)
                print('begin...')
                model = sm.GLM(y, X, family=sm.families.Poisson())
                result = model.fit()
                if verbose:
                    print(result.summary())
                coef_ = result.params
                std = result.bse
                AIC = result.aic
            if verbose:
                print(result.summary())

        elif (alg == 'custom'):
            from scipy.optimize import minimize
            x0 = np.zeros_like(X[0]) # initial value (be careful of Zernike order)
            
            if mode == 'PE':
                x0[0] = 0.8 + np.log(2) # intercept is much more important
                result = minimize(pub.CalibPE, x0=x0, method='SLSQP', args = (y, PMTPos, X))
            elif mode == 'time':
                x0[0] = np.mean(y)
                qt = 0.1
                ts = 2.6
                result = minimize(pub.CalibTime, x0=x0, method='SLSQP', args = (np.hstack((EventID, EventID)), y, X, qt, ts))
            elif mode == 'combined':
                x0 = np.zeros_like(X[0])
                x0[0] = 0.8 + np.log(2) # intercept is much more important
                result = minimize(pub.CalibPE, x0=x0, method='SLSQP', args = (y, PMTPos, X))

            coef_ = np.array(result.x, dtype=float)
            if verbose:
                print(result.message)
            AIC = np.zeros_like(coef_)
            std = np.zeros_like(coef_)

            H = pub.MyHessian(result.x, pub.CalibPE, *(y, PMTPos, X))
            # H = pub.MyHessian(result.x, *(Q, PMTPos, X, pub.CalibTime))
            # std = 1/np.sqrt(-np.diag(np.linalg.pinv(H1)))
            print(coef_)
            # print(std)
            print('Waring! No AIC and std value, std is testing')

        elif alg == 'sk':
            from sklearn.linear_model import TweedieRegressor
            alpha = 0.001
            reg = TweedieRegressor(power=1, alpha=alpha, link='log', max_iter=1000, tol=1e-6, fit_intercept=False)
            reg.fit(X, y)

            # just for point data
            # pred = reg.predict(X[0:30,0:cut+1])

            print('coeff:\n', reg.coef_,'\n')

            coef_ = reg.coef_ 

            AIC = np.zeros_like(coef_)
            std = np.zeros_like(coef_)
            print('Waring! No AIC and std value')

        elif alg == 'h2o':
            import h2o
            from h2o.estimators.gbm import H2OGradientBoostingEstimator
            from h2o.estimators.glm import H2OGeneralizedLinearEstimator           
            if mode != 'combined':
                y = np.atleast_2d(y).T
                data = np.hstack((X, y, np.ones_like(y)))

                h2o.init()
                hf = h2o.H2OFrame(data)
                predictors = hf.columns[0:-2]
                response_col = hf.columns[-2]

                if mode == 'PE':
                    #offset_col = hf.columns[-1]
                    glm_model = H2OGeneralizedLinearEstimator(family= "poisson",
                        #offset_column = offset_col, 
                        lambda_ = 0,
                        compute_p_values = True)

                    glm_model.train(predictors, response_col, training_frame=hf)

                    coef_table = glm_model._model_json['output']['coefficients_table']
                    coef_ = glm_model.coef()

                elif mode == 'time':
                    gbm = H2OGradientBoostingEstimator(distribution="quantile", seed = 1234,
                                                      stopping_metric = "mse", stopping_tolerance = 1e-4)
                    gbm.train(x = predictors, y = response_col, training_frame = hf)
                    breakpoint()
                    print(gbm)
                    exit()
            elif mode == 'combined':
                y = np.atleast_2d(y).T
                data = np.hstack((X, Y, y, np.ones_like(y)))

                h2o.init() 
                hf = h2o.H2OFrame(data)
                predictors = hf.columns[0:-2]
                response_col = hf.columns[-2]           

            if verbose:
                print(coef_)
                if basis == 'Zernike':
                    print(f'Regession coef shape is f{np.array(coef_).shape}, Zernike shape is {nk}')
            coef_ = coef_table['coefficients']
            std = coef_table['std_error']
            AIC = glm_model.aic()

            h2o.cluster().shutdown()

    elif pre == 'r':
        import h2o
        from h2o.estimators.gbm import H2OGradientBoostingEstimator
        from h2o.estimators.glm import H2OGeneralizedLinearEstimator           
        h2o.init()
        hf = h2o.import_file("electron-1.parquet")
        pairs = []
        for i in hf.columns:
            for j in hf.columns:
                if (i.startswith('Z') and j.startswith('L')):
                    if ((i!='X0') and (j != 'Y0')):
                        pairs.append((i,j))
        predictors = hf.columns[2:]
        response_col = hf.columns[0]
        
        print(predictors)
        print(response_col)
        print(pairs)
        if mode == 'PE':
            #offset_col = hf.columns[-1]
            glm_model = H2OGeneralizedLinearEstimator(family= "poisson",
                #offset_column = offset_col, 
                lambda_ = 0,
                compute_p_values = True)

            glm_model.train(predictors, response_col, training_frame=hf)
        
        elif mode == 'combined':
            #offset_col = hf.columns[-1]
            glm_model = H2OGeneralizedLinearEstimator(family= "poisson",
                #offset_column = offset_col, 
                interaction_pairs=pairs,
                lambda_ = 0,
                #remove_collinear_columns = True, 
                compute_p_values = True)

            glm_model.train(predictors, response_col, training_frame=hf)
        breakpoint()
        coef_table = glm_model._model_json['output']['coefficients_table']
        coef_ = coef_table['coefficients']
        std = coef_table['std_error']
        AIC = glm_model.aic()
        print(f'Regession coef is f{np.array(coef_)}')             
        if (figure=='ON'):
            import matplotlib.pyplot as plt
            L, K = 500, 500
            ddx = np.linspace(-1.0, 1.0, K)
            ddy = np.linspace(-1.0, 1.0, L)
            xv, yv = np.meshgrid(ddx, ddy)
            cart.make_cart_grid(xv, yv)
            # normal scale
            # im = plt.imshow(np.exp(cart.eval_grid(np.array(coef_), matrix=True)), origin='lower', extent=(-1, 1, -1, 1))
            # log scale
            im = plt.imshow(cart.eval_grid(np.array(coef_), matrix=True), origin='lower', extent=(-1, 1, -1, 1))
            plt.colorbar()
            plt.savefig('test.png')
    else:
        print('error regression algorithm')
            
    with h5py.File(output,'w') as out:        
        out.create_dataset('coeff' + str(order), data = coef_)
        out.create_dataset('std' + str(order), data = std)
        out.create_dataset('AIC' + str(order), data = AIC)
示例#20
0
    # print(gks_test)

    gks_x = gks.iloc[:, :-1].values
    gks_y = gks.iloc[:, -1].values

    gks_x_test = gks_test.iloc[:, :-1].values
    gks_y_test = gks_test.iloc[:, -1].values

    scaler = StandardScaler()

    gks_x = scaler.fit_transform(gks_x)

    # reg = SVR(C=10, epsilon=0.2)

    reg = TweedieRegressor(power=1, alpha=0.5, link='log')

    reg.fit(gks_x, gks_y)

    gks_x_test = scaler.transform(gks_x_test)
    preds = reg.predict(gks_x_test)

    print(mean_squared_error(gks_y_test, preds))

    # print(gks_test_names)

    with open('gks.csv', 'w') as file:
        for idx, val in enumerate(preds):
            file.write(gks_test_names.iloc[idx]['web_name'] + "," + str(val) +
                       "," + str(gks_y_test[idx]))
            file.write('\n')
示例#21
0
 def __init__(self, data, col_num):
     super().__init__(data, col_num)
     self.model = TweedieRegressor(power=2, link='log')
     self._spread = 0.01
示例#22
0
def test_tweedie_link_auto(power, expected_link_class):
    """Test that link='auto' delivers the expected link function"""
    y = np.array([0.1, 0.5])  # in range of all distributions
    X = np.array([[1], [2]])
    glm = TweedieRegressor(link="auto", power=power).fit(X, y)
    assert isinstance(glm._base_loss.link, expected_link_class)
示例#23
0
def test_tweedie_link_argument(name, link_class):
    """Test GLM link argument set as string."""
    y = np.array([0.1, 0.5])  # in range of all distributions
    X = np.array([[1], [2]])
    glm = TweedieRegressor(power=1, link=name).fit(X, y)
    assert isinstance(glm._base_loss.link, link_class)
示例#24
0
                           n_informative=80,
                           noise=0.5,
                           random_state=2)
    return X, y


@pytest.fixture(
    params=itertools.product(
        ["long", "wide"],
        [
            BinomialRegressor(),
            PoissonRegressor(),
            GammaRegressor(),
            # TweedieRegressor(power=3.0),  # too difficult
            # TweedieRegressor(power=0, link="log"),  # too difficult
            TweedieRegressor(power=1.5),
        ],
    ),
    ids=lambda param: f"{param[0]}-{param[1]}",
)
def glm_dataset(global_random_seed, request):
    """Dataset with GLM solutions, well conditioned X.

    This is inspired by ols_ridge_dataset in test_ridge.py.

    The construction is based on the SVD decomposition of X = U S V'.

    Parameters
    ----------
    type : {"long", "wide"}
        If "long", then n_samples > n_features.
    tick.label.set_fontsize(6)        
axes.tick_params(width=4) 
# change all spines
for axis in ['top','bottom','left','right']:
    axes.spines[axis].set_linewidth(6)



#%%

from sklearn.linear_model import TweedieRegressor
X = np.array(x).reshape(-1,1)
Y = np.array(y)


pr = TweedieRegressor(power = 1, alpha=0, fit_intercept=True)
y_pred_pr = pr.fit(X, Y).predict(X)

fig, axes = utils.plot_make(size_length=5)
sns.scatterplot(data = sc_vs_quickness_group_fill, x = "sc_LR_mean", y= "inverse_quickness", linewidth=0, s=100)
sns.lineplot(x = X.flatten(), y = y_pred_pr)

pr.score(X, Y)




#%

X2 = sm.add_constant(X)
glm = sm.GLM(Y, X2, family=sm.families.Tweedie())
def tweedie_regression():
    reg = TweedieRegressor(power=1, alpha=0.5, link='log')
    reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2])
    print(reg.coef_)
    print(reg.intercept_)
示例#27
0
    # the power attribute is properly updated
    power = 2.0
    est = TweedieRegressor(power=power)
    assert isinstance(est.family, TweedieDistribution)
    assert est.family.power == power
    assert est.power == power

    new_power = 0
    new_family = TweedieDistribution(power=new_power)
    est.family = new_family
    assert isinstance(est.family, TweedieDistribution)
    assert est.family.power == new_power
    assert est.power == new_power

    msg = "TweedieRegressor.family must be of type TweedieDistribution!"
    with pytest.raises(TypeError, match=msg):
        est.family = None


@pytest.mark.parametrize(
    "estimator, value",
    [
        (PoissonRegressor(), True),
        (GammaRegressor(), True),
        (TweedieRegressor(power=1.5), True),
        (TweedieRegressor(power=0), False),
    ],
)
def test_tags(estimator, value):
    assert estimator._get_tags()["requires_positive_y"] is value
示例#28
0
def all_models_info():
    '''takes in data
    sets baseline
    sets SSE, MSE, and RMSE
    returns infor for all 4'''
    # get data
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    # pull from add to trian
    train = evaluate.add_to_train()
    X_train, y_train, X_validate, y_validate, X_test, y_test = evaluate.xtrain_xval_xtest(
    )
    #OLS Model
    lm = LinearRegression(normalize=True)
    lm.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_lm'] = lm.predict(X_train)
    rmse_train_lm = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lm)**(1 / 2)
    y_validate['appraised_value_pred_lm'] = lm.predict(X_validate)
    rmse_validate_lm = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_lm)**(1 / 2)
    #LARS Model
    lars = LassoLars(alpha=1.0)
    lars.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_lars'] = lars.predict(X_train)
    rmse_train_lars = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lars)**1 / 2
    y_validate['appraised_value_pred_lars'] = lars.predict(X_validate)
    rmse_validate_lars = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_lars)**1 / 2
    #GLM
    glm = TweedieRegressor(power=1, alpha=0)
    glm.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_glm'] = glm.predict(X_train)
    rmse_train_glm = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_glm)**1 / 2
    y_validate['appraised_value_pred_glm'] = glm.predict(X_validate)
    rmse_validate_glm = mean_squared_error(
        y_validate.appraised_value, y_validate.appraised_value_pred_glm)**1 / 2
    # PF
    pf = PolynomialFeatures(degree=2)
    X_train_degree2 = pf.fit_transform(X_train)
    X_validate_degree2 = pf.transform(X_validate)
    X_test_degree2 = pf.transform(X_test)
    # LM2
    lm2 = LinearRegression(normalize=True)
    lm2.fit(X_train_degree2, y_train.appraised_value)
    y_train['appraised_value_pred_lm2'] = lm2.predict(X_train_degree2)
    rmse_train_lm2 = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lm2)**1 / 2
    y_validate['appraised_value_pred_lm2'] = lm2.predict(X_validate_degree2)
    rmse_validate_lm2 = mean_squared_error(
        y_validate.appraised_value, y_validate.appraised_value_pred_lm2)**1 / 2
    print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ",
          rmse_train_lm, "\nValidation/Out-of-Sample: ", rmse_validate_lm)
    print("--------------------------------------------------------------")
    print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train_lars,
          "\nValidation/Out-of-Sample: ", rmse_validate_lars)
    print("--------------------------------------------------------------")
    print(
        "RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ",
        rmse_train_glm, "\nValidation/Out-of-Sample: ", rmse_validate_glm)
    print("--------------------------------------------------------------")
    print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ",
          rmse_train_lm2, "\nValidation/Out-of-Sample: ", rmse_validate_lm2)
示例#29
0
inv_map = {v: k for k, v in eco_vec_map.items()}
df_eco.columns = df_eco.columns.to_series().map(inv_map)
df_eco.index.names = ['Date']

# Extract eco data from Sep 2018 to Jan 2020
df_eco_sel = df_eco.loc['2018-09-01':'2020-01-31']

# put together eco and transaction counts for regression
df_all = pd.concat([df_eco_sel, df_period.set_index(df_eco_sel.index)], axis=1)
y_train = df_all['Transaction_Count'].values
X_train = df_all[[
    'CPI', 'Exchange_Rate_USD', 'GDP', 'Unemployment_Rate', 'TSX'
]]

# generalized linear model
glm = TweedieRegressor(power=1, alpha=0.5, link='log')  # Poisson distribution
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
glm.fit(X_train_scaled, y_train)

# predict eco data for given year and month
df_future = pd.DataFrame(columns=['Date'])
for i, eco_var in enumerate(list(eco_vec_map.keys())):
    print("Forecasting " + eco_var + ' ' + str(Y) + ' ' +
          datetime.strptime(str(M), "%m").strftime("%b"))
    tmp = forecast_eco(df_eco, eco_var, Y, M)
    tmp = tmp[['ds', 'trend']]
    tmp.rename(columns={'ds': 'Date', 'trend': eco_var}, inplace=True)
    df_future = df_future.merge(tmp, on='Date', how='right')

# predict transaction count using the glm model
示例#30
0
 def tweedieregressor(self,X_train,X_test,y_train,y_test):
     
     regressor= TweedieRegressor()
     regfit=regressor.fit(self.X_train,self.y_train)
     return regressor.predict(self.X_test)