def test_framing_example_moderator():
    # moderation without formulas, generally not useful but test anyway

    cur_dir = os.path.dirname(os.path.abspath(__file__))
    data = pd.read_csv(os.path.join(cur_dir, 'results', "framing.csv"))

    outcome = np.asarray(data["cong_mesg"])
    outcome_exog = patsy.dmatrix("emo + treat + age + educ + gender + income",
                                 data,
                                 return_type='dataframe')
    probit = sm.families.links.probit
    outcome_model = sm.GLM(outcome,
                           outcome_exog,
                           family=sm.families.Binomial(link=probit()))

    mediator = np.asarray(data["emo"])
    mediator_exog = patsy.dmatrix("treat + age + educ + gender + income",
                                  data,
                                  return_type='dataframe')
    mediator_model = sm.OLS(mediator, mediator_exog)

    tx_pos = [
        outcome_exog.columns.tolist().index("treat"),
        mediator_exog.columns.tolist().index("treat")
    ]
    med_pos = outcome_exog.columns.tolist().index("emo")

    ix = (outcome_exog.columns.tolist().index("age"),
          mediator_exog.columns.tolist().index("age"))
    moderators = {ix: 20}
    med = Mediation(outcome_model,
                    mediator_model,
                    tx_pos,
                    med_pos,
                    moderators=moderators)

    # Just a smoke test
    np.random.seed(4231)
    med_rslt = med.fit(method='parametric', n_rep=100)
示例#2
0
def glm(data, xseq, **params):
    """
    Fit GLM
    """
    X = sm.add_constant(data['x'])
    Xseq = sm.add_constant(xseq)

    init_kwargs, fit_kwargs = separate_method_kwargs(params['method_args'],
                                                     sm.GLM, sm.GLM.fit)
    model = sm.GLM(data['y'], X, **init_kwargs)
    results = model.fit(**fit_kwargs)

    data = pd.DataFrame({'x': xseq})
    data['y'] = results.predict(Xseq)

    if params['se']:
        prediction = results.get_prediction(Xseq)
        ci = prediction.conf_int(1 - params['level'])
        data['ymin'] = ci[:, 0]
        data['ymax'] = ci[:, 1]

    return data
示例#3
0
    def __init__(self):

        # generate artificial data
        np.random.seed(98765678)
        nobs = 200
        rvs = np.random.randn(nobs, 6)
        data_exog = rvs
        data_exog = sm.add_constant(data_exog)
        xbeta = 0.1 + 0.1 * rvs.sum(1)
        data_endog = np.random.poisson(np.exp(xbeta))

        #estimate discretemod.Poisson as benchmark
        self.res_discrete = Poisson(data_endog, data_exog).fit(disp=0)

        mod_glm = sm.GLM(data_endog, data_exog, family=sm.families.Poisson())
        self.res_glm = mod_glm.fit()

        #estimate generic MLE
        self.mod = PoissonGMLE(data_endog, data_exog)
        self.res = self.mod.fit(start_params=0.9 * self.res_discrete.params,
                                method='nm',
                                disp=0)
示例#4
0
def fit_dispersion(counts, disp_raw, disp_conv, sf, CFG):

    mean_count = sp.mean(counts / sf, axis=1)[:, sp.newaxis]
    index = sp.where(disp_conv)[0]

    lowerBound = sp.percentile(sp.unique(disp_raw[index]), 1)
    upperBound = sp.percentile(sp.unique(disp_raw[index]), 99)

    idx = sp.where((disp_raw > lowerBound) & (disp_raw < upperBound))[0]

    matrix = sp.ones((idx.shape[0], 2), dtype='float')
    matrix[:, 0] /= mean_count[idx].ravel()

    modGamma = sm.GLM(disp_raw[idx],
                      matrix,
                      family=sm.families.Gamma(sm.families.links.identity))
    res = modGamma.fit()
    Lambda = res.params

    disp_fitted = disp_raw.copy()
    ok_idx = sp.where(~sp.isnan(disp_fitted))[0]
    disp_fitted[ok_idx] = Lambda[0] / mean_count[ok_idx] + Lambda[1]

    if sp.sum(disp_fitted > 0) > 0:
        print "Found dispersion fit"

    if CFG['debug']:
        fig = plt.figure(figsize=(8, 6), dpi=100)
        ax = fig.add_subplot(111)
        idx = sp.where(~sp.isnan(disp_fitted))[0]
        ax.plot(
            sp.mean(sp.log10(counts + 1), axis=1)[idx], disp_fitted[idx], 'bo')
        ax.set_title('Fitted Dispersion Estimate')
        ax.set_xlabel('Mean expression count')
        ax.set_ylabel('Dispersion')
        plt.savefig('dispersion_fitted.pdf', format='pdf', bbox_inches='tight')
        plt.close(fig)

    return (disp_fitted, Lambda, idx)
示例#5
0
    def evaluate_payoffs(self):
        self.intr = np.maximum(self.model.spots -
                               self.K, 0.0) if self.is_call else np.maximum(
                                   self.K - self.model.spots, 0.0)
        for step in reversed(self.model.steps):
            curr_spots = self.model.spots[:, step]
            if step == self.model.steps[-1]:
                self.spv[:, step] = self.intr[:, step]
                self.epv[:, step] = self.intr[:, step]
                #for plotting
                next_spv = self.spv[:, step]
            else:
                next_spv = self.spv[:, step + 1] * math.exp(
                    -self.mkt_data.rr * self.model.t_diffs[step])
                ols = sm.GLM(
                    next_spv,
                    sm.add_constant(
                        np.column_stack((curr_spots, np.square(curr_spots),
                                         np.power(curr_spots,
                                                  3), np.power(curr_spots,
                                                               4)))))

                res_ols = ols.fit()
                self.epv[:, step] = res_ols.fittedvalues
                self.spv[:, step] = np.where(
                    self.is_amer
                    and np.logical_and(self.intr[:, step] > self.epv[:, step],
                                       self.epv[:, step] > 0),
                    self.intr[:, step], next_spv)
                self.exercise_times = np.where(
                    self.is_amer
                    and np.logical_and(self.intr[:, step] > self.epv[:, step],
                                       self.epv[:, step] > 0), step,
                    self.exercise_times)
                if self.plot:
                    self.plot_result(self.epv[:, step], curr_spots, next_spv,
                                     "dummy title", self.path)

        self.payoff_evaluated = True
def link_functions_gaussian(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    print("Read in prostate data.")
    h2o_data = h2o.import_frame(
        path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))
    h2o_data.head()

    sm_data = pd.read_csv(
        zipfile.ZipFile(
            h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open(
                "prostate_complete.csv")).as_matrix()
    sm_data_response = sm_data[:, 9]
    sm_data_features = sm_data[:, 1:9]

    print("Testing for family: GAUSSIAN")
    print("Set variables for h2o.")
    myY = "GLEASON"
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]

    print("Create models with canonical link: IDENTITY")
    h2o_model = h2o.glm(x=h2o_data[myX],
                        y=h2o_data[myY],
                        family="gaussian",
                        link="identity",
                        alpha=[0.5],
                        Lambda=[0],
                        n_folds=0)
    sm_model = sm.GLM(endog=sm_data_response,
                      exog=sm_data_features,
                      family=sm.families.Gaussian(
                          sm.families.links.identity)).fit()

    print("Compare model deviances for link function identity")
    h2o_deviance = h2o_model._model_json['output'][
        'residual_deviance'] / h2o_model._model_json['output']['null_deviance']
    sm_deviance = sm_model.deviance / sm_model.null_deviance
    assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
示例#7
0
def run_GLM(X,y,family=None,link=None):
    ''' Runs a Generalized linear model on the design matrix X given the target y.
    This function adds its own constant term to the design matrix'''

    # assumes Binomial distribution
    if link==None:
        link = sm.genmod.families.links.logit
    if family==None:
        family=sm.families.Binomial(link=link)
    else:
        family = family(link=link)

    # make y a column vector
    if y.ndim==1:
        y = y[:,np.newaxis]

    # make y a float
    if y.dtype=='bool':
        y = y.astype('f8')

    # init yhat
    yhat = np.empty_like(y).ravel()
    yhat[:] = np.nan

    # get nans so we dont predict on nans
    idx = np.all(np.isfinite(X), axis=1)

    # add a constant term to the design matrix
    constant = np.ones([X.shape[0],1])
    X = np.concatenate([constant, X], axis=1)

    # fit and predict
    glm_binom = sm.GLM(y,X,family=family,missing='drop')
    glm_result = glm_binom.fit()
    # history_names = glm_binom.exog_names[-2:]
    # res_c = glm_binom.fit_constrained(['{}<=0'.format(history_names[0]),'{}<=0'.format(history_names[1])])
    yhat[idx] = glm_result.predict(X[idx,:])

    return yhat,glm_result
示例#8
0
def test_logistic_model_without_regularization_no_rate(n=100):
    d = dict()
    test_results = pd.DataFrame(d.items())
    weights, y, X = generate_data(n,
                                  loss='logistic',
                                  wt_param=1000,
                                  return_rate=False)
    # Statmodels
    y = np.array(y).ravel()
    # weights = np.array(weights).ravel().astype('int')
    # y = np.multiply(y, weights).astype('int')
    # non_actions = np.subtract(weights, y).astype('int')
    # y_sm = np.array(zip(y, non_actions))
    glm = sm.GLM(y, X, family=sm.families.Binomial())
    res = glm.fit()
    test_results['sm_glm'] = res.params
    # Sklearn (Expects labels not probabilities)
    sk_glm = linear_model.LogisticRegression(
        fit_intercept=False)  # Since design matrix has added intercept
    sk_glm.fit(X, y)
    test_results['sklearn_glm'] = sk_glm.coef_.ravel().tolist()
    print test_results
def cces_glm(Y, X, W, summarize):

    logit = sm.GLM(Y,
                   X,
                   family=sm.families.Binomial(),
                   freq_weights=W,
                   missing='drop')

    result = logit.fit()

    if summarize == True:
        print(result.summary())

    params = result.params
    conf_int = result.conf_int()
    conf_int['coef'] = params
    result_df = pd.concat([conf_int, np.exp(conf_int)], axis=1)
    result_df.columns = [
        'coef_2.5%', 'coef_97.5%', 'coef', 'or_2.5%', 'or_97.5%', 'or'
    ]
    result_df['factor'] = result_df.index
    return result_df
示例#10
0
    def setup_class(cls):

        # generate artificial data
        np.random.seed(98765678)
        nobs = 200
        rvs = np.random.randn(nobs, 6)
        data_exog = rvs
        data_exog = sm.add_constant(data_exog, prepend=False)
        xbeta = 0.1 + 0.1 * rvs.sum(1)
        data_endog = np.random.poisson(np.exp(xbeta))

        #estimate discretemod.Poisson as benchmark
        cls.res_discrete = Poisson(data_endog, data_exog).fit(disp=0)

        mod_glm = sm.GLM(data_endog, data_exog, family=sm.families.Poisson())
        cls.res_glm = mod_glm.fit()

        #estimate generic MLE
        cls.mod = PoissonGMLE(data_endog, data_exog)
        cls.res = cls.mod.fit(start_params=0.9 * cls.res_discrete.params,
                              method='bfgs',
                              disp=0)
示例#11
0
def anova_ml(args):
    if args.acase == 1 and not args.rad: fname = "csv/rio_c1.csv"
    if args.acase == 2 and not args.rad: fname = "csv/rio_c2.csv"
    if args.acase == 1 and args.rad: fname = "csv/rad_c1.csv"
    if args.acase == 2 and args.rad: fname = "csv/rad_c2.csv"
    df = pd.read_csv(fname)
    df.dt = np.abs(df.dt)
    if args.acase == 1 and not args.rad:
        df = df[(df.dt > 20) & (df.sza < 140) & (df.sza > 60)]
    if args.acase == 2 and args.rad: df = df[(df.dt != 0) & (df.dt != 360)]
    df["cossza"] = df.sza.transform(lambda x: np.cos(np.deg2rad(x)))
    df["logfmax"] = df.fmax.transform(lambda x: np.log10(x))
    models = {}
    models["m1"] = sm.GLM(df.dt,
                          df[["cossza", "lat", "logfmax", "lt"]].values,
                          family=sm.families.NegativeBinomial())
    response = []
    for k in models.keys():
        res = models[k].fit()
        response.append(res)
        print(res.summary().as_text())
    return response[0]
示例#12
0
def get_coefs(dim, dif_other, dur, ch, cond, t_RDK_dur, correct_only=True):
    """

    :param dim:
    :param dif_other:
    :param dur: [tr]
    :param ch: [tr, dim]
    :param cond: [tr, dim]
    :param t_RDK_dur:
    :param correct_only:
    :return: glmres.params, glmres.bse, glmres, glmmodel
    """
    id_dif = np.empty_like(cond)
    for dim1 in range(consts.N_DIM):
        out = np.unique(np.abs(cond[:, dim1]), return_inverse=True)
        _, id_dif[:, dim1] = out

    odim = consts.N_DIM - 1 - dim
    incl = ((t_RDK_dur == dur) & (np.isin(id_dif[:, odim], dif_other)))
    if correct_only:
        incl = (incl & (np.sign(ch[:, odim] - 0.5) == np.sign(cond[:, odim])))
    ch1 = ch[incl, dim]
    coh1 = cond[incl, dim]

    cohs, id_cohs = np.unique(coh1, return_inverse=True)
    if np.issubdtype(ch1.dtype, np.floating):
        # p_ch=1 is given
        ch11 = np.stack(
            [npg.aggregate(id_cohs, ch1),
             npg.aggregate(id_cohs, 1 - ch1)], -1)
    else:
        ch11 = npg.aggregate(np.vstack((id_cohs, 1 - ch1)), 1)

    glmmodel = sm.GLM(ch11,
                      sm.add_constant(cohs),
                      family=sm.families.Binomial())
    glmres = glmmodel.fit()
    return glmres.params, glmres.bse, glmres, glmmodel
        def test_on_clear_data(self):
            """
            When applied to non-noisy data, the svinfer's LogisticRegression
            is expected to return the same results as a classic logistic regression.
            The benchmark results are from statsmodels.api.GLM
            """
            # fit by svinfer
            svinfer_model = LogisticRegression(
                self.predictors_clear, self.response,
                [0] * len(self.predictors_clear)).fit(
                    DataFrameProcessor(self.data))
            svinfer_beta = svinfer_model.beta
            svinfer_vcov = svinfer_model.beta_vcov

            # fit by statsmodels
            sm_model = sm.GLM(
                self.data[self.response].values,
                sm.add_constant(self.data[self.predictors_clear].values),
                family=sm.families.Binomial(),
            ).fit(cov_type="HC0")  # use basic sandwich
            sm_beta = sm_model.params
            sm_vcov = sm_model.cov_params()

            self.assertTrue(
                check_if_almost_equal(
                    svinfer_beta,
                    sm_beta,
                    absolute_tolerance=1e-12,
                    relative_tolerance=1e-12,
                ))

            self.assertTrue(
                check_if_almost_equal(
                    svinfer_vcov,
                    sm_vcov,
                    absolute_tolerance=1e-12,
                    relative_tolerance=1e-12,
                ))
示例#14
0
def linear_model_with_interxns(trainDf, testDf, max_depth=2, printem=False):

    trainDf = trainDf.copy()
    testDf = testDf.copy()

    explanatories = [c for c in trainDf.columns if c[0] == 'x']

    trainDf = interact(trainDf, explanatories, max_depth)
    testDf = interact(testDf, explanatories, max_depth)

    trainDf["x0"] = 1
    testDf["x0"] = 1

    explanatories = [c for c in trainDf.columns if c[0] == 'x']

    if max_depth == 0:
        explanatories = ["x0"]

    linearModel = sm.GLM(trainDf["y"],
                         trainDf[explanatories],
                         family=sm.families.Gaussian())
    linearModel = linearModel.fit()

    preds = np.array(linearModel.predict(testDf[explanatories]))
    acts = np.array(testDf["y"])

    if printem:
        display(preds)
        print("")
        display(acts)
        print("")

    errs = preds - acts

    MAE = sum(abs(errs)) / len(errs)
    RMSE = np.sqrt(sum(errs * errs) / len(errs))

    return MAE, RMSE
示例#15
0
def make_model(draw=True):
    dict_data = get_data()
    y = dict_data['y_param']

    sparse_matrix, i = [], 1
    for name, value in dict_data.iteritems():
        if name != 'y_param':
            print 'x{} = {}'.format(i, name)
            sparse_matrix.append(value)
            i += 1

    ones = np.ones(len(sparse_matrix[0]))
    X = sm.add_constant(np.column_stack((sparse_matrix[0], ones)))
    for ele in sparse_matrix[1:]:
        X = sm.add_constant(np.column_stack((ele, X)))

    glm_binom = sm.GLM(y, X)
    res = glm_binom.fit()

    # general info about Gaussian Model (Exp model)
    print res.summary()

    # This is a general specification test,
    # for additional non-linear effects in a model.
    # If the p-value of the f-test is below a threshold, e.g. 0.1, then this
    # indicates that there might be additional non-linear effects in the model
    # and that the linear model is mis-specified.
    print reset_ramsey(res, len(sparse_matrix))

    check_sum_params(res.params)

    if draw:
        y_sum = sum(y)
        y = [float(item) / y_sum for item in y]
        yhat = res.mu
        make_observed_values(yhat, y)
        make_residual_dependence(yhat, res.resid_pearson)
        make_normalised_distribution(res.resid_deviance.copy())
示例#16
0
def link_functions_binomial(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    print("Read in prostate data.")
    h2o_data = h2o.import_frame(
        path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))
    h2o_data.head()

    sm_data = pd.read_csv(
        zipfile.ZipFile(
            h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open(
                "prostate_complete.csv")).as_matrix()
    sm_data_response = sm_data[:, 2]
    sm_data_features = sm_data[:, [1, 3, 4, 5, 6, 7, 8, 9]]

    print("Testing for family: BINOMIAL")
    print("Set variables for h2o.")
    myY = "CAPSULE"
    myX = ["ID", "AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "DPROS"]

    print("Create models with canonical link: LOGIT")
    h2o_model = h2o.glm(x=h2o_data[myX],
                        y=h2o_data[myY].asfactor(),
                        family="binomial",
                        link="logit",
                        alpha=[0.5],
                        Lambda=[0])
    sm_model = sm.GLM(endog=sm_data_response,
                      exog=sm_data_features,
                      family=sm.families.Binomial(
                          sm.families.links.logit)).fit()

    print("Compare model deviances for link function logit")
    h2o_deviance = h2o_model._model_json['output'][
        'residual_deviance'] / h2o_model._model_json['output']['null_deviance']
    sm_deviance = sm_model.deviance / sm_model.null_deviance
    assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def calculate_examplebehav(sigma=1):
    xc = np.random.uniform(0, 10, (n, 4))  #xc = x continuous
    beta0 = [1, 1, 1, 1]
    beta0 = mrc.normalize(beta0)
    I0 = np.dot(xc, beta0)
    y = 1.5 * (I0 - 8) + np.random.lognormal(0, sigma, n)
    y[y < 0] = 0
    y[y > 10] = 10
    #plt.plot(I0, y, 'o')
    tau_real = st.kendalltau(np.dot(xc, beta0), y)[0]
    x = xc.astype(int)
    #y = y.astype(int)
    #   x[abs(x)<2]=0
    #   x[abs(x)>5]=5
    tau0 = st.kendalltau(np.dot(x, beta0), y)[0]
    I00 = np.dot(x, beta0)
    #plt.plot(I00,y,'o')
    beta_ini = np.random.uniform(-1, 1, dim)
    beta_ori, tauback, iterations = mrc.ori(y, x, 5, 100, beta_ini)
    Ix = np.dot(x, beta_ori)

    xx = sm.add_constant(x)
    Gaussian_model = sm.GLM(y, xx, family=sm.families.Gaussian())
    Gaussian_results = Gaussian_model.fit()
    beta_glm = mrc.normalize(Gaussian_results.params[1:])
    tauglm = st.kendalltau(np.dot(x, beta_glm), y)[0]

    ### STATISTICAL SIGNIFICANCE  PART #####
    # for n=100, d=4, shape1= 8.336884  , shape2= 51.11006  . Think if these estimates apply.
    shape1 = 8.336884
    shape2 = 51.11006
    pval = 1 - bt.cdf(tauback, shape1, shape2, loc=0, scale=1)

    cos_ori = np.dot(beta0, beta_ori)
    cos_glm = np.dot(beta0, beta_glm)
    deltacos = cos_ori - cos_glm
    #plt.plot(Ix,y,'o')
    return (cos_ori, cos_glm, deltacos)
示例#18
0
    def fit(
        self,
        start_params=None,
        maxiter=100000,
        maxfun=5000,
        disp=False,
        method="bfgs",
        **kwds
    ):
        """
        Fit the model.
        Parameters
        ----------
        start_params : array-like
            A vector of starting values for the regression
            coefficients.  If None, a default is chosen.
        maxiter : integer
            The maximum number of iterations
        disp : bool
            Show convergence stats.
        method : str
            The optimization method to use.
        """

        if start_params is None:
            start_params = (
                sm.GLM(self.endog, self.exog, family=Binomial()).fit(disp=False).params
            )
            start_params = np.append(start_params, [0.5] * self.Z.shape[1])

        return super(Beta, self).fit(
            start_params=start_params,
            maxiter=maxiter,
            maxfun=maxfun,
            method=method,
            disp=disp,
            **kwds
        )
示例#19
0
def logitrank(df):
    teams = list(sorted(df['H'].unique()))
    dummies = dict(
        (team,
         (df['H'] == team).astype(np.int) - (df['V'] == team).astype(np.int))
        for team in teams)
    df2 = pd.DataFrame(dummies)
    df2['pdiff'] = df['PTS/H'] - df['PTS/V']

    y, X = dmatrices('I(pdiff>0) ~  %s' % ' + '.join(teams[:-1]), df2)
    results = sm.GLM(y, X, family=sm.families.Binomial()).fit()

    strengths = {
        name: val
        for name, val in zip(teams, np.append(results.params[1:], 0))
    }

    return {
        key: i
        for i, (
            key,
            v) in enumerate(sorted(strengths.iteritems(), key=lambda x: x[1]))
    }
def calculate_cauchy(sigma):
    x = np.random.uniform(0, 1, (n, dim))
    beta0 = [2, 1, -1, -1]
    beta0 = mrc.normalize(beta0)
    noise = cauchy.rvs(loc=0, scale=.1, size=n)
    I0 = np.dot(x, beta0)
    y = I0 + sigma * (noise)

    beta_ini = np.random.uniform(-1, -1, dim)
    beta_ori, tauori, iterations = mrc.ori(y, x, 2, 500, beta_ini)
    Ix = np.dot(x, beta_ori)
    beta_ori = mrc.normalize(beta_ori)

    xx = sm.add_constant(x)
    Gaussian_model = sm.GLM(y, xx, family=sm.families.Gaussian())
    Gaussian_results = Gaussian_model.fit()
    #print(Gaussian_results.summary())
    beta_glm = mrc.normalize(Gaussian_results.params[1:])

    cos_glm = np.abs(np.dot(beta0, beta_glm))
    cos_ori = np.abs(np.dot(beta0, beta_ori))
    deltacos = cos_ori - cos_glm
    return (cos_ori, cos_glm, deltacos)
示例#21
0
def get_dataset(adjacency_pairs_list):
    normalise = False
    c_count, c_gender, c_plen, y = create_predictors(adjacency_pairs_list, 3, normalise)

    #TODO compute these beta's based on the training set only
    betas={}

    for w in CATEGORIES:
        c_w = c_count[w]
        g_w = c_gender[w]
        y_w = y[w]

        if sum(y_w) > 0:
            c_w = np.array(c_w)
            g_w = np.array(g_w)
            X = np.array([np.ones(len(c_w)), c_w, g_w, c_w * g_w]).T

            y_w = np.array(y_w)

            res = sm.GLM(y_w, X, family=sm.families.Binomial()).fit()
            # print('params', w, res.params)
            betas[w] = res.params
    return c_count, c_gender, c_plen, y, betas
示例#22
0
def hic_norm(mat, count_neig, fire):
    """Poisson normalization.

    Parameters:
    ----------
    mat : DataFrame
        Pandas DataFrame consisting of genomic regions
    count_neig : str
        DataFrame column name where neighbor count is located
    fire : str
        DataFrame column name where fire score should be stored
    """
    y = mat[count_neig]
    x = mat[["F", "GC", "M"]]
    x = sm.add_constant(x)

    glm = sm.GLM(y, x, family=sm.families.Poisson())
    res = glm.fit()
    mat[fire] = mat[count_neig] / np.exp(res.params[0] +
                                         mat["F"] * res.params[1] +
                                         mat["GC"] * res.params[2] +
                                         mat["M"] * res.params[3])
    logging.debug(f"Done calculating Poisson normalization.")
示例#23
0
    def fit(self, data):
        """
        fit log-linear model to predicted weights
        """
        if self.marginals.is_empty:
            self.calculate_marginals()
        self.actual_counts = group_counts(data, self.demos)
        expected_counts = _get_expected_values(self.marginals, self.demos,
                                               self.population)

        model_data = sm.add_constant(pd.get_dummies(pd.merge(
            self.actual_counts,
            expected_counts,
        ),
                                                    columns=self.demos),
                                     prepend=False)
        features = model_data.columns[2:]
        model = sm.GLM(model_data["N"],
                       model_data[features],
                       family=sm.families.Poisson(),
                       offset=np.log(model_data["n"])).fit()
        self.model = model
        return self
示例#24
0
    def fit(self, X, y):
        """ 
        takes in training data and fits a model
        """

        self.classes_ = list(set(y))

        X = sm.add_constant(X)

        #  returns statsmodel link and distribution functions based on user input
        link = self.get_link()
        family = self.get_family(link)

        #  fits and stores statsmodel glm
        model = sm.GLM(y, X, family=family)
        self.fitted_model = model.fit()

        #  adds attributes for explainability
        # intercept cant be multidimensional np array like in classification
        # as scoring_base.py func compute_lm_significant hstack method will fail
        self.coef_ = np.array(self.fitted_model.params[1:]
                              )  #removes first value which is the intercept
        self.intercept_ = float(self.fitted_model.params[0])
示例#25
0
def cal_cv2(data, filename):
    b = data.iloc[:, 1:].T.values
    c = b.copy()
    #
    means = np.mean(c, axis=1)
    variance = np.var(c, axis=1)
    cv2 = variance/means**2
    #
    minMeanForFit = np.quantile(means[np.where(cv2 > 0.5)], 0.95)
    useForFit = means >= minMeanForFit
    gamma_model = sm.GLM(cv2[useForFit], np.array([np.repeat(1, means[useForFit].shape[0]), 1/means[useForFit]]).T,
                         family=sm.families.Gamma(link=sm.genmod.families.links.identity))
    gamma_results = gamma_model.fit()
    a0 = gamma_results.params[0]
    a1 = gamma_results.params[1]
    afit = a1/means + a0
    varFitRatio = variance / (afit*(means**2))
    cv2_score = pd.DataFrame({'Feature': data.columns[1:], "cv2_score": varFitRatio})
    cv2_score = cv2_score.sort_values('cv2_score', ascending=False)
    cv2_score.to_csv("{}_cv2.csv".format(filename), index=False)
    data_train = data.reindex(['Label']+list(cv2_score['Feature']), axis=1)
    data_train.to_csv("{}_cv2_data.csv".format(filename), index=None)
    return data_train
示例#26
0
def define_bin_params(Y, n, X=None):
    n_obs = len(Y)
    p = ncol(X)

    g = max(2, int(n_obs / 2))
    try:
        bin_mod = sm.GLM(endog=np.c_[Y, n],
                         exog=X,
                         family=sm.families.Binomial()).fit()
        bin_params = bin_mod.params
        bin_cov = fill_diag((g / (1 + g)) * bin_mod.cov_params())
    except:
        if np.sum(Y) > 0:
            binmean = np.sum(Y) / np.sum(n)
            binmean = np.log(binmean / (1 - binmean))
            bin_params = np.zeros(p)
            bin_params[0] = binmean
        else:
            bin_params = np.zeros(p)
            bin_params[0] = np.max([-3, -np.sum(n)])
        bin_cov = np.identity(p)

    return bin_params, bin_cov, p
示例#27
0
def prostate():

    h2o_data = h2o.upload_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    h2o_data.summary()

    sm_data = pd.read_csv(
        pyunit_utils.locate("smalldata/logreg/prostate.csv")).as_matrix()
    sm_data_response = sm_data[:, 1]
    sm_data_features = sm_data[:, 2:]

    h2o_glm = H2OGeneralizedLinearEstimator(family="binomial",
                                            nfolds=10,
                                            alpha=0.5)
    h2o_glm.train(x=range(2, h2o_data.ncol), y=1, training_frame=h2o_data)
    sm_glm = sm.GLM(endog=sm_data_response,
                    exog=sm_data_features,
                    family=sm.families.Binomial()).fit()

    print "statsmodels null deviance {0}".format(sm_glm.null_deviance)
    print "h2o null deviance {0}".format(h2o_glm.null_deviance())
    assert abs(sm_glm.null_deviance - h2o_glm.null_deviance()
               ) < 1e-5, "Expected null deviances to be the same"
示例#28
0
def _regress_out_chunk(data):
    # data is a tuple containing the selected columns from adata.X
    # and the regressors dataFrame
    data_chunk = data[0]
    regressors = data[1]
    variable_is_categorical = data[2]

    responses_chunk_list = []
    import statsmodels.api as sm
    from statsmodels.tools.sm_exceptions import PerfectSeparationError

    for col_index in range(data_chunk.shape[1]):

        # if all values are identical, the statsmodel.api.GLM throws an error;
        # but then no regression is necessary anyways...
        if not (data_chunk[:, col_index] != data_chunk[0, col_index]).any():
            responses_chunk_list.append(data_chunk[:, col_index])
            continue

        if variable_is_categorical:
            regres = np.c_[np.ones(regressors.shape[0]), regressors[:,
                                                                    col_index]]
        else:
            regres = regressors
        try:
            result = sm.GLM(data_chunk[:, col_index],
                            regres,
                            family=sm.families.Gaussian()).fit()
            new_column = result.resid_response
        except PerfectSeparationError:  # this emulates R's behavior
            logg.warning(
                'Encountered PerfectSeparationError, setting to 0 as in R.')
            new_column = np.zeros(data_chunk.shape[0])

        responses_chunk_list.append(new_column)

    return np.vstack(responses_chunk_list)
def looCV(clim, predic, fnc):
    """This function calculates the the leave-one-out-Cross-Validation

    Parameters
    ----------
    clim : damage time series
        DESCRIPTION.
    predic : DataFrame
        DESCRIPTION.
    fnc : LinkFunctionObject
        link function

    Returns
    -------
    out of sample error

    """

    err = 0
    for lo_index in range(len(clim)):

        clim_mask = np.ma.array(clim, mask=False)
        clim_mask.mask[lo_index] = True
        clim_lo = clim_mask.compressed()

        predic_lo = predic.reset_index().drop(lo_index,
                                              axis=0).drop('index', 1)

        model_res = sm.GLM(clim_lo, predic_lo,
                           family=sm.families.Gamma(fnc)).fit(maxiter=5000,
                                                              scale=1.)

        value_pred = model_res.predict(predic).iloc[lo_index]

        err = err + (clim[lo_index] - value_pred)**2

    return err / len(clim)
示例#30
0
    def fit_beta_reg(y, X, df, group_title):

        curr_best_fit = 0
        curr_best_model = None

        for i in tqdm(range(10)):

            X_sample = df.groupby(group_title).apply(
                lambda temp: temp.sample(int(HELD_OUT_PROP * len(temp))))
            train_idx = pd.Series(X_sample.index.get_level_values(1))
            test_idx = df.index.difference(train_idx).tolist()
            np.random.shuffle(test_idx)
            train_idx = train_idx.tolist()

            # print(train_idx)
            # print(len(train_idx))
            # print(len(test_idx))
            # print(len(set(train_idx).union(set(test_idx))))
            # print(X.shape)
            # print(df.shape)

            X_train = X[train_idx]
            X_test = X[test_idx]
            y_train = y[train_idx]
            y_test = y[test_idx]

            binom_glm = sm.GLM(y_train, X_train, family=sm.families.Binomial())
            binom_fit_model = binom_glm.fit()
            fit_val = LexicalAnalysis.goodness_of_fit(binom_fit_model, y_test,
                                                      X_test)

            if fit_val > curr_best_fit:
                print("NEW BEST MODEL")
                print(f"R^2 score of: {fit_val}")
                curr_best_fit = fit_val
                curr_best_model = binom_fit_model
        return curr_best_model