예제 #1
0
    def test_formulas(self):
        np.random.seed(2410)
        exog = np.random.normal(size=(300, 4))
        exog_re = np.random.normal(size=300)
        groups = np.kron(np.arange(100), [1, 1, 1])
        g_errors = exog_re * np.kron(np.random.normal(size=100), [1, 1, 1])
        endog = exog.sum(1) + g_errors + np.random.normal(size=300)

        mod1 = MixedLM(endog, exog, groups, exog_re)
        # test the names
        assert_(mod1.data.xnames == ["x1", "x2", "x3", "x4"])
        assert_(mod1.data.exog_re_names == ["x_re1"])
        assert_(mod1.data.exog_re_names_full == ["x_re1 RE"])
        rslt1 = mod1.fit()

        # Fit with a formula, passing groups as the actual values.
        df = pd.DataFrame({"endog": endog})
        for k in range(exog.shape[1]):
            df["exog%d" % k] = exog[:, k]
        df["exog_re"] = exog_re
        fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3"
        re_fml = "0 + exog_re"
        mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups)

        assert_(mod2.data.xnames == ["exog0", "exog1", "exog2", "exog3"])
        assert_(mod2.data.exog_re_names == ["exog_re"])
        assert_(mod2.data.exog_re_names_full == ["exog_re RE"])

        rslt2 = mod2.fit()
        assert_almost_equal(rslt1.params, rslt2.params)

        # Fit with a formula, passing groups as the variable name.
        df["groups"] = groups
        mod3 = MixedLM.from_formula(fml,
                                    df,
                                    re_formula=re_fml,
                                    groups="groups")
        assert_(mod3.data.xnames == ["exog0", "exog1", "exog2", "exog3"])
        assert_(mod3.data.exog_re_names == ["exog_re"])
        assert_(mod3.data.exog_re_names_full == ["exog_re RE"])

        rslt3 = mod3.fit(start_params=rslt2.params)
        assert_allclose(rslt1.params, rslt3.params, rtol=1e-4)

        # Check default variance structure with non-formula model
        # creation, also use different exog_re that produces a zero
        # estimated variance parameter.
        exog_re = np.ones(len(endog), dtype=np.float64)
        mod4 = MixedLM(endog, exog, groups, exog_re)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            rslt4 = mod4.fit()
        from statsmodels.formula.api import mixedlm
        mod5 = mixedlm(fml, df, groups="groups")
        assert_(mod5.data.exog_re_names == ["groups"])
        assert_(mod5.data.exog_re_names_full == ["groups RE"])
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            rslt5 = mod5.fit()
        assert_almost_equal(rslt4.params, rslt5.params)
    def test_sparse(self):

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        rdir = os.path.join(cur_dir, '#1lab_results')
        fname = os.path.join(rdir, 'pastes.csv')

        # Dense
        data = pd.read_csv(fname)
        vcf = {"cask": "0 + cask"}
        model = MixedLM.from_formula("strength ~ 1",
                                     groups="batch",
                                     re_formula="1",
                                     vc_formula=vcf,
                                     data=data)
        result = model.fit()

        # Sparse
        model2 = MixedLM.from_formula("strength ~ 1",
                                      groups="batch",
                                      re_formula="1",
                                      vc_formula=vcf,
                                      use_sparse=True,
                                      data=data)
        result2 = model2.fit()

        assert_allclose(result.params, result2.params)
        assert_allclose(result.bse, result2.bse)
예제 #3
0
def test_summary_col():
    from statsmodels.iolib.summary2 import summary_col
    ids = [1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3]
    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
    # hard coded simulated y
    # ids = np.asarray(ids)
    # np.random.seed(123987)
    # y = x + np.array([-1, 0, 1])[ids - 1] + 2 * np.random.randn(len(y))
    y = np.array([
        1.727, -1.037, 2.904, 3.569, 4.629, 5.736, 6.747, 7.020, 5.624, 10.155,
        10.400, 17.164, 17.276, 14.988, 14.453
    ])
    d = {'Y': y, 'X': x, 'IDS': ids}
    d = pd.DataFrame(d)

    # provide start_params to speed up convergence
    sp1 = np.array([-1.26722599, 1.1617587, 0.19547518])
    mod1 = MixedLM.from_formula('Y ~ X', d, groups=d['IDS'])
    results1 = mod1.fit(start_params=sp1)
    sp2 = np.array([3.48416861, 0.55287862, 1.38537901])
    mod2 = MixedLM.from_formula('X ~ Y', d, groups=d['IDS'])
    results2 = mod2.fit(start_params=sp2)

    out = summary_col([results1, results2], stars=True)
    s = ('\n=============================\n              Y         X    \n'
         '-----------------------------\nGroup Var 0.1955    1.3854   \n'
         '          (0.6032)  (2.7377) \nIntercept -1.2672   3.4842*  \n'
         '          (1.6546)  (1.8882) \nX         1.1618***          \n'
         '          (0.1959)           \nY                   0.5529***\n'
         '                    (0.2080) \n=============================\n'
         'Standard errors in\nparentheses.\n* p<.1, ** p<.05, ***p<.01')
    assert_equal(str(out), s)
예제 #4
0
    def test_formulas(self):
        np.random.seed(2410)
        exog = np.random.normal(size=(300, 4))
        exog_re = np.random.normal(size=300)
        groups = np.kron(np.arange(100), [1, 1, 1])
        g_errors = exog_re * np.kron(np.random.normal(size=100),
                                     [1, 1, 1])
        endog = exog.sum(1) + g_errors + np.random.normal(size=300)

        mod1 = MixedLM(endog, exog, groups, exog_re)
        # test the names
        assert_(mod1.data.xnames == ["x1", "x2", "x3", "x4"])
        assert_(mod1.data.exog_re_names == ["x_re1"])
        assert_(mod1.data.exog_re_names_full == ["x_re1 RE"])
        rslt1 = mod1.fit()

        # Fit with a formula, passing groups as the actual values.
        df = pd.DataFrame({"endog": endog})
        for k in range(exog.shape[1]):
            df["exog%d" % k] = exog[:, k]
        df["exog_re"] = exog_re
        fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3"
        re_fml = "0 + exog_re"
        mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml,
                                    groups=groups)

        assert_(mod2.data.xnames == ["exog0", "exog1", "exog2", "exog3"])
        assert_(mod2.data.exog_re_names == ["exog_re"])
        assert_(mod2.data.exog_re_names_full == ["exog_re RE"])

        rslt2 = mod2.fit()
        assert_almost_equal(rslt1.params, rslt2.params)

        # Fit with a formula, passing groups as the variable name.
        df["groups"] = groups
        mod3 = MixedLM.from_formula(fml, df, re_formula=re_fml,
                                    groups="groups")
        assert_(mod3.data.xnames == ["exog0", "exog1", "exog2", "exog3"])
        assert_(mod3.data.exog_re_names == ["exog_re"])
        assert_(mod3.data.exog_re_names_full == ["exog_re RE"])

        rslt3 = mod3.fit(start_params=rslt2.params)
        assert_allclose(rslt1.params, rslt3.params, rtol=1e-4)

        # Check default variance structure with non-formula model
        # creation, also use different exog_re that produces a zero
        # estimated variance parameter.
        exog_re = np.ones(len(endog), dtype=np.float64)
        mod4 = MixedLM(endog, exog, groups, exog_re)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            rslt4 = mod4.fit()
        from statsmodels.formula.api import mixedlm
        mod5 = mixedlm(fml, df, groups="groups")
        assert_(mod5.data.exog_re_names == ["groups"])
        assert_(mod5.data.exog_re_names_full == ["groups RE"])
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            rslt5 = mod5.fit()
        assert_almost_equal(rslt4.params, rslt5.params)
예제 #5
0
    def test_sparse(self):

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        rdir = os.path.join(cur_dir, 'results')
        fname = os.path.join(rdir, 'pastes.csv')

        # Dense
        data = pd.read_csv(fname)
        vcf = {"cask": "0 + cask"}
        model = MixedLM.from_formula(
            "strength ~ 1",
            groups="batch",
            re_formula="1",
            vc_formula=vcf,
            data=data)
        result = model.fit()

        # Sparse
        model2 = MixedLM.from_formula(
            "strength ~ 1",
            groups="batch",
            re_formula="1",
            vc_formula=vcf,
            use_sparse=True,
            data=data)
        result2 = model2.fit()

        assert_allclose(result.params, result2.params)
        assert_allclose(result.bse, result2.bse)
예제 #6
0
def test_summary_col():
    from statsmodels.iolib.summary2 import summary_col
    ids = [1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3]
    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
    # hard coded simulated y
    # ids = np.asarray(ids)
    # np.random.seed(123987)
    # y = x + np.array([-1, 0, 1])[ids - 1] + 2 * np.random.randn(len(y))
    y = np.array([
        1.727, -1.037, 2.904, 3.569, 4.629, 5.736, 6.747, 7.020, 5.624, 10.155,
        10.400, 17.164, 17.276, 14.988, 14.453
    ])
    d = {'Y': y, 'X': x, 'IDS': ids}
    d = pd.DataFrame(d)

    # provide start_params to speed up convergence
    sp1 = np.array([-1.26722599, 1.1617587, 0.19547518])
    mod1 = MixedLM.from_formula('Y ~ X', d, groups=d['IDS'])
    results1 = mod1.fit(start_params=sp1)
    sp2 = np.array([3.48416861, 0.55287862, 1.38537901])
    mod2 = MixedLM.from_formula('X ~ Y', d, groups=d['IDS'])
    results2 = mod2.fit(start_params=sp2)

    out = summary_col([results1, results2], stars=True)
    s = ('\n=============================\n              Y         X    \n'
         '-----------------------------\nGroup Var 0.1955    1.3854   \n'
         '          (0.6032)  (2.7377) \nIntercept -1.2672   3.4842*  \n'
         '          (1.6546)  (1.8882) \nX         1.1618***          \n'
         '          (0.1959)           \nY                   0.5529***\n'
         '                    (0.2080) \n=============================\n'
         'Standard errors in\nparentheses.\n* p<.1, ** p<.05, ***p<.01')
    assert_equal(str(out), s)
예제 #7
0
    def test_sparse(self):

        import scipy

        v = scipy.__version__.split(".")[1]
        v = int(v)
        if v < 16:
            return

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        rdir = os.path.join(cur_dir, "results")
        fname = os.path.join(rdir, "pastes.csv")

        # Dense
        data = pd.read_csv(fname)
        vcf = {"cask": "0 + cask"}
        model = MixedLM.from_formula("strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, data=data)
        result = model.fit()

        # Sparse
        model2 = MixedLM.from_formula(
            "strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, use_sparse=True, data=data
        )
        result2 = model2.fit()

        assert_allclose(result.params, result2.params)
        assert_allclose(result.bse, result2.bse)
예제 #8
0
def calcBetaLme(data_full, gain_full, loss_full, linear_full, quad_full, run_group, thrshd=None):
    """ 
    function to calculate beta parameters.
    Input: data from bold file, two list of gain, loss regressor values
        dummy variable indicating the groups,
        a threshold to idenfity the voxels inside the brain
    Output: beta coefficient, the corresponding p-values, the convergence information
    """
    T = data_full.shape[-1]
    time_by_vox = np.reshape(data_full, (-1, T)).T
    beta = np.empty([time_by_vox.shape[1],5])
    fml = "bold ~ gain + loss"
    for k in np.arange(0,time_by_vox.shape[1]):
        ## set a threshold to idenfity the voxels inside the brain
        if thrshd != None:
            if (np.mean(time_by_vox[:,k]) <= thrshd):
                beta[k, :] = [0, 0, 0, 0, 0]
            else:
                dt = pd.DataFrame({'gain':gain_full,'loss':loss_full,'run_group':run_group,
                              'ldrift':linear_full,'qdrift':quad_full,'bold':time_by_vox[:,k]})
                mod_lme = MixedLM.from_formula(fml, dt, groups=dt["run_group"])
                lme_result = mod_lme.fit()
                beta[k, :] = [lme_result.fe_params["gain"], lme_result.pvalues["gain"], 
                      lme_result.fe_params["loss"], lme_result.pvalues["loss"], lme_result.converged]
        else:
            dt = pd.DataFrame({'gain':gain_full,'loss':loss_full,'run_group':run_group,
                          'ldrift':linear_full,'qdrift':quad_full,'bold':time_by_vox[:,k]})
            mod_lme = MixedLM.from_formula(fml, dt, groups=dt["run_group"])
            lme_result = mod_lme.fit()
            beta[k, :] = [lme_result.fe_params["gain"], lme_result.pvalues["gain"], 
                  lme_result.fe_params["loss"], lme_result.pvalues["loss"], lme_result.converged]

    return beta
예제 #9
0
    def test_sparse(self):

        import scipy
        v = scipy.__version__.split(".")[1]
        v = int(v)
        if v < 16:
            return

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        rdir = os.path.join(cur_dir, 'results')
        fname = os.path.join(rdir, 'pastes.csv')

        # Dense
        data = pd.read_csv(fname)
        vcf = {"cask": "0 + cask"}
        model = MixedLM.from_formula("strength ~ 1",
                                     groups="batch",
                                     re_formula="1",
                                     vc_formula=vcf,
                                     data=data)
        result = model.fit()

        # Sparse
        model2 = MixedLM.from_formula("strength ~ 1",
                                      groups="batch",
                                      re_formula="1",
                                      vc_formula=vcf,
                                      use_sparse=True,
                                      data=data)
        result2 = model2.fit()

        assert_allclose(result.params, result2.params)
        assert_allclose(result.bse, result2.bse)
예제 #10
0
def test_handle_missing():

    np.random.seed(23423)
    df = np.random.normal(size=(100, 6))
    df = pd.DataFrame(df)
    df.columns = ["y", "g", "x1", "z1", "c1", "c2"]
    df["g"] = np.kron(np.arange(50), np.ones(2))
    re = np.random.normal(size=(50, 4))
    re = np.kron(re, np.ones((2, 1)))
    df["y"] = re[:, 0] + re[:, 1] * df.z1 + re[:, 2] * df.c1
    df["y"] += re[:, 3] * df.c2 + np.random.normal(size=100)
    df.loc[1, "y"] = np.NaN
    df.loc[2, "g"] = np.NaN
    df.loc[3, "x1"] = np.NaN
    df.loc[4, "z1"] = np.NaN
    df.loc[5, "c1"] = np.NaN
    df.loc[6, "c2"] = np.NaN

    fml = "y ~ x1"
    re_formula = "1 + z1"
    vc_formula = {"a": "0 + c1", "b": "0 + c2"}
    for include_re in False, True:
        for include_vc in False, True:
            kwargs = {}
            dx = df.copy()
            va = ["y", "g", "x1"]
            if include_re:
                kwargs["re_formula"] = re_formula
                va.append("z1")
            if include_vc:
                kwargs["vc_formula"] = vc_formula
                va.extend(["c1", "c2"])

            dx = dx[va].dropna()

            # Some of these models are severely misspecified with
            # small n, so produce convergence warnings.  Not relevant
            # to what we are checking here.
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")

                # Drop missing externally
                model1 = MixedLM.from_formula(fml,
                                              groups="g",
                                              data=dx,
                                              **kwargs)
                result1 = model1.fit()

                # MixeLM handles missing
                model2 = MixedLM.from_formula(fml,
                                              groups="g",
                                              data=df,
                                              missing='drop',
                                              **kwargs)
                result2 = model2.fit()

                assert_allclose(result1.params, result2.params)
                assert_allclose(result1.bse, result2.bse)
                assert_equal(len(result1.fittedvalues), result1.nobs)
예제 #11
0
    def test_dietox_slopes(self):
        # dietox data from geepack using random intercepts
        #
        # Fit in R using
        #
        # library(geepack)
        # r = lmer(Weight ~ Time + (1 + Time | Pig), data=dietox)
        # r = lmer(Weight ~ Time + (1 + Time | Pig), REML=FALSE, data=dietox)

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        rdir = os.path.join(cur_dir, 'results')
        fname = os.path.join(rdir, 'dietox.csv')

        # REML
        data = pd.read_csv(fname)
        model = MixedLM.from_formula("Weight ~ Time", groups="Pig",
                                     re_formula="1 + Time", data=data)
        result = model.fit(method='powell')

        # fixef(r)
        assert_allclose(result.fe_params, np.r_[15.738650, 6.939014], rtol=1e-5)

        # sqrt(diag(vcov(r)))
        assert_allclose(result.bse[0:2], np.r_[0.5501253, 0.0798254], rtol=1e-3)

        # attr(VarCorr(r), "sc")^2
        assert_allclose(result.scale, 6.03745, rtol=1e-3)

        # as.numeric(VarCorr(r)[[1]])
        assert_allclose(result.cov_re.values.ravel(),
                        np.r_[19.4934552, 0.2938323, 0.2938323, 0.4160620],
                        rtol=1e-1)

        # logLik(r)
        assert_allclose(model.loglike(result.params_object), -2217.047, rtol=1e-5)

        # ML
        data = pd.read_csv(fname)
        model = MixedLM.from_formula("Weight ~ Time", groups="Pig",
                                     re_formula="1 + Time", data=data)
        result = model.fit(method='powell', reml=False)

        # fixef(r)
        assert_allclose(result.fe_params, np.r_[15.73863, 6.93902], rtol=1e-5)

        # sqrt(diag(vcov(r)))
        assert_allclose(result.bse[0:2], np.r_[0.54629282, 0.07926954], rtol=1e-3)

        # attr(VarCorr(r), "sc")^2
        assert_allclose(result.scale, 6.037441, rtol=1e-3)

        #  as.numeric(VarCorr(r)[[1]])
        assert_allclose(result.cov_re.values.ravel(),
                        np.r_[19.190922, 0.293568, 0.293568, 0.409695], rtol=1e-2)

        # logLik(r)
        assert_allclose(model.loglike(result.params_object), -2215.753, rtol=1e-5)
예제 #12
0
    def test_dietox_slopes(self):
        # dietox data from geepack using random intercepts
        #
        # Fit in R using
        #
        # library(geepack)
        # r = lmer(Weight ~ Time + (1 + Time | Pig), data=dietox)
        # r = lmer(Weight ~ Time + (1 + Time | Pig), REML=FALSE, data=dietox)

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        rdir = os.path.join(cur_dir, 'results')
        fname = os.path.join(rdir, 'dietox.csv')

        # REML
        data = pd.read_csv(fname)
        model = MixedLM.from_formula("Weight ~ Time", groups="Pig",
                                     re_formula="1 + Time", data=data)
        result = model.fit(method='powell')

        # fixef(r)
        assert_allclose(result.fe_params, np.r_[15.738650, 6.939014], rtol=1e-5)

        # sqrt(diag(vcov(r)))
        assert_allclose(result.bse[0:2], np.r_[0.5501253, 0.0798254], rtol=1e-3)

        # attr(VarCorr(r), "sc")^2
        assert_allclose(result.scale, 6.03745, rtol=1e-3)

        # as.numeric(VarCorr(r)[[1]])
        assert_allclose(result.cov_re.values.ravel(),
                        np.r_[19.4934552, 0.2938323, 0.2938323, 0.4160620],
                        rtol=1e-1)

        # logLik(r)
        assert_allclose(model.loglike(result.params_object), -2217.047, rtol=1e-5)

        # ML
        data = pd.read_csv(fname)
        model = MixedLM.from_formula("Weight ~ Time", groups="Pig",
                                     re_formula="1 + Time", data=data)
        result = model.fit(method='powell', reml=False)

        # fixef(r)
        assert_allclose(result.fe_params, np.r_[15.73863, 6.93902], rtol=1e-5)

        # sqrt(diag(vcov(r)))
        assert_allclose(result.bse[0:2], np.r_[0.54629282, 0.07926954], rtol=1e-3)

        # attr(VarCorr(r), "sc")^2
        assert_allclose(result.scale, 6.037441, rtol=1e-3)

        #  as.numeric(VarCorr(r)[[1]])
        assert_allclose(result.cov_re.values.ravel(),
                        np.r_[19.190922, 0.293568, 0.293568, 0.409695], rtol=1e-2)

        # logLik(r)
        assert_allclose(model.loglike(result.params_object), -2215.753, rtol=1e-5)
예제 #13
0
    def test_dietox(self):
        # dietox data from geepack using random intercepts
        #
        # Fit in R using
        #
        # library(geepack)
        # rm = lmer(Weight ~ Time + (1 | Pig), data=dietox)
        # rm = lmer(Weight ~ Time + (1 | Pig), REML=FALSE, data=dietox)

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        rdir = os.path.join(cur_dir, 'results')
        fname = os.path.join(rdir, 'dietox.csv')

        # REML
        data = pd.read_csv(fname)
        model = MixedLM.from_formula("Weight ~ Time", groups="Pig",
                                     data=data)
        result = model.fit()

        # fixef(rm)
        assert_allclose(result.fe_params, np.r_[15.723523, 6.942505], rtol=1e-5)

        # sqrt(diag(vcov(rm)))
        assert_allclose(result.bse[0:2], np.r_[0.78805374, 0.03338727], rtol=1e-5)

        # attr(VarCorr(rm), "sc")^2
        assert_allclose(result.scale, 11.36692, rtol=1e-5)

        # VarCorr(rm)[[1]][[1]]
        assert_allclose(result.cov_re, 40.39395, rtol=1e-5)

        # logLik(rm)
        assert_allclose(model.loglike(result.params_object), -2404.775, rtol=1e-5)

        # ML
        data = pd.read_csv(fname)
        model = MixedLM.from_formula("Weight ~ Time", groups="Pig",
                                     data=data)
        result = model.fit(reml=False)

        # fixef(rm)
        assert_allclose(result.fe_params, np.r_[15.723517, 6.942506], rtol=1e-5)

        # sqrt(diag(vcov(rm)))
        assert_allclose(result.bse[0:2], np.r_[0.7829397, 0.0333661], rtol=1e-5)

        # attr(VarCorr(rm), "sc")^2
        assert_allclose(result.scale, 11.35251, rtol=1e-5)

        # VarCorr(rm)[[1]][[1]]
        assert_allclose(result.cov_re, 39.82097, rtol=1e-5)

        # logLik(rm)
        assert_allclose(model.loglike(result.params_object), -2402.932, rtol=1e-5)
예제 #14
0
    def test_dietox(self):
        # dietox data from geepack using random intercepts
        #
        # Fit in R using
        #
        # library(geepack)
        # rm = lmer(Weight ~ Time + (1 | Pig), data=dietox)
        # rm = lmer(Weight ~ Time + (1 | Pig), REML=FALSE, data=dietox)

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        rdir = os.path.join(cur_dir, 'results')
        fname = os.path.join(rdir, 'dietox.csv')

        # REML
        data = pd.read_csv(fname)
        model = MixedLM.from_formula("Weight ~ Time", groups="Pig",
                                     data=data)
        result = model.fit()

        # fixef(rm)
        assert_allclose(result.fe_params, np.r_[15.723523, 6.942505], rtol=1e-5)

        # sqrt(diag(vcov(rm)))
        assert_allclose(result.bse[0:2], np.r_[0.78805374, 0.03338727], rtol=1e-5)

        # attr(VarCorr(rm), "sc")^2
        assert_allclose(result.scale, 11.36692, rtol=1e-5)

        # VarCorr(rm)[[1]][[1]]
        assert_allclose(result.cov_re, 40.39395, rtol=1e-5)

        # logLik(rm)
        assert_allclose(model.loglike(result.params_object), -2404.775, rtol=1e-5)

        # ML
        data = pd.read_csv(fname)
        model = MixedLM.from_formula("Weight ~ Time", groups="Pig",
                                     data=data)
        result = model.fit(reml=False)

        # fixef(rm)
        assert_allclose(result.fe_params, np.r_[15.723517, 6.942506], rtol=1e-5)

        # sqrt(diag(vcov(rm)))
        assert_allclose(result.bse[0:2], np.r_[0.7829397, 0.0333661], rtol=1e-5)

        # attr(VarCorr(rm), "sc")^2
        assert_allclose(result.scale, 11.35251, rtol=1e-5)

        # VarCorr(rm)[[1]][[1]]
        assert_allclose(result.cov_re, 39.82097, rtol=1e-5)

        # logLik(rm)
        assert_allclose(model.loglike(result.params_object), -2402.932, rtol=1e-5)
예제 #15
0
    def test_pastes_vcomp(self):
        """
        pastes data from lme4

        Fit in R using formula:

        strength ~ (1|batch) + (1|batch:cask)
        """

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        rdir = os.path.join(cur_dir, 'results')
        fname = os.path.join(rdir, 'pastes.csv')

        # REML
        data = pd.read_csv(fname)
        vcf = {"cask": "0 + cask"}
        model = MixedLM.from_formula("strength ~ 1",
                                     groups="batch",
                                     re_formula="1",
                                     vc_formula=vcf,
                                     data=data)
        result = model.fit()

        assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3)
        assert_allclose(result.bse.iloc[0], 0.6769, rtol=1e-3)
        assert_allclose(result.cov_re.iloc[0, 0], 1.657, rtol=1e-3)
        assert_allclose(result.scale, 0.678, rtol=1e-3)
        assert_allclose(result.llf, -123.49, rtol=1e-1)
        assert_equal(result.aic, np.nan)  # don't provide aic/bic with REML
        assert_equal(result.bic, np.nan)

        resid = np.r_[0.17133538, -0.02866462, -1.08662875, 1.11337125,
                      -0.12093607]
        assert_allclose(result.resid[0:5], resid, rtol=1e-3)

        fit = np.r_[62.62866, 62.62866, 61.18663, 61.18663, 62.82094]
        assert_allclose(result.fittedvalues[0:5], fit, rtol=1e-4)

        # ML
        data = pd.read_csv(fname)
        vcf = {"cask": "0 + cask"}
        model = MixedLM.from_formula("strength ~ 1",
                                     groups="batch",
                                     re_formula="1",
                                     vc_formula=vcf,
                                     data=data)
        result = model.fit(reml=False)
        assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3)
        assert_allclose(result.bse.iloc[0], 0.642, rtol=1e-3)
        assert_allclose(result.cov_re.iloc[0, 0], 1.199, rtol=1e-3)
        assert_allclose(result.scale, 0.67799, rtol=1e-3)
        assert_allclose(result.llf, -123.997, rtol=1e-1)
        assert_allclose(result.aic, 255.9944, rtol=1e-3)
        assert_allclose(result.bic, 264.3718, rtol=1e-3)
예제 #16
0
def test_handle_missing():

    np.random.seed(23423)
    df = np.random.normal(size=(100, 6))
    df = pd.DataFrame(df)
    df.columns = ["y", "g", "x1", "z1", "c1", "c2"]
    df["g"] = np.kron(np.arange(50), np.ones(2))
    re = np.random.normal(size=(50, 4))
    re = np.kron(re, np.ones((2, 1)))
    df["y"] = re[:, 0] + re[:, 1] * df.z1 + re[:, 2] * df.c1
    df["y"] += re[:, 3] * df.c2 + np.random.normal(size=100)
    df.loc[1, "y"] = np.NaN
    df.loc[2, "g"] = np.NaN
    df.loc[3, "x1"] = np.NaN
    df.loc[4, "z1"] = np.NaN
    df.loc[5, "c1"] = np.NaN
    df.loc[6, "c2"] = np.NaN

    fml = "y ~ x1"
    re_formula = "1 + z1"
    vc_formula = {"a": "0 + c1", "b": "0 + c2"}
    for include_re in False, True:
        for include_vc in False, True:
            kwargs = {}
            dx = df.copy()
            va = ["y", "g", "x1"]
            if include_re:
                kwargs["re_formula"] = re_formula
                va.append("z1")
            if include_vc:
                kwargs["vc_formula"] = vc_formula
                va.extend(["c1", "c2"])

            dx = dx[va].dropna()

            # Some of these models are severely misspecified with
            # small n, so produce convergence warnings.  Not relevant
            # to what we are checking here.
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")

                # Drop missing externally
                model1 = MixedLM.from_formula(
                    fml, groups="g", data=dx, **kwargs)
                result1 = model1.fit()

                # MixeLM handles missing
                model2 = MixedLM.from_formula(
                    fml, groups="g", data=df, missing='drop', **kwargs)
                result2 = model2.fit()

                assert_allclose(result1.params, result2.params)
                assert_allclose(result1.bse, result2.bse)
                assert_equal(len(result1.fittedvalues), result1.nobs)
예제 #17
0
    def test_pastes_vcomp(self):
        """
        pastes data from lme4

        Fit in R using formula:

        strength ~ (1|batch) + (1|batch:cask)
        """

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        rdir = os.path.join(cur_dir, 'results')
        fname = os.path.join(rdir, 'pastes.csv')

        # REML
        data = pd.read_csv(fname)
        vcf = {"cask": "0 + cask"}
        model = MixedLM.from_formula("strength ~ 1", groups="batch",
                                     re_formula="1", vc_formula=vcf,
                                     data=data)
        result = model.fit()

        assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3)
        assert_allclose(result.bse.iloc[0], 0.6769, rtol=1e-3)
        assert_allclose(result.cov_re.iloc[0, 0], 1.657, rtol=1e-3)
        assert_allclose(result.scale, 0.678, rtol=1e-3)
        assert_allclose(result.llf, -123.49, rtol=1e-1)
        assert_equal(result.aic, np.nan)  # don't provide aic/bic with REML
        assert_equal(result.bic, np.nan)

        resid = np.r_[0.17133538, -0.02866462, -
                      1.08662875, 1.11337125, -0.12093607]
        assert_allclose(result.resid[0:5], resid, rtol=1e-3)

        fit = np.r_[62.62866, 62.62866, 61.18663, 61.18663, 62.82094]
        assert_allclose(result.fittedvalues[0:5], fit, rtol=1e-4)

        # ML
        data = pd.read_csv(fname)
        vcf = {"cask": "0 + cask"}
        model = MixedLM.from_formula("strength ~ 1", groups="batch",
                                     re_formula="1", vc_formula=vcf,
                                     data=data)
        result = model.fit(reml=False)
        assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3)
        assert_allclose(result.bse.iloc[0], 0.642, rtol=1e-3)
        assert_allclose(result.cov_re.iloc[0, 0], 1.199, rtol=1e-3)
        assert_allclose(result.scale, 0.67799, rtol=1e-3)
        assert_allclose(result.llf, -123.997, rtol=1e-1)
        assert_allclose(result.aic, 255.9944, rtol=1e-3)
        assert_allclose(result.bic, 264.3718, rtol=1e-3)
예제 #18
0
    def test_vcomp_3(self):
        # Test a model with vcomp but no other random effects, using formulas.

        import scipy

        v = scipy.__version__.split(".")[1]
        v = int(v)
        if v < 16:
            return

        np.random.seed(4279)
        x1 = np.random.normal(size=400)
        groups = np.kron(np.arange(100), np.ones(4))
        slopes = np.random.normal(size=100)
        slopes = np.kron(slopes, np.ones(4)) * x1
        y = slopes + np.random.normal(size=400)
        vc_fml = {"a": "0 + x1"}
        df = pd.DataFrame({"y": y, "x1": x1, "groups": groups})

        model = MixedLM.from_formula("y ~ 1", groups="groups", vc_formula=vc_fml, data=df)
        result = model.fit()
        result.summary()

        assert_allclose(result.resid.iloc[0:4], np.r_[-1.180753, 0.279966, 0.578576, -0.667916], rtol=1e-3)
        assert_allclose(result.fittedvalues.iloc[0:4], np.r_[-0.101549, 0.028613, -0.224621, -0.126295], rtol=1e-3)
예제 #19
0
def calcBetaLme(data_full, gain_full, loss_full, linear_full, quad_full,
                run_group, thrshd):
    """ 
    function to calculate beta parameters.
    Input: data from bold file, two list of gain, loss regressor values
        dummy variable indicating the groups,
        a threshold to idenfity the voxels inside the brain
    Output: beta coefficient, the corresponding p-values, the convergence information
    """
    T = data_full.shape[-1]
    time_by_vox = np.reshape(data_full, (-1, T)).T
    beta = np.empty([time_by_vox.shape[1], 5])
    fml = "bold ~ gain + loss"
    for k in np.arange(0, time_by_vox.shape[1]):
        ## set a threshold to idenfity the voxels inside the brain
        if (np.mean(time_by_vox[:, k]) <= 400):
            beta[k, :] = [0, 0, 0, 0, 0]
        else:
            dt = pd.DataFrame({
                'gain': gain_full,
                'loss': loss_full,
                'run_group': run_group,
                'ldrift': linear_full,
                'qdrift': quad_full,
                'bold': time_by_vox[:, k]
            })
            mod_lme = MixedLM.from_formula(fml, dt, groups=dt["run_group"])
            lme_result = mod_lme.fit()
            beta[k, :] = [
                lme_result.fe_params["gain"], lme_result.pvalues["gain"],
                lme_result.fe_params["loss"], lme_result.pvalues["loss"],
                lme_result.converged
            ]
    return beta
예제 #20
0
    def test_formulas(self):

        np.random.seed(2410)
        exog = np.random.normal(size=(300, 4))
        exog_re = np.random.normal(size=300)
        groups = np.kron(np.arange(100), [1, 1, 1])
        g_errors = exog_re * np.kron(np.random.normal(size=100), [1, 1, 1])
        endog = exog.sum(1) + g_errors + np.random.normal(size=300)

        mod1 = MixedLM(endog, exog, groups, exog_re)
        rslt1 = mod1.fit()

        df = pd.DataFrame({"endog": endog})
        for k in range(exog.shape[1]):
            df["exog%d" % k] = exog[:, k]
        df["exog_re"] = exog_re
        fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3"
        re_fml = "0 + exog_re"
        mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups)
        rslt2 = mod2.fit()
        assert_almost_equal(rslt1.params, rslt2.params)

        # Check default variance structure, with formula.api
        exog_re = np.ones(len(endog), dtype=np.float64)
        mod3 = MixedLM(endog, exog, groups, exog_re)
        rslt3 = mod3.fit()
        from statsmodels.formula.api import mixedlm

        mod4 = mixedlm(fml, df, groups=groups)
        rslt4 = mod4.fit()
        assert_almost_equal(rslt3.params, rslt4.params)
예제 #21
0
    def test_vcomp_3(self):
        # Test a model with vcomp but no other random effects, using formulas.

        import scipy
        v = scipy.__version__.split(".")[1]
        v = int(v)
        if v < 16:
            return

        np.random.seed(4279)
        x1 = np.random.normal(size=400)
        groups = np.kron(np.arange(100), np.ones(4))
        slopes = np.random.normal(size=100)
        slopes = np.kron(slopes, np.ones(4)) * x1
        y = slopes + np.random.normal(size=400)
        vc_fml = {"a": "0 + x1"}
        df = pd.DataFrame({"y": y, "x1": x1, "groups": groups})

        model = MixedLM.from_formula("y ~ 1",
                                     groups="groups",
                                     vc_formula=vc_fml,
                                     data=df)
        result = model.fit()
        result.summary()

        assert_allclose(result.resid.iloc[0:4],
                        np.r_[-1.180753, 0.279966, 0.578576, -0.667916],
                        rtol=1e-3)
        assert_allclose(result.fittedvalues.iloc[0:4],
                        np.r_[-0.101549, 0.028613, -0.224621, -0.126295],
                        rtol=1e-3)
def lmemodel(data, metadata, 
             fixedEffects = ['Tissue of Origin'],
             randomEffects=['High Confidence Donor ID (HCDID)']):
    """Performs a mixed effect linear model"""
    df = metadata[fixedEffects].copy()
    df = pd.concat([df, metadata[randomEffects]], axis=1 )
    #Change the parameters to be compatible with patsy formulas
    fixedEffects = [c.translate(string.maketrans(' ()', '___')) for c in fixedEffects]
    randomEffects = [c.translate(string.maketrans(' ()', '___')) for c in randomEffects]
    df.columns = [c.translate(string.maketrans(' ()', '___')) for c in df.columns]

    model_string = 'gene ~ '+' + '.join(fixedEffects)
    results = []
    for i in range(data.shape[0]):
        #Add the dependent variable to the dataframe
        df['gene'] = data.irow(i)
        #################
        df['High_Confidence_Donor_ID__HCDID_'] = stats.binom.rvs(1, .4, size=69)
        print df.shape, model_string
        return df
        df = df.dropna()
        print df.shape
        #################
        #compute new model
        mod = MixedLM.from_formula(model_string, df, groups = df[randomEffects])
        return mod
        #df.boxplot(by=fixedEffects)
        #mod = sm.ols(model_fit, df)
        #results.append(mod.fit())
    return results
예제 #23
0
    def test_vcomp_2(self):
        # Simulated data comparison to R

        np.random.seed(6241)
        n = 1600
        exog = np.random.normal(size=(n, 2))
        groups = np.kron(np.arange(n / 16), np.ones(16))

        # Build up the random error vector
        errors = 0

        # The random effects
        exog_re = np.random.normal(size=(n, 2))
        slopes = np.random.normal(size=(n // 16, 2))
        slopes = np.kron(slopes, np.ones((16, 1))) * exog_re
        errors += slopes.sum(1)

        # First variance component
        subgroups1 = np.kron(np.arange(n / 4), np.ones(4))
        errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4))

        # Second variance component
        subgroups2 = np.kron(np.arange(n / 2), np.ones(2))
        errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2))

        # iid errors
        errors += np.random.normal(size=n)

        endog = exog.sum(1) + errors

        df = pd.DataFrame(index=range(n))
        df["y"] = endog
        df["groups"] = groups
        df["x1"] = exog[:, 0]
        df["x2"] = exog[:, 1]
        df["z1"] = exog_re[:, 0]
        df["z2"] = exog_re[:, 1]
        df["v1"] = subgroups1
        df["v2"] = subgroups2

        # Equivalent model in R:
        # df.to_csv("tst.csv")
        # model = lmer(y ~ x1 + x2 + (0 + z1 + z2 | groups) + (1 | v1) + (1 |
        # v2), df)

        vcf = {"a": "0 + C(v1)", "b": "0 + C(v2)"}
        model1 = MixedLM.from_formula("y ~ x1 + x2", groups=groups,
                                      re_formula="0+z1+z2",
                                      vc_formula=vcf, data=df)
        result1 = model1.fit()

        # Compare to R
        assert_allclose(result1.fe_params, [
                        0.16527, 0.99911, 0.96217], rtol=1e-4)
        assert_allclose(result1.cov_re, [
                        [1.244,  0.146], [0.146, 1.371]], rtol=1e-3)
        assert_allclose(result1.vcomp, [4.024, 3.997], rtol=1e-3)
        assert_allclose(result1.bse.iloc[0:3], [
                        0.12610, 0.03938, 0.03848], rtol=1e-3)
예제 #24
0
    def test_vcomp_formula(self):

        np.random.seed(6241)
        n = 800
        exog = np.random.normal(size=(n, 2))
        exog[:, 0] = 1
        ex_vc = []
        groups = np.kron(np.arange(n / 4), np.ones(4))
        errors = 0
        exog_re = np.random.normal(size=(n, 2))
        slopes = np.random.normal(size=(n // 4, 2))
        slopes = np.kron(slopes, np.ones((4, 1))) * exog_re
        errors += slopes.sum(1)
        ex_vc = np.random.normal(size=(n, 4))
        slopes = np.random.normal(size=(n // 4, 4))
        slopes[:, 2:] *= 2
        slopes = np.kron(slopes, np.ones((4, 1))) * ex_vc
        errors += slopes.sum(1)
        errors += np.random.normal(size=n)
        endog = exog.sum(1) + errors

        exog_vc = {"a": {}, "b": {}}
        for k, group in enumerate(range(int(n / 4))):
            ix = np.flatnonzero(groups == group)
            exog_vc["a"][group] = ex_vc[ix, 0:2]
            exog_vc["b"][group] = ex_vc[ix, 2:]
        with pytest.warns(UserWarning, match="Using deprecated variance"):
            model1 = MixedLM(endog,
                             exog,
                             groups,
                             exog_re=exog_re,
                             exog_vc=exog_vc)
        result1 = model1.fit()

        df = pd.DataFrame(exog[:, 1:], columns=["x1"])
        df["y"] = endog
        df["re1"] = exog_re[:, 0]
        df["re2"] = exog_re[:, 1]
        df["vc1"] = ex_vc[:, 0]
        df["vc2"] = ex_vc[:, 1]
        df["vc3"] = ex_vc[:, 2]
        df["vc4"] = ex_vc[:, 3]
        vc_formula = {"a": "0 + vc1 + vc2", "b": "0 + vc3 + vc4"}
        model2 = MixedLM.from_formula("y ~ x1",
                                      groups=groups,
                                      re_formula="0 + re1 + re2",
                                      vc_formula=vc_formula,
                                      data=df)
        result2 = model2.fit()

        assert_allclose(result1.fe_params, result2.fe_params, rtol=1e-8)
        assert_allclose(result1.cov_re, result2.cov_re, rtol=1e-8)
        assert_allclose(result1.vcomp, result2.vcomp, rtol=1e-8)
        assert_allclose(result1.params, result2.params, rtol=1e-8)
        assert_allclose(result1.bse, result2.bse, rtol=1e-8)
예제 #25
0
    def test_formulas(self):

        np.random.seed(2410)
        exog = np.random.normal(size=(300,4))
        exog_re = np.random.normal(size=300)
        groups = np.kron(np.arange(100), [1,1,1])
        g_errors = exog_re * np.kron(np.random.normal(size=100),
                                     [1,1,1])
        endog = exog.sum(1) + g_errors + np.random.normal(size=300)

        mod1 = MixedLM(endog, exog, groups, exog_re)
        rslt1 = mod1.fit()

        # Fit with a formula, passing groups as the actual values.
        df = pd.DataFrame({"endog": endog})
        for k in range(exog.shape[1]):
            df["exog%d" % k] = exog[:,k]
        df["exog_re"] = exog_re
        fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3"
        re_fml = "0 + exog_re"
        mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml,
                                    groups=groups)
        rslt2 = mod2.fit()
        assert_almost_equal(rslt1.params, rslt2.params)

        # Fit with a formula, passing groups as the variable name.
        df["groups"] = groups
        mod3 = MixedLM.from_formula(fml, df, re_formula=re_fml,
                                    groups="groups")
        rslt3 = mod3.fit(start_params=rslt2.params)
        assert_allclose(rslt1.params, rslt3.params, rtol=1e-4)

        # Check default variance structure with non-formula model
        # creation.
        exog_re = np.ones(len(endog), dtype=np.float64)
        mod4 = MixedLM(endog, exog, groups, exog_re)
        rslt4 = mod4.fit(start_params=rslt2.params)
        from statsmodels.formula.api import mixedlm
        mod5 = mixedlm(fml, df, groups="groups")
        rslt5 = mod5.fit(start_params=rslt2.params)
        assert_almost_equal(rslt4.params, rslt5.params)
예제 #26
0
    def test_formulas(self):

        np.random.seed(2410)
        exog = np.random.normal(size=(300, 4))
        exog_re = np.random.normal(size=300)
        groups = np.kron(np.arange(100), [1, 1, 1])
        g_errors = exog_re * np.kron(np.random.normal(size=100), [1, 1, 1])
        endog = exog.sum(1) + g_errors + np.random.normal(size=300)

        mod1 = MixedLM(endog, exog, groups, exog_re)
        rslt1 = mod1.fit()

        # Fit with a formula, passing groups as the actual values.
        df = pd.DataFrame({"endog": endog})
        for k in range(exog.shape[1]):
            df["exog%d" % k] = exog[:, k]
        df["exog_re"] = exog_re
        fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3"
        re_fml = "0 + exog_re"
        mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups)
        rslt2 = mod2.fit()
        assert_almost_equal(rslt1.params, rslt2.params)

        # Fit with a formula, passing groups as the variable name.
        df["groups"] = groups
        mod3 = MixedLM.from_formula(fml,
                                    df,
                                    re_formula=re_fml,
                                    groups="groups")
        rslt3 = mod3.fit(start_params=rslt2.params)
        assert_almost_equal(rslt1.params, rslt3.params, decimal=5)

        # Check default variance structure with formula.api
        exog_re = np.ones(len(endog), dtype=np.float64)
        mod4 = MixedLM(endog, exog, groups, exog_re)
        rslt4 = mod4.fit(start_params=rslt2.params)
        from statsmodels.formula.api import mixedlm
        mod5 = mixedlm(fml, df, groups="groups")
        rslt5 = mod5.fit(start_params=rslt2.params)
        assert_almost_equal(rslt4.params, rslt5.params)
예제 #27
0
def test_mixed_lm_wrapper():
    # a bit more complicated model to test
    np.random.seed(2410)
    exog = np.random.normal(size=(300, 4))
    exog_re = np.random.normal(size=300)
    groups = np.kron(np.arange(100), [1, 1, 1])
    g_errors = exog_re * np.kron(np.random.normal(size=100),
                                 [1, 1, 1])
    endog = exog.sum(1) + g_errors + np.random.normal(size=300)

    # Fit with a formula, passing groups as the actual values.
    df = pd.DataFrame({"endog": endog})
    for k in range(exog.shape[1]):
        df["exog%d" % k] = exog[:, k]
    df["exog_re"] = exog_re
    fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3"
    re_fml = "~ exog_re"
    mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml,
                                groups=groups)
    result = mod2.fit()
    result.summary()

    xnames = ["exog0", "exog1", "exog2", "exog3"]
    re_names = ["Intercept", "exog_re"]
    re_names_full = ["Intercept RE", "Intercept RE x exog_re RE",
                     "exog_re RE"]

    assert_(mod2.data.xnames == xnames)
    assert_(mod2.data.exog_re_names == re_names)
    assert_(mod2.data.exog_re_names_full == re_names_full)

    params = result.params
    assert_(params.index.tolist() == xnames + re_names_full)
    bse = result.bse
    assert_(bse.index.tolist() == xnames + re_names_full)
    tvalues = result.tvalues
    assert_(tvalues.index.tolist() == xnames + re_names_full)
    cov_params = result.cov_params()
    assert_(cov_params.index.tolist() == xnames + re_names_full)
    assert_(cov_params.columns.tolist() == xnames + re_names_full)
    fe = result.fe_params
    assert_(fe.index.tolist() == xnames)
    bse_fe = result.bse_fe
    assert_(bse_fe.index.tolist() == xnames)
    cov_re = result.cov_re
    assert_(cov_re.index.tolist() == re_names)
    assert_(cov_re.columns.tolist() == re_names)
    cov_re_u = result.cov_re_unscaled
    assert_(cov_re_u.index.tolist() == re_names)
    assert_(cov_re_u.columns.tolist() == re_names)
    bse_re = result.bse_re
    assert_(bse_re.index.tolist() == re_names_full)
예제 #28
0
    def test_vcomp_formula(self):

        np.random.seed(6241)
        n = 800
        exog = np.random.normal(size=(n, 2))
        exog[:, 0] = 1
        ex_vc = []
        groups = np.kron(np.arange(n / 4), np.ones(4))
        errors = 0
        exog_re = np.random.normal(size=(n, 2))
        slopes = np.random.normal(size=(n // 4, 2))
        slopes = np.kron(slopes, np.ones((4, 1))) * exog_re
        errors += slopes.sum(1)
        ex_vc = np.random.normal(size=(n, 4))
        slopes = np.random.normal(size=(n // 4, 4))
        slopes[:, 2:] *= 2
        slopes = np.kron(slopes, np.ones((4, 1))) * ex_vc
        errors += slopes.sum(1)
        errors += np.random.normal(size=n)
        endog = exog.sum(1) + errors

        exog_vc = {"a": {}, "b": {}}
        for k, group in enumerate(range(int(n / 4))):
            ix = np.flatnonzero(groups == group)
            exog_vc["a"][group] = ex_vc[ix, 0:2]
            exog_vc["b"][group] = ex_vc[ix, 2:]
        model1 = MixedLM(endog, exog, groups, exog_re=exog_re, exog_vc=exog_vc)
        result1 = model1.fit()

        df = pd.DataFrame(exog[:, 1:], columns=["x1"])
        df["y"] = endog
        df["re1"] = exog_re[:, 0]
        df["re2"] = exog_re[:, 1]
        df["vc1"] = ex_vc[:, 0]
        df["vc2"] = ex_vc[:, 1]
        df["vc3"] = ex_vc[:, 2]
        df["vc4"] = ex_vc[:, 3]
        vc_formula = {"a": "0 + vc1 + vc2", "b": "0 + vc3 + vc4"}
        model2 = MixedLM.from_formula(
            "y ~ x1",
            groups=groups,
            re_formula="0 + re1 + re2",
            vc_formula=vc_formula,
            data=df)
        result2 = model2.fit()

        assert_allclose(result1.fe_params, result2.fe_params, rtol=1e-8)
        assert_allclose(result1.cov_re, result2.cov_re, rtol=1e-8)
        assert_allclose(result1.vcomp, result2.vcomp, rtol=1e-8)
        assert_allclose(result1.params, result2.params, rtol=1e-8)
        assert_allclose(result1.bse, result2.bse, rtol=1e-8)
예제 #29
0
def mass_uv_mixedlmm(formula, data, uv_data, group_id, re_formula=None):
    mods = []
    for d_idx in range(uv_data.shape[1]):
        print("{} of {}".format(d_idx, uv_data.shape[1]), end="\r")
        data_temp = data.copy()
        data_temp["Brain"] = uv_data[:, d_idx]
        model = MixedLM.from_formula(formula, data_temp, groups=group_id)
        try:
            mod_fit = model.fit()
        except:
            mods.append(None)
            continue
        mods.append(mod_fit)
    return mods
예제 #30
0
def mass_uv_mixedlmm(formula, data, uv_data, group_id, re_formula=None):
    mods = [[] for source_idx in range(uv_data.shape[1])]
    for source_idx in range(uv_data.shape[1]):
        for dest_idx in range(uv_data.shape[2]):
            if all(uv_data[:, source_idx, dest_idx] == 0):
                mods[source_idx].append(None)
                continue
            #print("Source {}, Destination {}".format(source_idx, dest_idx), end="\r")
            print("Source {}, Destination {}".format(source_idx, dest_idx))
            data_temp = data.copy()
            data_temp["Brain"] = uv_data[:, source_idx, dest_idx]
            model = MixedLM.from_formula(formula, data_temp, groups=group_id)
            mod_fit = model.fit()
            mods[source_idx].append(mod_fit)
    return mods
예제 #31
0
def mass_uv_mixedlmm(formula, data, uv_data, group_id, re_formula=None, exclude=[]):
    tvals = []
    coeffs = []
    for d_idx in range(uv_data.shape[1]):
        if d_idx in exclude:
            tvals.append(0)
            coeffs.append(0)
            continue
        data_temp = data.copy()
        data_temp["Brain"] = uv_data[:,d_idx]
        model = MixedLM.from_formula(formula, data_temp, groups=group_id)
        mod_fit = model.fit()
        tvals.append(mod_fit.tvalues.get(indep_var))
        coeffs.append(mod_fit.params.get(indep_var))
    tvals, coeffs = np.array(tvals), np.array(coeffs)
    return tvals, coeffs
예제 #32
0
def test_singular():
    # Issue #7051

    np.random.seed(3423)
    n = 100

    data = np.random.randn(n, 2)
    df = pd.DataFrame(data, columns=['Y', 'X'])
    df['class'] = pd.Series([i % 3 for i in df.index], index=df.index)

    with pytest.warns(Warning) as wrn:
        md = MixedLM.from_formula("Y ~ X", df, groups=df['class'])
        mdf = md.fit()
        mdf.summary()
        if not wrn:
            pytest.fail("warning expected")
예제 #33
0
def fit_func(rdf):
    md = MixedLM.from_formula("supply_hours ~ 1 + delta_weeks",
                              groups='block_dow',
                              re_formula='1 + delta_weeks',
                              data=rdf.fillna({'supply_hours': 0.}))

    mdf = md.fit()
    index = mdf.random_effects.keys()

    data = {
        'supply_hours': (mdf.params['Intercept'] +
                         [mdf.random_effects[i]['Intercept'] for i in index]),
        'block_dow':
        index
    }

    result = pd.DataFrame(data).set_index('block_dow')
    return result
def lmemodel(data,
             metadata,
             fixedEffects=['Tissue of Origin'],
             randomEffects=['High Confidence Donor ID (HCDID)']):
    """Performs a mixed effect linear model"""
    df = metadata[fixedEffects].copy()
    df = pd.concat([df, metadata[randomEffects]], axis=1)
    #Change the parameters to be compatible with patsy formulas
    fixedEffects = [
        c.translate(string.maketrans(' ()', '___')) for c in fixedEffects
    ]
    randomEffects = [
        c.translate(string.maketrans(' ()', '___')) for c in randomEffects
    ]
    df.columns = [
        c.translate(string.maketrans(' ()', '___')) for c in df.columns
    ]

    model_string = 'gene ~ ' + ' + '.join(fixedEffects)
    results = []
    for i in range(data.shape[0]):
        #Add the dependent variable to the dataframe
        df['gene'] = data.irow(i)
        #################
        df['High_Confidence_Donor_ID__HCDID_'] = stats.binom.rvs(1,
                                                                 .4,
                                                                 size=69)
        print df.shape, model_string
        return df
        df = df.dropna()
        print df.shape
        #################
        #compute new model
        mod = MixedLM.from_formula(model_string, df, groups=df[randomEffects])
        return mod
        #df.boxplot(by=fixedEffects)
        #mod = sm.ols(model_fit, df)
        #results.append(mod.fit())
    return results
예제 #35
0
    """
    try:
        from statsmodels.regression.mixed_linear_model import MixedLM

        raw_df = se_df.copy()

9b50c5aedf52 · D1691991	
        raw_df['delta_weeks'] = (pd.to_datetime(raw_df['week_of']) - pd.to_datetime(recommendation_week)).dt.days / 7
17a83f0a52b1 · D1438409	

        def fit_func(group):
ea0c134be68e · D1540393	
            try:
                md = MixedLM.from_formula("{} ~ 1 + delta_weeks".format(metric),
                                          groups='block_dow',
                                          re_formula='1 + delta_weeks',
                                          data=group.fillna({metric: 0.})
                                          )

                mdf = md.fit()

                index = mdf.random_effects.keys()
                data = {
                    metric: (mdf.params['Intercept'] + [mdf.random_effects[i]['Intercept'] for i in index]),
                    'block_dow': index,
                }
                return pd.DataFrame(data).set_index('block_dow')

            except np.linalg.linalg.LinAlgError as err:
                logging.warning(err)
dca42ed79c75 · D2335295	
예제 #36
0
                cnx_col_inds = list(np.where(cnx_masks[ROI_idx,])[0])
                for col_idx in cnx_col_inds:
                    this_point = this_epo[ROI_idx,col_idx].copy()
                    outname = label_names[col_idx]
                    outhemi = "lh" if "lh" in outname else "rh"
                    data_dict["Brain"].append(this_point)
                    data_dict["Subj"].append(sub)
                    data_dict["Block"].append(cond)
                    data_dict["ROI"].append(ROI)
                    data_dict["OutRegion"].append(outname)
                    data_dict["Hemi"].append(outhemi)
                    data_dict["RT"].append(epo.metadata["RT"].iloc[epo_idx])
                    group_id.append(sub_idx)
dm = pd.DataFrame.from_dict(data_dict)
group_id = np.array(group_id)

formula = "RT ~ Brain*Block + Brain*Block*C(ROI, Treatment('L3969-lh'))"
formula = "Brain ~ RT*Block"

mfs = []
for ROI in ROIs:
    this_dm = dm.copy()
    this_dm = this_dm[this_dm["ROI"]==ROI]
    this_group_id = group_id[(dm["ROI"]==ROI)]
    mod = MixedLM.from_formula(formula, this_dm, groups=this_group_id)
    mfs.append(mod.fit(reml=False))

formula = "RT ~ Block"
mod_rt = MixedLM.from_formula(formula, dm, groups=group_id)
mf_rt = mod_rt.fit()
예제 #37
0
def test_get_distribution():

    np.random.seed(234)

    n = 100
    n_groups = 10
    fe_params = np.r_[1, -2]
    cov_re = np.asarray([[1, 0.5], [0.5, 2]])
    vcomp = np.r_[0.5**2, 1.5**2]
    scale = 1.5

    exog_fe = np.random.normal(size=(n, 2))
    exog_re = np.random.normal(size=(n, 2))
    exog_vca = np.random.normal(size=(n, 2))
    exog_vcb = np.random.normal(size=(n, 2))

    groups = np.repeat(np.arange(n_groups, dtype=np.int), n / n_groups)

    ey = np.dot(exog_fe, fe_params)

    u = np.random.normal(size=(n_groups, 2))
    u = np.dot(u, np.linalg.cholesky(cov_re).T)

    u1 = np.sqrt(vcomp[0]) * np.random.normal(size=(n_groups, 2))
    u2 = np.sqrt(vcomp[1]) * np.random.normal(size=(n_groups, 2))

    y = ey + (u[groups, :] * exog_re).sum(1)
    y += (u1[groups, :] * exog_vca).sum(1)
    y += (u2[groups, :] * exog_vcb).sum(1)
    y += np.sqrt(scale) * np.random.normal(size=n)

    df = pd.DataFrame({
        "y": y,
        "x1": exog_fe[:, 0],
        "x2": exog_fe[:, 1],
        "z0": exog_re[:, 0],
        "z1": exog_re[:, 1],
        "grp": groups
    })
    df["z2"] = exog_vca[:, 0]
    df["z3"] = exog_vca[:, 1]
    df["z4"] = exog_vcb[:, 0]
    df["z5"] = exog_vcb[:, 1]

    vcf = {"a": "0 + z2 + z3", "b": "0 + z4 + z5"}
    m = MixedLM.from_formula("y ~ 0 + x1 + x2",
                             groups="grp",
                             re_formula="0 + z0 + z1",
                             vc_formula=vcf,
                             data=df)

    # Build a params vector that is comparable to
    # MixedLMResults.params
    import statsmodels
    mp = statsmodels.regression.mixed_linear_model.MixedLMParams
    po = mp.from_components(fe_params=fe_params, cov_re=cov_re, vcomp=vcomp)
    pa = po.get_packed(has_fe=True, use_sqrt=False)
    pa[len(fe_params):] /= scale

    # Get a realization
    dist = m.get_distribution(pa, scale, None)
    yr = dist.rvs(0)

    # Check the overall variance
    v = (np.dot(exog_re, cov_re) * exog_re).sum(1).mean()
    v += vcomp[0] * (exog_vca**2).sum(1).mean()
    v += vcomp[1] * (exog_vcb**2).sum(1).mean()
    v += scale
    assert_allclose(np.var(yr - ey), v, rtol=1e-2, atol=1e-4)
예제 #38
0
    def test_pastes_vcomp(self):
        # pastes data from lme4
        #
        # Fit in R using:
        #
        # r = lmer(strength ~ (1|batch) + (1|batch:cask), data=data)
        # r = lmer(strength ~ (1|batch) + (1|batch:cask), data=data,
        #          reml=FALSE)

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        rdir = os.path.join(cur_dir, 'results')
        fname = os.path.join(rdir, 'pastes.csv')
        data = pd.read_csv(fname)
        vcf = {"cask": "0 + cask"}

        # REML
        model = MixedLM.from_formula(
            "strength ~ 1",
            groups="batch",
            re_formula="1",
            vc_formula=vcf,
            data=data)
        result = model.fit()

        # fixef(r)
        assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3)

        # sqrt(diag(vcov(r)))
        assert_allclose(result.bse.iloc[0], 0.6769, rtol=1e-3)

        # VarCorr(r)$batch[[1]]
        assert_allclose(result.cov_re.iloc[0, 0], 1.657, rtol=1e-3)

        # attr(VarCorr(r), "sc")^2
        assert_allclose(result.scale, 0.678, rtol=1e-3)

        # logLik(r)
        assert_allclose(result.llf, -123.49, rtol=1e-1)

        # don't provide aic/bic with REML
        assert_equal(result.aic, np.nan)
        assert_equal(result.bic, np.nan)

        # resid(r)[1:5]
        resid = np.r_[0.17133538, -0.02866462, -1.08662875, 1.11337125,
                      -0.12093607]
        assert_allclose(result.resid[0:5], resid, rtol=1e-3)

        # predict(r)[1:5]
        fit = np.r_[62.62866, 62.62866, 61.18663, 61.18663, 62.82094]
        assert_allclose(result.fittedvalues[0:5], fit, rtol=1e-4)

        # ML
        model = MixedLM.from_formula(
            "strength ~ 1",
            groups="batch",
            re_formula="1",
            vc_formula=vcf,
            data=data)
        result = model.fit(reml=False)

        # fixef(r)
        assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3)

        # sqrt(diag(vcov(r)))
        assert_allclose(result.bse.iloc[0], 0.642, rtol=1e-3)

        # VarCorr(r)$batch[[1]]
        assert_allclose(result.cov_re.iloc[0, 0], 1.199, rtol=1e-3)

        # attr(VarCorr(r), "sc")^2
        assert_allclose(result.scale, 0.67799, rtol=1e-3)

        # logLik(r)
        assert_allclose(result.llf, -123.997, rtol=1e-1)

        # AIC(r)
        assert_allclose(result.aic, 255.9944, rtol=1e-3)

        # BIC(r)
        assert_allclose(result.bic, 264.3718, rtol=1e-3)
예제 #39
0
# How to estimate multilevel GLM in statsmodels package

# We only show example for gaussian model, because gamma model is not implemented in the package
# see: https://www.statsmodels.org/devel/mixed_glm.html

from pandas import read_csv
from statsmodels.regression.mixed_linear_model import MixedLM

if __name__ == '__main__':
    # Requires to set the working directory to the project directory
    gaussian_data = read_csv("./data/Gaussian_identity_data.csv")

    model = MixedLM.from_formula("y ~ x1 + x2 + x3", data=gaussian_data, groups=gaussian_data["group_index"])
    model_result = model.fit()

    # Fixed effects
    print(model_result.summary())

    # Random effects
    print(model_result.random_effects)

    # Dispersion parameter
    print(model_result.scale)
예제 #40
0
def test_random_effects_getters():
    # Simulation-based test to make sure that the BLUPs and actual
    # random effects line up.

    np.random.seed(34234)
    ng = 500  # number of groups
    m = 10  # group size

    y, x, z, v0, v1, g, b, c0, c1 = [], [], [], [], [], [], [], [], []

    for i in range(ng):

        # Fixed effects
        xx = np.random.normal(size=(m, 2))
        yy = xx[:, 0] + 0.5 * np.random.normal(size=m)

        # Random effects (re_formula)
        zz = np.random.normal(size=(m, 2))
        bb = np.random.normal(size=2)
        bb[0] *= 3
        bb[1] *= 1
        yy += np.dot(zz, bb).flat
        b.append(bb)

        # First variance component
        vv0 = np.kron(np.r_[0, 1], np.ones(m // 2)).astype(np.int)
        cc0 = np.random.normal(size=2)
        yy += cc0[vv0]
        v0.append(vv0)
        c0.append(cc0)

        # Second variance component
        vv1 = np.kron(np.ones(m // 2), np.r_[0, 1]).astype(np.int)
        cc1 = np.random.normal(size=2)
        yy += cc1[vv1]
        v1.append(vv1)
        c1.append(cc1)

        y.append(yy)
        x.append(xx)
        z.append(zz)
        g.append(["g%d" % i] * m)

    y = np.concatenate(y)
    x = np.concatenate(x)
    z = np.concatenate(z)
    v0 = np.concatenate(v0)
    v1 = np.concatenate(v1)
    g = np.concatenate(g)
    df = pd.DataFrame({
        "y": y,
        "x0": x[:, 0],
        "x1": x[:, 1],
        "z0": z[:, 0],
        "z1": z[:, 1],
        "v0": v0,
        "v1": v1,
        "g": g
    })

    b = np.asarray(b)
    c0 = np.asarray(c0)
    c1 = np.asarray(c1)
    cc = np.concatenate((c0, c1), axis=1)

    model = MixedLM.from_formula("y ~ x0 + x1",
                                 re_formula="~0 + z0 + z1",
                                 vc_formula={
                                     "v0": "~0+C(v0)",
                                     "v1": "0+C(v1)"
                                 },
                                 groups="g",
                                 data=df)
    result = model.fit()

    ref = result.random_effects
    b0 = [ref["g%d" % k][0:2] for k in range(ng)]
    b0 = np.asarray(b0)
    assert (np.corrcoef(b0[:, 0], b[:, 0])[0, 1] > 0.8)
    assert (np.corrcoef(b0[:, 1], b[:, 1])[0, 1] > 0.8)

    cf0 = [ref["g%d" % k][2:6] for k in range(ng)]
    cf0 = np.asarray(cf0)
    for k in range(4):
        assert (np.corrcoef(cf0[:, k], cc[:, k])[0, 1] > 0.8)

    # Smoke test for predictive covariances
    refc = result.random_effects_cov
    for g in refc.keys():
        p = ref[g].size
        assert (refc[g].shape == (p, p))
def big_ass_matrix(df, y, x, group = None, short = True) :

    independent = combinatorial(x, short)
    
    models = {}
    p = {}
    aic = {}
    r2 = {}
    best = {}
    dfs = {}
    bestdf = {}
    
    for dependent in y :
        
        print "Regressing for %s" % dependent
        
        for covariate in independent :
            
            if group is None :
                
                subset = delayer([covariate, dependent])
                df2 = df[subset].dropna()
                df2["Intercept"] = np.ones(len(df2))
                dfs.setdefault(dependent, []).append(df2)
                
                ols = sm.GLS(endog=df2[dependent], exog=df2[delayer([covariate, "Intercept"])]).fit()
                
                models.setdefault(dependent, []).append(ols)
                p.setdefault(dependent, []).append(ols.pvalues[:-1].values)
                aic.setdefault(dependent, []).append(ols.aic)
                r2.setdefault(dependent, []).append(ols.rsquared)
            
            else :
                
                subset = delayer([covariate, dependent, group])
                df2 = df[subset].dropna()
                dfs.setdefault(dependent, []).append(df2)
                
                ols = MixedLM.from_formula(rstr(y=dependent, x=covariate), data=df2, groups=df2[group]).fit()
                
                models.setdefault(dependent, []).append(ols)
                aic.setdefault(dependent, []).append(2 * (ols.k_fe + 1) - 2 * ols.llf)
                p.setdefault(dependent, []).append(ols.pvalues[1:-1].values)
                r2.setdefault(dependent, []).append(mmR2(df2, ols))

    
       
        bestAIC = np.min(aic[dependent])
        
        for i, val in enumerate(models[dependent]) :
            
            if aic[dependent][i] < 2 + bestAIC :
                
                if np.sum(p[dependent][i] > 0.05) == 0 :
                    
                    if group is None :
                        
                        best.setdefault(dependent, []).append(val)
                        bestdf.setdefault(dependent, []).append(dfs[dependent][i])
                        
                    else :
                        
                        if val.random_effects.abs().mean()[0] > 0.01 :
                            
                            best.setdefault(dependent, []).append(val)
                            bestdf.setdefault(dependent, []).append(dfs[dependent][i])
       
            
        if best.has_key(dependent) :
            for i, model in enumerate(best[dependent]) :
                
                if not os.path.exists("regressions/%s" % dependent) :                  
                    os.mkdir("regressions/%s" % dependent)
                    
                if not os.path.exists("../talk/figures/regressions/%s" % dependent) :                  
                    os.mkdir("../talk/figures/regressions/%s" % dependent)                

                if group is None :

                    dfx = bestdf[dependent][i]
                    plt.scatter(model.fittedvalues.values, dfx[model.model.endog_names].values, c=seaborn.color_palette("deep", 8)[0])
                    plt.plot(dfx[model.model.endog_names].values, dfx[model.model.endog_names].values, c=seaborn.color_palette("deep", 8)[2])
                    plt.ylabel(model.model.endog_names)
                    yl = model.model.exog_names[:]
                    yl.remove("Intercept")
                    plt.xlabel("Estimate using " + ", ".join(yl))
                    plt.title(rstr(dependent, model.model.exog_names).replace(" + Intercept", ""))
                    #plt.title(r"$R^2$ = %.02f" % model.rsquared)
                    st = ("$R^2$ = %.03f\n\n"% model.rsquared)
                    for coefnum, coef in enumerate(yl) :
                        st += ("%s" % coef)
                        st += (" : %.03f\n" % model.params[coef])
                        st += ("$p$ = %.01e\n\n" % model.pvalues[coefnum])
                    #plt.suptitle(st)
                    plt.text(0.01, .99, st, va="top", ha="left")
                    plt.xlim([-0.05, 1.05])
                    plt.ylim([-0.05, 1.05])
                    plt.savefig("regressions/%s/lm-%d.pdf" % (dependent, i))
                    plt.savefig("../talk/figures/regressions/%s/lm-%d.png" % (dependent, i), dpi=300, jpeg_quality=90)
                    plt.close()

                else :
                    dfx = bestdf[dependent][i]
                    y, yhat = mmPredict(model.model.data.frame, model)
                    plt.scatter(yhat, y, c=seaborn.color_palette("deep", 8)[0])
                    plt.plot(y, y, c=seaborn.color_palette("deep", 8)[2])
                    plt.ylabel(model.model.endog_names)
                    yl = model.model.exog_names[:]
                    yl.remove("Intercept")
                    plt.xlabel("Estimate using " + ", ".join(yl))
                    plt.title(rstr(dependent, model.model.exog_names).replace("Intercept + ", ""))
                    
                    #plt.title(r"$R^2$ = %.02f" % mmR2(dfx, model))
                    st = ("$R^2$ = %.03f\n\n" % mmR2(dfx, model))
                    for coefnum, coef in enumerate(yl) :
                        st += coef
                        st += " : %.03f\n" % model.fe_params[1+coefnum]
                        st += "$p$ = %.01e\n\n" % model.pvalues[coef]
                    st += ("Avg. abs. RE coef. : %.03f" % model.random_effects.abs().mean())
                    plt.text(0.01, .99, st, va="top", ha="left")
                    
                    plt.xlim([-0.05, 1.05])
                    plt.ylim([-0.05, 1.05])
                    plt.savefig("regressions/%s/mm_%d.pdf" % (dependent, i))
                    plt.savefig("../talk/figures/regressions/%s/mm_%d.png" % (dependent, i), dpi=300, jpeg_quality=90)
                    plt.close()
        
    return best, (models, p, r2, aic)
예제 #42
0
    def test_pastes_vcomp(self):
        # pastes data from lme4
        #
        # Fit in R using:
        #
        # r = lmer(strength ~ (1|batch) + (1|batch:cask), data=data)
        # r = lmer(strength ~ (1|batch) + (1|batch:cask), data=data,
        #          reml=FALSE)

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        rdir = os.path.join(cur_dir, 'results')
        fname = os.path.join(rdir, 'pastes.csv')
        data = pd.read_csv(fname)
        vcf = {"cask": "0 + cask"}

        # REML
        model = MixedLM.from_formula("strength ~ 1",
                                     groups="batch",
                                     re_formula="1",
                                     vc_formula=vcf,
                                     data=data)
        result = model.fit()

        # fixef(r)
        assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3)

        # sqrt(diag(vcov(r)))
        assert_allclose(result.bse.iloc[0], 0.6769, rtol=1e-3)

        # VarCorr(r)$batch[[1]]
        assert_allclose(result.cov_re.iloc[0, 0], 1.657, rtol=1e-3)

        # attr(VarCorr(r), "sc")^2
        assert_allclose(result.scale, 0.678, rtol=1e-3)

        # logLik(r)
        assert_allclose(result.llf, -123.49, rtol=1e-1)

        # do not provide aic/bic with REML
        assert_equal(result.aic, np.nan)
        assert_equal(result.bic, np.nan)

        # resid(r)[1:5]
        resid = np.r_[0.17133538, -0.02866462, -1.08662875, 1.11337125,
                      -0.12093607]
        assert_allclose(result.resid[0:5], resid, rtol=1e-3)

        # predict(r)[1:5]
        fit = np.r_[62.62866, 62.62866, 61.18663, 61.18663, 62.82094]
        assert_allclose(result.fittedvalues[0:5], fit, rtol=1e-4)

        # ML
        model = MixedLM.from_formula("strength ~ 1",
                                     groups="batch",
                                     re_formula="1",
                                     vc_formula=vcf,
                                     data=data)
        result = model.fit(reml=False)

        # fixef(r)
        assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3)

        # sqrt(diag(vcov(r)))
        assert_allclose(result.bse.iloc[0], 0.642, rtol=1e-3)

        # VarCorr(r)$batch[[1]]
        assert_allclose(result.cov_re.iloc[0, 0], 1.199, rtol=1e-3)

        # attr(VarCorr(r), "sc")^2
        assert_allclose(result.scale, 0.67799, rtol=1e-3)

        # logLik(r)
        assert_allclose(result.llf, -123.997, rtol=1e-1)

        # AIC(r)
        assert_allclose(result.aic, 255.9944, rtol=1e-3)

        # BIC(r)
        assert_allclose(result.bic, 264.3718, rtol=1e-3)
예제 #43
0
        continue
    subj, block, trial = match.group(1), match.group(2), match.group(3)
    print(filename)
    stc = mne.read_source_estimate("{}/stcs/{}".format(proc_dir,filename))
    stc = morphs["ATT_"+subj].apply(stc)
    ev_str = "Subj=='ATT_{}' and Block=='{}' and TrialIdx=={}".format(subj, block, int(trial))
    row_idx = np.where(np.array(df.eval(ev_str)))[0]
    temp_data = mne.extract_label_time_course(stc,labels,fs_src,mode="mean")
    temp_data = temp_data.mean(axis=1)
    for lab_idx, ln in enumerate(stats_label_names):
        df.at[row_idx,ln] = temp_data[lab_idx]

df = df.astype({ln:np.float64 for ln in stats_label_names})
for ln_idx, ln in enumerate(stats_label_names):
    formula = "{} ~ 1".format(ln)
    model = MixedLM.from_formula(formula, df, groups=df["Subj"])
    mod_fit = model.fit(reml=False)
    mod_fit.save("{}{}/null_reg70_lmm_byresp_{}.pickle".format(lmm_dir,band,ln_idx))

    formula = "{} ~ RT + Block".format(ln)
    re_formula = "1 + RT"
    model = MixedLM.from_formula(formula, df, groups=df["Subj"],
                                 re_formula=re_formula)
    mod_fit = model.fit(reml=False)
    mod_fit.save("{}{}/simple_reg70_lmm_byresp_{}.pickle".format(lmm_dir,band,ln_idx))

    formula = "{} ~ RT*C(Block, Treatment('audio'))".format(ln)
    re_formula = "1 + RT"
    model = MixedLM.from_formula(formula, df, groups=df["Subj"],
                                 re_formula=re_formula)
    mod_fit = model.fit(reml=False)
예제 #44
0
            #sel_inds = (df["Block"]==cond) & (df["Subj"]==sub)

            dPTE_slices = {}
            for k, v in pairs_info.items():
                temp_inds = list(zip(*v["inds"]))
                dPTE_slices[k] = dPTE[:, temp_inds[0],
                                      temp_inds[1]].mean(axis=1)

            if avg_trials:
                dm = dm.append(df[sel_inds])
                #dm = dm.append({"Subj":sub,"Block":cond},ignore_index=True)
                for k, v in dPTE_slices.items():
                    data[k].append(v.mean())
                group_id.append(sub_idx)
            else:
                for epo_idx in range(len(dPTE)):
                    dm = dm.append(df[sel_inds])
                    #dm = dm.append({"Subj":sub,"Block":cond},ignore_index=True)
                    for k, v in dPTE_slices.items():
                        data[k].append(v[epo_idx, ])
                    group_id.append(sub_idx)

formula = "Brain ~ Laut + Angenehm + C(Block, Treatment('audio')) + Wav"
#formula = "Brain ~ C(Block, Treatment('rest'))"
mod_fits = {}
for k, v in data.items():
    dm_temp = dm.copy()
    dm_temp["Brain"] = v
    model = MixedLM.from_formula(formula, dm_temp, groups=group_id)
    mod_fits[k] = model.fit()
예제 #45
0
def test_random_effects_getters():
    # Simulation-based test to make sure that the BLUPs and actual
    # random effects line up.

    np.random.seed(34234)
    ng = 500  # number of groups
    m = 10  # group size

    y, x, z, v0, v1, g, b, c0, c1 = [], [], [], [], [], [], [], [], []

    for i in range(ng):

        # Fixed effects
        xx = np.random.normal(size=(m, 2))
        yy = xx[:, 0] + 0.5 * np.random.normal(size=m)

        # Random effects (re_formula)
        zz = np.random.normal(size=(m, 2))
        bb = np.random.normal(size=2)
        bb[0] *= 3
        bb[1] *= 1
        yy += np.dot(zz, bb).flat
        b.append(bb)

        # First variance component
        vv0 = np.kron(np.r_[0, 1], np.ones(m // 2)).astype(np.int)
        cc0 = np.random.normal(size=2)
        yy += cc0[vv0]
        v0.append(vv0)
        c0.append(cc0)

        # Second variance component
        vv1 = np.kron(np.ones(m // 2), np.r_[0, 1]).astype(np.int)
        cc1 = np.random.normal(size=2)
        yy += cc1[vv1]
        v1.append(vv1)
        c1.append(cc1)

        y.append(yy)
        x.append(xx)
        z.append(zz)
        g.append(["g%d" % i] * m)

    y = np.concatenate(y)
    x = np.concatenate(x)
    z = np.concatenate(z)
    v0 = np.concatenate(v0)
    v1 = np.concatenate(v1)
    g = np.concatenate(g)
    df = pd.DataFrame({
        "y": y,
        "x0": x[:, 0],
        "x1": x[:, 1],
        "z0": z[:, 0],
        "z1": z[:, 1],
        "v0": v0,
        "v1": v1,
        "g": g
    })

    b = np.asarray(b)
    c0 = np.asarray(c0)
    c1 = np.asarray(c1)
    cc = np.concatenate((c0, c1), axis=1)

    model = MixedLM.from_formula(
        "y ~ x0 + x1",
        re_formula="~0 + z0 + z1",
        vc_formula={
            "v0": "~0+C(v0)",
            "v1": "0+C(v1)"
        },
        groups="g",
        data=df)
    result = model.fit()

    ref = result.random_effects
    b0 = [ref["g%d" % k][0:2] for k in range(ng)]
    b0 = np.asarray(b0)
    assert (np.corrcoef(b0[:, 0], b[:, 0])[0, 1] > 0.8)
    assert (np.corrcoef(b0[:, 1], b[:, 1])[0, 1] > 0.8)

    cf0 = [ref["g%d" % k][2:6] for k in range(ng)]
    cf0 = np.asarray(cf0)
    for k in range(4):
        assert (np.corrcoef(cf0[:, k], cc[:, k])[0, 1] > 0.8)

    # Smoke test for predictive covariances
    refc = result.random_effects_cov
    for g in refc.keys():
        p = ref[g].size
        assert (refc[g].shape == (p, p))
예제 #46
0
axes = [ax for sublist in axes for ax in sublist]
for block_idx,block in enumerate(blocks):
    angs = []
    for wav in wavs:
        angs.append(df_ang.loc[df_laut["Wav"]==wav]["Angenehm"][df_laut["Block"]==block].values)
    angs = np.array(angs)
    angs_mean = np.mean(angs,axis=1)
    print(angs_mean)
    sem = stats.sem(angs,axis=1)
    plt.sca(axes[block_idx])
    plt.bar(np.arange(len(wavs)),angs_mean,yerr=sem,tick_label=wavs)
    plt.title(block)

groups = df_laut["Subj"]
formula = "Laut ~ Block*Wav"
laut_model = MixedLM.from_formula(formula, df_laut, groups=groups)
laut_mf = laut_model.fit()
print(laut_mf.summary())

groups = df_ang["Subj"]
formula = "Angenehm ~ Block*Wav"
ang_model = MixedLM.from_formula(formula, df_ang, groups=groups)
ang_mf = ang_model.fit()
print(ang_mf.summary())

font = {'weight' : 'bold',
        'size'   : 38}
matplotlib.rc('font', **font)

fig, axes = plt.subplots(1, 2, figsize=(38.4, 21.6))
angs_block = []
예제 #47
0
def test_get_distribution():

    np.random.seed(234)

    n = 100
    n_groups = 10
    fe_params = np.r_[1, -2]
    cov_re = np.asarray([[1, 0.5], [0.5, 2]])
    vcomp = np.r_[0.5**2, 1.5**2]
    scale = 1.5

    exog_fe = np.random.normal(size=(n, 2))
    exog_re = np.random.normal(size=(n, 2))
    exog_vca = np.random.normal(size=(n, 2))
    exog_vcb = np.random.normal(size=(n, 2))

    groups = np.repeat(np.arange(n_groups, dtype=np.int),
                       n / n_groups)

    ey = np.dot(exog_fe, fe_params)

    u = np.random.normal(size=(n_groups, 2))
    u = np.dot(u, np.linalg.cholesky(cov_re).T)

    u1 = np.sqrt(vcomp[0]) * np.random.normal(size=(n_groups, 2))
    u2 = np.sqrt(vcomp[1]) * np.random.normal(size=(n_groups, 2))

    y = ey + (u[groups, :] * exog_re).sum(1)
    y += (u1[groups, :] * exog_vca).sum(1)
    y += (u2[groups, :] * exog_vcb).sum(1)
    y += np.sqrt(scale) * np.random.normal(size=n)

    df = pd.DataFrame({"y": y, "x1": exog_fe[:, 0], "x2": exog_fe[:, 1],
                       "z0": exog_re[:, 0], "z1": exog_re[:, 1],
                       "grp": groups})
    df["z2"] = exog_vca[:, 0]
    df["z3"] = exog_vca[:, 1]
    df["z4"] = exog_vcb[:, 0]
    df["z5"] = exog_vcb[:, 1]

    vcf = {"a": "0 + z2 + z3", "b": "0 + z4 + z5"}
    m = MixedLM.from_formula("y ~ 0 + x1 + x2", groups="grp",
                             re_formula="0 + z0 + z1",
                             vc_formula=vcf, data=df)

    # Build a params vector that is comparable to
    # MixedLMResults.params
    import statsmodels
    mp = statsmodels.regression.mixed_linear_model.MixedLMParams
    po = mp.from_components(fe_params=fe_params, cov_re=cov_re,
                            vcomp=vcomp)
    pa = po.get_packed(has_fe=True, use_sqrt=False)
    pa[len(fe_params):] /= scale

    # Get a realization
    dist = m.get_distribution(pa, scale, None)
    yr = dist.rvs(0)

    # Check the overall variance
    v = (np.dot(exog_re, cov_re) * exog_re).sum(1).mean()
    v += vcomp[0] * (exog_vca**2).sum(1).mean()
    v += vcomp[1] * (exog_vcb**2).sum(1).mean()
    v += scale
    assert_allclose(np.var(yr - ey), v, rtol=1e-2, atol=1e-4)
예제 #48
0
        if v["from"][0] == "all":
            from_inds = np.arange(mat_n)
        else:
            from_inds = np.array([label_names.index(x) for x in v["from"]])
        from_mat = these_data[:, from_inds, ]
        from_mat = np.nanmean(from_mat, axis=1)

        if v["to"][0] == "all":
            to_inds = np.arange(mat_n)
        else:
            to_inds = np.array([label_names.index(x) for x in v["to"]])
        to_mat = from_mat[:, to_inds]
        quant = np.nanmean(to_mat, axis=1)

        df["Brain"] = quant
        model = MixedLM.from_formula(formula, df, groups=group_id)
        mod_fit = model.fit(reml=False)
        print(mod_fit.summary())
        stat_cond = "C(Block, Treatment('rest'))[T.task]"
        CIs = mod_fit.conf_int()
        mod_ests[k] = {
            "Rest": mod_fit.params["Intercept"],
            "Task": mod_fit.params[stat_cond],
            "Rest_CIs": np.array([CIs[0]["Intercept"], CIs[1]["Intercept"]]),
            "Task_CIs": np.array([CIs[0][stat_cond], CIs[1][stat_cond]])
        }
    fig, ax = dpte_bar(mod_ests)
else:
    these_data = data.copy()
    triu_inds, tril_inds = np.triu_indices(mat_n, k=1), np.tril_indices(mat_n,
                                                                        k=-1)
print " -------- 2"
best_lm_hist, stuff_lm_hist = big_ass_matrix(df=sheep, y=histcols, x=imagecols, group=None, short=5)

print " -------- 3"
best_mm_phys, stuff_mm_phys = big_ass_matrix(df=sheep, y=pcols, x=imagecols, group="AgeAtDeath", short=5)

print " -------- 4"
best_mm_hist, stuff_mm_hist = big_ass_matrix(df=sheep, y=histcols, x=imagecols, group="AgeAtDeath", short=5)

# <codecell>

y = "BDHyperplasia"
x = ["Inflammation", "Scale", "Directionality"]

dfx = sheep[delayer([x, y, "AgeAtDeath"])].dropna()
model = MixedLM.from_formula(rstr(y, x), data=dfx, groups="AgeAtDeath").fit()
#model = sm.GLS(endog=dfx.Portal_inflammation, exog=dfx[["FociSize", "AgeAtDeath"]]).fit()

dfx = sheep[["BDHyperplasia", "Inflammation", "AgeAtDeath"]].dropna()
model2 = MixedLM.from_formula(rstr(y, ["Inflammation"]), data=dfx, groups="AgeAtDeath").fit()

dfx = sheep[["BDHyperplasia", "FociSize", "AgeAtDeath"]].dropna()
model3 = MixedLM.from_formula(rstr(y, ["FociSize"]), data=dfx, groups="AgeAtDeath").fit()

# <codecell>

ss = "E"
s = np.array([sheep[sheep.AgeAtDeath == model.random_effects.index.values[i]][ss].iloc[0] for i in range(len(model.random_effects.index.values))])
s -= s.min()
s /= s.max()