示例#1
0
def test_from_formula():
    mod = RecursiveLS.from_formula('cpi ~ m1', data=dta)
    res = mod.fit()

    # Test the RLS estimates against OLS estimates
    mod_ols = OLS.from_formula('cpi ~ m1', data=dta)
    res_ols = mod_ols.fit()
    assert_allclose(res.params, res_ols.params)
示例#2
0
def test_from_formula():
    mod = RecursiveLS.from_formula('cpi ~ m1', data=dta)
    res = mod.fit()

    # Test the RLS estimates against OLS estimates
    mod_ols = OLS.from_formula('cpi ~ m1', data=dta)
    res_ols = mod_ols.fit()
    assert_allclose(res.params, res_ols.params)
示例#3
0
def test_escaped_variable_name():
    # Rename 'cpi' column to 'CPI_'
    data = macrodata.load().data
    data.rename(columns={'cpi': 'CPI_'}, inplace=True)

    mod = OLS.from_formula('CPI_ ~ 1 + np.log(realgdp)', data=data)
    res = mod.fit()
    assert 'CPI\\_' in res.summary().as_latex()
    assert 'CPI_' in res.summary().as_text()
示例#4
0
def test_from_formula():
    with pytest.warns(ValueWarning, match="No frequency information"):
        mod = RecursiveLS.from_formula('cpi ~ m1', data=dta)

    res = mod.fit()

    # Test the RLS estimates against OLS estimates
    mod_ols = OLS.from_formula('cpi ~ m1', data=dta)
    res_ols = mod_ols.fit()
    assert_allclose(res.params, res_ols.params)
示例#5
0
def test_from_formula():
    with pytest.warns(ValueWarning, match="No frequency information"):
        mod = RecursiveLS.from_formula('cpi ~ m1', data=dta)

    res = mod.fit()

    # Test the RLS estimates against OLS estimates
    mod_ols = OLS.from_formula('cpi ~ m1', data=dta)
    res_ols = mod_ols.fit()
    assert_allclose(res.params, res_ols.params)
示例#6
0
    def setup_class(cls):
        from .test_diagnostic import get_duncan_data
        endog, exog, labels = get_duncan_data()
        data = pd.DataFrame(np.column_stack((endog, exog)),
                            columns='y const var1 var2'.split(),
                            index=labels)

        res0 = GLM.from_formula('y ~ const + var1 + var2 - 1', data).fit()
        res1 = OLS.from_formula('y ~ const + var1 + var2 - 1', data).fit()
        cls.infl1 = res1.get_influence()
        cls.infl0 = res0.get_influence()
    def setup_class(cls):
        from .test_diagnostic import get_duncan_data
        endog, exog, labels = get_duncan_data()
        data = pd.DataFrame(np.column_stack((endog, exog)),
                        columns='y const var1 var2'.split(),
                        index=labels)

        res0 = GLM.from_formula('y ~ const + var1 + var2 - 1', data).fit()
        res1 = OLS.from_formula('y ~ const + var1 + var2 - 1', data).fit()
        cls.infl1 = res1.get_influence()
        cls.infl0 = res0.get_influence()
示例#8
0
    def setup_class(cls):
        nobs = 30
        np.random.seed(987128)
        x = np.random.randn(nobs, 3)
        y = x.sum(1) + np.random.randn(nobs)
        index = ['obs%02d' % i for i in range(nobs)]
        # add one extra column to check that it doesn't matter
        cls.data = pd.DataFrame(np.round(np.column_stack((y, x)), 4),
                                columns='y var1 var2 var3'.split(),
                                index=index)

        cls.res = OLS.from_formula('y ~ var1 + var2', data=cls.data).fit()
示例#9
0
    def setup_class(cls):
        nobs = 30
        np.random.seed(987128)
        x = np.random.randn(nobs, 3)
        y = x.sum(1) + np.random.randn(nobs)
        index = ['obs%02d' % i for i in range(nobs)]
        # add one extra column to check that it doesn't matter
        cls.data = pd.DataFrame(np.round(np.column_stack((y, x)), 4),
                                columns='y var1 var2 var3'.split(),
                                index=index)

        cls.res = OLS.from_formula('y ~ var1 + var2', data=cls.data).fit()
示例#10
0
def test_missing_formula_predict():
    # see 2171
    nsample = 30

    data = np.linspace(0, 10, nsample)
    null = np.array([np.nan])
    data = pandas.DataFrame({'x': np.concatenate((data, null))})
    beta = np.array([1, 0.1])
    e = np.random.normal(size=nsample + 1)
    data['y'] = beta[0] + beta[1] * data['x'] + e
    model = OLS.from_formula('y ~ x', data=data)
    fit = model.fit()
    fit.predict(exog=data[:-1])
示例#11
0
def test_missing_formula_predict():
    # see 2171
    nsample = 30

    data = pandas.DataFrame({'x': np.linspace(0, 10, nsample)})
    null = pandas.DataFrame({'x': np.array([np.nan])})
    data = pandas.concat([data, null])
    beta = np.array([1, 0.1])
    e = np.random.normal(size=nsample+1)
    data['y'] = beta[0] + beta[1] * data['x'] + e
    model = OLS.from_formula('y ~ x', data=data)
    fit = model.fit()
    pred = fit.predict(exog=data[:-1])
示例#12
0
    def env_corr(self, env_vars, coeff_plot=False, qq_plot=False):
        """
        Determine correlations with environmental/non-discretionary variables
        using a logit regression. Tobit will be implemented when available
        upstream in statsmodels.

        Takes:
            env_vars: A pandas dataframe of environmental variables

        Returns:
            corr_mod: the statsmodels' model instance containing the inputs
                      and results from the logit model.

        Note that there can be no spaces in the variables' names.
        """

        import matplotlib.pyplot as plt
        from statsmodels.regression.linear_model import OLS
        from statsmodels.graphics.gofplots import qqplot
        from seaborn import coefplot

        env_data = _to_dataframe(env_vars)
        corr_data = env_data.join(self['Efficiency'])
        corr_mod = OLS.from_formula(
            "Efficiency ~ " + " + ".join(env_vars.columns), corr_data)
        corr_res = corr_mod.fit()

        #plot coeffs
        if coeff_plot:
            coefplot("Efficiency ~ " + " + ".join(env_vars.columns),
                     data=corr_data)
            plt.xticks(rotation=45, ha='right')
            plt.title('Regression coefficients and standard errors')

        #plot qq of residuals
        if qq_plot:
            qqplot(corr_res.resid, line='s')
            plt.title('Distribution of residuals')

        print(corr_res.summary())

        return corr_res
示例#13
0
    def env_corr(self, env_vars, coeff_plot=False, qq_plot=False):
        """
        Determine correlations with environmental/non-discretionary variables
        using a logit regression. Tobit will be implemented when available
        upstream in statsmodels.

        Takes:
            env_vars: A pandas dataframe of environmental variables

        Returns:
            corr_mod: the statsmodels' model instance containing the inputs
                      and results from the logit model.

        Note that there can be no spaces in the variables' names.
        """

        import matplotlib.pyplot as plt
        from statsmodels.regression.linear_model import OLS
        from statsmodels.graphics.gofplots import qqplot
        from seaborn import coefplot

        env_data = _to_dataframe(env_vars)
        corr_data = env_data.join(self['Efficiency'])
        corr_mod = OLS.from_formula(
            "Efficiency ~ " + " + ".join(env_vars.columns), corr_data)
        corr_res = corr_mod.fit()

        #plot coeffs
        if coeff_plot:
            coefplot("Efficiency ~ " + " + ".join(env_vars.columns),
                     data=corr_data)
            plt.xticks(rotation=45, ha='right')
            plt.title('Regression coefficients and standard errors')

        #plot qq of residuals
        if qq_plot:
            qqplot(corr_res.resid, line='s')
            plt.title('Distribution of residuals')

        print(corr_res.summary())

        return corr_res
示例#14
0
def test_outlier_test():
    endog, exog, labels = get_duncan_data()
    ndarray_mod = OLS(endog, exog).fit()
    rstudent = [
        3.1345185839, -2.3970223990, 2.0438046359, -1.9309187757, 1.8870465798,
        -1.7604905300, -1.7040324156, 1.6024285876, -1.4332485037,
        -1.1044851583, 1.0688582315, 1.0185271840, -0.9024219332,
        -0.9023876471, -0.8830953936, 0.8265782334, 0.8089220547, 0.7682770197,
        0.7319491074, -0.6665962829, 0.5227352794, -0.5135016547, 0.5083881518,
        0.4999224372, -0.4980818221, -0.4759717075, -0.4293565820,
        -0.4114056499, -0.3779540862, 0.3556874030, 0.3409200462, 0.3062248646,
        0.3038999429, -0.3030815773, -0.1873387893, 0.1738050251, 0.1424246593,
        -0.1292266025, 0.1272066463, -0.0798902878, 0.0788467222, 0.0722556991,
        0.0505098280, 0.0233215136, 0.0007112055
    ]
    unadj_p = [
        0.003177202, 0.021170298, 0.047432955, 0.060427645, 0.066248120,
        0.085783008, 0.095943909, 0.116738318, 0.159368890, 0.275822623,
        0.291386358, 0.314400295, 0.372104049, 0.372122040, 0.382333561,
        0.413260793, 0.423229432, 0.446725370, 0.468363101, 0.508764039,
        0.603971990, 0.610356737, 0.613905871, 0.619802317, 0.621087703,
        0.636621083, 0.669911674, 0.682917818, 0.707414459, 0.723898263,
        0.734904667, 0.760983108, 0.762741124, 0.763360242, 0.852319039,
        0.862874018, 0.887442197, 0.897810225, 0.899398691, 0.936713197,
        0.937538115, 0.942749758, 0.959961394, 0.981506948, 0.999435989
    ]
    bonf_p = [
        0.1429741, 0.9526634, 2.1344830, 2.7192440, 2.9811654, 3.8602354,
        4.3174759, 5.2532243, 7.1716001, 12.4120180, 13.1123861, 14.1480133,
        16.7446822, 16.7454918, 17.2050103, 18.5967357, 19.0453245, 20.1026416,
        21.0763395, 22.8943818, 27.1787396, 27.4660532, 27.6257642, 27.8911043,
        27.9489466, 28.6479487, 30.1460253, 30.7313018, 31.8336506, 32.5754218,
        33.0707100, 34.2442399, 34.3233506, 34.3512109, 38.3543568, 38.8293308,
        39.9348989, 40.4014601, 40.4729411, 42.1520939, 42.1892152, 42.4237391,
        43.1982627, 44.1678127, 44.9746195
    ]
    bonf_p = np.array(bonf_p)
    bonf_p[bonf_p > 1] = 1
    sorted_labels = [
        "minister", "reporter", "contractor", "insurance.agent", "machinist",
        "store.clerk", "conductor", "factory.owner", "mail.carrier",
        "streetcar.motorman", "carpenter", "coal.miner", "bartender",
        "bookkeeper", "soda.clerk", "chemist", "RR.engineer", "professor",
        "electrician", "gas.stn.attendant", "auto.repairman", "watchman",
        "banker", "machine.operator", "dentist", "waiter", "shoe.shiner",
        "welfare.worker", "plumber", "physician", "pilot", "engineer",
        "accountant", "lawyer", "undertaker", "barber", "store.manager",
        "truck.driver", "cook", "janitor", "policeman", "architect", "teacher",
        "taxi.driver", "author"
    ]

    res2 = np.c_[rstudent, unadj_p, bonf_p]
    res = oi.outlier_test(ndarray_mod, method='b', labels=labels, order=True)
    np.testing.assert_almost_equal(res.values, res2, 7)
    np.testing.assert_equal(res.index.tolist(),
                            sorted_labels)  # pylint: disable-msg=E1103

    data = pd.DataFrame(np.column_stack((endog, exog)),
                        columns='y const var1 var2'.split(),
                        index=labels)

    # check `order` with pandas bug in #3971
    res_pd = OLS.from_formula('y ~ const + var1 + var2 - 0', data).fit()

    res_outl2 = oi.outlier_test(res_pd, method='b', order=True)
    assert_almost_equal(res_outl2.values, res2, 7)
    assert_equal(res_outl2.index.tolist(), sorted_labels)

    res_outl1 = res_pd.outlier_test(method='b')
    res_outl1 = res_outl1.sort_values(['unadj_p'], ascending=True)
    assert_almost_equal(res_outl1.values, res2, 7)
    assert_equal(res_outl1.index.tolist(), sorted_labels)
    assert_array_equal(res_outl2.index, res_outl1.index)

    # additional keywords in method
    res_outl3 = res_pd.outlier_test(method='b', order=True)
    assert_equal(res_outl3.index.tolist(), sorted_labels)
    res_outl4 = res_pd.outlier_test(method='b', order=True, cutoff=0.15)
    assert_equal(res_outl4.index.tolist(), sorted_labels[:1])
示例#15
0
def test_outlier_test():
    endog, exog, labels = get_duncan_data()
    ndarray_mod = OLS(endog, exog).fit()
    rstudent =  [3.1345185839, -2.3970223990,  2.0438046359, -1.9309187757,
                 1.8870465798, -1.7604905300, -1.7040324156,  1.6024285876,
                 -1.4332485037, -1.1044851583,  1.0688582315,  1.0185271840,
                 -0.9024219332, -0.9023876471, -0.8830953936,  0.8265782334,
                 0.8089220547,  0.7682770197,  0.7319491074, -0.6665962829,
                 0.5227352794, -0.5135016547,  0.5083881518,  0.4999224372,
                 -0.4980818221, -0.4759717075, -0.4293565820, -0.4114056499,
                 -0.3779540862,  0.3556874030,  0.3409200462,  0.3062248646,
                 0.3038999429, -0.3030815773, -0.1873387893,  0.1738050251,
                 0.1424246593, -0.1292266025,  0.1272066463, -0.0798902878,
                 0.0788467222,  0.0722556991,  0.0505098280,  0.0233215136,
                 0.0007112055]
    unadj_p = [0.003177202, 0.021170298, 0.047432955, 0.060427645, 0.066248120,
               0.085783008, 0.095943909, 0.116738318, 0.159368890, 0.275822623,
               0.291386358, 0.314400295, 0.372104049, 0.372122040, 0.382333561,
               0.413260793, 0.423229432, 0.446725370, 0.468363101, 0.508764039,
               0.603971990, 0.610356737, 0.613905871, 0.619802317, 0.621087703,
               0.636621083, 0.669911674, 0.682917818, 0.707414459, 0.723898263,
               0.734904667, 0.760983108, 0.762741124, 0.763360242, 0.852319039,
               0.862874018, 0.887442197, 0.897810225, 0.899398691, 0.936713197,
               0.937538115, 0.942749758, 0.959961394, 0.981506948, 0.999435989]
    bonf_p = [0.1429741, 0.9526634, 2.1344830, 2.7192440, 2.9811654, 3.8602354,
            4.3174759, 5.2532243, 7.1716001, 12.4120180, 13.1123861, 14.1480133,
            16.7446822, 16.7454918, 17.2050103, 18.5967357, 19.0453245,
            20.1026416, 21.0763395, 22.8943818, 27.1787396, 27.4660532,
            27.6257642, 27.8911043, 27.9489466, 28.6479487, 30.1460253,
            30.7313018, 31.8336506, 32.5754218, 33.0707100, 34.2442399,
            34.3233506, 34.3512109, 38.3543568, 38.8293308, 39.9348989,
            40.4014601, 40.4729411, 42.1520939, 42.1892152, 42.4237391,
            43.1982627, 44.1678127, 44.9746195]
    bonf_p = np.array(bonf_p)
    bonf_p[bonf_p > 1] = 1
    sorted_labels = ["minister", "reporter", "contractor", "insurance.agent",
            "machinist", "store.clerk", "conductor", "factory.owner",
            "mail.carrier", "streetcar.motorman", "carpenter", "coal.miner",
            "bartender", "bookkeeper", "soda.clerk", "chemist", "RR.engineer",
            "professor", "electrician", "gas.stn.attendant", "auto.repairman",
            "watchman", "banker", "machine.operator", "dentist", "waiter",
            "shoe.shiner", "welfare.worker", "plumber", "physician", "pilot",
            "engineer", "accountant", "lawyer", "undertaker", "barber",
            "store.manager", "truck.driver", "cook", "janitor", "policeman",
            "architect", "teacher", "taxi.driver", "author"]

    res2 = np.c_[rstudent, unadj_p, bonf_p]
    res = oi.outlier_test(ndarray_mod, method='b', labels=labels, order=True)
    np.testing.assert_almost_equal(res.values, res2, 7)
    np.testing.assert_equal(res.index.tolist(), sorted_labels)  # pylint: disable-msg=E1103

    data = pd.DataFrame(np.column_stack((endog, exog)),
                        columns='y const var1 var2'.split(),
                        index=labels)

    # check `order` with pandas bug in #3971
    res_pd = OLS.from_formula('y ~ const + var1 + var2 - 0', data).fit()

    res_outl2 = oi.outlier_test(res_pd, method='b', order=True)
    assert_almost_equal(res_outl2.values, res2, 7)
    assert_equal(res_outl2.index.tolist(), sorted_labels)

    res_outl1 = res_pd.outlier_test(method='b')
    res_outl1 = res_outl1.sort_values(['unadj_p'], ascending=True)
    assert_almost_equal(res_outl1.values, res2, 7)
    assert_equal(res_outl1.index.tolist(), sorted_labels)
    assert_array_equal(res_outl2.index, res_outl1.index)

    # additional keywords in method
    res_outl3 = res_pd.outlier_test(method='b', order=True)
    assert_equal(res_outl3.index.tolist(), sorted_labels)
    res_outl4 = res_pd.outlier_test(method='b', order=True, cutoff=0.15)
    assert_equal(res_outl4.index.tolist(), sorted_labels[:1])
示例#16
0
 def setup_class(cls):
     formula_outcome = 'bweight ~ prenatal1_ + mmarried_ + mage + fbaby_'
     mod = OLS.from_formula(formula_outcome, dta_cat)
     tind = np.asarray(dta_cat['mbsmoke_'])
     cls.teff = TreatmentEffect(mod, tind, results_select=res_probit)