Exemplo n.º 1
0
def pacf_ols(x, nlags=40):
    '''Calculate partial autocorrelations

    Parameters
    ----------
    x : 1d array
        observations of time series for which pacf is calculated
    nlags : int
        Number of lags for which pacf is returned.  Lag 0 is not returned.

    Returns
    -------
    pacf : 1d array
        partial autocorrelations, maxlag+1 elements

    Notes
    -----
    This solves a separate OLS estimation for each desired lag.
    '''
    #TODO: add warnings for Yule-Walker
    #NOTE: demeaning and not using a constant gave incorrect answers?
    #JP: demeaning should have a better estimate of the constant
    #maybe we can compare small sample properties with a MonteCarlo
    xlags = lagmat(x, nlags)
    x0 = xlags[:,0]
    xlags = xlags[:,1:]
    #xlags = sm.add_constant(lagmat(x, nlags), prepend=True)
    xlags = sm.add_constant(xlags, prepend=True)
    pacf = [1.]
    for k in range(1, nlags+1):
        res = sm.OLS(x0[k:], xlags[k:,:k+1]).fit()
         #np.take(xlags[k:], range(1,k+1)+[-1],

        pacf.append(res.params[-1])
    return np.array(pacf)
 def __init__(self):
     from results.results_discrete import Spector
     data = sm.datasets.spector.load()
     data.exog = sm.add_constant(data.exog)
     self.data = data
     self.res1 = Logit(data.endog, data.exog).fit(method="newton", disp=0)
     res2 = Spector()
     res2.logit()
     self.res2 = res2
 def __init__(self):
     from results.results_discrete import RandHIE
     data = sm.datasets.randhie.load()
     nobs = len(data.endog)
     exog = sm.add_constant(data.exog.view(float).reshape(nobs,-1))
     self.res1 = Poisson(data.endog, exog).fit(method='newton', disp=0)
     res2 = RandHIE()
     res2.poisson()
     self.res2 = res2
 def __init__(self):
     from results.results_discrete import Anes
     data = sm.datasets.anes96.load()
     exog = data.exog
     exog[:,0] = np.log(exog[:,0] + .1)
     exog = np.column_stack((exog[:,0],exog[:,2],
         exog[:,5:8]))
     exog = sm.add_constant(exog)
     self.res1 = MNLogit(data.endog, exog).fit(method="newton", disp=0)
     res2 = Anes()
     res2.mnlogit_basezero()
     self.res2 = res2
Exemplo n.º 5
0
    def checkOLS(self, exog, endog, x, y):
        reference = sm.OLS(endog, sm.add_constant(exog)).fit()
        result = ols(y=y, x=x)

        assert_almost_equal(reference.params, result._beta_raw)
        assert_almost_equal(reference.df_model, result._df_model_raw)
        assert_almost_equal(reference.df_resid, result._df_resid_raw)
        assert_almost_equal(reference.fvalue, result._f_stat_raw[0])
        assert_almost_equal(reference.pvalues, result._p_value_raw)
        assert_almost_equal(reference.rsquared, result._r2_raw)
        assert_almost_equal(reference.rsquared_adj, result._r2_adj_raw)
        assert_almost_equal(reference.resid, result._resid_raw)
        assert_almost_equal(reference.bse, result._std_err_raw)
        assert_almost_equal(reference.t(), result._t_stat_raw)
        assert_almost_equal(reference.cov_params(), result._var_beta_raw)
        assert_almost_equal(reference.fittedvalues, result._y_fitted_raw)

        _check_non_raw_results(result)
Exemplo n.º 6
0
    def checkOLS(self, exog, endog, x, y):
        reference = sm.OLS(endog, sm.add_constant(exog)).fit()
        result = ols(y=y, x=x)

        assert_almost_equal(reference.params, result._beta_raw)
        assert_almost_equal(reference.df_model, result._df_model_raw)
        assert_almost_equal(reference.df_resid, result._df_resid_raw)
        assert_almost_equal(reference.fvalue, result._f_stat_raw[0])
        assert_almost_equal(reference.pvalues, result._p_value_raw)
        assert_almost_equal(reference.rsquared, result._r2_raw)
        assert_almost_equal(reference.rsquared_adj, result._r2_adj_raw)
        assert_almost_equal(reference.resid, result._resid_raw)
        assert_almost_equal(reference.bse, result._std_err_raw)
        assert_almost_equal(reference.t(), result._t_stat_raw)
        assert_almost_equal(reference.cov_params(), result._var_beta_raw)
        assert_almost_equal(reference.fittedvalues, result._y_fitted_raw)

        _check_non_raw_results(result)
Exemplo n.º 7
0
    import scikits.statsmodels as sm
    import numpy.lib.recfunctions as nprf

    data = sm.datasets.grunfeld.Load()
    # Baltagi doesn't include American Steel
    endog = data.endog[:-20]
    fullexog = data.exog[:-20]
#    fullexog.sort(order=['firm','year'])
    panel_arr = nprf.append_fields(fullexog, 'investment', endog, float,
            usemask=False)
    panel_panda = LongPanel.fromRecords(panel_arr, major_field='year',
            minor_field='firm')

    # the most cumbersome way of doing it as far as preprocessing by hand
    exog = fullexog[['value','capital']].view(float).reshape(-1,2)
    exog = sm.add_constant(exog)
    panel = group(fullexog['firm'])
    year = fullexog['year']
    panel_mod = PanelModel(endog, exog, panel, year, xtnames=['firm','year'],
            equation='invest value capital')
# note that equation doesn't actually do anything but name the variables
    panel_ols = panel_mod.fit(model='pooled')

    panel_be = panel_mod.fit(model='between', effects='oneway')
    panel_fe = panel_mod.fit(model='fixed', effects='oneway')

    panel_bet = panel_mod.fit(model='between', effects='time')
    panel_fet = panel_mod.fit(model='fixed', effects='time')

    panel_fe2 = panel_mod.fit(model='fixed', effects='twoways')
f2xcoef = np.array([[ 0.1,  3.,  1.,    0.],
                    [ 0.,  0.,  1.5,   0.1],
                    [ 3.,  2.,  1.,    0.]])
x0 = np.dot(f0, f2xcoef)
x0 += 0.1*np.random.normal(size=x0.shape)
ytrue = np.dot(f0,[1., 1., 1.])
y0 = ytrue + 0.1*np.random.normal(size=ytrue.shape)

xred, fact, eva, eve  = pca(x0, keepdim=0)
print eve
print fact[:5]
print f0[:5]

import scikits.statsmodels as sm

res = sm.OLS(y0, sm.add_constant(x0)).fit()
print 'OLS on original data'
print res.params
print res.aic
print res.rsquared

#print 'OLS on Factors'
#for k in range(x0.shape[1]):
#    xred, fact, eva, eve  = pca(x0, keepdim=k, normalize=1)
#    fact_wconst = sm.add_constant(fact)
#    res = sm.OLS(y0, fact_wconst).fit()
#    print 'k =', k
#    print res.params
#    print 'aic:  ', res.aic
#    print 'bic:  ', res.bic
#    print 'llf:  ', res.llf
Exemplo n.º 9
0
# ndts=np.column_stack(dts[col] for col in dts.dtype.names)
# ntda=ntds.swapaxis(1,0)
# ntda is ntds returns false?

# or now we just have detailed information about the different strings
# would this approach ever be inappropriate for a string typed variable
# other than dates?
#    descstats(ndts, [1])
#    raw_input("Enter to try second part")
#    descstats(ndts, [1,20,3])

if __name__ == '__main__':
    import scikits.statsmodels as sm
    import os
    data = sm.datasets.longley.Load()
    data.exog = sm.add_constant(data.exog)
    sum1 = descstats(data.exog)
    sum1a = descstats(data.exog[:,:1])

#    loc='http://eagle1.american.edu/~js2796a/data/handguns_data.csv'
#    dta=np.recfromcsv(loc)
#    summary2 = descstats(dta,['stpop'])
#    summary3 =  descstats(dta,['stpop','avginc','vio'])
#TODO: needs a by argument
#    summary4 = descstats(dta) this fails
# this is a bug
# p = dta[['stpop']]
# p.view(dtype = np.float, type = np.ndarray)
# this works
# p.view(dtype = np.int, type = np.ndarray)
firms = ['General Motors', 'Chrysler', 'General Electric', 'Westinghouse',
        'US Steel']
grun_exog = grun_data.exog
grun_endog = grun_data.endog

# Right now takes SUR takes a list of arrays
# The array alternates between the LHS of an equation and RHS side of an
# equation
# This is very likely to change
grun_sys = []
for i in firms:
    index = grun_exog['firm'] == i
    grun_sys.append(grun_endog[index])
    exog = grun_exog[index][['value','capital']].view(float).reshape(-1,2)
    exog = sm.add_constant(exog, prepend=True)
    grun_sys.append(exog)

# Note that the results in Greene (5th edition) uses a slightly different
# version of the Grunfeld data. To reproduce Table 14.1 the following changes
# are necessary.
grun_sys[-2][5] = 261.6
grun_sys[-2][-3] = 645.2
grun_sys[-1][11,2] = 232.6

grun_mod = SUR(grun_sys)
grun_res = grun_mod.fit()
print "Results for the 2-step GLS"
print "Compare to Greene Table 14.1, 5th edition"
print grun_res.params
# or you can do an iterative fit
Exemplo n.º 11
0
"""
Example: scikis.statsmodels.GLS
"""

import scikits.statsmodels as sm
import numpy as np
data = sm.datasets.longley.Load()
data.exog = sm.add_constant(data.exog)

# The Longley dataset is a time series dataset
# Let's assume that the data is heteroskedastic and that we know
# the nature of the heteroskedasticity.  We can then define
# `sigma` and use it to give us a GLS model

# First we will obtain the residuals from an OLS fit

ols_resid = sm.OLS(data.endog, data.exog).fit().resid

# Assume that the error terms follow an AR(1) process with a trend
# resid[i] = beta_0 + rho*resid[i-1] + e[i]
# where e ~ N(0,some_sigma**2)
# and that rho is simply the correlation of the residuals
# a consistent estimator for rho is to regress the residuals
# on the lagged residuals

resid_fit = sm.OLS(ols_resid[1:], sm.add_constant(ols_resid[:-1])).fit()
print resid_fit.t(0)
print resid_fit.pvalues[0]
# While we don't have strong evidence that the errors follow an AR(1)
# process we continue
Exemplo n.º 12
0
Created on Thu Mar 25 22:56:45 2010
Author: josef-pktd
"""

import numpy as np
from numpy.testing import assert_almost_equal
import scikits.statsmodels as sm

np.random.seed(87654589)

nobs = 10 #100
x1 = np.random.randn(nobs)
y1 = 10 + 15*x1 + 2*np.random.randn(nobs)

x1 = sm.add_constant(x1) #, prepend=True)
assert_almost_equal(x1, np.vander(x1[:,0],2), 16)
res1 = sm.OLS(y1, x1).fit()
print res1.params
print np.polyfit(x1[:,0], y1, 1)
assert_almost_equal(res1.params, np.polyfit(x1[:,0], y1, 1), 14)
print res1.summary(xname=['x1','const1'])

#regression 2
x2 = np.random.randn(nobs)
y2 = 19 + 17*x2 + 2*np.random.randn(nobs)
#y2 = 10 + 15*x2 + 2*np.random.randn(nobs)  # if H0 is true

x2 = sm.add_constant(x2) #, prepend=True)
assert_almost_equal(x2, np.vander(x2[:,0],2), 16)
Exemplo n.º 13
0
import numpy as np
import scikits.statsmodels as sm

data = np.loadtxt("burglary.txt",  skiprows=1, usecols = (1,2))

exog = data[:,1]
endog = data[:,0]

endog1 = endog[endog > 0]
exog1 = exog[endog > 0]

exog1 = sm.add_constant(exog1, prepend=True)

glm = sm.GLM(endog1, exog1, family=sm.family.Poisson())
res = glm.fit()

print "res.deviance=" + str(res.deviance)
print "res.scale=" + str(res.scale)
print "res.params=" + str(res.params)
print "res.pearson_chi2=" + str(res.pearson_chi2)
print "res.df_model=" + str(res.df_model)
print "res.null_deviance=" + str(res.null_deviance)
print "res.t()=" + str(res.t())
print "\n"

exog = sm.add_constant(exog, prepend=True)

glm = sm.GLM(endog, exog, family=sm.family.NegativeBinomial())
res = glm.fit()

print "res.deviance=" + str(res.deviance)
Exemplo n.º 14
0
# The proportion of low income families "LOWINC"
# The proportions of minority students,"PERASIAN","PERBLACK","PERHISP"
# The percentage of minority teachers "PERMINTE",
# The median teacher salary including benefits in 1000s "AVSALK"
# The mean teacher experience in years "AVYRSEXP",
# The per-pupil expenditures in thousands "PERSPENK"
# The parent-teacher ratio "PTRATIO"
# The percent of students taking college credit courses "PCTAF",
# The percentage of charter schools in the districut "PCTCHRT"
# The percent of schools in the district operating year round "PCTYRRND"
# The following are interaction terms "PERMINTE_AVYRSEXP","PERMINTE_AVSAL",
# "AVYRSEXP_AVSAL","PERSPEN_PTRATIO","PERSPEN_PCTAF","PTRATIO_PCTAF",
# "PERMINTE_AVYRSEXP_AVSAL","PERSPEN_PTRATIO_PCTAF"

data = sm.datasets.star98.Load()
data.exog = sm.add_constant(data.exog)

print """The response variable is (success, failure).  Eg., the first 
observation is """, data.endog[0]
print"""Giving a total number of trials for this observation of
""", data.endog[0].sum()

glm_binom = sm.GLM(data.endog, data.exog, family=sm.family.Binomial())

### In order to fit this model, you must (for now) specify the number of 
### trials per observation ie., success + failure
### This is the only time the data_weights argument should be used.

trials = data.endog.sum(axis=1)
binom_results = glm_binom.fit(data_weights=trials)
print """The fitted values are
"""Example: scikits.statsmodels.discretemod
"""

import numpy as np
import scikits.statsmodels as sm

# load the data from Spector and Mazzeo (1980)
# Examples follow Greene's Econometric Analysis Ch. 21 (5th Edition).
spector_data = sm.datasets.spector.load()
spector_data.exog = sm.add_constant(spector_data.exog)

# Linear Probability Model using OLS
lpm_mod = sm.OLS(spector_data.endog,spector_data.exog)
lpm_res = lpm_mod.fit()

# Logit Model
logit_mod = sm.Logit(spector_data.endog, spector_data.exog)
logit_res = logit_mod.fit()

# Probit Model
probit_mod = sm.Probit(spector_data.endog, spector_data.exog)
probit_res = probit_mod.fit()

print "This example is based on Greene Table 21.1 5th Edition"
print "Linear Model"
print lpm_res.params
print "Logit Model"
print logit_res.params
print "Probit Model"
print probit_res.params
#print "Typo in Greene for Weibull, replaced with logWeibull or Gumbel"
Exemplo n.º 16
0
def grangercausalitytests(x, maxlag):
    '''four tests for granger causality of 2 timeseries

    this is a proof-of concept implementation
    not cleaned up, has some duplicate calculations,
    memory intensive - builds full lag array for variables
    prints results
    not verified with other packages,
    all four tests give similar results (1 and 4 identical)

    Parameters
    ----------
    x : array, 2d, (nobs,2)
        data for test whether the time series in the second column Granger
        causes the time series in the first column
    maxlag : integer
        the Granger causality test results are calculated for all lags up to
        maxlag

    Returns
    -------
    None : no returns
        all test results are currently printed

    Notes
    -----
    TODO: convert to function that returns and compare with other packages

    '''
    from scipy import stats # lazy import
    import scikits.statsmodels as sm  # absolute import for now

    for mlg in range(1, maxlag+1):
        print '\nGranger Causality'
        print 'number of lags (no zero)', mlg
        mxlg = mlg + 1 # Note number of lags starting at zero in lagmat

        # create lagmat of both time series
        dta = lagmat2ds(x, mxlg, trim='both', dropex=1)

        #add constant
        dtaown = sm.add_constant(dta[:,1:mxlg])
        dtajoint = sm.add_constant(dta[:,1:])

        #run ols on both models without and with lags of second variable
        res2down = sm.OLS(dta[:,0], dtaown).fit()
        res2djoint = sm.OLS(dta[:,0], dtajoint).fit()

        #print results
        #for ssr based tests see: http://support.sas.com/rnd/app/examples/ets/granger/index.htm
        #the other tests are made-up

        # Granger Causality test using ssr (F statistic)
        fgc1 = (res2down.ssr-res2djoint.ssr)/res2djoint.ssr/(mxlg-1)*res2djoint.df_resid
        print 'ssr based F test:         F=%-8.4f, p=%-8.4f, df_denom=%d, df_num=%d' % \
              (fgc1, stats.f.sf(fgc1, mxlg-1, res2djoint.df_resid), res2djoint.df_resid, mxlg-1)

        # Granger Causality test using ssr (ch2 statistic)
        fgc2 = res2down.nobs*(res2down.ssr-res2djoint.ssr)/res2djoint.ssr
        print 'ssr based chi2 test:   chi2=%-8.4f, p=%-8.4f, df=%d' %  \
              (fgc2, stats.chi2.sf(fgc2, mxlg-1), mxlg-1)

        #likelihood ratio test pvalue:
        lr = -2*(res2down.llf-res2djoint.llf)
        print 'likelihood ratio test: chi2=%-8.4f, p=%-8.4f, df=%d' %  \
              (lr, stats.chi2.sf(lr, mxlg-1), mxlg-1)

        # F test that all lag coefficients of exog are zero
        rconstr = np.column_stack((np.zeros((mxlg-1,mxlg-1)), np.eye(mxlg-1, mxlg-1),\
                                   np.zeros((mxlg-1, 1))))
        ftres = res2djoint.f_test(rconstr)
        print 'parameter F test:         F=%-8.4f, p=%-8.4f, df_denom=%d, df_num=%d' % \
              (ftres.fvalue, ftres.pvalue, ftres.df_denom, ftres.df_num)
def anova_ols(y, x):
    X = sm.add_constant(data2dummy(x))
    res = sm.OLS(y, X).fit()
    return res.fvalue, res.f_pvalue, res.rsquared, np.sqrt(res.mse_resid)
Exemplo n.º 18
0
def add_trend(X, trend="c", prepend=False):
    """
    Adds a trend and/or constant to an array.

    Parameters
    ----------
    X : array-like
        Original array of data.
    trend : str {"c","ct","ctt"}
        "c" add constant only
        "t" add trend only
        "ct" add constant and linear trend
        "ctt" add constant and linear and quadratic trend.
    prepend : bool
        If True, prepends the new data to the columns of X.

    Notes
    -----
    Returns columns as ["ctt","ct","c"] whenever applicable.  There is currently
    no checking for an existing constant or trend.

    See also
    --------
    scikits.statsmodels.add_constant
    """
    #TODO: could be generalized for trend of aribitrary order
    trend = trend.lower()
    if trend == "c":    # handles structured arrays
        return sm.add_constant(X, prepend=prepend)
    elif trend == "ct" or trend == "t":
        trendorder = 1
    elif trend == "ctt":
        trendorder = 2
    else:
        raise ValueError("trend %s not understood") % trend
    X = np.asanyarray(X)
    nobs = len(X)
    trendarr = np.vander(np.arange(1,nobs+1, dtype=float), trendorder+1)
    if trend == "t":
        trendarr = trendarr[:,0]
    if not X.dtype.names:
        if not prepend:
            X = np.column_stack((X, trendarr))
        else:
            X = np.column_stack((trendarr, X))
    else:
        return_rec = data.__clas__ is np.recarray
        if trendorder == 1:
            if trend == "ct":
                dt = [('trend',float),('const',float)]
            else:
                dt = [('trend', float)]
        elif trendorder == 2:
            dt = [('trend_squared', float),('trend',float),('const',float)]
        trendarr = trendarr.view(dt)
        if prepend:
            X = nprf.append_fields(trendarr, X.dtype.names, [X[i] for i
                in data.dtype.names], usemask=False, asrecarray=return_rec)
        else:
            X = nprf.append_fields(X, trendarr.dtype.names, [trendarr[i] for i
                in trendarr.dtype.names], usemask=false, asrecarray=return_rec)
    return X
Exemplo n.º 19
0
it does not work for all types of R models.

There are also R scripts included with most of the datasets to run
some basic models for comparisons of results to statsmodels.
'''

from rpy import r
import numpy as np
import scikits.statsmodels as sm


examples = [1, 2]

if 1 in examples:
    data = sm.datasets.longley.load()
    y,x = data.endog, sm.add_constant(data.exog)
    des_cols = ['x.%d' % (i+1) for i in range(x.shape[1])]
    formula = r('y~%s-1' % '+'.join(des_cols))
    frame = r.data_frame(y=y, x=x)
    results = r.lm(formula, data=frame)
    print results.keys()
    print results['coefficients']

if 2 in examples:
    data2 = sm.datasets.star98.load()
    y2,x2 = data2.endog, sm.add_constant(data2.exog)
    import rpy
    y2 = y2[:,0]/y2.sum(axis=1)
    des_cols2 = ['x.%d' % (i+1) for i in range(x2.shape[1])]
    formula2 = r('y~%s-1' % '+'.join(des_cols2))
    frame2 = r.data_frame(y=y2, x=x2)
"""

import numpy as np
import numpy.testing as npt
from scipy import signal
import scikits.statsmodels as sm
from scikits.statsmodels.regression import GLSAR, yule_walker

examples_all = range(10) + ["test_copy"]

examples = examples_all  # [5]

if 0 in examples:
    print "\n Example 0"
    X = np.arange(1, 8)
    X = sm.add_constant(X)
    Y = np.array((1, 3, 4, 5, 8, 10, 9))
    rho = 2
    model = GLSAR(Y, X, 2)
    for i in range(6):
        results = model.fit()
        print "AR coefficients:", model.rho
        rho, sigma = yule_walker(results.resid, order=model.order)
        model = GLSAR(Y, X, rho)

    par0 = results.params
    print par0
    model0if = GLSAR(Y, X, 2)
    res = model0if.iterative_fit(6)
    print "iterativefit beta", res.params
    results.t()  # is this correct? it does equal params/bse