def notyet_atst(): d = macrodata.load().data realinv = d['realinv'] realgdp = d['realgdp'] realint = d['realint'] endog = realinv exog = add_constant(np.c_[realgdp, realint],prepend=True) res_ols1 = OLS(endog, exog).fit() #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'])) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'])) lint = d['realint'][:-1] tbilrate = d['tbilrate'][:-1] endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, lint], prepend=True) exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate], prepend=True) res_ols = OLS(endogg, exogg).fit() res_ols2 = OLS(endogg, exogg2).fit() #the following were done accidentally with res_ols1 in R, #with original Greene data params = np.array([-272.3986041341653, 0.1779455206941112, 0.2149432424658157]) cov_hac_4 = np.array([1321.569466333051, -0.2318836566017612, 37.01280466875694, -0.2318836566017614, 4.602339488102263e-05, -0.0104687835998635, 37.012804668757, -0.0104687835998635, 21.16037144168061]).reshape(3,3, order='F') cov_hac_10 = np.array([2027.356101193361, -0.3507514463299015, 54.81079621448568, -0.350751446329901, 6.953380432635583e-05, -0.01268990195095196, 54.81079621448564, -0.01268990195095195, 22.92512402151113]).reshape(3,3, order='F') #goldfeld-quandt het_gq_greater = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.246141976112324e-30, distr='f') het_gq_less = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.) het_gq_2sided = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.246141976112324e-30, distr='f') #goldfeld-quandt, fraction = 0.5 het_gq_greater_2 = dict(statistic=87.1328934692124, df1=48, df2=47, pvalue=2.154956842194898e-33, distr='f') gq = smsdia.het_goldfeldquandt(endog, exog, split=0.5) compare_t_est(gq, het_gq_greater, decimal=(13, 14)) assert_equal(gq[-1], 'increasing') harvey_collier = dict(stat=2.28042114041313, df=199, pvalue=0.02364236161988260, distr='t') #hc = harvtest(fm, order.by=ggdp , data = list()) harvey_collier_2 = dict(stat=0.7516918462158783, df=199, pvalue=0.4531244858006127, distr='t')
def coint(y1, y2, regression="c"): """ This is a simple cointegration test. Uses unit-root test on residuals to test for cointegrated relationship See Hamilton (1994) 19.2 Parameters ---------- y1 : array_like, 1d first element in cointegrating vector y2 : array_like remaining elements in cointegrating vector c : str {'c'} Included in regression * 'c' : Constant Returns ------- coint_t : float t-statistic of unit-root test on residuals pvalue : float MacKinnon's approximate p-value based on MacKinnon (1994) crit_value : dict Critical values for the test statistic at the 1 %, 5 %, and 10 % levels. Notes ----- The Null hypothesis is that there is no cointegration, the alternative hypothesis is that there is cointegrating relationship. If the pvalue is small, below a critical size, then we can reject the hypothesis that there is no cointegrating relationship. P-values are obtained through regression surface approximation from MacKinnon 1994. References ---------- MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for unit-root and cointegration tests. `Journal of Business and Economic Statistics` 12, 167-76. """ regression = regression.lower() if regression not in ['c','nc','ct','ctt']: raise ValueError("regression option %s not understood") % regression y1 = np.asarray(y1) y2 = np.asarray(y2) if regression == 'c': y2 = add_constant(y2) st1_resid = OLS(y1, y2).fit().resid #stage one residuals lgresid_cons = add_constant(st1_resid[0:-1]) uroot_reg = OLS(st1_resid[1:], lgresid_cons).fit() coint_t = (uroot_reg.params[0]-1)/uroot_reg.bse[0] pvalue = mackinnonp(coint_t, regression="c", N=2, lags=None) crit_value = mackinnoncrit(N=1, regression="c", nobs=len(y1)) return coint_t, pvalue, crit_value
def coint(y1, y2, regression="c"): """ This is a simple cointegration test. Uses unit-root test on residuals to test for cointegrated relationship See Hamilton (1994) 19.2 Parameters ---------- y1 : array_like, 1d first element in cointegrating vector y2 : array_like remaining elements in cointegrating vector c : str {'c'} Included in regression * 'c' : Constant Returns ------- coint_t : float t-statistic of unit-root test on residuals pvalue : float MacKinnon's approximate p-value based on MacKinnon (1994) crit_value : dict Critical values for the test statistic at the 1 %, 5 %, and 10 % levels. Notes ----- The Null hypothesis is that there is no cointegration, the alternative hypothesis is that there is cointegrating relationship. If the pvalue is small, below a critical size, then we can reject the hypothesis that there is no cointegrating relationship. P-values are obtained through regression surface approximation from MacKinnon 1994. References ---------- MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for unit-root and cointegration tests. `Journal of Business and Economic Statistics` 12, 167-76. """ regression = regression.lower() if regression not in ['c', 'nc', 'ct', 'ctt']: raise ValueError("regression option %s not understood") % regression y1 = np.asarray(y1) y2 = np.asarray(y2) if regression == 'c': y2 = add_constant(y2) st1_resid = OLS(y1, y2).fit().resid #stage one residuals lgresid_cons = add_constant(st1_resid[0:-1]) uroot_reg = OLS(st1_resid[1:], lgresid_cons).fit() coint_t = (uroot_reg.params[0] - 1) / uroot_reg.bse[0] pvalue = mackinnonp(coint_t, regression="c", N=2, lags=None) crit_value = mackinnoncrit(N=1, regression="c", nobs=len(y1)) return coint_t, pvalue, crit_value
def test_cov_cluster_2groups(): # comparing cluster robust standard errors to Peterson # requires Petersen's test_data # http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.txt import os cur_dir = os.path.abspath(os.path.dirname(__file__)) fpath = os.path.join(cur_dir, "test_data.txt") pet = np.genfromtxt(fpath) endog = pet[:, -1] group = pet[:, 0].astype(int) time = pet[:, 1].astype(int) exog = add_constant(pet[:, 2], prepend=True) res = OLS(endog, exog).fit() cov01, covg, covt = sw.cov_cluster_2groups(res, group, group2=time) # Reference number from Petersen # http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.htm bse_petw = [0.0284, 0.0284] bse_pet0 = [0.0670, 0.0506] bse_pet1 = [0.0234, 0.0334] # year bse_pet01 = [0.0651, 0.0536] # firm and year bse_0 = sw.se_cov(covg) bse_1 = sw.se_cov(covt) bse_01 = sw.se_cov(cov01) # print res.HC0_se, bse_petw - res.HC0_se # print bse_0, bse_0 - bse_pet0 # print bse_1, bse_1 - bse_pet1 # print bse_01, bse_01 - bse_pet01 assert_almost_equal(bse_petw, res.HC0_se, decimal=4) assert_almost_equal(bse_0, bse_pet0, decimal=4) assert_almost_equal(bse_1, bse_pet1, decimal=4) assert_almost_equal(bse_01, bse_pet01, decimal=4)
def pacf_ols(x, nlags=40): '''Calculate partial autocorrelations Parameters ---------- x : 1d array observations of time series for which pacf is calculated nlags : int Number of lags for which pacf is returned. Lag 0 is not returned. Returns ------- pacf : 1d array partial autocorrelations, maxlag+1 elements Notes ----- This solves a separate OLS estimation for each desired lag. ''' #TODO: add warnings for Yule-Walker #NOTE: demeaning and not using a constant gave incorrect answers? #JP: demeaning should have a better estimate of the constant #maybe we can compare small sample properties with a MonteCarlo xlags, x0 = lagmat(x, nlags, original='sep') #xlags = sm.add_constant(lagmat(x, nlags), prepend=True) xlags = add_constant(xlags, prepend=True) pacf = [1.] for k in range(1, nlags+1): res = OLS(x0[k:], xlags[k:,:k+1]).fit() #np.take(xlags[k:], range(1,k+1)+[-1], pacf.append(res.params[-1]) return np.array(pacf)
def test_hac_simple(): from gwstatsmodels.datasets import macrodata d2 = macrodata.load().data g_gdp = 400 * np.diff(np.log(d2["realgdp"])) g_inv = 400 * np.diff(np.log(d2["realinv"])) exogg = add_constant(np.c_[g_gdp, d2["realint"][:-1]], prepend=True) res_olsg = OLS(g_inv, exogg).fit() # > NeweyWest(fm, lag = 4, prewhite = FALSE, sandwich = TRUE, verbose=TRUE, adjust=TRUE) # Lag truncation parameter chosen: 4 # (Intercept) ggdp lint cov1_r = [ [1.40643899878678802, -0.3180328707083329709, -0.060621111216488610], [-0.31803287070833292, 0.1097308348999818661, 0.000395311760301478], [-0.06062111121648865, 0.0003953117603014895, 0.087511528912470993], ] # > NeweyWest(fm, lag = 4, prewhite = FALSE, sandwich = TRUE, verbose=TRUE, adjust=FALSE) # Lag truncation parameter chosen: 4 # (Intercept) ggdp lint cov2_r = [ [1.3855512908840137, -0.313309610252268500, -0.059720797683570477], [-0.3133096102522685, 0.108101169035130618, 0.000389440793564339], [-0.0597207976835705, 0.000389440793564336, 0.086211852740503622], ] cov1, se1 = sw.cov_hac_simple(res_olsg, nlags=4, use_correction=True) cov2, se2 = sw.cov_hac_simple(res_olsg, nlags=4, use_correction=False) assert_almost_equal(cov1, cov1_r, decimal=14) assert_almost_equal(cov2, cov2_r, decimal=14)
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog) ols_res = OLS(data.endog, data.exog).fit() gls_res = GLS(data.endog, data.exog).fit() cls.res1 = gls_res cls.res2 = ols_res
def test_hac_simple(): from gwstatsmodels.datasets import macrodata d2 = macrodata.load().data g_gdp = 400 * np.diff(np.log(d2['realgdp'])) g_inv = 400 * np.diff(np.log(d2['realinv'])) exogg = add_constant(np.c_[g_gdp, d2['realint'][:-1]], prepend=True) res_olsg = OLS(g_inv, exogg).fit() #> NeweyWest(fm, lag = 4, prewhite = FALSE, sandwich = TRUE, verbose=TRUE, adjust=TRUE) #Lag truncation parameter chosen: 4 # (Intercept) ggdp lint cov1_r = [ [1.40643899878678802, -0.3180328707083329709, -0.060621111216488610], [-0.31803287070833292, 0.1097308348999818661, 0.000395311760301478], [-0.06062111121648865, 0.0003953117603014895, 0.087511528912470993] ] #> NeweyWest(fm, lag = 4, prewhite = FALSE, sandwich = TRUE, verbose=TRUE, adjust=FALSE) #Lag truncation parameter chosen: 4 # (Intercept) ggdp lint cov2_r = [ [1.3855512908840137, -0.313309610252268500, -0.059720797683570477], [-0.3133096102522685, 0.108101169035130618, 0.000389440793564339], [-0.0597207976835705, 0.000389440793564336, 0.086211852740503622] ] cov1, se1 = sw.cov_hac_simple(res_olsg, nlags=4, use_correction=True) cov2, se2 = sw.cov_hac_simple(res_olsg, nlags=4, use_correction=False) assert_almost_equal(cov1, cov1_r, decimal=14) assert_almost_equal(cov2, cov2_r, decimal=14)
def test_cov_cluster_2groups(): #comparing cluster robust standard errors to Peterson #requires Petersen's test_data #http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.txt import os cur_dir = os.path.abspath(os.path.dirname(__file__)) fpath = os.path.join(cur_dir, "test_data.txt") pet = np.genfromtxt(fpath) endog = pet[:, -1] group = pet[:, 0].astype(int) time = pet[:, 1].astype(int) exog = add_constant(pet[:, 2], prepend=True) res = OLS(endog, exog).fit() cov01, covg, covt = sw.cov_cluster_2groups(res, group, group2=time) #Reference number from Petersen #http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.htm bse_petw = [0.0284, 0.0284] bse_pet0 = [0.0670, 0.0506] bse_pet1 = [0.0234, 0.0334] #year bse_pet01 = [0.0651, 0.0536] #firm and year bse_0 = sw.se_cov(covg) bse_1 = sw.se_cov(covt) bse_01 = sw.se_cov(cov01) #print res.HC0_se, bse_petw - res.HC0_se #print bse_0, bse_0 - bse_pet0 #print bse_1, bse_1 - bse_pet1 #print bse_01, bse_01 - bse_pet01 assert_almost_equal(bse_petw, res.HC0_se, decimal=4) assert_almost_equal(bse_0, bse_pet0, decimal=4) assert_almost_equal(bse_1, bse_pet1, decimal=4) assert_almost_equal(bse_01, bse_pet01, decimal=4)
def pacf_ols(x, nlags=40): '''Calculate partial autocorrelations Parameters ---------- x : 1d array observations of time series for which pacf is calculated nlags : int Number of lags for which pacf is returned. Lag 0 is not returned. Returns ------- pacf : 1d array partial autocorrelations, maxlag+1 elements Notes ----- This solves a separate OLS estimation for each desired lag. ''' #TODO: add warnings for Yule-Walker #NOTE: demeaning and not using a constant gave incorrect answers? #JP: demeaning should have a better estimate of the constant #maybe we can compare small sample properties with a MonteCarlo xlags, x0 = lagmat(x, nlags, original='sep') #xlags = sm.add_constant(lagmat(x, nlags), prepend=True) xlags = add_constant(xlags, prepend=True) pacf = [1.] for k in range(1, nlags + 1): res = OLS(x0[k:], xlags[k:, :k + 1]).fit() #np.take(xlags[k:], range(1,k+1)+[-1], pacf.append(res.params[-1]) return np.array(pacf)
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog) res1 = OLS(data.endog, data.exog).fit() R = np.array([[0, 1, 1, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0]]) q = np.array([0, 0, 0, 1, 0]) cls.Ftest1 = res1.f_test(R, q)
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog) res1 = OLS(data.endog, data.exog).fit() R = np.array([[0,1,1,0,0,0,0], [0,1,0,1,0,0,0], [0,1,0,0,0,0,0], [0,0,0,0,1,0,0], [0,0,0,0,0,1,0]]) q = np.array([0,0,0,1,0]) cls.Ftest1 = res1.f_test(R,q)
def test_prefect_pred(): cur_dir = os.path.dirname(os.path.abspath(__file__)) iris = np.genfromtxt(os.path.join(cur_dir, 'results', 'iris.csv'), delimiter=",", skip_header=1) y = iris[:,-1] X = iris[:,:-1] X = X[y != 2] y = y[y != 2] X = add_constant(X, prepend=True) glm = GLM(y, X, family=sm.families.Binomial()) assert_raises(PerfectSeparationError, glm.fit)
def setupClass(cls): from results.results_glm import Cpunish from gwstatsmodels.datasets.cpunish import load data = load() data.exog[:,3] = np.log(data.exog[:,3]) data.exog = add_constant(data.exog) exposure = [100] * len(data.endog) cls.res1 = GLM(data.endog, data.exog, family=sm.families.Poisson(), exposure=exposure).fit() cls.res1.params[-1] += np.log(100) # add exposure back in to param # to make the results the same cls.res2 = Cpunish()
def setupClass(cls): from results.results_regression import Longley data = longley.load() data.exog = add_constant(data.exog) res1 = OLS(data.endog, data.exog).fit() res2 = Longley() res2.wresid = res1.wresid # workaround hack cls.res1 = res1 cls.res2 = res2 res_qr = OLS(data.endog, data.exog).fit(method="qr") cls.res_qr = res_qr
def test_prefect_pred(): cur_dir = os.path.dirname(os.path.abspath(__file__)) iris = np.genfromtxt(os.path.join(cur_dir, 'results', 'iris.csv'), delimiter=",", skip_header=1) y = iris[:, -1] X = iris[:, :-1] X = X[y != 2] y = y[y != 2] X = add_constant(X, prepend=True) glm = GLM(y, X, family=sm.families.Binomial()) assert_raises(PerfectSeparationError, glm.fit)
def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'): """ Plot a reference line for a qqplot. Parameters ---------- ax : matplotlib axes instance The axes on which to plot the line line : str {'45','r','s','q'} Options for the reference line to which the data is compared.: - '45' - 45-degree line - 's' - standardized line, the expected order statistics are scaled by the standard deviation of the given sample and have the mean added to them - 'r' - A regression line is fit - 'q' - A line is fit through the quartiles. - None - By default no reference line is added to the plot. x : array X data for plot. Not needed if line is '45'. y : array Y data for plot. Not needed if line is '45'. dist : scipy.stats.distribution A scipy.stats distribution, needed if line is 'q'. Notes ----- There is no return value. The line is plotted on the given `ax`. """ if line == '45': end_pts = zip(ax.get_xlim(), ax.get_ylim()) end_pts[0] = max(end_pts[0]) end_pts[1] = min(end_pts[1]) ax.plot(end_pts, end_pts, fmt) return # does this have any side effects? if x is None and y is None: raise ValueError("If line is not 45, x and y cannot be None.") elif line == 'r': # could use ax.lines[0].get_xdata(), get_ydata(), # but don't know axes are 'clean' y = OLS(y, add_constant(x)).fit().fittedvalues ax.plot(x,y,fmt) elif line == 's': m,b = y.std(), y.mean() ref_line = x*m + b ax.plot(x, ref_line, fmt) elif line == 'q': q25 = stats.scoreatpercentile(y, 25) q75 = stats.scoreatpercentile(y, 75) theoretical_quartiles = dist.ppf([.25,.75]) m = (q75 - q25) / np.diff(theoretical_quartiles) b = q25 - m*theoretical_quartiles[0] ax.plot(x, m*x + b, fmt)
def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'): """ Plot a reference line for a qqplot. Parameters ---------- ax : matplotlib axes instance The axes on which to plot the line line : str {'45','r','s','q'} Options for the reference line to which the data is compared.: - '45' - 45-degree line - 's' - standardized line, the expected order statistics are scaled by the standard deviation of the given sample and have the mean added to them - 'r' - A regression line is fit - 'q' - A line is fit through the quartiles. - None - By default no reference line is added to the plot. x : array X data for plot. Not needed if line is '45'. y : array Y data for plot. Not needed if line is '45'. dist : scipy.stats.distribution A scipy.stats distribution, needed if line is 'q'. Notes ----- There is no return value. The line is plotted on the given `ax`. """ if line == '45': end_pts = zip(ax.get_xlim(), ax.get_ylim()) end_pts[0] = max(end_pts[0]) end_pts[1] = min(end_pts[1]) ax.plot(end_pts, end_pts, fmt) return # does this have any side effects? if x is None and y is None: raise ValueError("If line is not 45, x and y cannot be None.") elif line == 'r': # could use ax.lines[0].get_xdata(), get_ydata(), # but don't know axes are 'clean' y = OLS(y, add_constant(x)).fit().fittedvalues ax.plot(x, y, fmt) elif line == 's': m, b = y.std(), y.mean() ref_line = x * m + b ax.plot(x, ref_line, fmt) elif line == 'q': q25 = stats.scoreatpercentile(y, 25) q75 = stats.scoreatpercentile(y, 75) theoretical_quartiles = dist.ppf([.25, .75]) m = (q75 - q25) / np.diff(theoretical_quartiles) b = q25 - m * theoretical_quartiles[0] ax.plot(x, m * x + b, fmt)
def setupClass(cls): # if skipR: # raise SkipTest, "Rpy not installed" # try: # r.library('car') # except RPyRException: # raise SkipTest, "car library not installed for R" R = np.zeros(7) R[4:6] = [1, -1] # self.R = R data = longley.load() data.exog = add_constant(data.exog) res1 = OLS(data.endog, data.exog).fit() cls.Ttest1 = res1.t_test(R)
def setupClass(cls): # if skipR: # raise SkipTest, "Rpy not installed" # try: # r.library('car') # except RPyRException: # raise SkipTest, "car library not installed for R" R = np.zeros(7) R[4:6] = [1,-1] # self.R = R data = longley.load() data.exog = add_constant(data.exog) res1 = OLS(data.endog, data.exog).fit() cls.Ttest1 = res1.t_test(R)
def setupClass(cls): from results.results_regression import LongleyGls data = longley.load() exog = add_constant(np.column_stack(\ (data.exog[:,1],data.exog[:,4]))) tmp_results = OLS(data.endog, exog).fit() rho = np.corrcoef(tmp_results.resid[1:], tmp_results.resid[:-1])[0][1] # by assumption order = toeplitz(np.arange(16)) sigma = rho**order GLS_results = GLS(data.endog, exog, sigma=sigma).fit() cls.res1 = GLS_results cls.res2 = LongleyGls()
def __init__(self): ''' Tests Poisson family with canonical log link. Test results were obtained by R. ''' from results.results_glm import Cpunish from gwstatsmodels.datasets.cpunish import load self.data = load() self.data.exog[:,3] = np.log(self.data.exog[:,3]) self.data.exog = add_constant(self.data.exog) self.res1 = GLM(self.data.endog, self.data.exog, family=sm.families.Poisson()).fit() self.res2 = Cpunish()
def setupClass(cls): from results.results_glm import Cpunish from gwstatsmodels.datasets.cpunish import load data = load() data.exog[:, 3] = np.log(data.exog[:, 3]) data.exog = add_constant(data.exog) exposure = [100] * len(data.endog) cls.res1 = GLM(data.endog, data.exog, family=sm.families.Poisson(), exposure=exposure).fit() cls.res1.params[-1] += np.log(100) # add exposure back in to param # to make the results the same cls.res2 = Cpunish()
def __init__(self): d = macrodata.load().data #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'])) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'])) lint = d['realint'][:-1] tbilrate = d['tbilrate'][:-1] endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, lint], prepend=True) exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate], prepend=True) exogg3 = add_constant(np.c_[gs_l_realgdp], prepend=True) res_ols = OLS(endogg, exogg).fit() res_ols2 = OLS(endogg, exogg2).fit() res_ols3 = OLS(endogg, exogg3).fit() self.res = res_ols self.res2 = res_ols2 self.res3 = res_ols3 self.endog = self.res.model.endog self.exog = self.res.model.exog
def __init__(self): ''' Tests Poisson family with canonical log link. Test results were obtained by R. ''' from results.results_glm import Cpunish from gwstatsmodels.datasets.cpunish import load self.data = load() self.data.exog[:, 3] = np.log(self.data.exog[:, 3]) self.data.exog = add_constant(self.data.exog) self.res1 = GLM(self.data.endog, self.data.exog, family=sm.families.Poisson()).fit() self.res2 = Cpunish()
def __init__(self): ''' Test Gaussian family with canonical identity link ''' # Test Precisions self.decimal_resids = DECIMAL_3 self.decimal_params = DECIMAL_2 self.decimal_bic = DECIMAL_0 self.decimal_bse = DECIMAL_3 from gwstatsmodels.datasets.longley import load self.data = load() self.data.exog = add_constant(self.data.exog) self.res1 = GLM(self.data.endog, self.data.exog, family=sm.families.Gaussian()).fit() from results.results_glm import Longley self.res2 = Longley()
def __init__(self): ''' Test Binomial family with canonical logit link using star98 dataset. ''' self.decimal_resids = DECIMAL_1 self.decimal_bic = DECIMAL_2 from gwstatsmodels.datasets.star98 import load from results.results_glm import Star98 data = load() data.exog = add_constant(data.exog) self.res1 = GLM(data.endog, data.exog, \ family=sm.families.Binomial()).fit() #NOTE: if you want to replicate with RModel #res2 = RModel(data.endog[:,0]/trials, data.exog, r.glm, # family=r.binomial, weights=trials) self.res2 = Star98()
def __init__(self): ''' Tests Gamma family with canonical inverse link (power -1) ''' # Test Precisions self.decimal_aic_R = -1 #TODO: off by about 1, we are right with Stata self.decimal_resids = DECIMAL_2 from gwstatsmodels.datasets.scotland import load from results.results_glm import Scotvote data = load() data.exog = add_constant(data.exog) res1 = GLM(data.endog, data.exog, \ family=sm.families.Gamma()).fit() self.res1 = res1 # res2 = RModel(data.endog, data.exog, r.glm, family=r.Gamma) res2 = Scotvote() res2.aic_R += 2 # R doesn't count degree of freedom for scale with gamma self.res2 = res2
def __init__(self): ''' Test Negative Binomial family with canonical log link ''' # Test Precision self.decimal_resid = DECIMAL_1 self.decimal_params = DECIMAL_3 self.decimal_resids = -1 # 1 % mismatch at 0 self.decimal_fittedvalues = DECIMAL_1 from gwstatsmodels.datasets.committee import load self.data = load() self.data.exog[:,2] = np.log(self.data.exog[:,2]) interaction = self.data.exog[:,2]*self.data.exog[:,1] self.data.exog = np.column_stack((self.data.exog,interaction)) self.data.exog = add_constant(self.data.exog) self.res1 = GLM(self.data.endog, self.data.exog, family=sm.families.NegativeBinomial()).fit() from results.results_glm import Committee res2 = Committee() res2.aic_R += 2 # They don't count a degree of freedom for the scale self.res2 = res2
def __init__(self): ''' Test Negative Binomial family with canonical log link ''' # Test Precision self.decimal_resid = DECIMAL_1 self.decimal_params = DECIMAL_3 self.decimal_resids = -1 # 1 % mismatch at 0 self.decimal_fittedvalues = DECIMAL_1 from gwstatsmodels.datasets.committee import load self.data = load() self.data.exog[:, 2] = np.log(self.data.exog[:, 2]) interaction = self.data.exog[:, 2] * self.data.exog[:, 1] self.data.exog = np.column_stack((self.data.exog, interaction)) self.data.exog = add_constant(self.data.exog) self.res1 = GLM(self.data.endog, self.data.exog, family=sm.families.NegativeBinomial()).fit() from results.results_glm import Committee res2 = Committee() res2.aic_R += 2 # They don't count a degree of freedom for the scale self.res2 = res2
def test_add_constant_has_constant2d(self): x = np.asarray([[1,1,1,1],[1,2,3,4.]]) y = tools.add_constant(x) assert_equal(x,y)
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog) cls.res1 = OLS(data.endog, data.exog).fit() R = np.identity(7) cls.Ttest = cls.res1.t_test(R)
def grangercausalitytests(x, maxlag, addconst=True, verbose=True): '''four tests for granger causality of 2 timeseries all four tests give similar results `params_ftest` and `ssr_ftest` are equivalent based of F test which is identical to lmtest:grangertest in R Parameters ---------- x : array, 2d, (nobs,2) data for test whether the time series in the second column Granger causes the time series in the first column maxlag : integer the Granger causality test results are calculated for all lags up to maxlag verbose : bool print results if true Returns ------- results : dictionary all test results, dictionary keys are the number of lags. For each lag the values are a tuple, with the first element a dictionary with teststatistic, pvalues, degrees of freedom, the second element are the OLS estimation results for the restricted model, the unrestricted model and the restriction (contrast) matrix for the parameter f_test. Notes ----- TODO: convert to class and attach results properly The Null hypothesis for grangercausalitytests is that the time series in the second column, x2, Granger causes the time series in the first column, x1. This means that past values of x2 have a statistically significant effect on the current value of x1, taking also past values of x1 into account, as regressors. We reject the null hypothesis of x2 Granger causing x1 if the pvalues are below a desired size of the test. 'params_ftest', 'ssr_ftest' are based on F test 'ssr_chi2test', 'lrtest' are based on chi-square test ''' from scipy import stats # lazy import resli = {} for mlg in range(1, maxlag + 1): result = {} if verbose: print '\nGranger Causality' print 'number of lags (no zero)', mlg mxlg = mlg #+ 1 # Note number of lags starting at zero in lagmat # create lagmat of both time series dta = lagmat2ds(x, mxlg, trim='both', dropex=1) #add constant if addconst: dtaown = add_constant(dta[:, 1:mxlg + 1]) dtajoint = add_constant(dta[:, 1:]) else: raise ValueError('Not Implemented') dtaown = dta[:, 1:mxlg] dtajoint = dta[:, 1:] #run ols on both models without and with lags of second variable res2down = OLS(dta[:, 0], dtaown).fit() res2djoint = OLS(dta[:, 0], dtajoint).fit() #print results #for ssr based tests see: http://support.sas.com/rnd/app/examples/ets/granger/index.htm #the other tests are made-up # Granger Causality test using ssr (F statistic) fgc1 = (res2down.ssr - res2djoint.ssr) / res2djoint.ssr / (mxlg) * res2djoint.df_resid if verbose: print 'ssr based F test: F=%-8.4f, p=%-8.4f, df_denom=%d, df_num=%d' % \ (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg) result['ssr_ftest'] = (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg) # Granger Causality test using ssr (ch2 statistic) fgc2 = res2down.nobs * (res2down.ssr - res2djoint.ssr) / res2djoint.ssr if verbose: print 'ssr based chi2 test: chi2=%-8.4f, p=%-8.4f, df=%d' % \ (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg) result['ssr_chi2test'] = (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg) #likelihood ratio test pvalue: lr = -2 * (res2down.llf - res2djoint.llf) if verbose: print 'likelihood ratio test: chi2=%-8.4f, p=%-8.4f, df=%d' % \ (lr, stats.chi2.sf(lr, mxlg), mxlg) result['lrtest'] = (lr, stats.chi2.sf(lr, mxlg), mxlg) # F test that all lag coefficients of exog are zero rconstr = np.column_stack((np.zeros((mxlg-1,mxlg-1)), np.eye(mxlg-1, mxlg-1),\ np.zeros((mxlg-1, 1)))) rconstr = np.column_stack((np.zeros((mxlg,mxlg)), np.eye(mxlg, mxlg),\ np.zeros((mxlg, 1)))) ftres = res2djoint.f_test(rconstr) if verbose: print 'parameter F test: F=%-8.4f, p=%-8.4f, df_denom=%d, df_num=%d' % \ (ftres.fvalue, ftres.pvalue, ftres.df_denom, ftres.df_num) result['params_ftest'] = (np.squeeze(ftres.fvalue)[()], np.squeeze(ftres.pvalue)[()], ftres.df_denom, ftres.df_num) resli[mxlg] = (result, [res2down, res2djoint, rconstr]) return resli
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog) res1 = OLS(data.endog, data.exog).fit() R2 = [[0,1,-1,0,0,0,0],[0, 0, 0, 0, 1, -1, 0]] cls.Ftest1 = res1.f_test(R2)
def test_pandas_const_df_prepend(): dta = longley.load_pandas().exog dta = tools.add_constant(dta, prepend=True) assert_string_equal('const', dta.columns[0]) assert_equal(dta.var(0)[0], 0)
def test_pandas_const_series_prepend(): dta = longley.load_pandas() series = dta.exog['GNP'] series = tools.add_constant(series, prepend=True) assert_string_equal('const', series.columns[0]) assert_equal(series.var(0)[0], 0)
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog) cls.res1 = OLS(data.endog, data.exog).fit() cls.res2 = WLS(data.endog, data.exog).fit()
def test_all(self): d = macrodata.load().data #import datasetswsm.greene as g #d = g.load('5-1') #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'])) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'])) #simple diff, not growthrate, I want heteroscedasticity later for testing endogd = np.diff(d['realinv']) exogd = add_constant(np.c_[np.diff(d['realgdp']), d['realint'][:-1]], prepend=True) endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1]],prepend=True) res_ols = OLS(endogg, exogg).fit() #print res_ols.params mod_g1 = GLSAR(endogg, exogg, rho=-0.108136) res_g1 = mod_g1.fit() #print res_g1.params mod_g2 = GLSAR(endogg, exogg, rho=-0.108136) #-0.1335859) from R res_g2 = mod_g2.iterative_fit(maxiter=5) #print res_g2.params rho = -0.108136 # coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL partable = np.array([ [-9.50990, 0.990456, -9.602, 3.65e-018, -11.4631, -7.55670], # *** [ 4.37040, 0.208146, 21.00, 2.93e-052, 3.95993, 4.78086], # *** [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346]]) # ** #Statistics based on the rho-differenced data: result_gretl_g1 = dict( endog_mean = ("Mean dependent var", 3.113973), endog_std = ("S.D. dependent var", 18.67447), ssr = ("Sum squared resid", 22530.90), mse_resid_sqrt = ("S.E. of regression", 10.66735), rsquared = ("R-squared", 0.676973), rsquared_adj = ("Adjusted R-squared", 0.673710), fvalue = ("F(2, 198)", 221.0475), f_pvalue = ("P-value(F)", 3.56e-51), resid_acf1 = ("rho", -0.003481), dw = ("Durbin-Watson", 1.993858)) #fstatistic, p-value, df1, df2 reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #LM-statistic, p-value, df arch_4 = [7.30776, 0.120491, 4, "chi2"] #multicollinearity vif = [1.002, 1.002] cond_1norm = 6862.0664 determinant = 1.0296049e+009 reciprocal_condition_number = 0.013819244 #Chi-square(2): test-statistic, pvalue, df normality = [20.2792, 3.94837e-005, 2] #tests res = res_g1 #with rho from Gretl #basic assert_almost_equal(res.params, partable[:,0], 4) assert_almost_equal(res.bse, partable[:,1], 6) assert_almost_equal(res.tvalues, partable[:,2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=4) assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=2) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=4) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) #tests res = res_g2 #with estimated rho #estimated lag coefficient assert_almost_equal(res.model.rho, rho, decimal=3) #basic assert_almost_equal(res.params, partable[:,0], 4) assert_almost_equal(res.bse, partable[:,1], 3) assert_almost_equal(res.tvalues, partable[:,2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) assert_almost_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], decimal=6) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(2,4)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(2,4)) #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=1) assert_almost_equal(sm_arch[1], arch_4[1], decimal=2) ''' Performing iterative calculation of rho... ITER RHO ESS 1 -0.10734 22530.9 2 -0.10814 22530.9 Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv rho = -0.108136 coefficient std. error t-ratio p-value ------------------------------------------------------------- const -9.50990 0.990456 -9.602 3.65e-018 *** ds_l_realgdp 4.37040 0.208146 21.00 2.93e-052 *** realint_1 -0.579253 0.268009 -2.161 0.0319 ** Statistics based on the rho-differenced data: Mean dependent var 3.113973 S.D. dependent var 18.67447 Sum squared resid 22530.90 S.E. of regression 10.66735 R-squared 0.676973 Adjusted R-squared 0.673710 F(2, 198) 221.0475 P-value(F) 3.56e-51 rho -0.003481 Durbin-Watson 1.993858 ''' ''' RESET test for specification (squares and cubes) Test statistic: F = 5.219019, with p-value = P(F(2,197) > 5.21902) = 0.00619 RESET test for specification (squares only) Test statistic: F = 7.268492, with p-value = P(F(1,198) > 7.26849) = 0.00762 RESET test for specification (cubes only) Test statistic: F = 5.248951, with p-value = P(F(1,198) > 5.24895) = 0.023: ''' ''' Test for ARCH of order 4 coefficient std. error t-ratio p-value -------------------------------------------------------- alpha(0) 97.0386 20.3234 4.775 3.56e-06 *** alpha(1) 0.176114 0.0714698 2.464 0.0146 ** alpha(2) -0.0488339 0.0724981 -0.6736 0.5014 alpha(3) -0.0705413 0.0737058 -0.9571 0.3397 alpha(4) 0.0384531 0.0725763 0.5298 0.5968 Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491: ''' ''' Variance Inflation Factors Minimum possible value = 1.0 Values > 10.0 may indicate a collinearity problem ds_l_realgdp 1.002 realint_1 1.002 VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient between variable j and the other independent variables Properties of matrix X'X: 1-norm = 6862.0664 Determinant = 1.0296049e+009 Reciprocal condition number = 0.013819244 ''' ''' Test for ARCH of order 4 - Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491 Test of common factor restriction - Null hypothesis: restriction is acceptable Test statistic: F(2, 195) = 0.426391 with p-value = P(F(2, 195) > 0.426391) = 0.653468 Test for normality of residual - Null hypothesis: error is normally distributed Test statistic: Chi-square(2) = 20.2792 with p-value = 3.94837e-005: ''' #no idea what this is ''' Augmented regression for common factor test OLS, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv coefficient std. error t-ratio p-value --------------------------------------------------------------- const -10.9481 1.35807 -8.062 7.44e-014 *** ds_l_realgdp 4.28893 0.229459 18.69 2.40e-045 *** realint_1 -0.662644 0.334872 -1.979 0.0492 ** ds_l_realinv_1 -0.108892 0.0715042 -1.523 0.1294 ds_l_realgdp_1 0.660443 0.390372 1.692 0.0923 * realint_2 0.0769695 0.341527 0.2254 0.8219 Sum of squared residuals = 22432.8 Test of common factor restriction Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468 ''' ################ with OLS, HAC errors #Model 5: OLS, using observations 1959:2-2009:3 (T = 202) #Dependent variable: ds_l_realinv #HAC standard errors, bandwidth 4 (Bartlett kernel) #coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL #for confidence interval t(199, 0.025) = 1.972 partable = np.array([ [-9.48167, 1.17709, -8.055, 7.17e-014, -11.8029, -7.16049], # *** [4.37422, 0.328787, 13.30, 2.62e-029, 3.72587, 5.02258], #*** [-0.613997, 0.293619, -2.091, 0.0378, -1.19300, -0.0349939]]) # ** result_gretl_g1 = dict( endog_mean = ("Mean dependent var", 3.257395), endog_std = ("S.D. dependent var", 18.73915), ssr = ("Sum squared resid", 22799.68), mse_resid_sqrt = ("S.E. of regression", 10.70380), rsquared = ("R-squared", 0.676978), rsquared_adj = ("Adjusted R-squared", 0.673731), fvalue = ("F(2, 199)", 90.79971), f_pvalue = ("P-value(F)", 9.53e-29), llf = ("Log-likelihood", -763.9752), aic = ("Akaike criterion", 1533.950), bic = ("Schwarz criterion", 1543.875), hqic = ("Hannan-Quinn", 1537.966), resid_acf1 = ("rho", -0.107341), dw = ("Durbin-Watson", 2.213805)) linear_logs = [1.68351, 0.430953, 2, "chi2"] #for logs: dropping 70 nan or incomplete observations, T=133 #(res_ols.model.exog <=0).any(1).sum() = 69 ?not 70 linear_squares = [7.52477, 0.0232283, 2, "chi2"] #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4 lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"] lm2_acorr4 = [4.771043, 0.312, 4, "chi2"] acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"] #break cusum_Harvey_Collier = [0.494432, 0.621549, 198, "t"] #stats.t.sf(0.494432, 198)*2 #see cusum results in files break_qlr = [3.01985, 0.1, 3, 196, "maxF"] #TODO check this, max at 2001:4 break_chow = [13.1897, 0.00424384, 3, "chi2"] # break at 1984:1 arch_4 = [3.43473, 0.487871, 4, "chi2"] normality = [23.962, 0.00001, 2, "chi2"] het_white = [33.503723, 0.000003, 5, "chi2"] het_breush_pagan = [1.302014, 0.521520, 2, "chi2"] #TODO: not available het_breush_pagan_konker = [0.709924, 0.701200, 2, "chi2"] reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #not available cond_1norm = 5984.0525 determinant = 7.1087467e+008 reciprocal_condition_number = 0.013826504 vif = [1.001, 1.001] names = 'date residual leverage influence DFFITS'.split() cur_dir = os.path.abspath(os.path.dirname(__file__)) fpath = os.path.join(cur_dir, 'results/leverage_influence_ols_nostars.txt') lev = np.genfromtxt(fpath, skip_header=3, skip_footer=1, converters={0:lambda s: s}) #either numpy 1.6 or python 3.2 changed behavior if np.isnan(lev[-1]['f1']): lev = np.genfromtxt(fpath, skip_header=3, skip_footer=2, converters={0:lambda s: s}) lev.dtype.names = names res = res_ols #for easier copying cov_hac, bse_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False) assert_almost_equal(res.params, partable[:,0], 5) assert_almost_equal(bse_hac, partable[:,1], 5) #TODO assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=6) #FAIL assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=6) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) #f-value is based on cov_hac I guess #assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(6,5)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(6,5)) linear_sq = smsdia.linear_lm(res.resid, res.model.exog) assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6) assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7) hbpk = smsdia.het_breushpagan(res.resid, res.model.exog) assert_almost_equal(hbpk[0], het_breush_pagan_konker[0], decimal=6) assert_almost_equal(hbpk[1], het_breush_pagan_konker[1], decimal=6) hw = smsdia.het_white(res.resid, res.model.exog) assert_almost_equal(hw[:2], het_white[:2], 6) #arch #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.resid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=5) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) vif2 = [oi.variance_inflation_factor(res.model.exog, k) for k in [1,2]] infl = oi.OLSInfluence(res_ols) #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0])) #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag)) #print np.max(np.abs(lev['influence'] - infl.influence)) #just added this based on Gretl #just rough test, low decimal in Gretl output, assert_almost_equal(lev['residual'], res.resid, decimal=3) assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3) assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3) assert_almost_equal(lev['influence'], infl.influence, decimal=4)
def test_add_constant_1d(self): x = np.arange(1,5) x = tools.add_constant(x, prepend=True) y = np.asarray([[1,1,1,1],[1,2,3,4.]]).T assert_equal(x, y)
def test_add_constant_has_constant1d(self): x = np.ones(5) x = tools.add_constant(x) assert_equal(x, np.ones(5))
if __name__ == '__main__': import gwstatsmodels.api as sm examples = ['ivols', 'distquant'][:] if 'ivols' in examples: exampledata = ['ols', 'iv', 'ivfake'][1] nobs = nsample = 500 sige = 3 corrfactor = 0.025 x = np.linspace(0,10, nobs) X = tools.add_constant(np.column_stack((x, x**2))) beta = np.array([1, 0.1, 10]) def sample_ols(exog): endog = np.dot(exog, beta) + sige*np.random.normal(size=nobs) return endog, exog, None def sample_iv(exog): print 'using iv example' X = exog.copy() e = sige * np.random.normal(size=nobs) endog = np.dot(X, beta) + e exog[:,0] = X[:,0] + corrfactor * e z0 = X[:,0] + np.random.normal(size=nobs) z1 = X.sum(1) + np.random.normal(size=nobs) z2 = X[:,1]
def add_trend(X, trend="c", prepend=False): """ Adds a trend and/or constant to an array. Parameters ---------- X : array-like Original array of data. trend : str {"c","t","ct","ctt"} "c" add constant only "t" add trend only "ct" add constant and linear trend "ctt" add constant and linear and quadratic trend. prepend : bool If True, prepends the new data to the columns of X. Notes ----- Returns columns as ["ctt","ct","c"] whenever applicable. There is currently no checking for an existing constant or trend. See also -------- gwstatsmodels.add_constant """ #TODO: could be generalized for trend of aribitrary order trend = trend.lower() if trend == "c": # handles structured arrays return add_constant(X, prepend=prepend) elif trend == "ct" or trend == "t": trendorder = 1 elif trend == "ctt": trendorder = 2 else: raise ValueError("trend %s not understood" % trend) X = np.asanyarray(X) nobs = len(X) trendarr = np.vander(np.arange(1,nobs+1, dtype=float), trendorder+1) # put in order ctt trendarr = np.fliplr(trendarr) if trend == "t": trendarr = trendarr[:,1] if not X.dtype.names: if not prepend: X = np.column_stack((X, trendarr)) else: X = np.column_stack((trendarr, X)) else: return_rec = data.__clas__ is np.recarray if trendorder == 1: if trend == "ct": dt = [('const',float),('trend',float)] else: dt = [('trend', float)] elif trendorder == 2: dt = [('const',float),('trend',float),('trend_squared', float)] trendarr = trendarr.view(dt) if prepend: X = nprf.append_fields(trendarr, X.dtype.names, [X[i] for i in data.dtype.names], usemask=False, asrecarray=return_rec) else: X = nprf.append_fields(X, trendarr.dtype.names, [trendarr[i] for i in trendarr.dtype.names], usemask=false, asrecarray=return_rec) return X
def test_all(self): d = macrodata.load().data #import datasetswsm.greene as g #d = g.load('5-1') #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'])) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'])) #simple diff, not growthrate, I want heteroscedasticity later for testing endogd = np.diff(d['realinv']) exogd = add_constant(np.c_[np.diff(d['realgdp']), d['realint'][:-1]], prepend=True) endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1]], prepend=True) res_ols = OLS(endogg, exogg).fit() #print res_ols.params mod_g1 = GLSAR(endogg, exogg, rho=-0.108136) res_g1 = mod_g1.fit() #print res_g1.params mod_g2 = GLSAR(endogg, exogg, rho=-0.108136) #-0.1335859) from R res_g2 = mod_g2.iterative_fit(maxiter=5) #print res_g2.params rho = -0.108136 # coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL partable = np.array([ [-9.50990, 0.990456, -9.602, 3.65e-018, -11.4631, -7.55670], # *** [4.37040, 0.208146, 21.00, 2.93e-052, 3.95993, 4.78086], # *** [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346] ]) # ** #Statistics based on the rho-differenced data: result_gretl_g1 = dict(endog_mean=("Mean dependent var", 3.113973), endog_std=("S.D. dependent var", 18.67447), ssr=("Sum squared resid", 22530.90), mse_resid_sqrt=("S.E. of regression", 10.66735), rsquared=("R-squared", 0.676973), rsquared_adj=("Adjusted R-squared", 0.673710), fvalue=("F(2, 198)", 221.0475), f_pvalue=("P-value(F)", 3.56e-51), resid_acf1=("rho", -0.003481), dw=("Durbin-Watson", 1.993858)) #fstatistic, p-value, df1, df2 reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #LM-statistic, p-value, df arch_4 = [7.30776, 0.120491, 4, "chi2"] #multicollinearity vif = [1.002, 1.002] cond_1norm = 6862.0664 determinant = 1.0296049e+009 reciprocal_condition_number = 0.013819244 #Chi-square(2): test-statistic, pvalue, df normality = [20.2792, 3.94837e-005, 2] #tests res = res_g1 #with rho from Gretl #basic assert_almost_equal(res.params, partable[:, 0], 4) assert_almost_equal(res.bse, partable[:, 1], 6) assert_almost_equal(res.tvalues, partable[:, 2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=4) assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=2) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=4) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) #tests res = res_g2 #with estimated rho #estimated lag coefficient assert_almost_equal(res.model.rho, rho, decimal=3) #basic assert_almost_equal(res.params, partable[:, 0], 4) assert_almost_equal(res.bse, partable[:, 1], 3) assert_almost_equal(res.tvalues, partable[:, 2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) assert_almost_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], decimal=6) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(2, 4)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(2, 4)) #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=1) assert_almost_equal(sm_arch[1], arch_4[1], decimal=2) ''' Performing iterative calculation of rho... ITER RHO ESS 1 -0.10734 22530.9 2 -0.10814 22530.9 Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv rho = -0.108136 coefficient std. error t-ratio p-value ------------------------------------------------------------- const -9.50990 0.990456 -9.602 3.65e-018 *** ds_l_realgdp 4.37040 0.208146 21.00 2.93e-052 *** realint_1 -0.579253 0.268009 -2.161 0.0319 ** Statistics based on the rho-differenced data: Mean dependent var 3.113973 S.D. dependent var 18.67447 Sum squared resid 22530.90 S.E. of regression 10.66735 R-squared 0.676973 Adjusted R-squared 0.673710 F(2, 198) 221.0475 P-value(F) 3.56e-51 rho -0.003481 Durbin-Watson 1.993858 ''' ''' RESET test for specification (squares and cubes) Test statistic: F = 5.219019, with p-value = P(F(2,197) > 5.21902) = 0.00619 RESET test for specification (squares only) Test statistic: F = 7.268492, with p-value = P(F(1,198) > 7.26849) = 0.00762 RESET test for specification (cubes only) Test statistic: F = 5.248951, with p-value = P(F(1,198) > 5.24895) = 0.023: ''' ''' Test for ARCH of order 4 coefficient std. error t-ratio p-value -------------------------------------------------------- alpha(0) 97.0386 20.3234 4.775 3.56e-06 *** alpha(1) 0.176114 0.0714698 2.464 0.0146 ** alpha(2) -0.0488339 0.0724981 -0.6736 0.5014 alpha(3) -0.0705413 0.0737058 -0.9571 0.3397 alpha(4) 0.0384531 0.0725763 0.5298 0.5968 Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491: ''' ''' Variance Inflation Factors Minimum possible value = 1.0 Values > 10.0 may indicate a collinearity problem ds_l_realgdp 1.002 realint_1 1.002 VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient between variable j and the other independent variables Properties of matrix X'X: 1-norm = 6862.0664 Determinant = 1.0296049e+009 Reciprocal condition number = 0.013819244 ''' ''' Test for ARCH of order 4 - Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491 Test of common factor restriction - Null hypothesis: restriction is acceptable Test statistic: F(2, 195) = 0.426391 with p-value = P(F(2, 195) > 0.426391) = 0.653468 Test for normality of residual - Null hypothesis: error is normally distributed Test statistic: Chi-square(2) = 20.2792 with p-value = 3.94837e-005: ''' #no idea what this is ''' Augmented regression for common factor test OLS, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv coefficient std. error t-ratio p-value --------------------------------------------------------------- const -10.9481 1.35807 -8.062 7.44e-014 *** ds_l_realgdp 4.28893 0.229459 18.69 2.40e-045 *** realint_1 -0.662644 0.334872 -1.979 0.0492 ** ds_l_realinv_1 -0.108892 0.0715042 -1.523 0.1294 ds_l_realgdp_1 0.660443 0.390372 1.692 0.0923 * realint_2 0.0769695 0.341527 0.2254 0.8219 Sum of squared residuals = 22432.8 Test of common factor restriction Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468 ''' ################ with OLS, HAC errors #Model 5: OLS, using observations 1959:2-2009:3 (T = 202) #Dependent variable: ds_l_realinv #HAC standard errors, bandwidth 4 (Bartlett kernel) #coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL #for confidence interval t(199, 0.025) = 1.972 partable = np.array([ [-9.48167, 1.17709, -8.055, 7.17e-014, -11.8029, -7.16049], # *** [4.37422, 0.328787, 13.30, 2.62e-029, 3.72587, 5.02258], #*** [-0.613997, 0.293619, -2.091, 0.0378, -1.19300, -0.0349939] ]) # ** result_gretl_g1 = dict(endog_mean=("Mean dependent var", 3.257395), endog_std=("S.D. dependent var", 18.73915), ssr=("Sum squared resid", 22799.68), mse_resid_sqrt=("S.E. of regression", 10.70380), rsquared=("R-squared", 0.676978), rsquared_adj=("Adjusted R-squared", 0.673731), fvalue=("F(2, 199)", 90.79971), f_pvalue=("P-value(F)", 9.53e-29), llf=("Log-likelihood", -763.9752), aic=("Akaike criterion", 1533.950), bic=("Schwarz criterion", 1543.875), hqic=("Hannan-Quinn", 1537.966), resid_acf1=("rho", -0.107341), dw=("Durbin-Watson", 2.213805)) linear_logs = [1.68351, 0.430953, 2, "chi2"] #for logs: dropping 70 nan or incomplete observations, T=133 #(res_ols.model.exog <=0).any(1).sum() = 69 ?not 70 linear_squares = [7.52477, 0.0232283, 2, "chi2"] #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4 lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"] lm2_acorr4 = [4.771043, 0.312, 4, "chi2"] acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"] #break cusum_Harvey_Collier = [0.494432, 0.621549, 198, "t"] #stats.t.sf(0.494432, 198)*2 #see cusum results in files break_qlr = [3.01985, 0.1, 3, 196, "maxF"] #TODO check this, max at 2001:4 break_chow = [13.1897, 0.00424384, 3, "chi2"] # break at 1984:1 arch_4 = [3.43473, 0.487871, 4, "chi2"] normality = [23.962, 0.00001, 2, "chi2"] het_white = [33.503723, 0.000003, 5, "chi2"] het_breush_pagan = [1.302014, 0.521520, 2, "chi2"] #TODO: not available het_breush_pagan_konker = [0.709924, 0.701200, 2, "chi2"] reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #not available cond_1norm = 5984.0525 determinant = 7.1087467e+008 reciprocal_condition_number = 0.013826504 vif = [1.001, 1.001] names = 'date residual leverage influence DFFITS'.split( ) cur_dir = os.path.abspath(os.path.dirname(__file__)) fpath = os.path.join(cur_dir, 'results/leverage_influence_ols_nostars.txt') lev = np.genfromtxt(fpath, skip_header=3, skip_footer=1, converters={0: lambda s: s}) #either numpy 1.6 or python 3.2 changed behavior if np.isnan(lev[-1]['f1']): lev = np.genfromtxt(fpath, skip_header=3, skip_footer=2, converters={0: lambda s: s}) lev.dtype.names = names res = res_ols #for easier copying cov_hac, bse_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False) assert_almost_equal(res.params, partable[:, 0], 5) assert_almost_equal(bse_hac, partable[:, 1], 5) #TODO assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=6) #FAIL assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=6) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) #f-value is based on cov_hac I guess #assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(6, 5)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(6, 5)) linear_sq = smsdia.linear_lm(res.resid, res.model.exog) assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6) assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7) hbpk = smsdia.het_breushpagan(res.resid, res.model.exog) assert_almost_equal(hbpk[0], het_breush_pagan_konker[0], decimal=6) assert_almost_equal(hbpk[1], het_breush_pagan_konker[1], decimal=6) hw = smsdia.het_white(res.resid, res.model.exog) assert_almost_equal(hw[:2], het_white[:2], 6) #arch #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.resid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=5) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) vif2 = [ oi.variance_inflation_factor(res.model.exog, k) for k in [1, 2] ] infl = oi.OLSInfluence(res_ols) #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0])) #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag)) #print np.max(np.abs(lev['influence'] - infl.influence)) #just added this based on Gretl #just rough test, low decimal in Gretl output, assert_almost_equal(lev['residual'], res.resid, decimal=3) assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3) assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3) assert_almost_equal(lev['influence'], infl.influence, decimal=4)
def add_trend(X, trend="c", prepend=False): """ Adds a trend and/or constant to an array. Parameters ---------- X : array-like Original array of data. trend : str {"c","t","ct","ctt"} "c" add constant only "t" add trend only "ct" add constant and linear trend "ctt" add constant and linear and quadratic trend. prepend : bool If True, prepends the new data to the columns of X. Notes ----- Returns columns as ["ctt","ct","c"] whenever applicable. There is currently no checking for an existing constant or trend. See also -------- gwstatsmodels.add_constant """ #TODO: could be generalized for trend of aribitrary order trend = trend.lower() if trend == "c": # handles structured arrays return add_constant(X, prepend=prepend) elif trend == "ct" or trend == "t": trendorder = 1 elif trend == "ctt": trendorder = 2 else: raise ValueError("trend %s not understood" % trend) X = np.asanyarray(X) nobs = len(X) trendarr = np.vander(np.arange(1, nobs + 1, dtype=float), trendorder + 1) # put in order ctt trendarr = np.fliplr(trendarr) if trend == "t": trendarr = trendarr[:, 1] if not X.dtype.names: if not prepend: X = np.column_stack((X, trendarr)) else: X = np.column_stack((trendarr, X)) else: return_rec = data.__clas__ is np.recarray if trendorder == 1: if trend == "ct": dt = [('const', float), ('trend', float)] else: dt = [('trend', float)] elif trendorder == 2: dt = [('const', float), ('trend', float), ('trend_squared', float)] trendarr = trendarr.view(dt) if prepend: X = nprf.append_fields(trendarr, X.dtype.names, [X[i] for i in data.dtype.names], usemask=False, asrecarray=return_rec) else: X = nprf.append_fields(X, trendarr.dtype.names, [trendarr[i] for i in trendarr.dtype.names], usemask=false, asrecarray=return_rec) return X