def test_missing(self): data = longley.load() data.exog = add_constant(data.exog, prepend=False) data.endog[[3, 7, 14]] = np.nan mod = OLS(data.endog, data.exog, missing='drop') assert_equal(mod.endog.shape[0], 13) assert_equal(mod.exog.shape[0], 13)
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) ols_res = OLS(data.endog, data.exog).fit() gls_res = GLS(data.endog, data.exog).fit() cls.res1 = gls_res cls.res2 = ols_res
def setupClass(cls): R = np.zeros(7) R[4:6] = [1,-1] data = longley.load() data.exog = add_constant(data.exog, prepend=False) res1 = OLS(data.endog, data.exog).fit() cls.Ttest1 = res1.t_test(R)
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) res1 = OLS(data.endog, data.exog).fit() R2 = [[0,1,-1,0,0,0,0],[0, 0, 0, 0, 1, -1, 0]] cls.Ftest1 = res1.f_test(R2) hyp = 'x2 = x3, x5 = x6' cls.NewFtest1 = res1.f_test(hyp)
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) cls.res1 = OLS(data.endog, data.exog).fit() R = np.identity(7) cls.Ttest = cls.res1.t_test(R) hyp = 'x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0, x6 = 0, const = 0' cls.NewTTest = cls.res1.t_test(hyp)
def __init__(self): from statsmodels.datasets.cpunish import load self.data = load() self.endog = self.data.endog self.exog = self.data.exog np.random.seed(1234) self.weight = np.random.randint(5, 100, len(self.endog)) self.endog_big = np.repeat(self.endog, self.weight) self.exog_big = np.repeat(self.exog, self.weight, axis=0)
def setupClass(cls): from .results.results_regression import LongleyRTO data = longley.load() res1 = OLS(data.endog, data.exog).fit() res2 = LongleyRTO() res2.wresid = res1.wresid # workaround hack cls.res1 = res1 cls.res2 = res2 res_qr = OLS(data.endog, data.exog).fit(method="qr") cls.res_qr = res_qr
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) res1 = OLS(data.endog, data.exog).fit() R = np.array([[0,1,1,0,0,0,0], [0,1,0,1,0,0,0], [0,1,0,0,0,0,0], [0,0,0,0,1,0,0], [0,0,0,0,0,1,0]]) q = np.array([0,0,0,1,0]) cls.Ftest1 = res1.f_test((R,q))
def setup_class(cls): data = longley.load(as_pandas=False) data.exog = add_constant(data.exog, prepend=False) ols_res = OLS(data.endog, data.exog).fit() gls_res = GLS(data.endog, data.exog).fit() gls_res_scalar = GLS(data.endog, data.exog, sigma=1) cls.endog = data.endog cls.exog = data.exog cls.res1 = gls_res cls.res2 = ols_res cls.res3 = gls_res_scalar
def setupClass(cls): from results.results_glm import Cpunish from statsmodels.datasets.cpunish import load data = load() data.exog[:,3] = np.log(data.exog[:,3]) data.exog = add_constant(data.exog) exposure = [100] * len(data.endog) cls.res1 = GLM(data.endog, data.exog, family=sm.families.Poisson(), exposure=exposure).fit() cls.res1.params[-1] += np.log(100) # add exposure back in to param # to make the results the same cls.res2 = Cpunish()
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) y = data.endog X = data.exog n = y.shape[0] w = np.ones(n) cls.results = [] cls.results.append(OLS(y, X).fit()) cls.results.append(WLS(y, X, w).fit()) cls.results.append(GLS(y, X, 100*w).fit()) cls.results.append(GLS(y, X, np.diag(0.1*w)).fit())
def setupClass(cls): from results.results_regression import Longley data = longley.load() data.exog = add_constant(data.exog, prepend=False) res1 = OLS(data.endog, data.exog).fit() res2 = Longley() res2.wresid = res1.wresid # workaround hack cls.res1 = res1 cls.res2 = res2 res_qr = OLS(data.endog, data.exog).fit(method="qr") cls.res_qr = res_qr
def setupClass(cls): from results.results_regression import LongleyGls data = longley.load() exog = add_constant(np.column_stack(\ (data.exog[:,1],data.exog[:,4]))) tmp_results = OLS(data.endog, exog).fit() rho = np.corrcoef(tmp_results.resid[1:], tmp_results.resid[:-1])[0][1] # by assumption order = toeplitz(np.arange(16)) sigma = rho**order GLS_results = GLS(data.endog, exog, sigma=sigma).fit() cls.res1 = GLS_results cls.res2 = LongleyGls()
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) y = data.endog X = data.exog n = y.shape[0] np.random.seed(5) w = np.random.uniform(0.5, 1, n) w_inv = 1. / w cls.results = [] cls.results.append(WLS(y, X, w).fit()) cls.results.append(WLS(y, X, 0.01 * w).fit()) cls.results.append(GLS(y, X, 100 * w_inv).fit()) cls.results.append(GLS(y, X, np.diag(0.1 * w_inv)).fit())
def setupClass(cls): # if skipR: # raise SkipTest, "Rpy not installed" # try: # r.library('car') # except RPyRException: # raise SkipTest, "car library not installed for R" R = np.zeros(7) R[4:6] = [1,-1] # self.R = R data = longley.load() data.exog = add_constant(data.exog) res1 = OLS(data.endog, data.exog).fit() cls.Ttest1 = res1.t_test(R)
def __init__(self): ''' Tests Poisson family with canonical log link. Test results were obtained by R. ''' from results.results_glm import Cpunish from statsmodels.datasets.cpunish import load self.data = load() self.data.exog[:,3] = np.log(self.data.exog[:,3]) self.data.exog = add_constant(self.data.exog) self.res1 = GLM(self.data.endog, self.data.exog, family=sm.families.Poisson()).fit() self.res2 = Cpunish()
def __init__(self): ''' Tests Poisson family with canonical log link. Test results were obtained by R. ''' from .results.results_glm import Cpunish from statsmodels.datasets.cpunish import load self.data = load() self.data.exog[:,3] = np.log(self.data.exog[:,3]) self.data.exog = add_constant(self.data.exog, prepend=False) self.res1 = GLM(self.data.endog, self.data.exog, family=sm.families.Poisson()).fit() self.res2 = Cpunish() # compare with discrete, start close to save time modd = discrete.Poisson(self.data.endog, self.data.exog) self.resd = modd.fit(start_params=self.res1.params * 0.9, disp=False)
def __init__(self): ''' Test Gaussian family with canonical identity link ''' # Test Precisions self.decimal_resids = DECIMAL_3 self.decimal_params = DECIMAL_2 self.decimal_bic = DECIMAL_0 self.decimal_bse = DECIMAL_3 from statsmodels.datasets.longley import load self.data = load() self.data.exog = add_constant(self.data.exog) self.res1 = GLM(self.data.endog, self.data.exog, family=sm.families.Gaussian()).fit() from results.results_glm import Longley self.res2 = Longley()
def setup_class(cls): from .results.results_regression import LongleyGls data = longley.load(as_pandas=False) exog = add_constant(np.column_stack( (data.exog[:, 1], data.exog[:, 4])), prepend=False) tmp_results = OLS(data.endog, exog).fit() rho = np.corrcoef(tmp_results.resid[1:], tmp_results.resid[:-1])[0][1] # by assumption order = toeplitz(np.arange(16)) sigma = rho**order GLS_results = GLS(data.endog, exog, sigma=sigma).fit() cls.res1 = GLS_results cls.res2 = LongleyGls() # attach for test_missing cls.sigma = sigma cls.exog = exog cls.endog = data.endog
def __init__(self): ''' Test Binomial family with canonical logit link using star98 dataset. ''' self.decimal_resids = DECIMAL_1 self.decimal_bic = DECIMAL_2 from statsmodels.datasets.star98 import load from results.results_glm import Star98 data = load() data.exog = add_constant(data.exog) self.res1 = GLM(data.endog, data.exog, \ family=sm.families.Binomial()).fit() #NOTE: if you want to replicate with RModel #res2 = RModel(data.endog[:,0]/trials, data.exog, r.glm, # family=r.binomial, weights=trials) self.res2 = Star98()
def test_wtd_patsy_missing(): from statsmodels.datasets.cpunish import load import pandas as pd data = load() data.exog[0, 0] = np.nan data.endog[[2, 4, 6, 8]] = np.nan data.pandas = pd.DataFrame(data.exog, columns=data.exog_name) data.pandas['EXECUTIONS'] = data.endog weights = np.arange(1, len(data.endog)+1) formula = """EXECUTIONS ~ INCOME + PERPOVERTY + PERBLACK + VC100k96 + SOUTH + DEGREE""" mod_misisng = GLM.from_formula(formula, data=data.pandas, freq_weights=weights) assert_equal(mod_misisng.freq_weights.shape[0], mod_misisng.endog.shape[0]) assert_equal(mod_misisng.freq_weights.shape[0], mod_misisng.exog.shape[0]) assert_equal(mod_misisng.freq_weights.shape[0], 12) keep_weights = np.array([ 2, 4, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17]) assert_equal(mod_misisng.freq_weights, keep_weights)
def __init__(self): ''' Tests Gamma family with canonical inverse link (power -1) ''' # Test Precisions self.decimal_aic_R = -1 #TODO: off by about 1, we are right with Stata self.decimal_resids = DECIMAL_2 from statsmodels.datasets.scotland import load from results.results_glm import Scotvote data = load() data.exog = add_constant(data.exog) res1 = GLM(data.endog, data.exog, \ family=sm.families.Gamma()).fit() self.res1 = res1 # res2 = RModel(data.endog, data.exog, r.glm, family=r.Gamma) res2 = Scotvote() res2.aic_R += 2 # R doesn't count degree of freedom for scale with gamma self.res2 = res2
def setupClass(cls): from .results.results_regression import Longley data = longley.load() data.exog = add_constant(data.exog, prepend=False) res1 = OLS(data.endog, data.exog).fit() res2 = Longley() res2.wresid = res1.wresid # workaround hack cls.res1 = res1 cls.res2 = res2 res_qr = OLS(data.endog, data.exog).fit(method="qr") model_qr = OLS(data.endog, data.exog) Q, R = np.linalg.qr(data.exog) model_qr.exog_Q, model_qr.exog_R = Q, R model_qr.normalized_cov_params = np.linalg.inv(np.dot(R.T, R)) model_qr.rank = np_matrix_rank(R) res_qr2 = model_qr.fit(method="qr") cls.res_qr = res_qr cls.res_qr_manual = res_qr2
def __init__(self): ''' Test Negative Binomial family with canonical log link ''' # Test Precision self.decimal_resid = DECIMAL_1 self.decimal_params = DECIMAL_3 self.decimal_resids = -1 # 1 % mismatch at 0 self.decimal_fittedvalues = DECIMAL_1 from statsmodels.datasets.committee import load self.data = load() self.data.exog[:,2] = np.log(self.data.exog[:,2]) interaction = self.data.exog[:,2]*self.data.exog[:,1] self.data.exog = np.column_stack((self.data.exog,interaction)) self.data.exog = add_constant(self.data.exog) self.res1 = GLM(self.data.endog, self.data.exog, family=sm.families.NegativeBinomial()).fit() from results.results_glm import Committee res2 = Committee() res2.aic_R += 2 # They don't count a degree of freedom for the scale self.res2 = res2
def setupClass(cls): data = load().data cls.model = ols(longley_formula, data) super(TestFormulaRecArray, cls).setupClass()
def setupClass(cls): cls.data = load()
"""Example: statsmodels.OLS """ from statsmodels.datasets.longley import load import statsmodels.api as sm from statsmodels.iolib.table import SimpleTable, default_txt_fmt import numpy as np data = load() data_orig = (data.endog.copy(), data.exog.copy()) # Note: In this example using zscored/standardized variables has no effect on # regression estimates. Are there no numerical problems? rescale = 0 # 0: no rescaling, 1:demean, 2:standardize, 3:standardize and transform back rescale_ratio = data.endog.std() / data.exog.std(0) if rescale > 0: # rescaling data.endog -= data.endog.mean() data.exog -= data.exog.mean(0) if rescale > 1: data.endog /= data.endog.std() data.exog /= data.exog.std(0) # skip because mean has been removed, but dimension is hardcoded in table data.exog = sm.tools.add_constant(data.exog, prepend=False) ols_model = sm.OLS(data.endog, data.exog)
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) cls.endog = data.endog cls.exog = data.exog cls.ols_model = OLS(data.endog, data.exog)
"""Ordinary Least Squares """ from statsmodels.datasets.longley import load import statsmodels.api as sm import numpy as np data = load() data.exog = sm.tools.add_constant(data.exog) ols_model = sm.OLS(data.endog, data.exog) ols_results = ols_model.fit() # the Longley dataset is well known to have high multicollinearity # one way to find the condition number is as follows # normalize the independent variables to have unit length, Greene 4.9 norm_x = np.ones_like(data.exog) for i in range(int(ols_model.df_model)): norm_x[:,i] = data.exog[:,i]/np.linalg.norm(data.exog[:,i]) norm_xtx = np.dot(norm_x.T,norm_x) eigs = np.linalg.eigvals(norm_xtx) collin = np.sqrt(eigs.max()/eigs.min()) print collin # clearly there is a big problem with multicollinearity # the rule of thumb is any number of 20 requires attention # for instance, consider the longley dataset with the last observation dropped ols_results2 = sm.OLS(data.endog[:-1], data.exog[:-1,:]).fit() # all of our coefficients change considerably in percentages
def setup_class(cls): cls.data = load(as_pandas=False)
y_true = np.dot(X, beta) y = y_true + np.random.normal(size=nsample) res3 = sm.OLS(y, X).fit() print res3.f_test(R) #Multicollinearity #----------------- #Data #^^^^ # The Longley dataset is well known to have high multicollinearity, that is, # the exogenous predictors are highly correlated. This is problematic because # it can affect the stability of our coefficient estimates as we make minor # changes to model specification. from statsmodels.datasets.longley import load y = load().endog X = load().exog X = sm.tools.add_constant(X, prepend=False) #Fit and summary #^^^^^^^^^^^^^^^ ols_model = sm.OLS(y, X) ols_results = ols_model.fit() print ols_results.summary() #Condition number #^^^^^^^^^^^^^^^^ # One way to assess multicollinearity is to compute the condition number. # Values over 20 are worrisome (see Greene 4.9). The first step is to normalize # the independent variables to have unit length: norm_x = np.ones_like(X)
def setup_class(cls): data = longley.load(as_pandas=False) data.exog = add_constant(data.exog, prepend=False) cls.res1 = OLS(data.endog, data.exog).fit() R = np.identity(7)[:-1, :] cls.Ftest = cls.res1.f_test(R)
def setup_class(cls): data = longley.load(as_pandas=False) data.exog = add_constant(data.exog, prepend=False) cls.res1 = GLS(data.endog, data.exog).fit() cls.res2 = OLS(data.endog, data.exog).fit()
def setup_class(cls): data = longley.load(as_pandas=False) data.exog = add_constant(data.exog, prepend=False) cls.endog = data.endog cls.exog = data.exog cls.ols_model = OLS(data.endog, data.exog)
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) cls.res1 = OLS(data.endog, data.exog).fit() cls.res2 = WLS(data.endog, data.exog).fit()
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) cls.res1 = OLS(data.endog, data.exog).fit() R = np.identity(7)[:-1,:] cls.Ftest = cls.res1.f_test(R)
beta = np.array((3.5, 5.7, 150)) Y = np.dot(X, beta) + np.random.standard_normal(40) mod2 = sm.OLS(Y, X) res2 = mod2.fit() f2 = lambda params: -1 * mod2.loglike(params) resfmin = optimize.fmin(f2, np.ones(3), ftol=1e-10) print('OLS') print(res2.params) print('MLE') print(resfmin) print('\nExample 2: Longley Data, high multicollinearity') print('-----------------------------------------------\n') from statsmodels.datasets.longley import load data = load(as_pandas=False) data.exog = sm.add_constant(data.exog, prepend=False) mod = sm.OLS(data.endog, data.exog) f = lambda params: -1 * mod.loglike(params) score = lambda params: -1 * mod.score(params) #now you're set up to try and minimize or root find, but I couldn't get this one to work #note that if you want to get the results, it's also a property of mod, so you can do res = mod.fit() #print mod.results.params print('OLS') print(res.params) print('MLE') #resfmin2 = optimize.fmin(f, mod.results.params*0.9, maxfun=5000, maxiter=5000, xtol=1e-10, ftol= 1e-10) resfmin2 = optimize.fmin(f,
print(results.qr.summary()) results_pivot = model.fit(method="qr-pivot") print(results_pivot.summary()) assert all( (results.params - results_pivot.params) < 1e-8), 'Some of the params are not identical' assert all( (results.pvalues - results_pivot.pvalues) < 1e-8), 'Some of the params are not identical' predict = results.predict(X) predict_pivot = results_pivot.predict(X) ################################### data = longley.load(as_pandas=False) data.exog = sm.add_constant(data.exog, prepend=False) ols_m = sm.OLS(data.endog, data.exog) res = ols_m.fit() res_qr = ols_m.fit(method='qr') res_pivot = ols_m.fit(method='qr-pivot') print(res.summary()) print(res_qr.summary()) print(res_pivot.summary()) print('end') # Test multiple regression with multicollinearity X3 is highly correlated with X3 = sm.add_constant(