def test_compare_OLS(self): """ Gaussian GEE with independence correlation should agree exactly with OLS for parameter estimates and standard errors derived from the naive covariance estimate. """ vs = Independence() family = Gaussian() Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) mdf = md.fit() ols = sm.ols("Y ~ X1 + X2 + X3", data=D).fit() assert_almost_equal(ols.params.values, mdf.params, decimal=10) se = mdf.standard_errors(covariance_type="naive") assert_almost_equal(ols.bse, se, decimal=10) naive_tvalues = mdf.params / \ np.sqrt(np.diag(mdf.naive_covariance)) assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
def test_default_time(self): """ Check that the time defaults work correctly. """ endog, exog, group = load_data("gee_logistic_1.csv") # Time values for the autoregressive model T = np.zeros(len(endog)) idx = set(group) for ii in idx: jj = np.flatnonzero(group == ii) T[jj] = lrange(len(jj)) family = Binomial() va = Autoregressive() md1 = GEE(endog, exog, group, family=family, cov_struct=va) mdf1 = md1.fit() md2 = GEE(endog, exog, group, time=T, family=family, cov_struct=va) mdf2 = md2.fit() assert_almost_equal(mdf1.params, mdf2.params, decimal=6) assert_almost_equal(mdf1.standard_errors(), mdf2.standard_errors(), decimal=6)
def test_compare_OLS(self): #Gaussian GEE with independence correlation should agree #exactly with OLS for parameter estimates and standard errors #derived from the naive covariance estimate. vs = Independence() family = Gaussian() Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) mdf = md.fit() ols = smf.ols("Y ~ X1 + X2 + X3", data=D).fit() # don't use wrapper, asserts_xxx don't work ols = ols._results assert_almost_equal(ols.params, mdf.params, decimal=10) se = mdf.standard_errors(cov_type="naive") assert_almost_equal(ols.bse, se, decimal=10) naive_tvalues = mdf.params / \ np.sqrt(np.diag(mdf.cov_naive)) assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
def test_missing_formula(self): # Test missing data handling for formulas. endog = np.random.normal(size=100) exog1 = np.random.normal(size=100) exog2 = np.random.normal(size=100) exog3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) endog[0] = np.nan endog[5:7] = np.nan exog2[10:12] = np.nan data = pd.DataFrame({"endog": endog, "exog1": exog1, "exog2": exog2, "exog3": exog3, "groups": groups}) mod1 = GEE.from_formula("endog ~ exog1 + exog2 + exog3", groups, data, missing='drop') rslt1 = mod1.fit() assert_almost_equal(len(mod1.endog), 95) assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 4]) data = data.dropna() groups = groups[data.index.values] mod2 = GEE.from_formula("endog ~ exog1 + exog2 + exog3", groups, data, missing='none') rslt2 = mod2.fit() assert_almost_equal(rslt1.params.values, rslt2.params.values) assert_almost_equal(rslt1.bse.values, rslt2.bse.values)
def test_default_time(self): # Check that the time defaults work correctly. endog,exog,group = load_data("gee_logistic_1.csv") # Time values for the autoregressive model T = np.zeros(len(endog)) idx = set(group) for ii in idx: jj = np.flatnonzero(group == ii) T[jj] = lrange(len(jj)) family = Binomial() va = Autoregressive() md1 = GEE(endog, exog, group, family=family, cov_struct=va) mdf1 = md1.fit() md2 = GEE(endog, exog, group, time=T, family=family, cov_struct=va) mdf2 = md2.fit() assert_almost_equal(mdf1.params, mdf2.params, decimal=6) assert_almost_equal(mdf1.standard_errors(), mdf2.standard_errors(), decimal=6)
def t_est_missing(self): Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) Y[0] = np.nan Y[5:7] = np.nan X2[10:12] = np.nan D = pd.DataFrame({ "Y": Y, "X1": X1, "X2": X2, "X3": X3, "groups": groups }) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=D["groups"], missing='drop') mdf = md.fit() assert (len(md.endog) == 95) assert (md.exog.shape) == (95, 4)
def test_missing(self): #Test missing data handling for calling from the api. Missing #data handling does not currently work for formulas. endog = np.random.normal(size=100) exog = np.random.normal(size=(100, 3)) exog[:, 0] = 1 groups = np.kron(lrange(20), np.ones(5)) endog[0] = np.nan endog[5:7] = np.nan exog[10:12, 1] = np.nan mod1 = GEE(endog, exog, groups, missing='drop') rslt1 = mod1.fit() assert_almost_equal(len(mod1.endog), 95) assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 3]) ii = np.isfinite(endog) & np.isfinite(exog).all(1) mod2 = GEE(endog[ii], exog[ii, :], groups[ii], missing='none') rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) assert_almost_equal(rslt1.bse, rslt2.bse)
def test_missing(self): """ Test missing data handling for calling from the api. Missing data handling does not currently work for formulas. """ endog = np.random.normal(size=100) exog = np.random.normal(size=(100, 3)) exog[:, 0] = 1 groups = np.kron(lrange(20), np.ones(5)) endog[0] = np.nan endog[5:7] = np.nan exog[10:12, 1] = np.nan mod1 = GEE(endog, exog, groups, missing='drop') rslt1 = mod1.fit() assert_almost_equal(len(mod1.endog), 95) assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 3]) ii = np.isfinite(endog) & np.isfinite(exog).all(1) mod2 = GEE(endog[ii], exog[ii, :], groups[ii], missing='none') rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) assert_almost_equal(rslt1.bse, rslt2.bse)
def test_formulas(self): """ Check formulas, especially passing groups and time as either variable names or arrays. """ n = 100 Y = np.random.normal(size=n) X1 = np.random.normal(size=n) mat = np.concatenate((np.ones((n, 1)), X1[:, None]), axis=1) Time = np.random.uniform(size=n) groups = np.kron(lrange(20), np.ones(5)) data = pd.DataFrame({"Y": Y, "X1": X1, "Time": Time, "groups": groups}) va = Autoregressive() family = Gaussian() mod1 = GEE(Y, mat, groups, time=Time, family=family, cov_struct=va) rslt1 = mod1.fit() mod2 = GEE.from_formula("Y ~ X1", groups, data, time=Time, family=family, cov_struct=va) rslt2 = mod2.fit() mod3 = GEE.from_formula("Y ~ X1", groups, data, time="Time", family=family, cov_struct=va) rslt3 = mod3.fit() mod4 = GEE.from_formula("Y ~ X1", "groups", data, time=Time, family=family, cov_struct=va) rslt4 = mod4.fit() mod5 = GEE.from_formula("Y ~ X1", "groups", data, time="Time", family=family, cov_struct=va) rslt5 = mod5.fit() assert_almost_equal(rslt1.params, rslt2.params, decimal=8) assert_almost_equal(rslt1.params, rslt3.params, decimal=8) assert_almost_equal(rslt1.params, rslt4.params, decimal=8) assert_almost_equal(rslt1.params, rslt5.params, decimal=8) check_wrapper(rslt2)
def test_combinations(): actual = list(combinations('ABCD', 2)) desired = [('A', 'B'), ('A', 'C'), ('A', 'D'), ('B', 'C'), ('B', 'D'), ('C', 'D')] assert_(actual == desired, '%r not equal %r' % (actual, desired)) actual = list(combinations(lrange(4), 3)) desired = [(0, 1, 2), (0, 1, 3), (0, 2, 3), (1, 2, 3)] assert_(actual == desired, '%r not equal %r' % (actual, desired))
def test_combinations(): actual = list(combinations("ABCD", 2)) desired = [ ("A", "B"), ("A", "C"), ("A", "D"), ("B", "C"), ("B", "D"), ("C", "D"), ] assert_(actual == desired, "%r not equal %r" % (actual, desired)) actual = list(combinations(lrange(4), 3)) desired = [(0, 1, 2), (0, 1, 3), (0, 2, 3), (1, 2, 3)] assert_(actual == desired, "%r not equal %r" % (actual, desired))
def test_formulas(self): """ Check formulas, especially passing groups and time as either variable names or arrays. """ n = 100 Y = np.random.normal(size=n) X1 = np.random.normal(size=n) mat = np.concatenate((np.ones((n,1)), X1[:, None]), axis=1) Time = np.random.uniform(size=n) groups = np.kron(lrange(20), np.ones(5)) data = pd.DataFrame({"Y": Y, "X1": X1, "Time": Time, "groups": groups}) va = Autoregressive() family = Gaussian() mod1 = GEE(Y, mat, groups, time=Time, family=family, cov_struct=va) rslt1 = mod1.fit() mod2 = GEE.from_formula("Y ~ X1", groups, data, time=Time, family=family, cov_struct=va) rslt2 = mod2.fit() mod3 = GEE.from_formula("Y ~ X1", groups, data, time="Time", family=family, cov_struct=va) rslt3 = mod3.fit() mod4 = GEE.from_formula("Y ~ X1", "groups", data, time=Time, family=family, cov_struct=va) rslt4 = mod4.fit() mod5 = GEE.from_formula("Y ~ X1", "groups", data, time="Time", family=family, cov_struct=va) rslt5 = mod5.fit() assert_almost_equal(rslt1.params, rslt2.params, decimal=8) assert_almost_equal(rslt1.params, rslt3.params, decimal=8) assert_almost_equal(rslt1.params, rslt4.params, decimal=8) assert_almost_equal(rslt1.params, rslt5.params, decimal=8) check_wrapper(rslt2)
def t_est_missing(self): Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) Y[0] = np.nan Y[5:7] = np.nan X2[10:12] = np.nan D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3, "groups": groups}) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=D["groups"], missing='drop') mdf = md.fit() assert(len(md.endog) == 95) assert(md.exog.shape) == (95,4)
def test_logistic(self): #R code for comparing results: #library(gee) #Z = read.csv("results/gee_logistic_1.csv", header=FALSE) #Y = Z[,2] #Id = Z[,1] #X1 = Z[,3] #X2 = Z[,4] #X3 = Z[,5] #mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, # corstr="independence") #smi = summary(mi) #u = coefficients(smi) #cfi = paste(u[,1], collapse=",") #sei = paste(u[,4], collapse=",") #me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, # corstr="exchangeable") #sme = summary(me) #u = coefficients(sme) #cfe = paste(u[,1], collapse=",") #see = paste(u[,4], collapse=",") #ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, # corstr="AR-M") #sma = summary(ma) #u = coefficients(sma) #cfa = paste(u[,1], collapse=",") #sea = paste(u[,4], collapse=",") #sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa) #sprintf("se = [[%s],[%s],[%s]]", sei, see, sea) endog,exog,group = load_data("gee_logistic_1.csv") # Time values for the autoregressive model T = np.zeros(len(endog)) idx = set(group) for ii in idx: jj = np.flatnonzero(group == ii) T[jj] = lrange(len(jj)) family = Binomial() ve = Exchangeable() vi = Independence() va = Autoregressive() # From R gee cf = [[0.0167272965285882,1.13038654425893, -1.86896345082962,1.09397608331333], [0.0178982283915449,1.13118798191788, -1.86133518416017,1.08944256230299], [0.0109621937947958,1.13226505028438, -1.88278757333046,1.09954623769449]] se = [[0.127291720283049,0.166725808326067, 0.192430061340865,0.173141068839597], [0.127045031730155,0.165470678232842, 0.192052750030501,0.173174779369249], [0.127240302296444,0.170554083928117, 0.191045527104503,0.169776150974586]] for j,v in enumerate((vi,ve,va)): md = GEE(endog, exog, group, T, family, v) mdf = md.fit() if id(v) != id(va): assert_almost_equal(mdf.params, cf[j], decimal=6) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6) # Test with formulas D = np.concatenate((endog[:,None], group[:,None], exog[:,1:]), axis=1) D = pd.DataFrame(D) D.columns = ["Y","Id",] + ["X%d" % (k+1) for k in range(exog.shape[1]-1)] for j,v in enumerate((vi,ve)): md = GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D, family=family, cov_struct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=6) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
def test_logistic(self): """ R code for comparing results: library(gee) Z = read.csv("results/gee_logistic_1.csv", header=FALSE) Y = Z[,2] Id = Z[,1] X1 = Z[,3] X2 = Z[,4] X3 = Z[,5] mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, corstr="independence") smi = summary(mi) u = coefficients(smi) cfi = paste(u[,1], collapse=",") sei = paste(u[,4], collapse=",") me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, corstr="exchangeable") sme = summary(me) u = coefficients(sme) cfe = paste(u[,1], collapse=",") see = paste(u[,4], collapse=",") ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, corstr="AR-M") sma = summary(ma) u = coefficients(sma) cfa = paste(u[,1], collapse=",") sea = paste(u[,4], collapse=",") sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa) sprintf("se = [[%s],[%s],[%s]]", sei, see, sea) """ endog, exog, group = load_data("gee_logistic_1.csv") # Time values for the autoregressive model T = np.zeros(len(endog)) idx = set(group) for ii in idx: jj = np.flatnonzero(group == ii) T[jj] = lrange(len(jj)) family = Binomial() ve = Exchangeable() vi = Independence() va = Autoregressive() # From R gee cf = [[ 0.0167272965285882, 1.13038654425893, -1.86896345082962, 1.09397608331333 ], [ 0.0178982283915449, 1.13118798191788, -1.86133518416017, 1.08944256230299 ], [ 0.0109621937947958, 1.13226505028438, -1.88278757333046, 1.09954623769449 ]] se = [[ 0.127291720283049, 0.166725808326067, 0.192430061340865, 0.173141068839597 ], [ 0.127045031730155, 0.165470678232842, 0.192052750030501, 0.173174779369249 ], [ 0.127240302296444, 0.170554083928117, 0.191045527104503, 0.169776150974586 ]] for j, v in enumerate((vi, ve, va)): md = GEE(endog, exog, group, T, family, v) mdf = md.fit() if id(v) != id(va): assert_almost_equal(mdf.params, cf[j], decimal=6) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6) # Test with formulas D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = [ "Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] for j, v in enumerate((vi, ve)): md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=D.loc[:, "Id"], family=family, cov_struct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=6) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
def ols_high_d_category_multi_results(data_df, models, table_header): """ This function is used to get multi results of multi models on one dataframe. During analyzing data with large data size and complicated, we usually have several model assumptions. By using this function, we can easily get the results comparison of the different models. :param data_df: Dataframe with relevant data :param models: List of models :param table_header: Title of summary table :return: summary table of results of the different models """ results = [] for model1 in models: results.append( ols_high_d_category(data_df, model1['consist_input'], model1['out_input'], model1['category_input'], model1['cluster_input'], formula=None, robust=False, c_method='cgm', epsilon=1e-5, max_iter=1e6)) consist_name_list = [result.params.index.to_list() for result in results] consist_name_total = [] consist_name_total.extend(consist_name_list[0]) for i in consist_name_list[1:]: for j in i: if j not in consist_name_total: consist_name_total.append(j) index_name = [] for name in consist_name_total: index_name.append(name) index_name.append('pvalue') index_name.append('std err') exog_len = lrange(len(results)) lzip = [] y_zip = [] b_zip = np.zeros(5) table_content = [] for name in consist_name_total: coeff_list = [] pvalue_list = [] std_list = [] for i in range(len(results)): if name in consist_name_list[i]: coeff = "%#7.4g" % (results[i].params[name]) pvalue = "%#8.2g" % (results[i].pvalues[name]) std = "%#8.2f" % ( results[i].bse[consist_name_list[i].index(name)]) coeff_list.append(coeff) pvalue_list.append(pvalue) std_list.append(std) else: coeff = 'Nan' pvalue = 'Nan' std = 'Nan' coeff_list.append(coeff) pvalue_list.append(pvalue) std_list.append(std) table_content.append(tuple(coeff_list)) table_content.append(tuple(pvalue_list)) table_content.append(tuple(std_list)) wtffff = dict( fmt='txt', # basic table formatting table_dec_above='=', table_dec_below='-', title_align='l', # basic row formatting row_pre='', row_post='', header_dec_below='-', row_dec_below=None, colwidths=None, colsep=' ', data_aligns="l", # data formats # data_fmt="%s", data_fmts=["%s"], # labeled alignments # stubs_align='l', stub_align='l', header_align='r', # labeled formats header_fmt='%s', stub_fmt='%s', header='%s', stub='%s', empty_cell='', empty='', missing='--', ) a = SimpleTable(table_content, table_header, index_name, title='multi', txt_fmt=wtffff) print(a)