def breusch_pagan_test(resid, exog): ''' Perform Breush-Paga test and print out results. Parameters: resid - ols residuals, Series or array exog - dataframe or matrix like structure ''' print('(Calculation: sigma_i = sigma * f(alpha_0 + alpha z_i))') name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f_pvalue'] test = sms.het_breushpagan(resid, exog) table = [[n, v] for n, v in zip(name, test)] headers = ["Statistic", "Value"] print(tabulate(table, headers, tablefmt="simple")) return test
print(plot_leverage_resid2(results)) # Other plotting options can be found on the [Graphics page.](http://www.statsmodels.org/stable/graphics.html) # ## Multicollinearity # # Condition number: np.linalg.cond(results.model.exog) # ## Heteroskedasticity tests # # Breush-Pagan test: name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] test = sms.het_breushpagan(results.resid, results.model.exog) lzip(name, test) # Goldfeld-Quandt test name = ['F statistic', 'p-value'] test = sms.het_goldfeldquandt(results.resid, results.model.exog) lzip(name, test) # ## Linearity # # Harvey-Collier multiplier test for Null hypothesis that the linear specification is correct: name = ['t value', 'p value'] test = sms.linear_harvey_collier(results) lzip(name, test)
# Other plotting options can be found on the [Graphics page.](http://statsmodels.sourceforge.net/stable/graphics.html) # ## Multicollinearity # # Condition number: np.linalg.cond(results.model.exog) # ## Heteroskedasticity tests # # Breush-Pagan test: name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] test = sms.het_breushpagan(results.resid, results.model.exog) lzip(name, test) # Goldfeld-Quandt test name = ['F statistic', 'p-value'] test = sms.het_goldfeldquandt(results.resid, results.model.exog) lzip(name, test) # ## Linearity # # Harvey-Collier multiplier test for Null hypothesis that the linear specification is correct: name = ['t value', 'p value']
# This block creates the objects for each statistic we'd like printed to Excel.<br> # <br> # It will report $R^2$, $R^2 adj,$ residuals, the f p-value, aic, the fitted model parameters, normality of the residuals, the Bruesh-Pagan Test for heteroscedasicity and the Harvey-Collier test for linearity. # <codecell> r_squared = model.rsquared r_square_adj = model.rsquared_adj residuals = model.resid p = model.f_pvalue aic = model.aic pvalues = pd.DataFrame(model.pvalues) params = pd.DataFrame(model.params) normality = sms.jarque_bera(model.resid) breush_pagan_hska = sms.het_breushpagan(model.resid, model.model.exog) harvey_collier = sms.linear_harvey_collier(model) # <headingcell level=4> # Print the regression results to Excel # <codecell> Range("Results", "O6").value = "R^2" Range("Results", "P6").value = r_squared Range("Results", "O7").value = "R^2 Adjusted" Range("Results", "P7").value = r_square_adj Range("Results", "O8").value = "p-value"
def do_stats(df): # Only view those that received vacation and are employed df.is_employed.replace(0.0, np.nan, inplace=True) df.paid_vacation.replace(0.0, np.nan, inplace=True) df.dropna(inplace=True) # No longer need this dummy df.drop('is_employed', axis=1, inplace=True) # Summary stats if not f_exists(SUMMARY_TXT): summary = df.describe().T summary = np.round(summary, decimals=3) with open(SUMMARY_TXT, 'w') as f: f.write(summary.to_string()) # Test for autocorrelation: scatter matrix, correlation, run OLS if not f_exists(SCAT_MATRIX_PNG): scatter_matrix(df, alpha=0.2, figsize=(64, 64), diagonal='hist') pylab.savefig(SCAT_MATRIX_PNG, bbox_inches='tight') if not f_exists(CORR_TXT): corr = df.corr() corr = corr.reindex_axis(sorted(corr.columns, key=COL_ORDER.index), axis=0) corr = corr.reindex_axis(sorted(corr.columns, key=COL_ORDER.index), axis=1) for i, k in enumerate(corr): row = corr[k] for j in range(len(row)): if j > i: row[j] = np.nan with open(CORR_TXT, 'w') as f: f.write(np.round(corr, decimals=3).to_string(na_rep='')) if not f_exists(OLS1_TXT): ols_results = smf.ols( formula='vacation ~ paid_vacation + np.square(paid_vacation) + ' 'age + fam_size + is_female + income83 + salary + ' 'np.square(salary)', data=df).fit() with open(OLS1_TXT, 'w') as f: f.write(str(ols_results.summary())) f.write('\n\nCondition Number: {}'.format( np.linalg.cond(ols_results.model.exog))) # Need to drop salary, too much autocorrelation df.drop('salary', axis=1, inplace=True) # test for Heteroskedasticity if not f_exists(HET_BP_TXT): ols_results = smf.ols( formula='vacation ~ paid_vacation + np.square(paid_vacation) + ' 'age + fam_size + is_female + income83', data=df).fit() names = ['LM', 'LM P val.', 'F Stat.', 'F Stat. P val.'] test = sms.het_breushpagan(ols_results.resid, ols_results.model.exog) f_p = test[3] with open(HET_BP_TXT, 'w') as f: str_ = '\n'.join('{}: {}'.format(n, v) for n, v in zip(names, test)) f.write(str_ + '\n\n') if f_p < .01: f.write('No Heteroskedasticity found.\n') else: f.write('Warning: Heteroskedasticity found!\n') # no Heteroskedasticity found # final OLS results if not f_exists(OLS2_TXT): ols_results = smf.ols( formula='vacation ~ paid_vacation + np.square(paid_vacation) + ' 'age + fam_size + is_female + income83', data=df).fit().get_robustcov_results(cov_type='HAC', maxlags=1) with open(OLS2_TXT, 'w') as f: f.write(str(ols_results.summary())) f.write('\n\nCondition Number: {}'.format( np.linalg.cond(ols_results.model.exog))) return df
data_mod = data_mod.dropna( axis=0, subset=['knowmeth', 'electric', 'radio', 'tv', 'bicycle']) answer4 = data_mod.shape[0] * data_mod.shape[1] ''' answer 5, 6 ''' m1 = smf.ols('ceb ~ heduc + urban + electric + radio + tv + bicycle +'\ 'nevermarr + idlnchld_noans + heduc_noans + usemeth_noans +'\ 'age + educ + religion + idlnchld + knowmeth + usemeth +'\ 'agefm', data=data_mod) fitted = m1.fit() ''' answer 7 ''' bp = sms.het_breushpagan(fitted.resid, fitted.model.exog)[1] m2 = smf.ols('ceb ~ heduc + urban + electric + radio + tv + bicycle +'\ 'nevermarr + idlnchld_noans + heduc_noans + usemeth_noans +'\ 'age + educ + religion + idlnchld + knowmeth + usemeth +'\ 'agefm', data=data_mod) fitted2 = m2.fit(cov_type='HC1') ''' answer 8 ''' m3 = smf.ols('ceb ~ heduc + urban + electric + bicycle +'\ 'nevermarr + idlnchld_noans + heduc_noans + usemeth_noans +'\ 'age + educ + idlnchld + knowmeth + usemeth +'\ 'agefm', data=data_mod) fitted3 = m3.fit()
model_new = sm.OLS.from_formula(formula=model.model.formula, data=data) data_new = pd.DataFrame(pp.scale(data.values[:,:-1]), columns=['Beds','Healing_days','Income','Salary','Costs']) model_new1 = sm.OLS.from_formula(formula=model.model.formula, data=data_new).fit() print(model_new1.summary()) import patsy y, X = patsy.dmatrices(model.model.formula, data, return_type='dataframe') model_n = lm.LinearRegression() #Кросс-Валидация k_fold = KFold(n_splits=10) scores = cross_val_score(model_n, X, y, cv=k_fold, scoring='r2') predicted = cross_val_predict(model_n,X,y,cv=k_fold) slope, intercept, r_value, p_value, std_err = st.linregress(y.values[:,0],predicted[:,0]) print(r_value*r_value) #Гомоскедастичность (Бреуш-Паган, Голдфильд-Квандт) test = sms.het_breushpagan(res11.resid, res11.model.exog) name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] print(lzip(name, test)) name = ['F statistic', 'p-value'] test = sms.het_goldfeldquandt(res11.resid, res11.model.exog) print(lzip(name, test)) #Q-Q st.probplot(res4.resid,plot=plt) sm.qqplot(res11.resid, line='s') plt.show() #Дарбин-Уотсон dw = sms.stattools.durbin_watson(res11.resid) print(dw)
def regress_excel(filename, sheetname): """ Define a function that takes in a data frame and a string to name the excel files produced and will returns results of the regression performed inputs: * filename - name of excel file containing the dates, dependent variable, and independent variables as columns, in that order, will be used for both _plots.xls and _results.xlsx resulting files saved in the current working directory * sheetname - name of the sheet in the excel file desired, will be used for both _plots.xls and _results.xlsx resulting files saved in the current working directory outputs: along with the [filename]_[sheetname]_plots.xls and _results.xlsx in the working directory, the function returns * Results - has the following important attributes and methods * .summary() - shows a summary of the regression results * .params - is a pandas series of the parameters fitted by the model """ dates, dependent, independent = extract_data(filename, sheetname) results = regress_data(dependent, independent) parameters = pd.Series(results.params) X = sm.add_constant(independent) fignum = 0 # Generate test results for Breusch-Pagan LM test bptest = sms.het_breushpagan(results.resid, results.model.exog) plt.close() sm.qqplot(results.resid, fit=True, line='45') plt.title('Q-Q Plot') plt.savefig('qq.jpg') plt.close() predicted = results.predict(X) plt.plot(dates, predicted, 'r--', dates, dependent, 'b--') plt.title('In Sample Backtesting') plt.xlabel('Date') plt.ylabel(dependent.name) plt.legend() plt.savefig('backtest.jpg') plt.close() w = Workbook() ws = w.add_sheet('Plots') plot_to_excel('qq.jpg', w, ws, fignum) fignum += 1 plot_to_excel('backtest.jpg', w, ws, fignum) fignum += 1 for i in range(len(results.model.exog_names)): sm.graphics.plot_fit(results, results.model.exog_names[i]) fig_name = results.model.exog_names[i] + '_fitted.jpg' plt.savefig(fig_name) plt.close() plot_to_excel(fig_name, w, ws, fignum) fignum += 1 for i in range(len(independent.columns)): plt.scatter(independent.iloc[:, i], results.resid) plt.xlabel(independent.columns[i]) plt.ylabel('Residuals') title_string = independent.columns[i] + ' vs. Residuals' plt.title(title_string) fig_name = independent.columns[i] + '_residuals.jpg' plt.savefig(fig_name) plt.close() plot_to_excel(fig_name, w, ws, fignum) fignum += 1 save_plots = filename[0:-5] + '_' + sheetname + '_plots.xls' save_results = filename[0:-5] + '_' + sheetname + '_results.xlsx' coef_plot(results, 'Coefficients.jpg') plt.close() plot_to_excel('Coefficients.jpg', w, ws, fignum) fignum += 1 writer = pd.ExcelWriter(save_results, engine='xlsxwriter') bptest = sms.het_breushpagan(results.resid, results.model.exog) bptest = pd.DataFrame([bptest[0], bptest[1]], ['LM', 'P-Value']) bptest.to_excel(writer, sheet_name="Breusch-Pagan") parameters = pd.concat( [results.params, results.bse, results.pvalues, results.conf_int()], axis=1) parameters.columns = [ 'Parameter', 'Std_Err', 'P-Value', 'Lower_Bound', 'Upper_Bound' ] parameters.to_excel(writer, sheet_name='Parameters') outliers = pd.concat([dates, results.outlier_test()], axis=1) outliers = outliers.loc[outliers['bonf(p)'] < 0.05] outliers.to_excel(writer, sheet_name='Outliers') rsquared_adj = pd.DataFrame([results.rsquared_adj]) rsquared_adj.to_excel(writer, sheet_name='Adjusted R^2') aic = pd.DataFrame([results.aic]) aic.to_excel(writer, sheet_name='AIC') bic = pd.DataFrame([results.bic]) bic.to_excel(writer, sheet_name='BIC') dw = sm.stats.stattools.durbin_watson(results.resid) dw = pd.DataFrame([dw]) dw.to_excel(writer, sheet_name='Durbin Watson') ftest = pd.DataFrame([results.fvalue, results.f_pvalue], ['F', 'P-Value']) ftest.to_excel(writer, sheet_name="F Test") mean_err = pd.DataFrame( [results.mse_model, results.mse_resid, results.mse_total], ['MSR', 'MSE', 'MSTO']) mean_err.to_excel(writer, sheet_name="Mean Squared Error") cov_mat = pd.DataFrame(np.cov(np.transpose(independent))) cov_mat.columns = independent.columns cov_mat.index = independent.columns cov_mat.to_excel(writer, sheet_name='Covariance Matrix') writer.save() w.save(save_plots) for file in os.listdir(os.getcwd()): if (file.endswith('.bmp') or file.endswith('.jpg')): os.remove(file) return results
import pandas as pd import statsmodels.stats.api as sms import statsmodels.formula.api as smf df = pd.read_csv("trafficking_data.csv") results = smf.ols('df["Adult victims"] ~ df["gdp"] + df["policy index"]',data=df).fit() print sms.het_breushpagan(results.resid, results.model.exog)
def Fig_OLS_Checks(): #fs = 10 # font size used across figures #color = str() #OrC = 'open' SampSizes = [5, 6, 7, 8, 9, 10, 13, 16, 20, 30, 40, 50, 60, 70, 80, 90, 100] Iterations = 100 fig = plt.figure(figsize=(12, 8)) # MODEL PARAMETERS Rare_MacIntercept_pVals = [] # List to hold coefficient p-values Rare_MacIntercept_Coeffs = [] # List to hold coefficients Rich_MacIntercept_pVals = [] Rich_MacIntercept_Coeffs = [] Dom_MacIntercept_pVals = [] Dom_MacIntercept_Coeffs = [] Even_MacIntercept_pVals = [] Even_MacIntercept_Coeffs = [] Rare_MicIntercept_pVals = [] Rare_MicIntercept_Coeffs = [] Rich_MicIntercept_pVals = [] Rich_MicIntercept_Coeffs = [] Dom_MicIntercept_pVals = [] Dom_MicIntercept_Coeffs = [] Even_MicIntercept_pVals = [] Even_MicIntercept_Coeffs = [] Rare_MacSlope_pVals = [] Rare_MacSlope_Coeffs = [] Rich_MacSlope_pVals = [] Rich_MacSlope_Coeffs = [] Dom_MacSlope_pVals = [] Dom_MacSlope_Coeffs = [] Even_MacSlope_pVals = [] Even_MacSlope_Coeffs = [] Rare_MicSlope_pVals = [] Rare_MicSlope_Coeffs = [] Rich_MicSlope_pVals = [] Rich_MicSlope_Coeffs = [] Dom_MicSlope_pVals = [] Dom_MicSlope_Coeffs = [] Even_MicSlope_pVals = [] Even_MicSlope_Coeffs = [] RareR2List = [] # List to hold model R2 RarepFList = [] # List to hold significance of model R2 RichR2List = [] # List to hold model R2 RichpFList = [] # List to hold significance of model R2 DomR2List = [] # List to hold model R2 DompFList = [] # List to hold significance of model R2 EvenR2List = [] # List to hold model R2 EvenpFList = [] # List to hold significance of model R2 # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #RarepLinListHC = [] RarepLinListRainB = [] RarepLinListLM = [] #RichpLinListHC = [] RichpLinListRainB = [] RichpLinListLM = [] #DompLinListHC = [] DompLinListRainB = [] DompLinListLM = [] #EvenpLinListHC = [] EvenpLinListRainB = [] EvenpLinListLM = [] # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) RarepCorrListBG = [] RarepCorrListF = [] RichpCorrListBG = [] RichpCorrListF = [] DompCorrListBG = [] DompCorrListF = [] EvenpCorrListBG = [] EvenpCorrListF = [] # 6. Homoscedacticity RarepHomoHW = [] RarepHomoHB = [] RichpHomoHW = [] RichpHomoHB = [] DompHomoHW = [] DompHomoHB = [] EvenpHomoHW = [] EvenpHomoHB = [] # 7. Normally distributed residuals (errors) RarepNormListOmni = [] # Omnibus test for normality RarepNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality RarepNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance RarepNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance RichpNormListOmni = [] # Omnibus test for normality RichpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality RichpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance RichpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance DompNormListOmni = [] # Omnibus test for normality DompNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality DompNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance DompNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance EvenpNormListOmni = [] # Omnibus test for normality EvenpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality EvenpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance EvenpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance NLIST = [] for SampSize in SampSizes: sRare_MacIntercept_pVals = [] # List to hold coefficient p-values sRare_MacIntercept_Coeffs = [] # List to hold coefficients sRich_MacIntercept_pVals = [] # List to hold coefficient p-values sRich_MacIntercept_Coeffs = [] # List to hold coefficients sDom_MacIntercept_pVals = [] sDom_MacIntercept_Coeffs = [] sEven_MacIntercept_pVals = [] sEven_MacIntercept_Coeffs = [] sRare_MicIntercept_pVals = [] sRare_MicIntercept_Coeffs = [] sRich_MicIntercept_pVals = [] sRich_MicIntercept_Coeffs = [] sDom_MicIntercept_pVals = [] sDom_MicIntercept_Coeffs = [] sEven_MicIntercept_pVals = [] sEven_MicIntercept_Coeffs = [] sRare_MacSlope_pVals = [] sRare_MacSlope_Coeffs = [] sRich_MacSlope_pVals = [] sRich_MacSlope_Coeffs = [] sDom_MacSlope_pVals = [] sDom_MacSlope_Coeffs = [] sEven_MacSlope_pVals = [] sEven_MacSlope_Coeffs = [] sRare_MicSlope_pVals = [] sRare_MicSlope_Coeffs = [] sRich_MicSlope_pVals = [] sRich_MicSlope_Coeffs = [] sDom_MicSlope_pVals = [] sDom_MicSlope_Coeffs = [] sEven_MicSlope_pVals = [] sEven_MicSlope_Coeffs = [] sRareR2List = [] # List to hold model R2 sRarepFList = [] # List to hold significance of model R2 sRichR2List = [] # List to hold model R2 sRichpFList = [] # List to hold significance of model R2 sDomR2List = [] # List to hold model R2 sDompFList = [] # List to hold significance of model R2 sEvenR2List = [] # List to hold model R2 sEvenpFList = [] # List to hold significance of model R2 # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #sRarepLinListHC = [] sRarepLinListRainB = [] sRarepLinListLM = [] #sRichpLinListHC = [] sRichpLinListRainB = [] sRichpLinListLM = [] #sDompLinListHC = [] sDompLinListRainB = [] sDompLinListLM = [] #sEvenpLinListHC = [] sEvenpLinListRainB = [] sEvenpLinListLM = [] # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) sRarepCorrListBG = [] sRarepCorrListF = [] sRichpCorrListBG = [] sRichpCorrListF = [] sDompCorrListBG = [] sDompCorrListF = [] sEvenpCorrListBG = [] sEvenpCorrListF = [] # 6. Homoscedacticity sRarepHomoHW = [] sRarepHomoHB = [] sRichpHomoHW = [] sRichpHomoHB = [] sDompHomoHW = [] sDompHomoHB = [] sEvenpHomoHW = [] sEvenpHomoHB = [] # 7. Normally distributed residuals (errors) sRarepNormListOmni = [] # Omnibus test for normality sRarepNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality sRarepNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sRarepNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance sRichpNormListOmni = [] # Omnibus test for normality sRichpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality sRichpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sRichpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance sDompNormListOmni = [] # Omnibus test for normality sDompNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality sDompNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sDompNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance sEvenpNormListOmni = [] # Omnibus test for normality sEvenpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality sEvenpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sEvenpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance for iteration in range(Iterations): Nlist, Slist, Evarlist, ESimplist, ENeelist, EHeiplist, EQlist = [[], [], [], [], [], [], []] klist, Shanlist, BPlist, SimpDomlist, SinglesList, tenlist, onelist = [[], [], [], [], [], [], []] NmaxList, rareSkews, KindList = [[], [], []] NSlist = [] ct = 0 radDATA = [] datasets = [] GoodNames = ['EMPclosed', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is MGRAST mlist = ['micro', 'macro'] for m in mlist: for name in os.listdir(mydir +'data/'+m): if name in GoodNames: pass else: continue path = mydir+'data/'+m+'/'+name+'/'+name+'-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, m, num_lines]) numMac = 0 numMic = 0 radDATA = [] for d in datasets: name, kind, numlines = d lines = [] lines = np.random.choice(range(1, numlines+1), SampSize, replace=True) path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData.txt' for line in lines: data = linecache.getline(path, line) radDATA.append(data) #print name, kind, numlines, len(radDATA) for data in radDATA: data = data.split() if len(data) == 0: print 'no data' continue name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data N = float(N) S = float(S) Nlist.append(float(np.log(N))) Slist.append(float(np.log(S))) NSlist.append(float(np.log(N/S))) Evarlist.append(float(np.log(float(Evar)))) ESimplist.append(float(np.log(float(ESimp)))) KindList.append(kind) BPlist.append(float(BP)) NmaxList.append(float(np.log(float(BP)*float(N)))) EHeiplist.append(float(EHeip)) # lines for the log-modulo transformation of skewnness skew = float(skew) sign = 1 if skew < 0: sign = -1 lms = np.log(np.abs(skew) + 1) lms = lms * sign #if lms > 3: print name, N, S rareSkews.append(float(lms)) if kind == 'macro': numMac += 1 elif kind == 'micro': numMic += 1 ct+=1 #print 'Sample Size:',SampSize, ' Mic:', numMic,'Mac:', numMac # Multiple regression for Rarity d = pd.DataFrame({'N': list(Nlist)}) d['Rarity'] = list(rareSkews) d['Kind'] = list(KindList) RarityResults = smf.ols('Rarity ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RarityResults.summary(), '\n' # Multiple regression for Rarity d = pd.DataFrame({'N': list(Nlist)}) d['Richness'] = list(Slist) d['Kind'] = list(KindList) RichnessResults = smf.ols('Richness ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RichnessResults.summary(), '\n' # Multiple regression for Dominance d = pd.DataFrame({'N': list(Nlist)}) d['Dominance'] = list(NmaxList) d['Kind'] = list(KindList) DomResults = smf.ols('Dominance ~ N * Kind', d).fit() # Fit the dummy variable regression model #print DomResults.summary(), '\n' # Multiple regression for Evenness d = pd.DataFrame({'N': list(Nlist)}) d['Evenness'] = list(ESimplist) d['Kind'] = list(KindList) EvenResults = smf.ols('Evenness ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RarityResults.summary(), '\n' RareResids = RarityResults.resid # residuals of the model RichResids = RichnessResults.resid # residuals of the model DomResids = DomResults.resid # residuals of the model EvenResids = EvenResults.resid # residuals of the model # MODEL RESULTS/FIT RareFpval = RarityResults.f_pvalue Rarer2 = RarityResults.rsquared # coefficient of determination #Adj_r2 = RareResults.rsquared_adj # adjusted RichFpval = RichnessResults.f_pvalue Richr2 = RichnessResults.rsquared # coefficient of determination #Adj_r2 = RichnessResults.rsquared_adj # adjusted DomFpval = DomResults.f_pvalue Domr2 = DomResults.rsquared # coefficient of determination #Adj_r2 = DomResults.rsquared_adj # adjusted EvenFpval = EvenResults.f_pvalue Evenr2 = EvenResults.rsquared # coefficient of determination #Adj_r2 = EvenResuls.rsquared_adj # adjusted # MODEL PARAMETERS and p-values Rareparams = RarityResults.params Rareparams = Rareparams.tolist() Rarepvals = RarityResults.pvalues Rarepvals = Rarepvals.tolist() Richparams = RichnessResults.params Richparams = Richparams.tolist() Richpvals = RichnessResults.pvalues Richpvals = Richpvals.tolist() Domparams = DomResults.params Domparams = Domparams.tolist() Dompvals = DomResults.pvalues Dompvals = Dompvals.tolist() Evenparams = EvenResults.params Evenparams = Evenparams.tolist() Evenpvals = EvenResults.pvalues Evenpvals = Evenpvals.tolist() sRare_MacIntercept_pVals.append(Rarepvals[0]) sRare_MacIntercept_Coeffs.append(Rareparams[0]) sRich_MacIntercept_pVals.append(Rarepvals[0]) sRich_MacIntercept_Coeffs.append(Rareparams[0]) sDom_MacIntercept_pVals.append(Dompvals[0]) sDom_MacIntercept_Coeffs.append(Domparams[0]) sEven_MacIntercept_pVals.append(Evenpvals[0]) sEven_MacIntercept_Coeffs.append(Evenparams[0]) sRare_MicIntercept_pVals.append(Rarepvals[1]) if Rarepvals[1] > 0.05: sRare_MicIntercept_Coeffs.append(Rareparams[1]) else: sRare_MicIntercept_Coeffs.append(Rareparams[1]) sRich_MicIntercept_pVals.append(Richpvals[1]) if Richpvals[1] > 0.05: sRich_MicIntercept_Coeffs.append(Richparams[1]) else: sRich_MicIntercept_Coeffs.append(Richparams[1]) sDom_MicIntercept_pVals.append(Dompvals[1]) if Dompvals[1] > 0.05: sDom_MicIntercept_Coeffs.append(Domparams[1]) else: sDom_MicIntercept_Coeffs.append(Domparams[1]) sEven_MicIntercept_pVals.append(Evenpvals[1]) if Evenpvals[1] > 0.05: sEven_MicIntercept_Coeffs.append(Evenparams[1]) else: sEven_MicIntercept_Coeffs.append(Evenparams[1]) sRare_MacSlope_pVals.append(Rarepvals[2]) sRare_MacSlope_Coeffs.append(Rareparams[2]) sRich_MacSlope_pVals.append(Richpvals[2]) sRich_MacSlope_Coeffs.append(Richparams[2]) sDom_MacSlope_pVals.append(Dompvals[2]) sDom_MacSlope_Coeffs.append(Domparams[2]) sEven_MacSlope_pVals.append(Evenpvals[2]) sEven_MacSlope_Coeffs.append(Evenparams[2]) sRare_MicSlope_pVals.append(Rarepvals[3]) if Rarepvals[3] > 0.05: sRare_MicSlope_Coeffs.append(Rareparams[3]) else: sRare_MicSlope_Coeffs.append(Rareparams[3]) sRich_MicSlope_pVals.append(Richpvals[3]) if Richpvals[3] > 0.05: sRich_MicSlope_Coeffs.append(Richparams[3]) else: sRich_MicSlope_Coeffs.append(Richparams[3]) sDom_MicSlope_pVals.append(Dompvals[3]) if Dompvals[3] > 0.05: sDom_MicSlope_Coeffs.append(Domparams[3]) else: sDom_MicSlope_Coeffs.append(Domparams[3]) sEven_MicSlope_pVals.append(Evenpvals[3]) if Evenpvals[3] > 0.05: sEven_MicSlope_Coeffs.append(Evenparams[3]) else: sEven_MicSlope_Coeffs.append(Evenparams[3]) sRareR2List.append(Rarer2) sRarepFList.append(RareFpval) sRichR2List.append(Richr2) sRichpFList.append(RichFpval) sDomR2List.append(Domr2) sDompFList.append(DomFpval) sEvenR2List.append(Evenr2) sEvenpFList.append(EvenFpval) # TESTS OF LINEAR REGRESSION ASSUMPTIONS # Error in predictor variables is negligible...Presumably Yes # Variables are measured at the continuous level...Definitely Yes # TESTS FOR LINEARITY, i.e., WHETHER THE DATA ARE CORRECTLY MODELED AS LINEAR #HC = smd.linear_harvey_collier(RarityResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sRarepLinListHC.append(HC) #HC = smd.linear_harvey_collier(DomResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sDompLinListHC.append(HC) #HC = smd.linear_harvey_collier(EvenResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sEvenpLinListHC.append(HC) RB = smd.linear_rainbow(RarityResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sRarepLinListRainB.append(RB[1]) RB = smd.linear_rainbow(RichnessResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sRichpLinListRainB.append(RB[1]) RB = smd.linear_rainbow(DomResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sDompLinListRainB.append(RB[1]) RB = smd.linear_rainbow(EvenResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sEvenpLinListRainB.append(RB[1]) LM = smd.linear_lm(RarityResults.resid, RarityResults.model.exog) # Lagrangian multiplier test for linearity sRarepLinListLM.append(LM[1]) LM = smd.linear_lm(RichnessResults.resid, RichnessResults.model.exog) # Lagrangian multiplier test for linearity sRichpLinListLM.append(LM[1]) LM = smd.linear_lm(DomResults.resid, DomResults.model.exog) # Lagrangian multiplier test for linearity sDompLinListLM.append(LM[1]) LM = smd.linear_lm(EvenResults.resid, EvenResults.model.exog) # Lagrangian multiplier test for linearity sEvenpLinListLM.append(LM[1]) # INDEPENDENCE OF OBSERVATIONS (no serial correlation in residuals) BGtest = smd.acorr_breush_godfrey(RarityResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(RareResids, lags=None, boxpierce=True) sRarepCorrListBG.append(BGtest[1]) sRarepCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey(RichnessResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(RichResids, lags=None, boxpierce=True) sRichpCorrListBG.append(BGtest[1]) sRichpCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey(DomResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(DomResids, lags=None, boxpierce=True) sDompCorrListBG.append(BGtest[1]) sDompCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey(EvenResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(EvenResids, lags=None, boxpierce=True) sEvenpCorrListBG.append(BGtest[1]) sEvenpCorrListF.append(BGtest[3]) # There are no significant outliers...Need tests or measures/metrics # HOMOSCEDASTICITY # These tests return: # 1. lagrange multiplier statistic, # 2. p-value of lagrange multiplier test, # 3. f-statistic of the hypothesis that the error variance does not depend on x, # 4. p-value for the f-statistic HW = sms.het_white(RareResids, RarityResults.model.exog) sRarepHomoHW.append(HW[3]) HW = sms.het_white(RichResids, RichnessResults.model.exog) sRichpHomoHW.append(HW[3]) HW = sms.het_white(DomResids, DomResults.model.exog) sDompHomoHW.append(HW[3]) HW = sms.het_white(EvenResids, EvenResults.model.exog) sEvenpHomoHW.append(HW[3]) HB = sms.het_breushpagan(RareResids, RarityResults.model.exog) sRarepHomoHB.append(HB[3]) HB = sms.het_breushpagan(RichResids, RichnessResults.model.exog) sRichpHomoHB.append(HB[3]) HB = sms.het_breushpagan(DomResids, DomResults.model.exog) sDompHomoHB.append(HB[3]) HB = sms.het_breushpagan(EvenResids, EvenResults.model.exog) sEvenpHomoHB.append(HB[3]) # 7. NORMALITY OF ERROR TERMS O = sms.omni_normtest(RareResids) sRarepNormListOmni.append(O[1]) O = sms.omni_normtest(RichResids) sRichpNormListOmni.append(O[1]) O = sms.omni_normtest(DomResids) sDompNormListOmni.append(O[1]) O = sms.omni_normtest(EvenResids) sEvenpNormListOmni.append(O[1]) JB = sms.jarque_bera(RareResids) sRarepNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(RichResids) sRichpNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(DomResids) sDompNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(EvenResids) sEvenpNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality KS = smd.kstest_normal(RareResids) sRarepNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(RichResids) sRichpNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(DomResids) sDompNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(EvenResids) sEvenpNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance AD = smd.normal_ad(RareResids) sRarepNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(RichResids) sRichpNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(DomResids) sDompNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(EvenResids) sEvenpNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance print 'Sample size:',SampSize, 'iteration:',iteration NLIST.append(SampSize) Rare_MacIntercept_pVals.append(np.mean(sRare_MacIntercept_pVals)) # List to hold coefficient p-values Rare_MacIntercept_Coeffs.append(np.mean(sRare_MacIntercept_Coeffs)) # List to hold coefficients Rich_MacIntercept_pVals.append(np.mean(sRich_MacIntercept_pVals)) # List to hold coefficient p-values Rich_MacIntercept_Coeffs.append(np.mean(sRich_MacIntercept_Coeffs)) # List to hold coefficients Dom_MacIntercept_pVals.append(np.mean(sDom_MacIntercept_pVals)) Dom_MacIntercept_Coeffs.append(np.mean(sDom_MacIntercept_Coeffs)) Even_MacIntercept_pVals.append(np.mean(sEven_MacIntercept_pVals)) Even_MacIntercept_Coeffs.append(np.mean(sEven_MacIntercept_Coeffs)) Rare_MicIntercept_pVals.append(np.mean(sRare_MicIntercept_pVals)) Rare_MicIntercept_Coeffs.append(np.mean(sRare_MicIntercept_Coeffs)) Rich_MicIntercept_pVals.append(np.mean(sRich_MicIntercept_pVals)) Rich_MicIntercept_Coeffs.append(np.mean(sRich_MicIntercept_Coeffs)) Dom_MicIntercept_pVals.append(np.mean(sDom_MicIntercept_pVals)) Dom_MicIntercept_Coeffs.append(np.mean(sDom_MicIntercept_Coeffs)) Even_MicIntercept_pVals.append(np.mean(sEven_MicIntercept_pVals)) Even_MicIntercept_Coeffs.append(np.mean(sEven_MicIntercept_Coeffs)) Rare_MacSlope_pVals.append(np.mean(sRare_MacSlope_pVals)) # List to hold coefficient p-values Rare_MacSlope_Coeffs.append(np.mean(sRare_MacSlope_Coeffs)) # List to hold coefficients Rich_MacSlope_pVals.append(np.mean(sRich_MacSlope_pVals)) # List to hold coefficient p-values Rich_MacSlope_Coeffs.append(np.mean(sRich_MacSlope_Coeffs)) # List to hold coefficients Dom_MacSlope_pVals.append(np.mean(sDom_MacSlope_pVals)) Dom_MacSlope_Coeffs.append(np.mean(sDom_MacSlope_Coeffs)) Even_MacSlope_pVals.append(np.mean(sEven_MacSlope_pVals)) Even_MacSlope_Coeffs.append(np.mean(sEven_MacSlope_Coeffs)) Rare_MicSlope_pVals.append(np.mean(sRare_MicSlope_pVals)) Rare_MicSlope_Coeffs.append(np.mean(sRare_MicSlope_Coeffs)) Rich_MicSlope_pVals.append(np.mean(sRich_MicSlope_pVals)) Rich_MicSlope_Coeffs.append(np.mean(sRich_MicSlope_Coeffs)) Dom_MicSlope_pVals.append(np.mean(sDom_MicSlope_pVals)) Dom_MicSlope_Coeffs.append(np.mean(sDom_MicSlope_Coeffs)) Even_MicSlope_pVals.append(np.mean(sEven_MicSlope_pVals)) Even_MicSlope_Coeffs.append(np.mean(sEven_MicSlope_Coeffs)) RareR2List.append(np.mean(sRareR2List)) RarepFList.append(np.mean(sRarepFList)) RichR2List.append(np.mean(sRichR2List)) RichpFList.append(np.mean(sRichpFList)) DomR2List.append(np.mean(sDomR2List)) DompFList.append(np.mean(sDompFList)) EvenR2List.append(np.mean(sEvenR2List)) EvenpFList.append(np.mean(sEvenpFList)) # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #RarepLinListHC.append(np.mean(sRarepLinListHC)) RarepLinListRainB.append(np.mean(sRarepLinListRainB)) RarepLinListLM.append(np.mean(sRarepLinListLM)) #RichpLinListHC.append(np.mean(sRichpLinListHC)) RichpLinListRainB.append(np.mean(sRichpLinListRainB)) RichpLinListLM.append(np.mean(sRichpLinListLM)) #DompLinListHC.append(np.mean(sDompLinListHC)) DompLinListRainB.append(np.mean(sDompLinListRainB)) DompLinListLM.append(np.mean(sDompLinListLM)) #EvenpLinListHC.append(np.mean(sEvenpLinListHC)) EvenpLinListRainB.append(np.mean(sEvenpLinListRainB)) EvenpLinListLM.append(np.mean(sEvenpLinListLM)) # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) RarepCorrListBG.append(np.mean(sRarepCorrListBG)) RarepCorrListF.append(np.mean(sRarepCorrListF)) RichpCorrListBG.append(np.mean(sRichpCorrListBG)) RichpCorrListF.append(np.mean(sRichpCorrListF)) DompCorrListBG.append(np.mean(sDompCorrListBG)) DompCorrListF.append(np.mean(sDompCorrListF)) EvenpCorrListBG.append(np.mean(sEvenpCorrListBG)) EvenpCorrListF.append(np.mean(sEvenpCorrListF)) # 6. Homoscedacticity RarepHomoHW.append(np.mean(sRarepHomoHW)) RarepHomoHB.append(np.mean(sRarepHomoHB)) RichpHomoHB.append(np.mean(sRichpHomoHB)) RichpHomoHW.append(np.mean(sRichpHomoHW)) DompHomoHW.append(np.mean(sDompHomoHW)) DompHomoHB.append(np.mean(sDompHomoHB)) EvenpHomoHW.append(np.mean(sEvenpHomoHW)) EvenpHomoHB.append(np.mean(sEvenpHomoHB)) # 7. Normally distributed residuals (errors) RarepNormListOmni.append(np.mean(sRarepNormListOmni)) RarepNormListJB.append(np.mean(sRarepNormListJB)) RarepNormListKS.append(np.mean(sRarepNormListKS)) RarepNormListAD.append(np.mean(sRarepNormListAD)) RichpNormListOmni.append(np.mean(sRichpNormListOmni)) RichpNormListJB.append(np.mean(sRichpNormListJB)) RichpNormListKS.append(np.mean(sRichpNormListKS)) RichpNormListAD.append(np.mean(sRichpNormListAD)) DompNormListOmni.append(np.mean(sDompNormListOmni)) DompNormListJB.append(np.mean(sDompNormListJB)) DompNormListKS.append(np.mean(sDompNormListKS)) DompNormListAD.append(np.mean(sDompNormListAD)) EvenpNormListOmni.append(np.mean(sEvenpNormListOmni)) EvenpNormListJB.append(np.mean(sEvenpNormListJB)) EvenpNormListKS.append(np.mean(sEvenpNormListKS)) EvenpNormListAD.append(np.mean(sEvenpNormListAD)) fig.add_subplot(4, 3, 1) plt.xlim(min(SampSizes)-1,max(SampSizes)+10) plt.ylim(0,1) plt.xscale('log') # Rarity R2 vs. Sample Size plt.plot(NLIST,RareR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.text(1.01, 0.6, 'Rarity', rotation='vertical', fontsize=16) leg = plt.legend(loc=4,prop={'size':14}) leg.draw_frame(False) fig.add_subplot(4, 3, 2) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.xscale('log') plt.ylim(0.0, 0.16) # Rarity Coeffs vs. Sample Size plt.plot(NLIST, Rare_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Rare_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, RareIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 3) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.ylim(0.0, 0.6) plt.xscale('log') # Rarity p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(RarepLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST,RarepLinListRainB, c='m') plt.plot(NLIST,RarepLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST,RarepCorrListBG, c='c') plt.plot(NLIST,RarepCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST,RarepHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST,RarepHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST,RarepNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST,RarepNormListJB, c='Lime', ls='-') #plt.plot(NLIST,RarepNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST,RarepNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 4) plt.xscale('log') plt.ylim(0,1) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Dominance R2 vs. Sample Size plt.plot(NLIST, DomR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.text(1.01, 0.82, 'Dominance', rotation='vertical', fontsize=16) leg = plt.legend(loc=4,prop={'size':14}) leg.draw_frame(False) fig.add_subplot(4, 3, 5) plt.ylim(-0.2, 1.2) plt.xscale('log') plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Dominance Coeffs vs. Sample Size plt.plot(NLIST, Dom_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Dom_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, DomIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 6) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.xscale('log') #plt.yscale('log') plt.ylim(0, 0.6) # Dominance p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(DompLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST, DompLinListRainB, c='m') plt.plot(NLIST, DompLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST, DompCorrListBG, c='c') plt.plot(NLIST, DompCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, DompHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST, DompHomoHB, c='r',ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, DompNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST, DompNormListJB, c='Lime', ls='-') #plt.plot(NLIST, DompNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST, DompNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 7) plt.text(1.01, 0.7, 'Evenness', rotation='vertical', fontsize=16) plt.xscale('log') plt.ylim(0,1) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Evenness R2 vs. Sample Size plt.plot(NLIST, EvenR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) leg = plt.legend(loc=4,prop={'size':14}) leg.draw_frame(False) fig.add_subplot(4, 3, 8) plt.ylim(-0.25, 0.0) plt.xscale('log') plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Evenness Coeffs vs. Sample Size plt.plot(NLIST, Even_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Even_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, EvenIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 9) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.xscale('log') plt.ylim(0.0, 0.3) # Evenness p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(EvenpLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST, EvenpLinListRainB, c='m') plt.plot(NLIST, EvenpLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST, EvenpCorrListBG, c='c') plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, EvenpHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST, EvenpHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, EvenpNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST, EvenpNormListJB, c='Lime', alpha=0.9, ls='-') #plt.plot(NLIST, EvenpNormListKS, c='Lime', alpha=0.9, ls='--', lw=3) #plt.plot(NLIST, EvenpNormListAD, c='Lime', alpha=0.9, ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 10) plt.xscale('log') plt.ylim(0,1) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Dominance R2 vs. Sample Size plt.plot(NLIST, RichR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.xlabel('Sample size', fontsize=14) plt.text(1.01, 0.82, 'Richness', rotation='vertical', fontsize=16) leg = plt.legend(loc=4,prop={'size':14}) leg.draw_frame(False) fig.add_subplot(4, 3, 11) plt.ylim(-0.2, 1.2) plt.xscale('log') plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Richness Coeffs vs. Sample Size plt.plot(NLIST, Rich_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Rich_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, RichIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') plt.xlabel('Sample size', fontsize=14) leg = plt.legend(loc=10,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 12) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.xscale('log') # Richness p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(RichpLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST,RichpLinListRainB, c='m') plt.plot(NLIST,RichpLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST,RichpCorrListBG, c='c') plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST,RichpHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST,RichpHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST,RichpNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST,RichpNormListJB, c='Lime', ls='-') #plt.plot(NLIST,RichpNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST,RichpNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') plt.xlabel('Sample size', fontsize=14) leg = plt.legend(loc=1,prop={'size':8}) leg.draw_frame(False) #plt.tick_params(axis='both', which='major', labelsize=fs-3) plt.subplots_adjust(wspace=0.4, hspace=0.4) plt.savefig(mydir+'figs/appendix/SampleSize/SampleSizeEffects.png', dpi=600, bbox_inches = "tight") #plt.close() #plt.show() return
def Breush_Pagan(X,y): ols_retults=ols(X,y) name = ['LM statistic', 'p-value of LM test', 'f-statistic of the hypothesis', 'f p-value'] test = sms.het_breushpagan(ols_retults.resid, ols_retults.model.exog) return lzip(name, test)
def Fig_OLS_Checks(): #fs = 10 # font size used across figures #color = str() #OrC = 'open' SampSizes = [ 5, 6, 7, 8, 9, 10, 13, 16, 20, 30, 40, 50, 60, 70, 80, 90, 100 ] Iterations = 100 fig = plt.figure(figsize=(12, 8)) # MODEL PARAMETERS Rare_MacIntercept_pVals = [] # List to hold coefficient p-values Rare_MacIntercept_Coeffs = [] # List to hold coefficients Rich_MacIntercept_pVals = [] Rich_MacIntercept_Coeffs = [] Dom_MacIntercept_pVals = [] Dom_MacIntercept_Coeffs = [] Even_MacIntercept_pVals = [] Even_MacIntercept_Coeffs = [] Rare_MicIntercept_pVals = [] Rare_MicIntercept_Coeffs = [] Rich_MicIntercept_pVals = [] Rich_MicIntercept_Coeffs = [] Dom_MicIntercept_pVals = [] Dom_MicIntercept_Coeffs = [] Even_MicIntercept_pVals = [] Even_MicIntercept_Coeffs = [] Rare_MacSlope_pVals = [] Rare_MacSlope_Coeffs = [] Rich_MacSlope_pVals = [] Rich_MacSlope_Coeffs = [] Dom_MacSlope_pVals = [] Dom_MacSlope_Coeffs = [] Even_MacSlope_pVals = [] Even_MacSlope_Coeffs = [] Rare_MicSlope_pVals = [] Rare_MicSlope_Coeffs = [] Rich_MicSlope_pVals = [] Rich_MicSlope_Coeffs = [] Dom_MicSlope_pVals = [] Dom_MicSlope_Coeffs = [] Even_MicSlope_pVals = [] Even_MicSlope_Coeffs = [] RareR2List = [] # List to hold model R2 RarepFList = [] # List to hold significance of model R2 RichR2List = [] # List to hold model R2 RichpFList = [] # List to hold significance of model R2 DomR2List = [] # List to hold model R2 DompFList = [] # List to hold significance of model R2 EvenR2List = [] # List to hold model R2 EvenpFList = [] # List to hold significance of model R2 # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #RarepLinListHC = [] RarepLinListRainB = [] RarepLinListLM = [] #RichpLinListHC = [] RichpLinListRainB = [] RichpLinListLM = [] #DompLinListHC = [] DompLinListRainB = [] DompLinListLM = [] #EvenpLinListHC = [] EvenpLinListRainB = [] EvenpLinListLM = [] # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) RarepCorrListBG = [] RarepCorrListF = [] RichpCorrListBG = [] RichpCorrListF = [] DompCorrListBG = [] DompCorrListF = [] EvenpCorrListBG = [] EvenpCorrListF = [] # 6. Homoscedacticity RarepHomoHW = [] RarepHomoHB = [] RichpHomoHW = [] RichpHomoHB = [] DompHomoHW = [] DompHomoHB = [] EvenpHomoHW = [] EvenpHomoHB = [] # 7. Normally distributed residuals (errors) RarepNormListOmni = [] # Omnibus test for normality RarepNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality RarepNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance RarepNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance RichpNormListOmni = [] # Omnibus test for normality RichpNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality RichpNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance RichpNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance DompNormListOmni = [] # Omnibus test for normality DompNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality DompNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance DompNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance EvenpNormListOmni = [] # Omnibus test for normality EvenpNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality EvenpNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance EvenpNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance NLIST = [] for SampSize in SampSizes: sRare_MacIntercept_pVals = [] # List to hold coefficient p-values sRare_MacIntercept_Coeffs = [] # List to hold coefficients sRich_MacIntercept_pVals = [] # List to hold coefficient p-values sRich_MacIntercept_Coeffs = [] # List to hold coefficients sDom_MacIntercept_pVals = [] sDom_MacIntercept_Coeffs = [] sEven_MacIntercept_pVals = [] sEven_MacIntercept_Coeffs = [] sRare_MicIntercept_pVals = [] sRare_MicIntercept_Coeffs = [] sRich_MicIntercept_pVals = [] sRich_MicIntercept_Coeffs = [] sDom_MicIntercept_pVals = [] sDom_MicIntercept_Coeffs = [] sEven_MicIntercept_pVals = [] sEven_MicIntercept_Coeffs = [] sRare_MacSlope_pVals = [] sRare_MacSlope_Coeffs = [] sRich_MacSlope_pVals = [] sRich_MacSlope_Coeffs = [] sDom_MacSlope_pVals = [] sDom_MacSlope_Coeffs = [] sEven_MacSlope_pVals = [] sEven_MacSlope_Coeffs = [] sRare_MicSlope_pVals = [] sRare_MicSlope_Coeffs = [] sRich_MicSlope_pVals = [] sRich_MicSlope_Coeffs = [] sDom_MicSlope_pVals = [] sDom_MicSlope_Coeffs = [] sEven_MicSlope_pVals = [] sEven_MicSlope_Coeffs = [] sRareR2List = [] # List to hold model R2 sRarepFList = [] # List to hold significance of model R2 sRichR2List = [] # List to hold model R2 sRichpFList = [] # List to hold significance of model R2 sDomR2List = [] # List to hold model R2 sDompFList = [] # List to hold significance of model R2 sEvenR2List = [] # List to hold model R2 sEvenpFList = [] # List to hold significance of model R2 # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #sRarepLinListHC = [] sRarepLinListRainB = [] sRarepLinListLM = [] #sRichpLinListHC = [] sRichpLinListRainB = [] sRichpLinListLM = [] #sDompLinListHC = [] sDompLinListRainB = [] sDompLinListLM = [] #sEvenpLinListHC = [] sEvenpLinListRainB = [] sEvenpLinListLM = [] # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) sRarepCorrListBG = [] sRarepCorrListF = [] sRichpCorrListBG = [] sRichpCorrListF = [] sDompCorrListBG = [] sDompCorrListF = [] sEvenpCorrListBG = [] sEvenpCorrListF = [] # 6. Homoscedacticity sRarepHomoHW = [] sRarepHomoHB = [] sRichpHomoHW = [] sRichpHomoHB = [] sDompHomoHW = [] sDompHomoHB = [] sEvenpHomoHW = [] sEvenpHomoHB = [] # 7. Normally distributed residuals (errors) sRarepNormListOmni = [] # Omnibus test for normality sRarepNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality sRarepNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sRarepNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance sRichpNormListOmni = [] # Omnibus test for normality sRichpNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality sRichpNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sRichpNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance sDompNormListOmni = [] # Omnibus test for normality sDompNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality sDompNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sDompNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance sEvenpNormListOmni = [] # Omnibus test for normality sEvenpNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality sEvenpNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sEvenpNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance for iteration in range(Iterations): Nlist, Slist, Evarlist, ESimplist, ENeelist, EHeiplist, EQlist = [ [], [], [], [], [], [], [] ] klist, Shanlist, BPlist, SimpDomlist, SinglesList, tenlist, onelist = [ [], [], [], [], [], [], [] ] NmaxList, rareSkews, KindList = [[], [], []] NSlist = [] ct = 0 radDATA = [] datasets = [] GoodNames = [ 'EMPclosed', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA' ] # all microbe data is MGRAST mlist = ['micro', 'macro'] for m in mlist: for name in os.listdir(mydir + 'data/' + m): if name in GoodNames: pass else: continue path = mydir + 'data/' + m + '/' + name + '/' + name + '-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, m, num_lines]) numMac = 0 numMic = 0 radDATA = [] for d in datasets: name, kind, numlines = d lines = [] lines = np.random.choice(range(1, numlines + 1), SampSize, replace=True) path = mydir + 'data/' + kind + '/' + name + '/' + name + '-SADMetricData.txt' for line in lines: data = linecache.getline(path, line) radDATA.append(data) #print name, kind, numlines, len(radDATA) for data in radDATA: data = data.split() if len(data) == 0: print 'no data' continue name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data N = float(N) S = float(S) Nlist.append(float(np.log(N))) Slist.append(float(np.log(S))) NSlist.append(float(np.log(N / S))) Evarlist.append(float(np.log(float(Evar)))) ESimplist.append(float(np.log(float(ESimp)))) KindList.append(kind) BPlist.append(float(BP)) NmaxList.append(float(np.log(float(BP) * float(N)))) EHeiplist.append(float(EHeip)) # lines for the log-modulo transformation of skewnness skew = float(skew) sign = 1 if skew < 0: sign = -1 lms = np.log(np.abs(skew) + 1) lms = lms * sign #if lms > 3: print name, N, S rareSkews.append(float(lms)) if kind == 'macro': numMac += 1 elif kind == 'micro': numMic += 1 ct += 1 #print 'Sample Size:',SampSize, ' Mic:', numMic,'Mac:', numMac # Multiple regression for Rarity d = pd.DataFrame({'N': list(Nlist)}) d['Rarity'] = list(rareSkews) d['Kind'] = list(KindList) RarityResults = smf.ols( 'Rarity ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RarityResults.summary(), '\n' # Multiple regression for Rarity d = pd.DataFrame({'N': list(Nlist)}) d['Richness'] = list(Slist) d['Kind'] = list(KindList) RichnessResults = smf.ols( 'Richness ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RichnessResults.summary(), '\n' # Multiple regression for Dominance d = pd.DataFrame({'N': list(Nlist)}) d['Dominance'] = list(NmaxList) d['Kind'] = list(KindList) DomResults = smf.ols( 'Dominance ~ N * Kind', d).fit() # Fit the dummy variable regression model #print DomResults.summary(), '\n' # Multiple regression for Evenness d = pd.DataFrame({'N': list(Nlist)}) d['Evenness'] = list(ESimplist) d['Kind'] = list(KindList) EvenResults = smf.ols( 'Evenness ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RarityResults.summary(), '\n' RareResids = RarityResults.resid # residuals of the model RichResids = RichnessResults.resid # residuals of the model DomResids = DomResults.resid # residuals of the model EvenResids = EvenResults.resid # residuals of the model # MODEL RESULTS/FIT RareFpval = RarityResults.f_pvalue Rarer2 = RarityResults.rsquared # coefficient of determination #Adj_r2 = RareResults.rsquared_adj # adjusted RichFpval = RichnessResults.f_pvalue Richr2 = RichnessResults.rsquared # coefficient of determination #Adj_r2 = RichnessResults.rsquared_adj # adjusted DomFpval = DomResults.f_pvalue Domr2 = DomResults.rsquared # coefficient of determination #Adj_r2 = DomResults.rsquared_adj # adjusted EvenFpval = EvenResults.f_pvalue Evenr2 = EvenResults.rsquared # coefficient of determination #Adj_r2 = EvenResuls.rsquared_adj # adjusted # MODEL PARAMETERS and p-values Rareparams = RarityResults.params Rareparams = Rareparams.tolist() Rarepvals = RarityResults.pvalues Rarepvals = Rarepvals.tolist() Richparams = RichnessResults.params Richparams = Richparams.tolist() Richpvals = RichnessResults.pvalues Richpvals = Richpvals.tolist() Domparams = DomResults.params Domparams = Domparams.tolist() Dompvals = DomResults.pvalues Dompvals = Dompvals.tolist() Evenparams = EvenResults.params Evenparams = Evenparams.tolist() Evenpvals = EvenResults.pvalues Evenpvals = Evenpvals.tolist() sRare_MacIntercept_pVals.append(Rarepvals[0]) sRare_MacIntercept_Coeffs.append(Rareparams[0]) sRich_MacIntercept_pVals.append(Rarepvals[0]) sRich_MacIntercept_Coeffs.append(Rareparams[0]) sDom_MacIntercept_pVals.append(Dompvals[0]) sDom_MacIntercept_Coeffs.append(Domparams[0]) sEven_MacIntercept_pVals.append(Evenpvals[0]) sEven_MacIntercept_Coeffs.append(Evenparams[0]) sRare_MicIntercept_pVals.append(Rarepvals[1]) if Rarepvals[1] > 0.05: sRare_MicIntercept_Coeffs.append(Rareparams[1]) else: sRare_MicIntercept_Coeffs.append(Rareparams[1]) sRich_MicIntercept_pVals.append(Richpvals[1]) if Richpvals[1] > 0.05: sRich_MicIntercept_Coeffs.append(Richparams[1]) else: sRich_MicIntercept_Coeffs.append(Richparams[1]) sDom_MicIntercept_pVals.append(Dompvals[1]) if Dompvals[1] > 0.05: sDom_MicIntercept_Coeffs.append(Domparams[1]) else: sDom_MicIntercept_Coeffs.append(Domparams[1]) sEven_MicIntercept_pVals.append(Evenpvals[1]) if Evenpvals[1] > 0.05: sEven_MicIntercept_Coeffs.append(Evenparams[1]) else: sEven_MicIntercept_Coeffs.append(Evenparams[1]) sRare_MacSlope_pVals.append(Rarepvals[2]) sRare_MacSlope_Coeffs.append(Rareparams[2]) sRich_MacSlope_pVals.append(Richpvals[2]) sRich_MacSlope_Coeffs.append(Richparams[2]) sDom_MacSlope_pVals.append(Dompvals[2]) sDom_MacSlope_Coeffs.append(Domparams[2]) sEven_MacSlope_pVals.append(Evenpvals[2]) sEven_MacSlope_Coeffs.append(Evenparams[2]) sRare_MicSlope_pVals.append(Rarepvals[3]) if Rarepvals[3] > 0.05: sRare_MicSlope_Coeffs.append(Rareparams[3]) else: sRare_MicSlope_Coeffs.append(Rareparams[3]) sRich_MicSlope_pVals.append(Richpvals[3]) if Richpvals[3] > 0.05: sRich_MicSlope_Coeffs.append(Richparams[3]) else: sRich_MicSlope_Coeffs.append(Richparams[3]) sDom_MicSlope_pVals.append(Dompvals[3]) if Dompvals[3] > 0.05: sDom_MicSlope_Coeffs.append(Domparams[3]) else: sDom_MicSlope_Coeffs.append(Domparams[3]) sEven_MicSlope_pVals.append(Evenpvals[3]) if Evenpvals[3] > 0.05: sEven_MicSlope_Coeffs.append(Evenparams[3]) else: sEven_MicSlope_Coeffs.append(Evenparams[3]) sRareR2List.append(Rarer2) sRarepFList.append(RareFpval) sRichR2List.append(Richr2) sRichpFList.append(RichFpval) sDomR2List.append(Domr2) sDompFList.append(DomFpval) sEvenR2List.append(Evenr2) sEvenpFList.append(EvenFpval) # TESTS OF LINEAR REGRESSION ASSUMPTIONS # Error in predictor variables is negligible...Presumably Yes # Variables are measured at the continuous level...Definitely Yes # TESTS FOR LINEARITY, i.e., WHETHER THE DATA ARE CORRECTLY MODELED AS LINEAR #HC = smd.linear_harvey_collier(RarityResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sRarepLinListHC.append(HC) #HC = smd.linear_harvey_collier(DomResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sDompLinListHC.append(HC) #HC = smd.linear_harvey_collier(EvenResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sEvenpLinListHC.append(HC) RB = smd.linear_rainbow( RarityResults ) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sRarepLinListRainB.append(RB[1]) RB = smd.linear_rainbow( RichnessResults ) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sRichpLinListRainB.append(RB[1]) RB = smd.linear_rainbow( DomResults ) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sDompLinListRainB.append(RB[1]) RB = smd.linear_rainbow( EvenResults ) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sEvenpLinListRainB.append(RB[1]) LM = smd.linear_lm(RarityResults.resid, RarityResults.model.exog ) # Lagrangian multiplier test for linearity sRarepLinListLM.append(LM[1]) LM = smd.linear_lm(RichnessResults.resid, RichnessResults.model.exog ) # Lagrangian multiplier test for linearity sRichpLinListLM.append(LM[1]) LM = smd.linear_lm(DomResults.resid, DomResults.model.exog ) # Lagrangian multiplier test for linearity sDompLinListLM.append(LM[1]) LM = smd.linear_lm(EvenResults.resid, EvenResults.model.exog ) # Lagrangian multiplier test for linearity sEvenpLinListLM.append(LM[1]) # INDEPENDENCE OF OBSERVATIONS (no serial correlation in residuals) BGtest = smd.acorr_breush_godfrey( RarityResults, nlags=None, store=False ) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(RareResids, lags=None, boxpierce=True) sRarepCorrListBG.append(BGtest[1]) sRarepCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey( RichnessResults, nlags=None, store=False ) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(RichResids, lags=None, boxpierce=True) sRichpCorrListBG.append(BGtest[1]) sRichpCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey( DomResults, nlags=None, store=False ) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(DomResids, lags=None, boxpierce=True) sDompCorrListBG.append(BGtest[1]) sDompCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey( EvenResults, nlags=None, store=False ) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(EvenResids, lags=None, boxpierce=True) sEvenpCorrListBG.append(BGtest[1]) sEvenpCorrListF.append(BGtest[3]) # There are no significant outliers...Need tests or measures/metrics # HOMOSCEDASTICITY # These tests return: # 1. lagrange multiplier statistic, # 2. p-value of lagrange multiplier test, # 3. f-statistic of the hypothesis that the error variance does not depend on x, # 4. p-value for the f-statistic HW = sms.het_white(RareResids, RarityResults.model.exog) sRarepHomoHW.append(HW[3]) HW = sms.het_white(RichResids, RichnessResults.model.exog) sRichpHomoHW.append(HW[3]) HW = sms.het_white(DomResids, DomResults.model.exog) sDompHomoHW.append(HW[3]) HW = sms.het_white(EvenResids, EvenResults.model.exog) sEvenpHomoHW.append(HW[3]) HB = sms.het_breushpagan(RareResids, RarityResults.model.exog) sRarepHomoHB.append(HB[3]) HB = sms.het_breushpagan(RichResids, RichnessResults.model.exog) sRichpHomoHB.append(HB[3]) HB = sms.het_breushpagan(DomResids, DomResults.model.exog) sDompHomoHB.append(HB[3]) HB = sms.het_breushpagan(EvenResids, EvenResults.model.exog) sEvenpHomoHB.append(HB[3]) # 7. NORMALITY OF ERROR TERMS O = sms.omni_normtest(RareResids) sRarepNormListOmni.append(O[1]) O = sms.omni_normtest(RichResids) sRichpNormListOmni.append(O[1]) O = sms.omni_normtest(DomResids) sDompNormListOmni.append(O[1]) O = sms.omni_normtest(EvenResids) sEvenpNormListOmni.append(O[1]) JB = sms.jarque_bera(RareResids) sRarepNormListJB.append( JB[1] ) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(RichResids) sRichpNormListJB.append( JB[1] ) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(DomResids) sDompNormListJB.append( JB[1] ) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(EvenResids) sEvenpNormListJB.append( JB[1] ) # Calculate residual skewness, kurtosis, and do the JB test for normality KS = smd.kstest_normal(RareResids) sRarepNormListKS.append( KS[1] ) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(RichResids) sRichpNormListKS.append( KS[1] ) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(DomResids) sDompNormListKS.append( KS[1] ) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(EvenResids) sEvenpNormListKS.append( KS[1] ) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance AD = smd.normal_ad(RareResids) sRarepNormListAD.append( AD[1] ) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(RichResids) sRichpNormListAD.append( AD[1] ) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(DomResids) sDompNormListAD.append( AD[1] ) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(EvenResids) sEvenpNormListAD.append( AD[1] ) # Anderson-Darling test for normal distribution unknown mean and variance print 'Sample size:', SampSize, 'iteration:', iteration NLIST.append(SampSize) Rare_MacIntercept_pVals.append(np.mean( sRare_MacIntercept_pVals)) # List to hold coefficient p-values Rare_MacIntercept_Coeffs.append( np.mean(sRare_MacIntercept_Coeffs)) # List to hold coefficients Rich_MacIntercept_pVals.append(np.mean( sRich_MacIntercept_pVals)) # List to hold coefficient p-values Rich_MacIntercept_Coeffs.append( np.mean(sRich_MacIntercept_Coeffs)) # List to hold coefficients Dom_MacIntercept_pVals.append(np.mean(sDom_MacIntercept_pVals)) Dom_MacIntercept_Coeffs.append(np.mean(sDom_MacIntercept_Coeffs)) Even_MacIntercept_pVals.append(np.mean(sEven_MacIntercept_pVals)) Even_MacIntercept_Coeffs.append(np.mean(sEven_MacIntercept_Coeffs)) Rare_MicIntercept_pVals.append(np.mean(sRare_MicIntercept_pVals)) Rare_MicIntercept_Coeffs.append(np.mean(sRare_MicIntercept_Coeffs)) Rich_MicIntercept_pVals.append(np.mean(sRich_MicIntercept_pVals)) Rich_MicIntercept_Coeffs.append(np.mean(sRich_MicIntercept_Coeffs)) Dom_MicIntercept_pVals.append(np.mean(sDom_MicIntercept_pVals)) Dom_MicIntercept_Coeffs.append(np.mean(sDom_MicIntercept_Coeffs)) Even_MicIntercept_pVals.append(np.mean(sEven_MicIntercept_pVals)) Even_MicIntercept_Coeffs.append(np.mean(sEven_MicIntercept_Coeffs)) Rare_MacSlope_pVals.append( np.mean(sRare_MacSlope_pVals)) # List to hold coefficient p-values Rare_MacSlope_Coeffs.append( np.mean(sRare_MacSlope_Coeffs)) # List to hold coefficients Rich_MacSlope_pVals.append( np.mean(sRich_MacSlope_pVals)) # List to hold coefficient p-values Rich_MacSlope_Coeffs.append( np.mean(sRich_MacSlope_Coeffs)) # List to hold coefficients Dom_MacSlope_pVals.append(np.mean(sDom_MacSlope_pVals)) Dom_MacSlope_Coeffs.append(np.mean(sDom_MacSlope_Coeffs)) Even_MacSlope_pVals.append(np.mean(sEven_MacSlope_pVals)) Even_MacSlope_Coeffs.append(np.mean(sEven_MacSlope_Coeffs)) Rare_MicSlope_pVals.append(np.mean(sRare_MicSlope_pVals)) Rare_MicSlope_Coeffs.append(np.mean(sRare_MicSlope_Coeffs)) Rich_MicSlope_pVals.append(np.mean(sRich_MicSlope_pVals)) Rich_MicSlope_Coeffs.append(np.mean(sRich_MicSlope_Coeffs)) Dom_MicSlope_pVals.append(np.mean(sDom_MicSlope_pVals)) Dom_MicSlope_Coeffs.append(np.mean(sDom_MicSlope_Coeffs)) Even_MicSlope_pVals.append(np.mean(sEven_MicSlope_pVals)) Even_MicSlope_Coeffs.append(np.mean(sEven_MicSlope_Coeffs)) RareR2List.append(np.mean(sRareR2List)) RarepFList.append(np.mean(sRarepFList)) RichR2List.append(np.mean(sRichR2List)) RichpFList.append(np.mean(sRichpFList)) DomR2List.append(np.mean(sDomR2List)) DompFList.append(np.mean(sDompFList)) EvenR2List.append(np.mean(sEvenR2List)) EvenpFList.append(np.mean(sEvenpFList)) # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #RarepLinListHC.append(np.mean(sRarepLinListHC)) RarepLinListRainB.append(np.mean(sRarepLinListRainB)) RarepLinListLM.append(np.mean(sRarepLinListLM)) #RichpLinListHC.append(np.mean(sRichpLinListHC)) RichpLinListRainB.append(np.mean(sRichpLinListRainB)) RichpLinListLM.append(np.mean(sRichpLinListLM)) #DompLinListHC.append(np.mean(sDompLinListHC)) DompLinListRainB.append(np.mean(sDompLinListRainB)) DompLinListLM.append(np.mean(sDompLinListLM)) #EvenpLinListHC.append(np.mean(sEvenpLinListHC)) EvenpLinListRainB.append(np.mean(sEvenpLinListRainB)) EvenpLinListLM.append(np.mean(sEvenpLinListLM)) # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) RarepCorrListBG.append(np.mean(sRarepCorrListBG)) RarepCorrListF.append(np.mean(sRarepCorrListF)) RichpCorrListBG.append(np.mean(sRichpCorrListBG)) RichpCorrListF.append(np.mean(sRichpCorrListF)) DompCorrListBG.append(np.mean(sDompCorrListBG)) DompCorrListF.append(np.mean(sDompCorrListF)) EvenpCorrListBG.append(np.mean(sEvenpCorrListBG)) EvenpCorrListF.append(np.mean(sEvenpCorrListF)) # 6. Homoscedacticity RarepHomoHW.append(np.mean(sRarepHomoHW)) RarepHomoHB.append(np.mean(sRarepHomoHB)) RichpHomoHB.append(np.mean(sRichpHomoHB)) RichpHomoHW.append(np.mean(sRichpHomoHW)) DompHomoHW.append(np.mean(sDompHomoHW)) DompHomoHB.append(np.mean(sDompHomoHB)) EvenpHomoHW.append(np.mean(sEvenpHomoHW)) EvenpHomoHB.append(np.mean(sEvenpHomoHB)) # 7. Normally distributed residuals (errors) RarepNormListOmni.append(np.mean(sRarepNormListOmni)) RarepNormListJB.append(np.mean(sRarepNormListJB)) RarepNormListKS.append(np.mean(sRarepNormListKS)) RarepNormListAD.append(np.mean(sRarepNormListAD)) RichpNormListOmni.append(np.mean(sRichpNormListOmni)) RichpNormListJB.append(np.mean(sRichpNormListJB)) RichpNormListKS.append(np.mean(sRichpNormListKS)) RichpNormListAD.append(np.mean(sRichpNormListAD)) DompNormListOmni.append(np.mean(sDompNormListOmni)) DompNormListJB.append(np.mean(sDompNormListJB)) DompNormListKS.append(np.mean(sDompNormListKS)) DompNormListAD.append(np.mean(sDompNormListAD)) EvenpNormListOmni.append(np.mean(sEvenpNormListOmni)) EvenpNormListJB.append(np.mean(sEvenpNormListJB)) EvenpNormListKS.append(np.mean(sEvenpNormListKS)) EvenpNormListAD.append(np.mean(sEvenpNormListAD)) fig.add_subplot(4, 3, 1) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.ylim(0, 1) plt.xscale('log') # Rarity R2 vs. Sample Size plt.plot(NLIST, RareR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.text(1.01, 0.6, 'Rarity', rotation='vertical', fontsize=16) leg = plt.legend(loc=4, prop={'size': 14}) leg.draw_frame(False) fig.add_subplot(4, 3, 2) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.xscale('log') plt.ylim(0.0, 0.16) # Rarity Coeffs vs. Sample Size plt.plot(NLIST, Rare_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Rare_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, RareIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 3) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.ylim(0.0, 0.6) plt.xscale('log') # Rarity p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(RarepLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST,RarepLinListRainB, c='m') plt.plot(NLIST, RarepLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST,RarepCorrListBG, c='c') plt.plot(NLIST, RarepCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, RarepHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST,RarepHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, RarepNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST,RarepNormListJB, c='Lime', ls='-') #plt.plot(NLIST,RarepNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST,RarepNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 4) plt.xscale('log') plt.ylim(0, 1) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Dominance R2 vs. Sample Size plt.plot(NLIST, DomR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.text(1.01, 0.82, 'Dominance', rotation='vertical', fontsize=16) leg = plt.legend(loc=4, prop={'size': 14}) leg.draw_frame(False) fig.add_subplot(4, 3, 5) plt.ylim(-0.2, 1.2) plt.xscale('log') plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Dominance Coeffs vs. Sample Size plt.plot(NLIST, Dom_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Dom_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, DomIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 6) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.xscale('log') #plt.yscale('log') plt.ylim(0, 0.6) # Dominance p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(DompLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST, DompLinListRainB, c='m') plt.plot(NLIST, DompLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST, DompCorrListBG, c='c') plt.plot(NLIST, DompCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, DompHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST, DompHomoHB, c='r',ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, DompNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST, DompNormListJB, c='Lime', ls='-') #plt.plot(NLIST, DompNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST, DompNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 7) plt.text(1.01, 0.7, 'Evenness', rotation='vertical', fontsize=16) plt.xscale('log') plt.ylim(0, 1) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Evenness R2 vs. Sample Size plt.plot(NLIST, EvenR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) leg = plt.legend(loc=4, prop={'size': 14}) leg.draw_frame(False) fig.add_subplot(4, 3, 8) plt.ylim(-0.25, 0.0) plt.xscale('log') plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Evenness Coeffs vs. Sample Size plt.plot(NLIST, Even_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Even_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, EvenIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 9) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.xscale('log') plt.ylim(0.0, 0.3) # Evenness p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(EvenpLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST, EvenpLinListRainB, c='m') plt.plot(NLIST, EvenpLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST, EvenpCorrListBG, c='c') plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, EvenpHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST, EvenpHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, EvenpNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST, EvenpNormListJB, c='Lime', alpha=0.9, ls='-') #plt.plot(NLIST, EvenpNormListKS, c='Lime', alpha=0.9, ls='--', lw=3) #plt.plot(NLIST, EvenpNormListAD, c='Lime', alpha=0.9, ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 10) plt.xscale('log') plt.ylim(0, 1) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Dominance R2 vs. Sample Size plt.plot(NLIST, RichR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.xlabel('Sample size', fontsize=14) plt.text(1.01, 0.82, 'Richness', rotation='vertical', fontsize=16) leg = plt.legend(loc=4, prop={'size': 14}) leg.draw_frame(False) fig.add_subplot(4, 3, 11) plt.ylim(-0.2, 1.2) plt.xscale('log') plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Richness Coeffs vs. Sample Size plt.plot(NLIST, Rich_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Rich_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, RichIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') plt.xlabel('Sample size', fontsize=14) leg = plt.legend(loc=10, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 12) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.xscale('log') # Richness p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(RichpLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST,RichpLinListRainB, c='m') plt.plot(NLIST, RichpLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST,RichpCorrListBG, c='c') plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, RichpHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST,RichpHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, RichpNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST,RichpNormListJB, c='Lime', ls='-') #plt.plot(NLIST,RichpNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST,RichpNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') plt.xlabel('Sample size', fontsize=14) leg = plt.legend(loc=1, prop={'size': 8}) leg.draw_frame(False) #plt.tick_params(axis='both', which='major', labelsize=fs-3) plt.subplots_adjust(wspace=0.4, hspace=0.4) plt.savefig(mydir + 'figs/appendix/SampleSize/SampleSizeEffects.png', dpi=600, bbox_inches="tight") #plt.close() #plt.show() return
def do_regression(data_frame, response_var, predicators): Y = np.array(data_frame[response_var]) X = np.array(data_frame[predicators]) X = sm.add_constant(X) linear_model = sm.OLS(Y,X) lr = linear_model.fit() ## make a plot fnou = response_var+"_vs_"+"("+("+".join(predicators))+")"+".pdf" pp = PdfPages(fnou) if(len(predicators) == 1): ## plot the data points plt.clf() plt.scatter(X[:,1], Y[:], s=0.5, c='b') ## plot the fitting line x1 = np.arange(min(X[:,1]), max(X[:,1]), (max(X[:,1])-min(X[:,1]))*0.01 ) y1 = x1 * lr.params[1] + lr.params[0] plt.plot(x1,y1,'b--') plt.xlabel(predicators[0]) plt.ylabel(response_var) plt.title(response_var+"_vs_"+predicators[0]) pp.savefig(bbox_inches='tight', papertype='a4') ## plot the residule-x plt.clf() plt.scatter(X[:,1], lr.resid, s = 5.0, c='b', alpha=0.4, linewidth=0.0) plt.xlabel(predicators[0]) plt.ylabel("Residual") pp.savefig(bbox_inches='tight', papertype='a4') ## plot residual square-x plt.clf() plt.scatter(X[:,1], lr.resid**2, s = 5.0, c='b', alpha=0.4, linewidth=0.0) plt.xlabel(predicators[0]) plt.ylabel("Residual Square") pp.savefig(bbox_inches='tight', papertype='a4') y_pred = lr.predict(X) ## plot the residule-y-pred plt.clf() plt.scatter(y_pred, lr.resid, s = 5.0, c='r', alpha=0.4, linewidth=0.0) plt.xlabel(response_var) plt.ylabel("Residual") pp.savefig(bbox_inches='tight', papertype='a4') ## plot residual square-y-pred plt.clf() plt.scatter(y_pred, lr.resid**2, s = 5.0, c='r', alpha=0.4, linewidth=0.0) plt.xlabel(response_var) plt.ylabel("Residual Square") pp.savefig(bbox_inches='tight', papertype='a4') ## plot observed-predicted plt.clf() plt.scatter(y_pred, Y, s = 5.0, c='g', alpha=0.4, linewidth=0.0) y1 = np.arange(min(Y),max(Y),(max(Y)-min(Y))*0.01) plt.plot(y1,y1,'b--') plt.xlabel("Predicted") plt.ylabel("Observed") #plt.title("Aggregated Regression") plt.axis('equal') pp.savefig(bbox_inches='tight', papertype='a4') ## make a histogram of residual plt.clf() plt.hist(lr.resid, bins=40) plt.title("Regression Residual Distribution") res_hist = np.histogram(lr.resid,bins=40) bin_w = res_hist[1][1] - res_hist[1][0] res_mean = np.mean(lr.resid) res_std = np.std(lr.resid) x1 = np.arange(min(lr.resid), max(lr.resid), (max(lr.resid)-min(lr.resid))*0.01 ) y1 = norm.pdf((x1-res_mean)/res_std)*len(lr.resid)*bin_w/res_std plt.plot(x1,y1,'b--') pp.savefig(bbox_inches='tight', papertype='a4') pp.close() print("fitting parameters : ") for i in range(len(lr.params)): print(" %4d %6.4g"%(i,lr.params[i])) print("standard error : ") for i in range(len(lr.bse)): print(" %4d %6.4g"%(i,lr.bse[i])) print("p-value : ") for i in range(len(lr.pvalues)): print(" %4d %6.4g"%(i,lr.pvalues[i])) #print(lr.ssr) print("r-square : ", lr.rsquared) print("** residual analysis **") print("skewness of residual : %6.4g"%(skew(lr.resid))) print("kurtosis of residual : %6.4g"%(kurtosis(lr.resid))) print("** BreuschPagan test **") test = sms.het_breushpagan(lr.resid, lr.model.exog) name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] for i in range(len(test)): print("%s : %5.3g"%(name[i],test[i])) print("** Normality of residual **") k2, p = stats.normaltest(lr.resid) print("p-value = ", p) ## write a line of record to the output data file fpou = open("data_output.txt","a") buf = "%d %s %s"%(len(predicators), response_var, " ".join(predicators) ) buf = buf + " %6.4g"%(lr.rsquared) for i in range(len(lr.params)): buf = buf + " %6.4g"%(lr.params[i]) for i in range(len(lr.pvalues)): buf = buf + " %6.4g"%(lr.pvalues[i]) fpou.write(buf+"\n") fpou.close()