def plot_obs_pred(obs, pred, radius, loglog, ax = None, inset = False, sites = None): """Generic function to generate an observed vs predicted figure with 1:1 line""" if not ax: fig = plt.figure(figsize = (3.5, 3.5)) ax = plt.subplot(111) axis_min = 0.9 * min(list(obs[obs > 0]) + list(pred[pred > 0])) if loglog: axis_max = 3 * max(list(obs)+list(pred)) else: axis_max = 1.1 * max(list(obs)+list(pred)) macroecotools.plot_color_by_pt_dens(np.array(pred), np.array(obs), radius, loglog=loglog, plot_obj = ax) plt.plot([axis_min, axis_max],[axis_min, axis_max], 'k-') plt.xlim(axis_min, axis_max) plt.ylim(axis_min, axis_max) ax.tick_params(axis = 'both', which = 'major', labelsize = 6) if loglog: plt.annotate(r'$R^2$ = %0.2f' %macroecotools.obs_pred_rsquare(np.log10(obs[(obs != 0) * (pred != 0)]), np.log10(pred[(obs != 0) * (pred != 0)])), xy = (0.05, 0.85), xycoords = 'axes fraction', fontsize = 7) else: plt.annotate(r'$R^2$ = %0.2f' %macroecotools.obs_pred_rsquare(obs, pred), xy = (0.05, 0.85), xycoords = 'axes fraction', fontsize = 7) if inset: axins = inset_axes(ax, width="30%", height="30%", loc=4) if loglog: hist_mete_r2(sites[(obs != 0) * (pred != 0)], np.log10(obs[(obs != 0) * (pred != 0)]), np.log10(pred[(obs != 0) * (pred != 0)])) else: hist_mete_r2(sites, obs, pred) plt.setp(axins, xticks=[], yticks=[]) return ax
def bootstrap_SDR(name_site_combo, model, in_dir = './data/', out_dir = './out_files/', Niter = 200): """A general function of bootstrapping for ISD applying to all four models. Inputs: name_site_combo: a list with dat_name and site model - takes one of four values 'ssnt_0', 'ssnt_1', 'asne', or 'agsne' in_dir - directory of raw data out_dir - directory used both in input (obs_pred.csv file) and output Niter - number of bootstrap samples Output: Writes to one file on disk for R^2. """ dat_name, site = name_site_combo dat = wk.import_raw_data(in_dir + dat_name + '.csv') dat_site = dat[dat['site'] == site] dat_clean = clean_data_agsne(dat_site) G, S, N, E = get_GSNE(dat_clean) lambda1, beta, lambda3 = agsne.get_agsne_lambdas(G, S, N, E) par_list = [] for sp in np.unique(dat_clean['sp']): dat_sp = dat_clean[dat_clean['sp'] == sp] n = len(dat_sp) genus_sp = dat_sp['genus'][0] m = len(np.unique(dat_clean[dat_clean['genus'] == genus_sp]['sp'])) par_list.append([m, n]) pred_obs = wk.import_obs_pred_data(out_dir + dat_name + '_obs_pred_sdr_' + model + '.csv') pred = pred_obs[pred_obs['site'] == site]['pred'] obs = pred_obs[pred_obs['site'] == site]['obs'] out_list_rsquare = [dat_name, site, str(mtools.obs_pred_rsquare(np.log10(obs), np.log10(pred)))] iisd_agsne = mete_distributions.theta_agsne([G, S, N, E], [lambda1, beta, lambda3, agsne.agsne_lambda3_z(lambda1, beta, S) / lambda3]) iisd_asne = mete_distributions.theta_epsilon(S, N, E) dbh_scaled = np.array(dat_clean['dbh'] / min(dat_clean['dbh'])) iisd_ssnt_0 = ssnt_isd_bounded(1, N / (sum(dbh_scaled ** 1) - N)) iisd_ssnt_1 = ssnt_isd_bounded(2/3, N / (sum(dbh_scaled ** (2/3)) - N)) dist_for_model = {'ssnt_0': iisd_ssnt_0, 'ssnt_1': iisd_ssnt_1, 'asne': iisd_asne, 'agsne': iisd_agsne} dist = dist_for_model[model] for i in range(Niter): if model in ['ssnt_0', 'ssnt_1']: obs_boot = np.array([np.mean((dist.rvs(par[1])) ** 2) for par in par_list]) # Here par[1] is n for each species elif model == 'asne': obs_boot = np.array([np.mean(np.array(dist.rvs(par[1], par[1]))) for par in par_list]) else: obs_boot = np.array([np.mean(np.array(dist.rvs(par[1], par[1], par[0]))) for par in par_list]) out_list_rsquare.append(str(mtools.obs_pred_rsquare(np.log10(obs_boot), np.log10(pred)))) wk.write_to_file(out_dir + 'SDR_bootstrap_' + model + '_rsquare.txt', ",".join(str(x) for x in out_list_rsquare))
def fig5(SADModels): """ This function generates a 2x2 figure, with these subplots: One subplot for each model: r-squared vs. N list of r-squared values and list of Ns plotted against each other """ fig = plt.figure() for i, model in enumerate(SADModels): fig.add_subplot(2, 2, i+1) obs_pred_data = import_obs_pred_data(mydir + '/Results/' + model + '.txt') obs = ((obs_pred_data["obs"])) pred = ((obs_pred_data["pred"])) site = ((obs_pred_data["site"])) obs_data = [] pred_data = [] for sites in np.unique(site): obs_data.append(obs[sites==site]) pred_data.append(pred[sites==site]) Ns = [] r2s = [] for j, sad in enumerate(obs_data): r2 = macroecotools.obs_pred_rsquare(np.array(sad), np.array(pred_data[j])) r2s.append(r2) N = sum(sad) # Find Total Abundance Ns.append(N) plt.scatter(np.log(Ns).tolist(), r2s, color='Maroon', label=model, alpha = 0.5) # label is for the legend plt.xlabel('Log Abundance', fontsize=8) plt.ylabel('Rsquared Value', fontsize=8) plt.subplots_adjust(wspace = .35, hspace = .35) plt.axhline(y = 0) if model == 'SimBrokenStick': plt.title("Broken Stick R^2 v N", fontsize = 10) elif model == 'SimLogNormInt': plt.title("Log Norm R^2 v N", fontsize = 10) elif model == 'SimpleRandomFraction': plt.title("Random Fraction R^2 v N", fontsize = 10) elif model == 'SimParetoInt': plt.title("Pareto Int R^2 v N", fontsize = 10) print model + ': Done' # insert code to plot a legend plt.savefig('/Users/Nathan_Hillis/GitHub/SADModels/Results/R2vN.png', dpi=600, bbox_inches = 'tight', pad_inches=0.03) plt.show() return
def bootstrap_SAD(name_site_combo, model, in_dir = './data/', out_dir = './out_files/', Niter = 200): """A general function of bootstrapping for SAD applying to all four models. Inputs: name_site_combo: a list with dat_name and site model - takes one of four values 'ssnt_0', 'ssnt_1', 'asne', or 'agsne' in_dir - directory of raw data out_dir - directory used both in input (obs_pred.csv file) and output Niter - number of bootstrap samples Output: Writes to disk, with one file for R^2 and one for KS statistic. """ dat_name, site = name_site_combo dat = wk.import_raw_data(in_dir + dat_name + '.csv') dat_site = dat[dat['site'] == site] dat_clean = clean_data_agsne(dat_site) G, S, N, E = get_GSNE(dat_clean) beta_ssnt = mete.get_beta(S, N, version = 'untruncated') beta_asne = mete.get_beta(S, N) lambda1, beta, lambda3 = agsne.get_agsne_lambdas(G, S, N, E) sad_agsne = mete_distributions.sad_agsne([G, S, N, E], [lambda1, beta, lambda3, agsne.agsne_lambda3_z(lambda1, beta, S) / lambda3]) dist_for_model = {'ssnt_0': stats.logser(np.exp(-beta_ssnt)), 'ssnt_1': stats.logser(np.exp(-beta_ssnt)), 'asne': md.trunc_logser(np.exp(-beta_asne), N), 'agsne': sad_agsne} dist = dist_for_model[model] pred_obs = wk.import_obs_pred_data(out_dir + dat_name + '_obs_pred_rad_' + model + '.csv') pred = pred_obs[pred_obs['site'] == site]['pred'][::-1] obs = pred_obs[pred_obs['site'] == site]['obs'][::-1] out_list_rsquare = [dat_name, site, str(mtools.obs_pred_rsquare(np.log10(obs), np.log10(pred)))] emp_cdf = mtools.get_emp_cdf(obs) out_list_ks = [dat_name, site, str(max(abs(emp_cdf - np.array([dist.cdf(x) for x in obs]))))] for i in range(Niter): obs_boot = np.array(sorted(dist.rvs(S))) cdf_boot = np.array([dist.cdf(x) for x in obs_boot]) emp_cdf_boot = mtools.get_emp_cdf(obs_boot) out_list_rsquare.append(str(mtools.obs_pred_rsquare(np.log10(obs_boot), np.log10(pred)))) out_list_ks.append(str(max(abs(emp_cdf_boot - np.array(cdf_boot))))) wk.write_to_file(out_dir + 'SAD_bootstrap_' + model + '_rsquare.txt', ",".join(str(x) for x in out_list_rsquare)) wk.write_to_file(out_dir + 'SAD_bootstrap_' + model + '_ks.txt', ",".join(str(x) for x in out_list_ks))
def obs_pred_r2_analysis(datasets, data_dir='./data/'): """Calculated the coefficients of determination for the METE SAD""" for i, dataset in enumerate(datasets): obs_pred_data = import_obs_pred_data(data_dir + dataset + '_obs_pred.csv') obs = ((obs_pred_data["obs"])) pred = ((obs_pred_data["pred"])) loglog_R2 = macroecotools.obs_pred_rsquare(np.log10(obs), np.log10(pred)) print("%s: log-log R^2 = %s" % (dataset.upper(), loglog_R2))
def obs_pred_r2_multi(methods, datasets, data_dir='/home/kenlocey/data1/'): # TAKEN FROM THE mete_sads.py script print 'generating 1:1 line R-square values for dataset(s)' for i, dataset in enumerate(datasets): for j, method in enumerate(methods): obs_pred_data = import_obs_pred_data(data_dir + dataset + '/' + dataset + '_obs_pred.txt') obs = ((obs_pred_data["obs"])) pred = ((obs_pred_data["pred"])) print method, dataset,' ', macroecotools.obs_pred_rsquare(np.log10(obs), np.log10(pred))
def plot_r2_comp(name_site_combo, dat_dir='./out_files/', out_fig_dir='./out_figs/'): """Plot r2 of the three patterns separately for each community.""" models = ['asne', 'agsne', 'ssnt_0', 'ssnt_1'] model_names = ['ASNE', 'AGSNE', 'SSNT_N', 'SSNT_M'] patterns = ['rad', 'isd', 'sdr'] pattern_names = ['SAD', 'ISD', 'SDR'] col_list = ['b', '#787878', 'r'] symbol_list = ['o', 's', '*'] fig = plt.figure(figsize=(10.5, 3.5)) for i, pattern in enumerate(patterns): r2_dic = {'asne': [], 'agsne': [], 'ssnt_0': [], 'ssnt_1': []} r2_list = [] for j, model in enumerate(models): for dat_name, site in name_site_combo: pred_obs_model_pattern = wk.import_obs_pred_data(dat_dir + dat_name + '_obs_pred_' + pattern + '_' + model + '.csv') pred_obs_site = pred_obs_model_pattern[ pred_obs_model_pattern['site'] == site] r2 = mtools.obs_pred_rsquare(np.log10(pred_obs_site['obs']), np.log10(pred_obs_site['pred'])) r2_dic[model].append(r2) r2_list.append(r2) ax = plt.subplot(1, 3, i + 1) for j in range(1, 4): model = models[j] plt.scatter(r2_dic['asne'], r2_dic[model], s=20, marker=symbol_list[j - 1], facecolors=col_list[j - 1], edgecolors='none', label=model_names[j]) min_val, max_val = min(r2_list), max(r2_list) if min_val < 0: axis_min = 1.1 * min_val else: axis_min = 0.9 * min_val if max_val < 0: axis_max = 0.9 * max_val else: axis_max = 1.1 * max_val plt.plot([axis_min, axis_max], [axis_min, axis_max], 'k-') plt.xlim(axis_min, axis_max) plt.ylim(axis_min, axis_max) ax.tick_params(axis='both', which='major', labelsize=6) ax.set_xlabel(r'$R^2$ of ASNE', labelpad=4, size=10) ax.set_ylabel(r'$R^2$ of the other models', labelpad=4, size=10) ax.set_title(pattern_names[i], size=16) if i == 0: ax.legend(loc=2, prop={'size': 10}) plt.subplots_adjust(left=0.08, wspace=0.3) plt.tight_layout() plt.savefig(out_fig_dir + 'r2_comp.png', dpi=400)
def obs_pred_r2_multi(methods, data_dir = mydir + '/results/'): # TAKEN FROM THE mete_sads.py script print 'generating 1:1 line R-square values for dataset(s)' for j, method in enumerate(methods): obs_pred_data = import_obs_pred_data(data_dir + dataset + '/' + dataset + '_obs_pred.txt') obs = ((obs_pred_data["obs"])) pred = ((obs_pred_data["pred"])) print method,' ', macroecotools.obs_pred_rsquare(np.log10(obs), np.log10(pred))
def figSuppp(figname = 'SuppFig3', data_dir=mydir, radius=2): fig = plt.figure() plot_dim = 2 count = 0 IN_Obs_Pred = importData.import_NSR2_data(mydir + \ 'data/NSR2/Stratified/lognorm_pln_NSR2_stratify.txt') N = np.asarray(list(((IN_Obs_Pred["N"])))) S = np.asarray(list(((IN_Obs_Pred["S"])))) NmaxObs = np.asarray(list(((IN_Obs_Pred["NmaxObs"])))) NmaxPred = [] SPred = [] for i in range(len(N)): NmaxPred_i = importPredictS.predictS(N[i], NmaxObs[i], predictNmax=True).getNmax() SPred_i = importPredictS.predictS(N[i], NmaxObs[i], predictNmax=True).getS() NmaxPred.append(NmaxPred_i) SPred.append(SPred_i) NmaxPred = np.asarray(NmaxPred) SPred = np.asarray(SPred) toIteratePred = [NmaxPred, SPred] toIterateObs = [NmaxObs, S] for x in range(2): axis_min = 0 axis_max = 2 * max(toIteratePred[x]) #print plot_dim ax = fig.add_subplot(plot_dim-1, plot_dim, count+1) if x == 0: ax.set_title(r"$\mathbf{N_{max}}$") else: ax.set_title(r"$\mathbf{S}$") macroecotools.plot_color_by_pt_dens(toIteratePred[x], toIterateObs[x], radius, loglog=1, plot_obj=plt.subplot(plot_dim-1,plot_dim,count+1)) plt.plot([axis_min, axis_max],[axis_min, axis_max], 'k-') plt.xlim(axis_min, axis_max) plt.ylim(0, axis_max) r2_all = macroecotools.obs_pred_rsquare(np.log10(toIterateObs[x]), np.log10(toIteratePred[x])) r2text = r"${}^{{2}}_{{m}} = {:.{p}f} $".format('r',r2_all , p=2) plt.text(0.18, 0.93, r2text, fontsize=10, horizontalalignment='center', verticalalignment='center',transform = ax.transAxes) plt.tick_params(axis='both', which='major', labelsize=7) plt.subplots_adjust(wspace=0.5, hspace=0.3) #axins = inset_axes(ax, width="30%", height="30%", loc=4) ax.set(adjustable='box-forced', aspect='equal') #plt.setp(axins, xticks=[], yticks=[]) count += 1 fig.text(0.50, 0.04, r'$Predicted$', ha='center', va='center') fig.text(0.05, 0.5, r'$Observed$', ha='center', va='center', rotation='vertical') fig_name = str(mydir + 'figures/' + figname + '.png') plt.savefig(fig_name, dpi=600)#, bbox_inches = 'tight')#, pad_inches=0) plt.close()
def hist_mete_r2(sites, obs, pred): """Generate a kernel density estimate of the r^2 values for obs-pred plots""" r2s = [] for site in sites: obs_site = obs[sites==site] pred_site = pred[sites==site] r2 = macroecotools.obs_pred_rsquare(obs_site, pred_site) r2s.append(r2) hist_r2 = np.histogram(r2s, range=(0, 1)) xvals = hist_r2[1] + (hist_r2[1][1] - hist_r2[1][0]) xvals = xvals[0:len(xvals)-1] yvals = hist_r2[0] / len(r2s) plt.plot(xvals, yvals, 'k-', linewidth=2) plt.axis([0, 1, 0, 1])
def hist_mete_r2(sites, obs, pred): # TAKEN FROM Macroecotools or the mete_sads.py script used for White et al. (2012) """Generate a kernel density estimate of the r^2 values for obs-pred plots""" r2s = [] for site in np.unique(sites): obs_site = obs[sites==site] pred_site = pred[sites==site] r2 = macroecotools.obs_pred_rsquare(obs_site, pred_site) r2s.append(r2) hist_r2 = np.histogram(r2s, range=(0, 1)) xvals = hist_r2[1] + (hist_r2[1][1] - hist_r2[1][0]) xvals = xvals[0:len(xvals)-1] yvals = hist_r2[0] plt.plot(xvals, yvals, 'k-', linewidth=2) plt.axis([0, 1, 0, 1.1 * max(yvals)])
def generate_obs_pred_data(datasets, methods): for method in methods: for dataset in datasets: gN = 0 #OUT = open(mydir+'/data/'+method+'_'+dataset+'_obs_pred.txt','w+') IN = mydir+'/MicroMETE/data/'+dataset+'_SADs.txt' num_lines = sum(1 for line in open(IN)) for line in open(IN): line = line.split() obs = map(int, line) obs = list([x for x in obs if x > 1]) N = sum(obs) gN += N print N S = len(obs) if S < 10: continue obs.sort() obs.reverse() print method, dataset, N, S, 'countdown:', num_lines, if method == 'geom': # Predicted geometric series pred = predRADs.get_GeomSeries(N, S, False) # False mean no zeros allowed elif method == 'mete': # Predicted log-series logSeries = mete.get_mete_rad(S, N) pred = logSeries[0] r2 = macroecotools.obs_pred_rsquare(np.log10(obs), np.log10(pred)) print " r2:", r2 # write to file, by cite, observed and expected ranked abundances #for i, sp in enumerate(pred): # print>> OUT, obs[i], pred[i] num_lines -= 1 print 'N(HMP): ',gN #OUT.close() print dataset
def plot_obs_pred_sad(SADModels, data_dir, radius=2): # TAKEN FROM THE mete_sads.py script used for White et al. (2012) # Used for Figure 3 Locey and White (2013) ######################################################################################## """Multiple obs-predicted plotter""" fig = plt.figure() for i, model in enumerate(SADModels): fig.add_subplot(2, 2, i+1) obs_pred_data = import_obs_pred_data(data_dir + model + '.txt') site = ((obs_pred_data["site"])) obs = ((obs_pred_data["obs"])) pred = ((obs_pred_data["pred"])) axis_min = 0.5 * min(obs) axis_max = 2 * max(obs) macroecotools.plot_color_by_pt_dens(pred, obs, radius, loglog=1, plot_obj=plt.subplot(2, 2, i+1)) plt.plot([axis_min, axis_max],[axis_min, axis_max], 'k-') plt.xlim(axis_min, axis_max) plt.ylim(axis_min, axis_max) plt.tick_params(axis='both', which='major', labelsize=8) plt.subplots_adjust(wspace=0.5, hspace=0.3) r2 = macroecotools.obs_pred_rsquare(np.log10(obs), np.log10(pred)) print model, r2 # Create inset for histogram of site level r^2 values #axins = inset_axes(ax, width="30%", height="30%", loc=4) #hist_mete_r2(site, np.log10(obs), np.log10(pred)) #plt.setp(axins, xticks=[], yticks=[]) plt.title(model) #plt.text(1, 2000, r'$R^2$' + '='+ str(round(r2,3))) plt.ylabel('Observed abundance',rotation='90',fontsize=12) plt.xlabel('Predicted abundance',fontsize=12) plt.savefig(mydir+'/Results/obs_pred_plots.png', dpi=600)#, bbox_inches = 'tight')#, pad_inches=0) plt.show()
def plot_r2_comp(name_site_combo, dat_dir = './out_files/', out_fig_dir = './out_figs/'): """Plot r2 of the three patterns separately for each community.""" models = ['asne', 'agsne', 'ssnt_0', 'ssnt_1'] model_names = ['ASNE', 'AGSNE', 'SSNT_N', 'SSNT_M'] patterns = ['rad', 'isd', 'sdr'] pattern_names = ['SAD', 'ISD', 'SDR'] col_list = ['b', '#787878', 'r'] symbol_list = ['o', 's', '*'] fig = plt.figure(figsize = (10.5, 3.5)) for i, pattern in enumerate(patterns): r2_dic = {'asne':[], 'agsne':[], 'ssnt_0':[], 'ssnt_1':[]} r2_list = [] for j, model in enumerate(models): for dat_name, site in name_site_combo: pred_obs_model_pattern = wk.import_obs_pred_data(dat_dir + dat_name + '_obs_pred_' + pattern + '_' + model + '.csv') pred_obs_site = pred_obs_model_pattern[pred_obs_model_pattern['site'] == site] r2 = mtools.obs_pred_rsquare(np.log10(pred_obs_site['obs']), np.log10(pred_obs_site['pred'])) r2_dic[model].append(r2) r2_list.append(r2) ax = plt.subplot(1, 3, i + 1) for j in range(1, 4): model = models[j] plt.scatter(r2_dic['asne'], r2_dic[model], s = 20, marker = symbol_list[j - 1], facecolors = col_list[j - 1], edgecolors = 'none', label = model_names[j]) min_val, max_val = min(r2_list), max(r2_list) if min_val < 0: axis_min = 1.1 * min_val else: axis_min = 0.9 * min_val if max_val < 0: axis_max = 0.9 * max_val else: axis_max= 1.1 * max_val plt.plot([axis_min, axis_max], [axis_min, axis_max], 'k-') plt.xlim(axis_min, axis_max) plt.ylim(axis_min, axis_max) ax.tick_params(axis = 'both', which = 'major', labelsize = 6) ax.set_xlabel(r'$R^2$ of ASNE', labelpad = 4, size = 10) ax.set_ylabel(r'$R^2$ of the other models', labelpad = 4, size = 10) ax.set_title(pattern_names[i], size = 16) if i == 0: ax.legend(loc = 2, prop = {'size': 10}) plt.subplots_adjust(left = 0.08, wspace = 0.3) plt.tight_layout() plt.savefig(out_fig_dir + 'r2_comp.png', dpi = 400)
def create_null_dataset(Svals, Nvals, Niter, dataset_name, data_dir='./data/', dic_filename='beta_lookup_table.pck', return_obs_pred=0): """Create simulated fits to uniform abundance distribution data Create list of coefficients of determination for simulated observed vs. predicted abundance relationships for a dataset. If these values are similar to those observed then the constraints alone largely determine the fit to the data. If they are weaker than the observed then the application of maximum entropy is also important. Svals : a list of values of observed species richnesses to match. Each row is a community (e.g., a site, year, etc.) Nvals : a list of values of observed community abundances to match. The ordering of rows should match that of Svals so that each row represents the S and N values for a single community. Niter: number of simulations dataset_name : short code that will indicate the name of the dataset in the output file names data_dir : directory in which to store output """ resultfile = open(data_dir + dataset_name + '_sim_r2.csv', 'wb') out = csv.writer(resultfile, dialect = 'excel') dic_beta = mete.get_beta_dict(dic_filename) for i in range(Niter): pool = multiprocessing.Pool() curried_args = itertools.izip(Svals, Nvals, itertools.repeat(dic_beta)) site_sim_results = pool.map(sim_null_curry, curried_args) pool.close() sim_obs = [] sim_pred = [] for site in site_sim_results: sim_obs.extend((site[0])) sim_pred.extend((site[1])) r2 = macroecotools.obs_pred_rsquare(np.array(np.log10(sim_obs)), np.array(np.log10(sim_pred))) results = ((np.column_stack((i, r2)))) out.writerows(results) resultfile.close() if return_obs_pred == 1: return sim_obs, sim_pred
def plot_obs_pred_sad(methods, datasets, data_dir='/home/kenlocey/data1/', radius=2): # TAKEN FROM THE mete_sads.py script used for White et al. (2012) # Used for Figure 3 Locey and White (2013) ######################################################################################## """Multiple obs-predicted plotter""" for i, dataset in enumerate(datasets): for j, method in enumerate(methods): #if method == 'mete' and dataset == 'EMP': continue obs_pred_data = import_obs_pred_data(mydir+'/data/truedata/'+method+'_'+dataset+'_obs_pred.txt') #site = ((obs_pred_data["site"])) obs = ((obs_pred_data["obs"])) pred = ((obs_pred_data["pred"])) axis_min = 0.5 * min(obs) axis_max = 2 * max(obs) macroecotools.plot_color_by_pt_dens(pred, obs, radius, loglog=1, plot_obj=plt.subplot(2, 2, j+1)) plt.plot([axis_min, axis_max],[axis_min, axis_max], 'k-') plt.xlim(axis_min, axis_max) plt.ylim(axis_min, axis_max) plt.tick_params(axis='both', which='major', labelsize=8) plt.subplots_adjust(wspace=0.5, hspace=0.3) r2 = macroecotools.obs_pred_rsquare(np.log10(obs), np.log10(pred)) print method, dataset, r2 #Create inset for histogram of site level r^2 values #axins = inset_axes(ax, width="30%", height="30%", loc=4) #hist_mete_r2(site, np.log10(obs), np.log10(pred)) #plt.setp(axins, xticks=[], yticks=[]) if method == 'mete': plt.title("Log-series") else: plt.title("Geometric series") plt.text(1, 2000, r'$R^2$' + '='+ str(round(r2,3))) plt.ylabel('Observed abundance',rotation='90',fontsize=12) plt.xlabel('Predicted abundance',fontsize=12) plt.savefig(mydir+'/obs_pred_plots.png', dpi=600)#, bbox_inches = 'tight')#, pad_inches=0)
def plot_sim_results(datasets, colors, data_dir='./data/'): all_lowerbounds = {'bbs': -0.7, 'cbc': -2.3, 'fia': 0, 'gentry': 0, 'mcdb': -0.75, 'nabc': -0.5} lowerbounds = [all_lowerbounds[dataset] for dataset in datasets] fig = plt.figure() for i, dataset in enumerate(datasets): sim_data = import_sim_data(data_dir + dataset + '_sim_r2.csv') obs_pred_data = import_obs_pred_data(data_dir + dataset + '_obs_pred.csv') obs_r2 = macroecotools.obs_pred_rsquare(np.log10(obs_pred_data['obs']), np.log10(obs_pred_data['pred'])) sim_kde = stats.kde.gaussian_kde(sim_data['r2']) xvals = np.arange(lowerbounds[i], 1, 0.01) yvals = sim_kde.evaluate(xvals) xvals = xvals[yvals > 0.000001] yvals = yvals[yvals > 0.000001] ax = fig.add_subplot(3,2,i+1) longdashes = [10,5] plot_obj, = plt.plot(xvals, yvals, 'k--', linewidth=2, color=colors[i]) plot_obj.set_dashes(longdashes) plt.plot([obs_r2, obs_r2], [0, max(yvals)], color=colors[i], linewidth=2) plt.axis([lowerbounds[i], 1, 0, 1.1 * max(yvals)]) plt.savefig('sim_results.png', dpi=400, bbox_inches = 'tight', pad_inches=0)
import numpy as np import matplotlib.pylab as plt import sys from macroecotools import plot_color_by_pt_dens, obs_pred_rsquare def plot_obs_pred(obs_pred_data, adj=0, dest_file='./obs_pred.png'): plot_color_by_pt_dens(obs_pred_data['pred'] + adj, obs_pred_data['obs'] + adj, 3, loglog=1) plt.loglog([min(obs_pred_data['pred'] + adj), max(obs_pred_data['pred'] + adj)], [min(obs_pred_data['pred'] + adj), max(obs_pred_data['pred'] + adj)], 'k-') plt.savefig(dest_file, dpi = 400) if len(sys.argv) > 1: datasets = [sys.argv[1]] else: datasets = ['bbs_2012', 'bbs_2008_2012', 'cbc', 'gentry', 'naba'] for dataset in datasets: for datatype in ['fit', 'test']: for predtype in ['rad']: obs_pred_data = read_csv('./results/' + dataset + '_' + predtype+ '_' + datatype + '_obs_pred.csv') adj = 0 log_pred = [log(float(i + adj)) for i in obs_pred_data['pred'].values] log_obs = [log(float(i + adj)) for i in obs_pred_data['obs'].values] print obs_pred_rsquare(np.array(log_pred), np.array(log_obs)) fig_name = './figs/' + dataset + '_' + datatype +'_obs_pred_' + predtype + '.png' plot_obs_pred(obs_pred_data, adj=adj, dest_file=fig_name)
def Supp(figname = 'Supp', data_dir=mydir, radius=2): # TAKEN FROM THE mete_sads.py script used for White et al. (2012) # Used for Figure 3 Locey and White (2013) """Multiple obs-predicted plotter""" fig = plt.figure() count = 0 plot_dim = 2 IN_Obs_Pred = importData.import_obs_pred_data(mydir + \ 'data/ObsPred/Stratified/lognorm_75_25_obs_pred_stratify_test.txt') site = np.asarray(list(((IN_Obs_Pred["site"])))) obs = np.asarray(list(((IN_Obs_Pred["obs"])))) pred7525 = np.asarray(list(((IN_Obs_Pred["pred7525"])))) predPln = np.asarray(list(((IN_Obs_Pred["predPln"])))) toIterate = [pred7525, predPln] for x in range(2): axis_min = 0 axis_max = 2 * max(obs) #print plot_dim ax = fig.add_subplot(plot_dim, plot_dim, count+1) if x == 0: ax.set_title(r"$\mathbf{75:25\, Simulation}$") else: ax.set_title(r"$\mathbf{Lognormal\, MLE}$") macroecotools.plot_color_by_pt_dens(toIterate[x], obs, radius, loglog=1, plot_obj=plt.subplot(plot_dim,plot_dim,count+1)) # #plt.text(0.1, 0.9,'matplotlib', ha='center', va='center', transform=ax.transAxes) plt.plot([axis_min, axis_max],[axis_min, axis_max], 'k-') plt.xlim(0, axis_max) plt.ylim(0, axis_max) #r2s = ((INh2["R2"])) #r2s = r2s.astype(float) # insert r2 of all data r2_all = macroecotools.obs_pred_rsquare(np.log10(obs), np.log10(toIterate[x])) r2text = r"${}^{{2}}_{{m}} = {:.{p}f} $".format('r',r2_all , p=2) plt.text(0.18, 0.93, r2text, fontsize=10, horizontalalignment='center', verticalalignment='center',transform = ax.transAxes) plt.tick_params(axis='both', which='major', labelsize=7) plt.subplots_adjust(wspace=0.5, hspace=0.3) axins = inset_axes(ax, width="30%", height="30%", loc=4) #hist_r2 = np.histogram(r2s, range=(0, 1)) #xvals = hist_r2[1] + (hist_r2[1][1] - hist_r2[1][0]) #xvals = xvals[0:len(xvals)-1] #yvals = hist_r2[0] #plt.plot(xvals, yvals, 'k-', linewidth=2) #plt.axis([0, 1, 0, 1.1 * max(yvals)]) ax.set(adjustable='box-forced', aspect='equal') #plt.setp(axins, xticks=[], yticks=[]) count += 1 fig.text(0.50, 0.04, r'$Predicted \; rank-abundance$', ha='center', va='center') fig.text(0.05, 0.5, r'$Observed \; rank-abundance$', ha='center', va='center', rotation='vertical') fig_name = str(mydir + 'figures/' + figname + '.png') plt.savefig(fig_name, dpi=600)#, bbox_inches = 'tight')#, pad_inches=0) plt.close()
def bootstrap_ISD(name_site_combo, model, in_dir='./data/', out_dir='./out_files/', Niter=200): """A general function of bootstrapping for ISD applying to all four models. Inputs: name_site_combo: a list with dat_name and site model - takes one of four values 'ssnt_0', 'ssnt_1', 'asne', or 'agsne' in_dir - directory of raw data out_dir - directory used both in input (obs_pred.csv file) and output Niter - number of bootstrap samples Output: Writes to disk, with one file for R^2 and one for KS statistic. """ dat_name, site = name_site_combo dat = wk.import_raw_data(in_dir + dat_name + '.csv') dat_site = dat[dat['site'] == site] dat_clean = clean_data_agsne(dat_site) G, S, N, E = get_GSNE(dat_clean) lambda1, beta, lambda3 = agsne.get_agsne_lambdas(G, S, N, E) isd_agsne = mete_distributions.psi_agsne([G, S, N, E], [ lambda1, beta, lambda3, agsne.agsne_lambda3_z(lambda1, beta, S) / lambda3 ]) isd_asne = mete_distributions.psi_epsilon_approx(S, N, E) dbh_scaled = np.array(dat_clean['dbh'] / min(dat_clean['dbh'])) isd_ssnt_0 = ssnt_isd_bounded(1, N / (sum(dbh_scaled**1) - N)) isd_ssnt_1 = ssnt_isd_bounded(2 / 3, N / (sum(dbh_scaled**(2 / 3)) - N)) dist_for_model = { 'ssnt_0': isd_ssnt_0, 'ssnt_1': isd_ssnt_1, 'asne': isd_asne, 'agsne': isd_agsne } dist = dist_for_model[model] pred_obs = wk.import_obs_pred_data(out_dir + dat_name + '_obs_pred_isd_' + model + '.csv') pred = pred_obs[pred_obs['site'] == site]['pred'] obs = pred_obs[pred_obs['site'] == site]['obs'] out_list_rsquare = [ dat_name, site, str(mtools.obs_pred_rsquare(np.log10(obs), np.log10(pred))) ] wk.write_to_file(out_dir + 'ISD_bootstrap_' + model + '_rsquare.txt', ",".join(str(x) for x in out_list_rsquare), new_line=False) emp_cdf = mtools.get_emp_cdf(obs) out_list_ks = [ dat_name, site, str(max(abs(emp_cdf - np.array([dist.cdf(x) for x in obs])))) ] wk.write_to_file(out_dir + 'ISD_bootstrap_' + model + '_ks.txt', ",".join(str(x) for x in out_list_ks), new_line=False) num_pools = 8 # Assuming that 8 pools are to be created for i in xrange(Niter): obs_boot = [] cdf_boot = [] while len(obs_boot) < N: pool = multiprocessing.Pool(num_pools) out_sample = pool.map(wk.generate_isd_sample, [dist for j in xrange(num_pools)]) for combo in out_sample: cdf_sublist, sample_sublist = combo obs_boot.extend(sample_sublist) cdf_boot.extend(cdf_sublist) pool.close() pool.join() if model in ['asne', 'agsne']: obs_boot = np.sort(obs_boot[:N])**0.5 # Convert to diameter else: obs_boot = np.sort(obs_boot[:N]) sample_rsquare = mtools.obs_pred_rsquare(np.log10(obs_boot), np.log10(pred)) sample_ks = max(abs(emp_cdf - np.sort(cdf_boot[:N]))) wk.write_to_file(out_dir + 'ISD_bootstrap_' + model + '_rsquare.txt', "".join([',', str(sample_rsquare)]), new_line=False) wk.write_to_file(out_dir + 'ISD_bootstrap_' + model + '_ks.txt', "".join([',', str(sample_ks)]), new_line=False) wk.write_to_file(out_dir + 'ISD_bootstrap_' + model + '_rsquare.txt', '\t') wk.write_to_file(out_dir + 'ISD_bootstrap_' + model + '_ks.txt', '\t') wk.write_to_file(out_dir + 'ISD_bootstrap_' + model + '_ks.txt', ",".join(str(x) for x in out_list_ks))
def bootstrap_SAD(name_site_combo, model, in_dir='./data/', out_dir='./out_files/', Niter=200): """A general function of bootstrapping for SAD applying to all four models. Inputs: name_site_combo: a list with dat_name and site model - takes one of four values 'ssnt_0', 'ssnt_1', 'asne', or 'agsne' in_dir - directory of raw data out_dir - directory used both in input (obs_pred.csv file) and output Niter - number of bootstrap samples Output: Writes to disk, with one file for R^2 and one for KS statistic. """ dat_name, site = name_site_combo dat = wk.import_raw_data(in_dir + dat_name + '.csv') dat_site = dat[dat['site'] == site] dat_clean = clean_data_agsne(dat_site) G, S, N, E = get_GSNE(dat_clean) beta_ssnt = mete.get_beta(S, N, version='untruncated') beta_asne = mete.get_beta(S, N) lambda1, beta, lambda3 = agsne.get_agsne_lambdas(G, S, N, E) sad_agsne = mete_distributions.sad_agsne([G, S, N, E], [ lambda1, beta, lambda3, agsne.agsne_lambda3_z(lambda1, beta, S) / lambda3 ]) dist_for_model = { 'ssnt_0': stats.logser(np.exp(-beta_ssnt)), 'ssnt_1': stats.logser(np.exp(-beta_ssnt)), 'asne': md.trunc_logser(np.exp(-beta_asne), N), 'agsne': sad_agsne } dist = dist_for_model[model] pred_obs = wk.import_obs_pred_data(out_dir + dat_name + '_obs_pred_rad_' + model + '.csv') pred = pred_obs[pred_obs['site'] == site]['pred'][::-1] obs = pred_obs[pred_obs['site'] == site]['obs'][::-1] out_list_rsquare = [ dat_name, site, str(mtools.obs_pred_rsquare(np.log10(obs), np.log10(pred))) ] emp_cdf = mtools.get_emp_cdf(obs) out_list_ks = [ dat_name, site, str(max(abs(emp_cdf - np.array([dist.cdf(x) for x in obs])))) ] for i in range(Niter): obs_boot = np.array(sorted(dist.rvs(S))) cdf_boot = np.array([dist.cdf(x) for x in obs_boot]) emp_cdf_boot = mtools.get_emp_cdf(obs_boot) out_list_rsquare.append( str(mtools.obs_pred_rsquare(np.log10(obs_boot), np.log10(pred)))) out_list_ks.append(str(max(abs(emp_cdf_boot - np.array(cdf_boot))))) wk.write_to_file(out_dir + 'SAD_bootstrap_' + model + '_rsquare.txt', ",".join(str(x) for x in out_list_rsquare)) wk.write_to_file(out_dir + 'SAD_bootstrap_' + model + '_ks.txt', ",".join(str(x) for x in out_list_ks))
def fig4(figname="Fig4", data_dir=mydir, radius=2, saveAs="eps"): fig = plt.figure() fig.subplots_adjust(bottom=0.15) plot_dim = 1 count = 0 IN_Obs_Pred = importData.import_NSR2_data(mydir + "data/NSR2/Stratified/lognorm_pln_NSR2_stratify.txt") N = np.asarray(list(((IN_Obs_Pred["N"])))) S = np.asarray(list(((IN_Obs_Pred["S"])))) NmaxObs = np.asarray(list(((IN_Obs_Pred["NmaxObs"])))) models = ["geom", "lognorm", "mete", "zipf"] modelSlopes = [0.647520323289, 0.942904468437, 0.769214774397, 0.954497727096] modelInterepts = [0.116508916992, 0.292527611072, 0.19240314275, 0.189954627996] for g, model in enumerate(models): NmaxPred = [] SPred = [] for i in range(len(N)): NmaxPred_i = mo.predictS(N[i], NmaxObs[i], predictNmax=True).getNmax( b=modelInterepts[g], slope=modelSlopes[g] ) SPred_i = mo.predictS(N[i], NmaxObs[i], predictNmax=True).getS() NmaxPred.append(NmaxPred_i) SPred.append(SPred_i) NmaxPred = np.asarray(NmaxPred) SPred = np.asarray(SPred) axis_min = 0 axis_max = 2 * max(NmaxObs) ax = fig.add_subplot(2, 2, count + 1) if model == "zipf": OUT2 = importData.import_NSR2_data( data_dir + "data/NSR2/Stratified_Test/" + model + "_mle_NSR2_stratify.txt" ) elif model == "lognorm": OUT2 = importData.import_NSR2_data( data_dir + "data/NSR2/Stratified_Test/" + model + "_pln_NSR2_stratify.txt" ) else: OUT2 = importData.import_NSR2_data(data_dir + "data/NSR2/Stratified_Test/" + model + "_NSR2_stratify.txt") NmaxObs_BS = np.asarray(list(((OUT2["NmaxObs"])))) NmaxPred_BS = np.asarray(list(((OUT2["NmaxPred"])))) if model == "geom": ax.set_title("Broken-stick") elif model == "lognorm": ax.set_title("Lognormal") elif model == "mete": ax.set_title("Log-series") elif model == "zipf": ax.set_title("Zipf") macroecotools.plot_color_by_pt_dens(NmaxPred, NmaxObs, radius, loglog=1, plot_obj=plt.subplot(2, 2, count + 1)) plt.plot([axis_min, axis_max], [axis_min, axis_max], "k-") plt.xlim(axis_min, axis_max) plt.ylim(0, axis_max) r2_all = macroecotools.obs_pred_rsquare(np.log10(NmaxObs), np.log10(NmaxPred)) r2text = r"${}^{{2}}_{{m}} = {:.{p}f} $".format("r", r2_all, p=2) plt.text( 0.72, 0.12, r2text, fontsize=13, horizontalalignment="center", verticalalignment="center", transform=ax.transAxes, ) plt.tick_params(axis="both", which="major", labelsize=12) plt.subplots_adjust(wspace=0.00001, hspace=0.3) ax.set(adjustable="box-forced", aspect="equal") count += 1 fig.text(0.50, 0.055, "Predicted, " + r"$log_{10}(N_{max})$", ha="center", va="center", fontsize=19) fig.text( 0.09, 0.5, "Observed, " + r"$log_{10}(N_{max})$", ha="center", va="center", rotation="vertical", fontsize=19 ) fig_name = str(mydir + "figures/" + figname + "_RGB." + saveAs) plt.savefig(fig_name, dpi=600, format=saveAs) # , bbox_inches = 'tight')#, pad_inches=0) plt.close()
def bootstrap_SDR(name_site_combo, model, in_dir='./data/', out_dir='./out_files/', Niter=200): """A general function of bootstrapping for ISD applying to all four models. Inputs: name_site_combo: a list with dat_name and site model - takes one of four values 'ssnt_0', 'ssnt_1', 'asne', or 'agsne' in_dir - directory of raw data out_dir - directory used both in input (obs_pred.csv file) and output Niter - number of bootstrap samples Output: Writes to one file on disk for R^2. """ dat_name, site = name_site_combo dat = wk.import_raw_data(in_dir + dat_name + '.csv') dat_site = dat[dat['site'] == site] dat_clean = clean_data_agsne(dat_site) G, S, N, E = get_GSNE(dat_clean) lambda1, beta, lambda3 = agsne.get_agsne_lambdas(G, S, N, E) par_list = [] for sp in np.unique(dat_clean['sp']): dat_sp = dat_clean[dat_clean['sp'] == sp] n = len(dat_sp) genus_sp = dat_sp['genus'][0] m = len(np.unique(dat_clean[dat_clean['genus'] == genus_sp]['sp'])) par_list.append([m, n]) pred_obs = wk.import_obs_pred_data(out_dir + dat_name + '_obs_pred_sdr_' + model + '.csv') pred = pred_obs[pred_obs['site'] == site]['pred'] obs = pred_obs[pred_obs['site'] == site]['obs'] out_list_rsquare = [ dat_name, site, str(mtools.obs_pred_rsquare(np.log10(obs), np.log10(pred))) ] iisd_agsne = mete_distributions.theta_agsne([G, S, N, E], [ lambda1, beta, lambda3, agsne.agsne_lambda3_z(lambda1, beta, S) / lambda3 ]) iisd_asne = mete_distributions.theta_epsilon(S, N, E) dbh_scaled = np.array(dat_clean['dbh'] / min(dat_clean['dbh'])) iisd_ssnt_0 = ssnt_isd_bounded(1, N / (sum(dbh_scaled**1) - N)) iisd_ssnt_1 = ssnt_isd_bounded(2 / 3, N / (sum(dbh_scaled**(2 / 3)) - N)) dist_for_model = { 'ssnt_0': iisd_ssnt_0, 'ssnt_1': iisd_ssnt_1, 'asne': iisd_asne, 'agsne': iisd_agsne } dist = dist_for_model[model] for i in range(Niter): if model in ['ssnt_0', 'ssnt_1']: obs_boot = np.array([ np.mean((dist.rvs(par[1]))**2) for par in par_list ]) # Here par[1] is n for each species elif model == 'asne': obs_boot = np.array([ np.mean(np.array(dist.rvs(par[1], par[1]))) for par in par_list ]) else: obs_boot = np.array([ np.mean(np.array(dist.rvs(par[1], par[1], par[0]))) for par in par_list ]) out_list_rsquare.append( str(mtools.obs_pred_rsquare(np.log10(obs_boot), np.log10(pred)))) wk.write_to_file(out_dir + 'SDR_bootstrap_' + model + '_rsquare.txt', ",".join(str(x) for x in out_list_rsquare))
# Force matplotlib to not use any Xwindows backend. matplotlib.use('Agg') from pandas import read_csv from math import log import numpy as np import matplotlib.pylab as plt import sys from macroecotools import plot_color_by_pt_dens, obs_pred_rsquare def plot_obs_pred(obs_pred_data, dest_file='./obs_pred.png'): plot_color_by_pt_dens(obs_pred_data['pred'], obs_pred_data['obs'], 3, loglog=1) plt.loglog([min(obs_pred_data['pred']), max(obs_pred_data['pred'])], [min(obs_pred_data['pred']), max(obs_pred_data['pred'])], 'k-') plt.savefig(dest_file, dpi = 400) if len(sys.argv) > 1: datasets = [sys.argv[1]] else: datasets = ['bbs_2012', 'bbs_2008_2012', 'cbc', 'fia', 'gentry', 'naba'] for dataset in datasets: for datatype in ['fit', 'test']: for predtype in ['rare']: obs_pred_data = read_csv('./results/' + dataset + '_' + predtype+ '_' + datatype + '_obs_pred.csv') print obs_pred_rsquare(np.array(obs_pred_data['obs'].values), np.array(obs_pred_data['pred'].values)) fig_name = './figs/' + dataset + '_' + datatype +'_obs_pred_rarity.png' plot_obs_pred(obs_pred_data, dest_file=fig_name)
def fig4(figname = 'Fig4', data_dir=mydir, radius=1.5, saveAs = 'png'): fig = plt.figure() fig.subplots_adjust(bottom= 0.15) plot_dim = 1 count = 0 models = ['geom', 'lognorm', 'mete', 'zipf'] #modelSlopes = [0.647520323289, 0.942904468437, 0.769214774397, 0.954497727096] #modelInterepts = [0.116508916992, 0.292527611072, 0.19240314275, 0.189954627996] modelSlopes = [] modelInterepts = [] for g, model in enumerate(models): if model == 'geom': IN_Obs_Pred = importData.import_NSR2_data(mydir + \ 'data/NSR2/Stratified/geom_NSR2_stratify.txt') nsr2 = importData.import_NSR2_data(data_dir + \ 'data/NSR2/Stratified_Test/' + model + '_NSR2_stratify.txt') elif model == 'lognorm': IN_Obs_Pred = importData.import_NSR2_data(mydir + \ 'data/NSR2/Stratified/lognorm_pln_NSR2_stratify.txt') nsr2 = importData.import_NSR2_data(data_dir + \ 'data/NSR2/Stratified_Test/' + model + '_'+ 'pln' + '_NSR2_stratify.txt') elif model == 'mete': IN_Obs_Pred = importData.import_NSR2_data(mydir + \ 'data/NSR2/Stratified/mete_NSR2_stratify.txt') nsr2 = importData.import_NSR2_data(data_dir + \ 'data/NSR2/Stratified_Test/' + model + '_NSR2_stratify.txt') elif model == 'zipf': IN_Obs_Pred = importData.import_NSR2_data(mydir + \ 'data/NSR2/Stratified/zipf_mle_NSR2_stratify.txt') nsr2 = importData.import_NSR2_data(data_dir + \ 'data/NSR2/Stratified_Test/' + model + '_mle' + '_NSR2_stratify.txt') N = np.asarray(list(((IN_Obs_Pred["N"])))) N_All = np.asarray(list(((nsr2["N"])))) domSlope = np.mean(((nsr2["NmaxPredSlope"]))) domIntercept = 10 ** np.mean(((nsr2["NmaxPredIntercept"]))) NmaxObs = np.asarray(list(((IN_Obs_Pred["NmaxObs"])))) NmaxObsAll = np.asarray(list(((nsr2["NmaxObs"])))) NmaxPred = [] NmaxPredAll = [] for i in range(len(N)): NmaxPred_i = mo.predictNmax(N[i]).getNmax(b = domIntercept, slope = domSlope) NmaxPred.append(NmaxPred_i) NmaxPred = np.asarray(NmaxPred) NmaxPred_obs = [k for k in zip(NmaxObs, NmaxPred) if k[0] < 200000 ] NmaxObs = np.asarray([k[0] for k in NmaxPred_obs]) NmaxPred = np.asarray([k[1] for k in NmaxPred_obs]) axis_min = 10 axis_max = 1000000 ax = fig.add_subplot(2, 2, count+1) if model == 'geom': ax.set_title("Broken-stick") elif model == 'lognorm': ax.set_title("Lognormal") elif model == 'mete': ax.set_title("Log-series") elif model == 'zipf': ax.set_title("Zipf") #plot_color_by_pt_dens(NmaxPred, NmaxObs, radius, loglog=1, # plot_obj=plt.subplot(2,2,count+1)) #if model == 'lognorm': # radius = 1.3 macroecotools.plot_color_by_pt_dens(NmaxPred, NmaxObs, radius, loglog=1, plot_obj=plt.subplot(2,2,count+1)) plt.plot([axis_min, axis_max],[axis_min, axis_max], 'k-') plt.xlim(axis_min, axis_max) plt.ylim(axis_min, axis_max) ax.set_xlim(axis_min, axis_max) ax.set_ylim(axis_min, axis_max ) r2_all = macroecotools.obs_pred_rsquare(np.log10(NmaxObs), np.log10(NmaxPred)) r2text = r"${}^{{2}}_{{m}} = {:.{p}f} $".format('r',r2_all , p=2) plt.text(0.72, 0.12, r2text, fontsize=13, horizontalalignment='center', verticalalignment='center',transform = ax.transAxes) plt.tick_params(axis='both', which='major', labelsize=12) plt.subplots_adjust(wspace=0.00001, hspace=0.3) ax.set(adjustable='box-forced', aspect='equal') count += 1 fig.text(0.50, 0.055 , 'Predicted, ' +r'$log_{10}(N_{max})$', ha='center', va='center', fontsize = 19) fig.text(0.09, 0.5, 'Observed, ' +r'$log_{10}(N_{max})$', ha='center', va='center', rotation='vertical',\ fontsize = 19) fig_name = str(mydir + 'figures/' + figname + '_RGB.' + saveAs) plt.savefig(fig_name, dpi=600, format = saveAs)#, bbox_inches = 'tight')#, pad_inches=0) plt.close()
3, loglog=1) plt.loglog( [min(obs_pred_data['pred'] + adj), max(obs_pred_data['pred'] + adj)], [min(obs_pred_data['pred'] + adj), max(obs_pred_data['pred'] + adj)], 'k-') plt.savefig(dest_file, dpi=400) if len(sys.argv) > 1: datasets = [sys.argv[1]] else: datasets = ['bbs_2012', 'bbs_2008_2012', 'cbc', 'fia', 'gentry', 'naba'] for dataset in datasets: for datatype in ['fit', 'test']: for predtype in ['sad']: obs_pred_data = read_csv('./results/' + dataset + '_' + predtype + '_' + datatype + '_obs_pred_norm.csv') adj = 1 log_pred = [ log(float(i + adj)) for i in obs_pred_data['pred'].values ] log_obs = [ log(float(i + adj)) for i in obs_pred_data['obs'].values ] print obs_pred_rsquare(np.array(log_pred), np.array(log_obs)) fig_name = './figs/' + dataset + '_' + datatype + '_obs_pred_' + predtype + '.png' plot_obs_pred(obs_pred_data, adj=adj, dest_file=fig_name)
def bootstrap_ISD(name_site_combo, model, in_dir = './data/', out_dir = './out_files/', Niter = 200): """A general function of bootstrapping for ISD applying to all four models. Inputs: name_site_combo: a list with dat_name and site model - takes one of four values 'ssnt_0', 'ssnt_1', 'asne', or 'agsne' in_dir - directory of raw data out_dir - directory used both in input (obs_pred.csv file) and output Niter - number of bootstrap samples Output: Writes to disk, with one file for R^2 and one for KS statistic. """ dat_name, site = name_site_combo dat = wk.import_raw_data(in_dir + dat_name + '.csv') dat_site = dat[dat['site'] == site] dat_clean = clean_data_agsne(dat_site) G, S, N, E = get_GSNE(dat_clean) lambda1, beta, lambda3 = agsne.get_agsne_lambdas(G, S, N, E) isd_agsne = mete_distributions.psi_agsne([G, S, N, E], [lambda1, beta, lambda3, agsne.agsne_lambda3_z(lambda1, beta, S) / lambda3]) isd_asne = mete_distributions.psi_epsilon_approx(S, N, E) dbh_scaled = np.array(dat_clean['dbh'] / min(dat_clean['dbh'])) isd_ssnt_0 = ssnt_isd_bounded(1, N / (sum(dbh_scaled ** 1) - N)) isd_ssnt_1 = ssnt_isd_bounded(2/3, N / (sum(dbh_scaled ** (2/3)) - N)) dist_for_model = {'ssnt_0': isd_ssnt_0, 'ssnt_1': isd_ssnt_1, 'asne': isd_asne, 'agsne': isd_agsne} dist = dist_for_model[model] pred_obs = wk.import_obs_pred_data(out_dir + dat_name + '_obs_pred_isd_' + model + '.csv') pred = pred_obs[pred_obs['site'] == site]['pred'] obs = pred_obs[pred_obs['site'] == site]['obs'] out_list_rsquare = [dat_name, site, str(mtools.obs_pred_rsquare(np.log10(obs), np.log10(pred)))] wk.write_to_file(out_dir + 'ISD_bootstrap_' + model + '_rsquare.txt', ",".join(str(x) for x in out_list_rsquare), new_line = False) emp_cdf = mtools.get_emp_cdf(obs) out_list_ks = [dat_name, site, str(max(abs(emp_cdf - np.array([dist.cdf(x) for x in obs]))))] wk.write_to_file(out_dir + 'ISD_bootstrap_' + model + '_ks.txt', ",".join(str(x) for x in out_list_ks), new_line = False) num_pools = 8 # Assuming that 8 pools are to be created for i in xrange(Niter): obs_boot = [] cdf_boot = [] while len(obs_boot) < N: pool = multiprocessing.Pool(num_pools) out_sample = pool.map(wk.generate_isd_sample, [dist for j in xrange(num_pools)]) for combo in out_sample: cdf_sublist, sample_sublist = combo obs_boot.extend(sample_sublist) cdf_boot.extend(cdf_sublist) pool.close() pool.join() if model in ['asne', 'agsne']: obs_boot = np.sort(obs_boot[:N]) ** 0.5 # Convert to diameter else: obs_boot = np.sort(obs_boot[:N]) sample_rsquare = mtools.obs_pred_rsquare(np.log10(obs_boot), np.log10(pred)) sample_ks = max(abs(emp_cdf - np.sort(cdf_boot[:N]))) wk.write_to_file(out_dir + 'ISD_bootstrap_' + model + '_rsquare.txt', "".join([',', str(sample_rsquare)]), new_line = False) wk.write_to_file(out_dir + 'ISD_bootstrap_' + model + '_ks.txt', "".join([',', str(sample_ks)]), new_line = False) wk.write_to_file(out_dir + 'ISD_bootstrap_' + model + '_rsquare.txt', '\t') wk.write_to_file(out_dir + 'ISD_bootstrap_' + model + '_ks.txt', '\t') wk.write_to_file(out_dir + 'ISD_bootstrap_' + model + '_ks.txt', ",".join(str(x) for x in out_list_ks))
def sample_lines_mete_geom_test(datasets, SAD_number, iterations, percents): #percents = [0.500000, 0.250000, 0.125000, 0.062500, 0.031250, 0.015625] SAD_number = int(SAD_number) iterations = int(iterations) methods = ['geom', 'mete'] for i, dataset in enumerate(datasets): signal.signal(signal.SIGALRM, timeout_handler) if dataset == 'MGRAST': # fix subset l8r IN = mydir + dataset + '-Data' + '/MGRAST/MGRAST-SADs.txt' nsr2_data_mete_geom = gf.import_NSR2_data(mydir + 'NSR2/' + 'mete_MGRAST_NSR2.txt') elif dataset == '95' or dataset == '97' or dataset == '99': IN = mydir + dataset + '-Data/' + str(dataset) + '/MGRAST-' + str(dataset) + '-SADs.txt' nsr2_data_mete_geom = gf.import_NSR2_data(mydir + 'NSR2/' + 'mete_MGRAST'+dataset+'_NSR2.txt') elif dataset == 'HMP': IN = mydir + dataset + '-Data' + '/' + dataset +'-SADs_NAP.txt' nsr2_data_mete_geom = gf.import_NSR2_data(mydir + 'NSR2/' + 'mete_'+dataset+'_NSR2.txt') else: IN = mydir + dataset + '-Data' + '/' + dataset +'-SADs.txt' nsr2_data_mete_geom = gf.import_NSR2_data(mydir + 'NSR2/' + 'mete_'+dataset+'_NSR2.txt') nsr2_data_mete_geom_N_site = np.column_stack((nsr2_data_mete_geom["site"], nsr2_data_mete_geom["N"])) nsr2_data_mete_geom_sorted = nsr2_data_mete_geom_N_site[nsr2_data_mete_geom_N_site[:,1].argsort()[::-1]] nsr2_data_mete_geom_top100 = nsr2_data_mete_geom_N_site[nsr2_data_mete_geom_N_site[:,1].argsort()[::-1]][:SAD_number,] # Get the SAD numbers mete_geom_numbers = nsr2_data_mete_geom_top100[:,0] mete_geom_numbers = mete_geom_numbers.astype(int) OUT1 = open(mydir + 'SubSampled-Data' + '/' + dataset + '_geom_SubSampled_Data.txt', 'w+') OUT2 = open(mydir + 'SubSampled-Data' + '/' + dataset + '_mete_SubSampled_Data.txt', 'w+') num_lines = sum(1 for line in open(IN)) test_lines = 0 succeess_lines_geom = SAD_number succeess_lines_mete = SAD_number while (succeess_lines_geom > 0) and (succeess_lines_mete > 0): site = nsr2_data_mete_geom_sorted[test_lines,0] for j,line in enumerate(open(IN)): if (j != site): continue else: if dataset == "HMP": line = line.strip().split(',') line = [x.strip(' ') for x in line] line = [x.strip('[]') for x in line] site_name = line[0] line.pop(0) else: line = eval(line) obs = map(int, line) # Calculate relative abundance of each OTU # Use that as weights N_0 = float(sum(obs)) S_0 = len(obs) N_max = max(obs) if S_0 < 10 or N_0 <= S_0: test_lines += 1 continue line_ra = map(lambda x: x/N_0, obs) # Calculate relative abundance of each OTU # Use that as weights sample_sizes = map(lambda x: round(x*N_0), percents) if any(sample_size <= 10 for sample_size in sample_sizes) == True: test_lines += 1 continue gm_lines = SAD_number geom_means = [N_0, S_0, N_max] mete_means = [N_0, S_0, N_max] print dataset, N_0, S_0, ' countdown: ', succeess_lines_geom # separate this. get percents for Zipf and mete/geom # then go on with the sampling failed_percents = 0 for k, percent in enumerate(percents): sample_size = round(percent * N_0) if sample_size <= 10 or failed_percents > 0: continue mg_iter = iterations N_max_list_mg = [] N_0_list_mg = [] S_0_list_mg = [] r2_list_BS = [] r2_list_METE = [] iter_count_current = 0 iter_count = iterations fail_threshold = 20 iter_failed = 0 while (mg_iter > 0) and (iter_failed < fail_threshold): sample_k = np.random.multinomial(sample_size, line_ra, size = None) sample_k_sorted = -np.sort( -sample_k[sample_k != 0] ) N_k = sum(sample_k_sorted) S_k = sample_k_sorted.size if S_k < 10 or N_k <= S_k: iter_failed += 1 continue N_max_k = max(sample_k_sorted) logSeries = mete.get_mete_rad(S_k, N_k) pred_mete = logSeries[0] r2_mete = macroecotools.obs_pred_rsquare(np.log10(sample_k_sorted), np.log10(pred_mete)) pred_BS = get_GeomSeries(N_k, S_k, False) # False mean no zeros allowed r2_BS = macroecotools.obs_pred_rsquare(np.log10(sample_k_sorted), np.log10(pred_BS)) r2_list = [r2_mete, r2_BS] if any( (r2 == -float('inf') ) or (r2 == float('inf') ) or (r2 == float('Nan') ) for r2 in r2_list): #mg_iter += 1 iter_failed += 1 continue N_max_list_mg.append(N_max_k) N_0_list_mg.append(N_k) S_0_list_mg.append(S_k) r2_list_BS.append(r2_BS) r2_list_METE.append(r2_mete) mg_iter -= 1 if len(N_max_list_mg) != iterations: test_lines += 1 continue N_0_mg_mean = np.mean(N_0_list_mg) geom_means.append(N_0_mg_mean) mete_means.append(N_0_mg_mean) S_0_mean = np.mean(S_0_list_mg) geom_means.append(S_0_mean) mete_means.append(S_0_mean) N_max_mg_mean = np.mean(N_max_list_mg) geom_means.append(N_max_mg_mean) mete_means.append(N_max_mg_mean) r2_BS_mg_mean = np.mean(r2_list_BS) geom_means.append(r2_BS_mg_mean) r2_METE_mg_mean = np.mean(r2_list_METE) mete_means.append(r2_METE_mg_mean) '''Now we check if the lists are the right length there are 6 iterations for the percentage mete/ geom, append four items each iteration. 4*6 = 24, add three original = 27 likewise, for zipf, (5*6) + 3 = 33 ''' test_lines += 1 if (len(geom_means) == 27): succeess_lines_geom -= 1 geom_means_str = ' '.join(map(str, geom_means)) #OUT1.write(','.join(map(repr, geom_means_str[i])) print>> OUT1, j, geom_means_str if (len(mete_means) == 27): succeess_lines_mete -= 1 mete_means_str = ' '.join(map(str, mete_means)) print>> OUT2, j, mete_means_str print dataset, percent
def fig5(SADModels): """ This function generates a 2x2 figure, with these subplots: One subplot for each model: r-squared vs. N list of r-squared values and list of Ns plotted against each other """ fig = plt.figure() for i, model in enumerate(SADModels): fig.add_subplot(2, 2, i + 1) obs_pred_data = import_obs_pred_data(mydir + '/Results/' + model + '.txt') obs = ((obs_pred_data["obs"])) pred = ((obs_pred_data["pred"])) site = ((obs_pred_data["site"])) obs_data = [] pred_data = [] for sites in np.unique(site): obs_data.append(obs[sites == site]) pred_data.append(pred[sites == site]) Ns = [] r2s = [] for j, sad in enumerate(obs_data): r2 = macroecotools.obs_pred_rsquare(np.array(sad), np.array(pred_data[j])) r2s.append(r2) N = sum(sad) # Find Total Abundance Ns.append(N) plt.scatter(np.log(Ns).tolist(), r2s, color='Maroon', label=model, alpha=0.5) # label is for the legend plt.xlabel('Log Abundance', fontsize=8) plt.ylabel('Rsquared Value', fontsize=8) plt.subplots_adjust(wspace=.35, hspace=.35) plt.axhline(y=0) if model == 'SimBrokenStick': plt.title("Broken Stick R^2 v N", fontsize=10) elif model == 'SimLogNormInt': plt.title("Log Norm R^2 v N", fontsize=10) elif model == 'SimpleRandomFraction': plt.title("Random Fraction R^2 v N", fontsize=10) elif model == 'SimParetoInt': plt.title("Pareto Int R^2 v N", fontsize=10) print model + ': Done' # insert code to plot a legend plt.savefig('/Users/Nathan_Hillis/GitHub/SADModels/Results/R2vN.png', dpi=600, bbox_inches='tight', pad_inches=0.03) plt.show() return
def test_func(ab, dist_name, *pars): pred = get_pred_multi_dists(len(ab), dist_name, *pars) r2 = macroecotools.obs_pred_rsquare(sorted(ab, reverse = True), pred) return r2
mete_r2s = [] pln_r2s = [] shuffle(RADs) for i, obs in enumerate(RADs): N = int(sum(obs)) S = int(len(obs)) if S > 4 and N > 10 and obs.count(1) / len(obs) < 0.5: result = mete.get_mete_rad(S, N) pred1 = np.log10(result[0]) obs1 = np.log10(obs) mete_r2 = mct.obs_pred_rsquare(np.array(obs1), np.array(pred1)) mete_r2s.append(mete_r2) pred = get_rad_from_obs(obs, 'pln') pred1 = np.log10(pred) pln_r2 = mct.obs_pred_rsquare(np.array(obs1), np.array(pred1)) pln_r2s.append(pln_r2) print i, 'N:', N, ' S:', S, ' n:', len( pln_r2s), ' | mete:', mete_r2, ' pln:', pln_r2 if len(pln_r2s) > 20: break kernel = 0.5 D = get_kdens_choose_kernel(mete_r2s, kernel) plt.plot(D[0], D[1], color='0.5', lw=3, alpha=0.99, label='log-series')
site = np.asarray(list(((data["site"])))) obs = np.asarray(list(((data["obs"])))) pred = np.asarray(list(((data["pred"])))) axis_min = 0 axis_max = 2 * max(obs) radius=2 mct.plot_color_by_pt_dens(pred, obs, radius, loglog=1, plot_obj=ax1) plt.plot([axis_min, axis_max],[axis_min, axis_max], 'k-') plt.xlim(0, axis_max) plt.ylim(0, axis_max) r2_all = mct.obs_pred_rsquare(np.log10(obs), np.log10(pred)) r2text = r"${}^{{2}} = {:.{p}f} $".format('r',r2_all , p=2) plt.text(2, 30000, r2text, fontsize=14) plt.text(28, 800000, 'Log-series', fontsize=14) plt.text(5, 0.1, 'Predicted rank-abundance', fontsize=10) plt.text(0.1, 60000, 'Observed rank-abundance', rotation='vertical', fontsize=10) plt.tick_params(axis='both', which='major', labelsize=7) #plt.subplots_adjust(wspace=0.5, hspace=0.3) #axins = inset_axes(ax, width="30%", height="30%", loc=4) #plt.setp(axins, xticks=[], yticks=[]) ax2 = fig.add_subplot(2, 2, 2)
def fig4(figname = 'Fig4', data_dir=mydir, radius=2): fig = plt.figure() plot_dim = 1 count = 0 IN_Obs_Pred = importData.import_NSR2_data(mydir + \ 'data/NSR2/Stratified/lognorm_pln_NSR2_stratify.txt') N = np.asarray(list(((IN_Obs_Pred["N"])))) S = np.asarray(list(((IN_Obs_Pred["S"])))) NmaxObs = np.asarray(list(((IN_Obs_Pred["NmaxObs"])))) # order models = ['geom', 'lognorm', 'mete', 'zipf'] modelSlopes = [0.647520323289, 0.942904468437, 0.769214774397, 0.954497727096] modelInterepts = [0.116508916992, 0.292527611072, 0.19240314275, 0.189954627996] for g, model in enumerate(models): NmaxPred = [] SPred = [] for i in range(len(N)): NmaxPred_i = importPredictS.predictS(N[i], NmaxObs[i], \ predictNmax=True).getNmax(b = modelInterepts[g], slope = modelSlopes[g]) SPred_i = importPredictS.predictS(N[i], NmaxObs[i], predictNmax=True).getS() NmaxPred.append(NmaxPred_i) SPred.append(SPred_i) NmaxPred = np.asarray(NmaxPred) SPred = np.asarray(SPred) axis_min = 0 axis_max = 2 * max(NmaxObs) ax = fig.add_subplot(2, 2, count+1) #ax.set_title(r"$\mathbf{N_{max}}$", y=1.03) if model == 'geom': ax.set_title(r"$\mathbf{Broken-stick}$") elif model == 'lognorm': ax.set_title(r"$\mathbf{Lognormal}$") elif model == 'mete': ax.set_title(r"$\mathbf{Log-series}$") elif model == 'zipf': ax.set_title(r"$\mathbf{Zipf}$") macroecotools.plot_color_by_pt_dens(NmaxPred, NmaxObs, radius, loglog=1, plot_obj=plt.subplot(2,2,count+1)) plt.plot([axis_min, axis_max],[axis_min, axis_max], 'k-') plt.xlim(axis_min, axis_max) plt.ylim(0, axis_max) print max(NmaxPred) r2_all = macroecotools.obs_pred_rsquare(np.log10(NmaxObs), np.log10(NmaxPred)) r2text = r"${}^{{2}}_{{m}} = {:.{p}f} $".format('r',r2_all , p=2) plt.text(0.22, 0.91, r2text, fontsize=13, horizontalalignment='center', verticalalignment='center',transform = ax.transAxes) plt.tick_params(axis='both', which='major', labelsize=7) plt.subplots_adjust(wspace=0.5, hspace=0.3) #axins = inset_axes(ax, width="30%", height="30%", loc=4) ax.set(adjustable='box-forced', aspect='equal') #plt.setp(axins, xticks=[], yticks=[]) count += 1 fig.text(0.50, 0.04, r'$Predicted\, log_{10}(N_{max})$', ha='center', va='center', fontsize = 16) fig.text(0.04, 0.5, r'$Observed\,log_{10}(N_{max})$', ha='center', va='center', rotation='vertical',\ fontsize = 16) fig_name = str(mydir + 'figures/' + figname + '.png') plt.savefig(fig_name, dpi=600)#, bbox_inches = 'tight')#, pad_inches=0) plt.close()
def generate_obs_pred_data(datasets, methods, size): for method in methods: for dataset in datasets: #OUT1 = open(mydir + "ObsPred/" + method +'_'+dataset+'_obs_pred.txt','w+') #OUT2 = open(mydir + "NSR2/" + method +'_'+dataset+'_NSR2.txt','w+') #OUT1 = open(mydir + "ObsPred/" + method +'_'+dataset+'_obs_pred_subset.txt','w+') #OUT2 = open(mydir + "NSR2/" + method +'_'+dataset+'_NSR2_subset.txt','w+') if dataset == "HMP": IN = mydir + dataset + '-Data' + '/' + dataset +'-SADs.txt' num_lines = sum(1 for line in open(IN)) OUT1 = open(mydir + "ObsPred/" + method +'_'+dataset+'_obs_pred.txt','w+') OUT2 = open(mydir + "NSR2/" + method +'_'+dataset+'_NSR2.txt','w+') elif dataset == 'EMPclosed' or dataset == 'EMPpen': IN = mydir + dataset + '-Data' + '/' + dataset +'-SADs.txt' num_lines = sum(1 for line in open(IN)) random_sites = np.random.randint(num_lines,size=size) num_lines = size OUT1 = open(mydir + "ObsPred/" + method +'_'+dataset+'_obs_pred_subset.txt','w+') OUT2 = open(mydir + "NSR2/" + method +'_'+dataset+'_NSR2_subset.txt','w+') num_lines = sum(1 for line in open(IN)) else: IN = mydir + 'MGRAST-Data/' + dataset + '/' + 'MGRAST-' + dataset + '-SADs.txt' num_lines = sum(1 for line in open(IN)) OUT1 = open(mydir + "ObsPred/" + method +'_'+ 'MGRAST' + dataset+'_obs_pred.txt','w+') OUT2 = open(mydir + "NSR2/" + method +'_'+ 'MGRAST' + dataset+'_NSR2.txt','w+') for j,line in enumerate(open(IN)): if dataset == "HMP": line = line.split() elif size == 0: line = eval(line) else: line = eval(line) if j not in random_sites: continue #line.strip("[]") #line.split() obs = map(int, line) N = sum(obs) S = len(obs) if S < 10 or N <= S: num_lines += 1 continue obs.sort() obs.reverse() print method, dataset, N, S, 'countdown:', num_lines, if method == 'geom': # Predicted geometric series pred = get_GeomSeries(N, S, False) # False mean no zeros allowed elif method == 'mete': # Predicted log-series logSeries = mete.get_mete_rad(S, N) pred = logSeries[0] r2 = macroecotools.obs_pred_rsquare(np.log10(obs), np.log10(pred)) print " r2:", r2 if r2 == -float('inf') or r2 == float('inf') or r2 == float('Nan'): print r2 + " is Nan or inf, removing..." continue print>> OUT2, j, N, S, r2 # write to file, by cite, observed and expected ranked abundances for i, sp in enumerate(pred): print>> OUT1, j, obs[i], pred[i] num_lines -= 1 OUT1.close() print dataset
from macroecotools import plot_color_by_pt_dens, obs_pred_rsquare def plot_obs_pred(obs_pred_data, dest_file='./obs_pred.png'): plot_color_by_pt_dens(obs_pred_data['pred'], obs_pred_data['obs'], 3, loglog=1) plt.loglog([min(obs_pred_data['pred']), max(obs_pred_data['pred'])], [min(obs_pred_data['pred']), max(obs_pred_data['pred'])], 'k-') plt.savefig(dest_file, dpi=400) if len(sys.argv) > 1: datasets = [sys.argv[1]] else: datasets = ['bbs_2012', 'bbs_2008_2012', 'cbc', 'fia', 'gentry', 'naba'] for dataset in datasets: for datatype in ['fit', 'test']: for predtype in ['rare']: obs_pred_data = read_csv('./results/' + dataset + '_' + predtype + '_' + datatype + '_obs_pred.csv') print obs_pred_rsquare(np.array(obs_pred_data['obs'].values), np.array(obs_pred_data['pred'].values)) fig_name = './figs/' + dataset + '_' + datatype + '_obs_pred_rarity.png' plot_obs_pred(obs_pred_data, dest_file=fig_name)
def test_func(ab, dist_name, *pars): pred = get_pred_multi_dists(len(ab), dist_name, *pars) r2 = macroecotools.obs_pred_rsquare(sorted(ab, reverse=True), pred) return r2
shuffle(RADs) for i, obs in enumerate(RADs): N = int(sum(obs)) S = int(len(obs)) print i, N, S, len(pln_r2s) if S >= 10 and N > 50: if N < 10000: result = mete.get_mete_rad(S, N) predRAD = result[0] mete_r2 = mct.obs_pred_rsquare(np.array(obs), np.array(predRAD)) mete_r2s.append(mete_r2) #zipf_pred = dist.zipf(obs) #predRAD = zipf_pred.from_cdf() #zipf_r2 = mct.obs_pred_rsquare(np.array(obs), np.array(predRAD)) #zipf_r2s.append(zipf_r2) predRAD = get_rad_from_obs(obs, 'pln') pln_r2 = mct.obs_pred_rsquare(np.array(obs), np.array(predRAD)) pln_r2s.append(pln_r2) if len(pln_r2s) > 200: break fig = plt.figure(111) kernel = 0.5
def test_zipf_num_est(datasets, estimators, SAD_number, iterations, fail_threshold): percents = [0.500000, 0.250000, 0.125000, 0.062500, 0.031250, 0.015625] for dataset in datasets: signal.signal(signal.SIGALRM, gf.timeout_handler) if dataset == 'MGRAST': # fix subset l8r IN = mydir + dataset + '-Data' + '/MGRAST/MGRAST-SADs.txt' nsr2_data_zipf = gf.import_NSR2_data(mydir + 'NSR2/' + 'zipf_MGRAST_NSR2.txt') elif dataset == '95' or dataset == '97' or dataset == '99': IN = mydir + dataset + '-Data/' + str(dataset) + '/MGRAST-' + str(dataset) + '-SADs.txt' nsr2_data_zipf = gf.import_NSR2_data(mydir + 'NSR2/' +'zipf_MGRAST'+dataset+'_NSR2.txt') elif dataset == 'HMP': IN = mydir + dataset + '-Data' + '/' + dataset +'-SADs_NAP.txt' nsr2_data_zipf = gf.import_NSR2_data(mydir + 'NSR2/' + 'zipf_'+dataset+'_NSR2.txt') else: IN = mydir + dataset + '-Data' + '/' + dataset +'-SADs.txt' nsr2_data_zipf = gf.import_NSR2_data(mydir + 'NSR2/' + 'zipf_'+dataset+'_NSR2.txt') nsr2_data_zipf_N_site = np.column_stack((nsr2_data_zipf["site"], nsr2_data_zipf["N"])) # Sort these arrays nsr2_data_zipf_sorted = nsr2_data_zipf_N_site[nsr2_data_zipf_N_site[:,1].argsort()[::-1]] nsr2_data_zipf_top100 = nsr2_data_zipf_sorted[:SAD_number,] # Get the SAD numbers zipf_numbers = nsr2_data_zipf_top100[:,0] zipf_numbers = zipf_numbers.astype(int) successful_SADs_samplings = SAD_number for estimator in estimators: OUT = open(mydir + 'SubSampled-Data' + '/' + dataset + '_zipf_' + \ str(estimator) + '_SubSampled_Data.txt', 'w+') num_lines = sum(1 for line in open(IN)) test_lines = 0 succeess_lines = SAD_number while succeess_lines > 0: site = nsr2_data_zipf_sorted[test_lines,0] for j,line in enumerate(open(IN)): if (j != site): continue else: if dataset == "HMP": line = line.strip().split(',') line = [x.strip(' ') for x in line] line = [x.strip('[]') for x in line] site_name = line[0] line.pop(0) else: line = eval(line) obs = map(int, line) # Calculate relative abundance of each OTU # Use that as weights N_0 = float(sum(obs)) S_0 = len(obs) N_max = max(obs) if S_0 < 10 or N_0 <= S_0: test_lines += 1 continue line_ra = map(lambda x: x/N_0, obs) sample_sizes = map(lambda x: round(x*N_0), percents) if any(sample_size <= 10 for sample_size in sample_sizes) == True: test_lines += 1 continue zipf_means = [N_0, S_0, N_max] failed_percents = 0 for k, percent in enumerate(percents): if failed_percents > 0: continue N_max_list_zipf = [] N_0_list_zipf = [] S_0_list_zipf = [] r2_list_zipf = [] gamma_list = [] iter_count_current = 0 iter_count = iterations iter_failed = 0 while iter_count > 0 and iter_failed < fail_threshold: sample_size_k = sample_sizes[0] sample_k = np.random.multinomial(sample_size_k, line_ra, size = None) sample_k_sorted = -np.sort( -sample_k[sample_k != 0] ) N_0_k = sum(sample_k_sorted) S_0_k = sample_k_sorted.size if S_0_k < 10 or N_0_k <= S_0_k: continue N_max_k = max(sample_k_sorted) iter_count_current += 1 # Start the timer. Once 1 second is over, a SIGALRM signal is sent. signal.alarm(2) # This try/except loop ensures that # you'll catch TimeoutException when it's sent. #start_time = time.time() try: # Whatever your function that might hang zipf_class = gf.zipf(sample_k_sorted, estimator) pred_tuple = zipf_class.from_cdf() Zipf_solve_line = zipf_class.zipf_solver(sample_k_sorted) rv = stats.zipf(Zipf_solve_line) pred_zipf = pred_tuple[0] gamma = pred_tuple[1] r2_zipf = macroecotools.obs_pred_rsquare(np.log10(sample_k_sorted), np.log10(pred_zipf)) if (r2_zipf == -float('inf') ) or (r2_zipf == float('inf') ) or (r2_zipf == float('Nan') ): continue else: r2_list_zipf.append(r2_zipf) gamma_list.append(gamma) N_max_list_zipf.append(N_max_k) N_0_list_zipf.append(N_0_k) S_0_list_zipf.append(S_0_k) except gf.TimeoutException: print "Line " + str(j) + ": " + str(estimator) + " timed out" iter_count -= 1 if iter_failed >= fail_threshold: failed_percents += 1 iter_failed += 1 continue # continue the for loop if function takes more than x seconds else: iter_count -= 1 #print("--- %s seconds ---" % (time.time() - start_time)) # Reset the alarm signal.alarm(0) if len(N_0_list_zipf) != iterations: test_lines += 1 continue N_0_zipf_mean = np.mean(N_0_list_zipf) zipf_means.append(N_0_zipf_mean) S_0_zipf_mean = np.mean(S_0_list_zipf) zipf_means.append(S_0_zipf_mean) N_max_zipf_mean = np.mean(N_max_list_zipf) zipf_means.append(N_max_zipf_mean) r2_zipf_mean = np.mean(r2_list_zipf) zipf_means.append(r2_zipf_mean) gamma_zipf_mean = np.mean(gamma_list) zipf_means.append(gamma_zipf_mean) '''Now we check if the lists are the right length there are 6 iterations for the percentage mete/ geom, append four items each iteration. 4*6 = 24, add three original = 27 likewise, for zipf, (5*6) + 3 = 33 ''' if len(zipf_means) == 33: test_lines += 1 succeess_lines -= 1 zipf_means_str = ' '.join(map(str, zipf_means)) #OUT1.write(','.join(map(repr, geom_means_str[i])) print>> OUT, j, zipf_means_str print "Line " + str(j) + ": " + str(succeess_lines) + " SADs to go!" else: test_lines += 1 #print estimator print dataset
print 'Number of RADs:', len(RADs) mete_r2s = [] pln_r2s = [] zipf_r2s = [] ct = 0 shuffle(RADs) for obs in RADs: N = int(sum(obs)) S = int(len(obs)) s = obs.count(1) if S > 9 and N > 9: ct += 1 pred = mete.get_mete_rad(S, N)[0] mete_r2 = mct.obs_pred_rsquare(obs, np.array(pred)) mete_r2s.append(mete_r2) pred = get_pln_from_obs(obs, 'pln') pred = np.log10(pred) obs1 = np.log10(obs) pln_r2 = mct.obs_pred_rsquare(obs1, pred) pln_r2s.append(pln_r2) print ct, 'N:', N, ' S:', S, ' n:', len( pln_r2s), ' | mete:', mete_r2, ' pln:', pln_r2 if len(pln_r2s) > minct: break kernel = 0.5 D = get_kdens_choose_kernel(pln_r2s, kernel) plt.plot(D[0],
shuffle(RADs) for i, obs in enumerate(RADs): N = int(sum(obs)) S = int(len(obs)) print i, N, S, len(pln_r2s) if S >= 10 and N > 50: if N < 10000: result = mete.get_mete_rad(S, N) predRAD = result[0] mete_r2 = mct.obs_pred_rsquare(np.array(obs), np.array(predRAD)) mete_r2s.append(mete_r2) #zipf_pred = dist.zipf(obs) #predRAD = zipf_pred.from_cdf() #zipf_r2 = mct.obs_pred_rsquare(np.array(obs), np.array(predRAD)) #zipf_r2s.append(zipf_r2) predRAD = get_rad_from_obs(obs, 'pln') pln_r2 = mct.obs_pred_rsquare(np.array(obs), np.array(predRAD)) pln_r2s.append(pln_r2) if len(pln_r2s) > 200: break fig = plt.figure(111)