def get_envpred(envpred_data, predtype=['sad', 'rad']): if predtype is 'sad': envpred = DataFrame(columns=['site_id', 'octave', 'env_pred']) if predtype is 'rad': envpred = DataFrame(columns=['site_id', 'rank', 'env_pred']) if predtype is 'rare': envpred = DataFrame(columns=['site_id', 'env_pred']) for index, site in envpred_data.iterrows(): obs_S = site['S'] envpred_S = 10 ** site['logSpred'] envpred_N = 10 ** site['logNpred'] if predtype is 'sad': sad_bins = get_log_bins([envpred_N]) octave = range(0, len(sad_bins) - 1) site_pred = get_mete_sad(envpred_S, envpred_N, bin_edges=sad_bins) site_ids = [site['site_id'] for i in range(0, len(site_pred))] site_pred_with_id = DataFrame(np.column_stack([site_ids, octave, site_pred]), columns=['site_id', 'octave', 'env_pred']) if predtype is 'rad': # note using observed S here for time being rank = range(1, int(obs_S + 1)) site_beta = get_beta(envpred_S, envpred_N) site_pred, p = get_mete_rad(obs_S, envpred_N, beta=site_beta) site_ids = [site['site_id'] for i in range(0, len(site_pred))] site_pred_with_id = DataFrame(np.column_stack([site_ids, rank, site_pred]), columns=['site_id', 'rank', 'env_pred']) if predtype is 'rare': pred_rad = get_mete_rad(int(envpred_S), envpred_N)[0] site_pred = sum([i <= 10 for i in pred_rad]) site_pred_with_id = DataFrame(np.column_stack([site['site_id'], site_pred]), columns=['site_id', 'env_pred']) envpred = envpred.append(site_pred_with_id, ignore_index=True) return envpred
def get_obs_pred_sad(raw_data_site, dataset_name, model, out_dir = './out_files/'): """Write the observed and predicted RAD to file for a given model. Inputs: raw_data_site - data in the same format as obtained by clean_data_genera(), with four columns site, sp, dbh, and genus, and only for one site. dataset_name - name of the dataset for raw_data_site. model - can take one of three values 'ssnt', 'asne', or 'agsne'. Note that the predicted SAD for SSNT does not change with alternative scaling of D. out_dir - directory for output file. """ G, S, N, E = get_GSNE(raw_data_site) if model == 'ssnt': pred = mete.get_mete_rad(S, N, version = 'untruncated')[0] elif model == 'asne': pred = mete.get_mete_rad(S, N)[0] elif model == 'agsne': pred = agsne.get_mete_agsne_rad(G, S, N, E) obs = np.sort([len(raw_data_site[raw_data_site['sp'] == sp]) for sp in np.unique(raw_data_site['sp'])])[::-1] results = np.zeros((S, ), dtype = ('S15, i8, i8')) results['f0'] = np.array([raw_data_site['site'][0]] * S) results['f1'] = obs results['f2'] = pred if model == 'ssnt': f1_write = open(out_dir + dataset_name + '_obs_pred_rad_ssnt_0.csv', 'ab') f2_write = open(out_dir + dataset_name + '_obs_pred_rad_ssnt_1.csv', 'ab') f2 = csv.writer(f2_write) f2.writerows(results) f2_write.close() else: f1_write = open(out_dir + dataset_name + '_obs_pred_rad_' + model + '.csv', 'ab') f1 = csv.writer(f1_write) f1.writerows(results) f1_write.close()
def plot_RADs_canonical(N, S): fig = plt.figure() ax = fig.add_subplot(1,1,1) # Predicted geometric series predRAD = predRADs.get_GeomSeries(N, S, False) # False mean no zeros allowed ranks = range(1, S+1) plt.plot(ranks, predRAD, lw = 1, c='m') # Predicted log-series logSeries = mete.get_mete_rad(S, N) predRAD = logSeries[0] ranks = range(1, S+1) plt.plot(ranks, predRAD, lw = 1, c='c') # Predicted PLN #predRAD = pln.get_rad_from_obs(predRAD, 'pln') #ranks = range(1, len(predRAD)+1) #plt.plot(ranks, predRAD, lw = 1, c='gray') plt.yscale('log') plt.show() return
def getPredRADs(N, S, Nmax): PRED = [] # Predicted geometric series predRAD = predRADs.get_GeomSeries(N, S, False) # False mean no zeros allowed PRED.append(predRAD) # Predicted log-series logSeries = mete.get_mete_rad(S, N) predRAD = logSeries[0] PRED.append(predRAD) # Predicted PLN predRAD = pln.get_rad_from_obs(RAD, 'pln') PRED.append(predRAD) sample_size = 10 # Predicted from compositions (basically geometric series) predRAD = getPred(N, S, maxn, 'compositions', sample_size) PRED.append(predRAD) # Predicted from Fraction 1: Power Fraction predRAD = getPred(N, S, maxn, 'power fraction', sample_size) PRED.append(predRAD) # Predicted from Fraction 2: Random non-preemption predRAD = getPred(N, S, maxn, 'random fraction non-preemption', sample_size) PRED.append(predRAD) # Predicted from Fraction 3: Random preemption predRAD = getPred(N, S, maxn, 'random fraction', sample_size) PRED.append(predRAD) return PRED
def get_pred_geom_logser(dataset): out_write_geom = open('./data/' + dataset + '/' + dataset + '-obs-pred-geom.txt', 'w') out_write_logser = open('./data/' + dataset + '/' + dataset + '-obs-pred-logser.txt', 'w') out_geom = csv.writer(out_write_geom, delimiter = '\t') out_logser = csv.writer(out_write_logser, delimiter = '\t') data = get_SADs(dataset) data = data[data['obs'] != 0] # Remove rows with zeros site_list = np.sort(list(set(data['site']))) for site in site_list: data_site = data[data['site'] == site] S = len(data_site) N = sum(data_site['obs']) if S > 4 and round(N) > S: cdf = [(S - i + 0.5) / S for i in range(1, S + 1)] pred_geom = trunc_geom.ppf(np.array(cdf), S / N, N) pred_logser = get_mete_rad(int(S), int(round(N)))[0] results_geom = np.zeros((len(data_site), ), dtype = [('f0', 'S25'), ('f1', float), ('f2', int)]) results_geom['f0'] = np.array([site] * len(data_site)) results_geom['f1'] = np.array(sorted(data_site['obs'], reverse = True)) results_geom['f2'] = np.array(pred_geom) out_geom.writerows(results_geom) results_logser = np.zeros((len(data_site), ), dtype = [('f0', 'S25'), ('f1', float), ('f2', int)]) results_logser['f0'] = np.array([site] * len(data_site)) results_logser['f1'] = np.array(sorted(data_site['obs'], reverse = True)) results_logser['f2'] = np.array(pred_logser) out_logser.writerows(results_logser) out_write_geom.close() out_write_logser.close()
def get_obs_pred_sad(raw_data_site, dataset_name, model, out_dir='./out_files/'): """Write the observed and predicted RAD to file for a given model. Inputs: raw_data_site - data in the same format as obtained by clean_data_genera(), with four columns site, sp, dbh, and genus, and only for one site. dataset_name - name of the dataset for raw_data_site. model - can take one of three values 'ssnt', 'asne', or 'agsne'. Note that the predicted SAD for SSNT does not change with alternative scaling of D. out_dir - directory for output file. """ G, S, N, E = get_GSNE(raw_data_site) if model == 'ssnt': pred = mete.get_mete_rad(S, N, version='untruncated')[0] elif model == 'asne': pred = mete.get_mete_rad(S, N)[0] elif model == 'agsne': pred = agsne.get_mete_agsne_rad(G, S, N, E) obs = np.sort([ len(raw_data_site[raw_data_site['sp'] == sp]) for sp in np.unique(raw_data_site['sp']) ])[::-1] results = np.zeros((S, ), dtype=('S15, i8, i8')) results['f0'] = np.array([raw_data_site['site'][0]] * S) results['f1'] = obs results['f2'] = pred if model == 'ssnt': f1_write = open(out_dir + dataset_name + '_obs_pred_rad_ssnt_0.csv', 'ab') f2_write = open(out_dir + dataset_name + '_obs_pred_rad_ssnt_1.csv', 'ab') f2 = csv.writer(f2_write) f2.writerows(results) f2_write.close() else: f1_write = open( out_dir + dataset_name + '_obs_pred_rad_' + model + '.csv', 'ab') f1 = csv.writer(f1_write) f1.writerows(results) f1_write.close()
def fig1(figname = 'Fig1', data_dir= mydir, saveAs = 'eps'): SAD = [10000, 8000, 6000, 5000, 1000, 200, 100, 20, 18, 16, 14, 12, 10, 4,5, 4, 4, 3, 3, 2, 2, 2, 2, 2,2, 1, 1, 1, 1, 1,1,1,1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] SAD.sort() SAD.reverse() x = range(1, len(SAD) +1) N = sum(SAD) S = len(SAD) geom = np.log10(mo.get_Geom(N, S, False)) logSeries = np.log10(mete.get_mete_rad(S, N)[0]) lognorm_pred = mo.lognorm(SAD, 'pln') lognorm_SAD = np.log10(lognorm_pred.get_rad_from_obs()[0]) zipf_class = mo.zipf(SAD, 'fmin') pred_tuple = zipf_class.from_cdf() zipf_SAD = np.log10(pred_tuple[0]) gamma = pred_tuple[1] SAD = np.log10(SAD) fig = plt.figure() plt.plot() max_y = max(max(SAD), max(zipf_SAD)) plt.plot(x, SAD,color = '#A9A9A9', linestyle = '-', linewidth=2, label="Observed") plt.plot(x, geom,color = '#00008B', linestyle = '-', linewidth=2, label="Broken-stick") plt.plot(x, lognorm_SAD, color = '#0000CD',linestyle = '--', linewidth=2, label="Lognormal") plt.plot(x, logSeries, color = '#FF4500',linestyle = '-.', linewidth=2, label="Log-series") plt.plot(x, zipf_SAD, color = 'red',linestyle = '-',linewidth=2, label="Zipf") plt.tight_layout() plt.xlabel('Rank Abundance', fontsize = 22) plt.ylabel('Abundance, ' +r'$log_{10}$', fontsize = 22) output = "dorm_fix_prob.png" plt.legend(loc='upper right') #plt.yscale('log') #plt.yscale('log') plt.xlim(1, len(SAD)) plt.ylim(-0.25 , max_y) plt.tick_params(axis='both', which='major', labelsize=14) plt.legend(frameon=False, fontsize= 18) fig_name = str(mydir + 'figures/' + figname + '_RGB.' + saveAs) plt.savefig(fig_name, bbox_inches = "tight", pad_inches = 0.4, dpi = 600, \ format = saveAs) plt.close()
def run_test(raw_data, dataset_name, data_dir='./data/', cutoff = 9): """Use data to compare the predicted and empirical SADs and get results in csv files Keyword arguments: raw_data : numpy structured array with 4 columns: 'site','year','sp','ab' dataset_name : short code that will indicate the name of the dataset in the output file names data_dir : directory in which to store output cutoff : minimum number of species required to run - 1. """ usites = np.sort(list(set(raw_data["site"]))) f1 = csv.writer(open(data_dir + dataset_name + '_obs_pred.csv','wb')) f2 = csv.writer(open(data_dir + dataset_name + '_dist_test.csv','wb')) for i in range(0, len(usites)): subsites = raw_data["site"][raw_data["site"] == usites[i]] subab = raw_data["ab"][raw_data["site"] == usites[i]] N = sum(subab) S = len(subsites) if S > cutoff: print("%s, Site %s, S=%s, N=%s" % (dataset_name, i, S, N)) # Generate predicted values and p (e ** -beta) based on METE: mete_pred = mete.get_mete_rad(int(S), int(N)) pred = np.array(mete_pred[0]) p = mete_pred[1] p_untruncated = exp(-mete.get_beta(S, N, version='untruncated')) obsab = np.sort(subab)[::-1] # Calculate Akaike weight of log-series: L_logser = md.logser_ll(obsab, p) L_logser_untruncated = md.logser_ll(obsab, p_untruncated) mu, sigma = md.pln_solver(obsab) L_pln = md.pln_ll(mu,sigma,obsab) k1 = 1 k2 = 2 AICc_logser = macroecotools.AICc(k1, L_logser, S) AICc_logser_untruncated = macroecotools.AICc(k1, L_logser_untruncated, S) AICc_pln = macroecotools.AICc(k2, L_pln, S) weight = macroecotools.aic_weight(AICc_logser, AICc_pln, S, cutoff = 4) weight_untruncated = macroecotools.aic_weight(AICc_logser_untruncated, AICc_pln, S, cutoff = 4) #save results to a csv file: results = ((np.column_stack((subsites, obsab, pred)))) results2 = ((np.column_stack((np.array(usites[i], dtype='S20'), S, N, p, weight, p_untruncated, weight_untruncated)))) f1.writerows(results) f2.writerows(results2)
def get_envpred_sads(envpred_data): envpred_sads = DataFrame(columns=['SiteID', 'EnvPred']) for index, site in envpred_data.iterrows(): obs_S = site['S'] envpred_S = 10 ** site['predlogS'] envpred_N = 10 ** site['predlogN'] beta = get_beta(envpred_S, envpred_N) #To produce a comparable number of species use obs_S; IS THIS RIGHT? site_sad, p = get_mete_rad(obs_S, envpred_N, beta=beta) site_ids = [site['SiteID'] for i in range(0, len(site_sad))] site_sad_with_id = DataFrame(np.column_stack([site_ids, site_sad]), columns=['SiteID', 'EnvPred']) envpred_sads = envpred_sads.append(site_sad_with_id, ignore_index=True) return envpred_sads
def generate_obs_pred_data(datasets, methods): for method in methods: for dataset in datasets: gN = 0 #OUT = open(mydir+'/data/'+method+'_'+dataset+'_obs_pred.txt','w+') IN = mydir+'/MicroMETE/data/'+dataset+'_SADs.txt' num_lines = sum(1 for line in open(IN)) for line in open(IN): line = line.split() obs = map(int, line) obs = list([x for x in obs if x > 1]) N = sum(obs) gN += N print N S = len(obs) if S < 10: continue obs.sort() obs.reverse() print method, dataset, N, S, 'countdown:', num_lines, if method == 'geom': # Predicted geometric series pred = predRADs.get_GeomSeries(N, S, False) # False mean no zeros allowed elif method == 'mete': # Predicted log-series logSeries = mete.get_mete_rad(S, N) pred = logSeries[0] r2 = macroecotools.obs_pred_rsquare(np.log10(obs), np.log10(pred)) print " r2:", r2 # write to file, by cite, observed and expected ranked abundances #for i, sp in enumerate(pred): # print>> OUT, obs[i], pred[i] num_lines -= 1 print 'N(HMP): ',gN #OUT.close() print dataset
def sim_null(S0, N0, dic_beta): """Abundances simulated from a discrete uniform and associated METE predictions""" N_sim = sorted(np.random.random_integers(1, (2 * N0 - S0) / S0, S0), reverse = True) N_tot = sum(N_sim) #In cases where N and S are nearly equal it is possible for random draws to #yield all singletons which breaks the numerical solutions for Beta. #If this is the case make one species a doubleton. if N_tot == S0: N_sim[0] = 2 if (S0, N0) not in dic_beta: dic_beta[(S0, N0)] = mete.get_beta(S0, sum(N_sim)) N_pred = mete.get_mete_rad(S0, sum(N_sim), dic_beta[(S0, N0)])[0] np.random.seed() return N_sim, N_pred
def plot_RADs_canonical(N, S): fig = plt.figure() ax = fig.add_subplot(1,1,1) # Predicted geometric series print 'generating geometric series' t0 = time.time() predRAD = get_GeomSeries(N, S, False) # False mean no zeros allowed t = time.time() - t0 print 'time for geometric series:',t ranks = range(1, S+1) plt.plot(ranks, predRAD, lw = 1, c='m') # Predicted log-series print 'generating log-series' t0 = time.time() logSeries = mete.get_mete_rad(S, N) t = time.time() - t0 print 'time for log-series:',t predRAD = logSeries[0] ranks = range(1, S+1) plt.plot(ranks, predRAD, lw = 1, c='c') # Predicted PLN print 'generating Poisson log-normal' t0 = time.time() predRAD = pln.get_rad_from_obs(predRAD, 'pln') t = time.time() - t0 print 'time for log-normal:',t ranks = range(1, len(predRAD)+1) plt.plot(ranks, predRAD, lw = 1, c='gray') plt.yscale('log') plt.savefig('/Users/lisalocey/Desktop/RareBio/figs/GlobalRADs_N='+str(int(N))+'_S='+str(int(S))+'.png',dpi=600) plt.show() return
def fig1(figname="Fig1", data_dir=mydir, saveAs="eps"): SAD = [ 10000, 8000, 6000, 5000, 1000, 200, 100, 20, 18, 16, 14, 12, 10, 4, 5, 4, 4, 3, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ] SAD.sort() SAD.reverse() x = range(1, len(SAD) + 1) N = sum(SAD) S = len(SAD) geom = np.log10(mo.get_Geom(N, S, False)) logSeries = np.log10(mete.get_mete_rad(S, N)[0]) lognorm_pred = mo.lognorm(SAD, "pln") lognorm_SAD = np.log10(lognorm_pred.get_rad_from_obs()[0]) zipf_class = mo.zipf(SAD, "fmin") pred_tuple = zipf_class.from_cdf() zipf_SAD = np.log10(pred_tuple[0]) gamma = pred_tuple[1] SAD = np.log10(SAD) fig = plt.figure() plt.plot() max_y = max(max(SAD), max(zipf_SAD)) plt.plot(x, SAD, color="#A9A9A9", linestyle="-", linewidth=2, label="Observed") plt.plot(x, geom, color="#00008B", linestyle="-", linewidth=2, label="Broken-stick") plt.plot(x, lognorm_SAD, color="#0000CD", linestyle="--", linewidth=2, label="Lognormal") plt.plot(x, logSeries, color="#FF4500", linestyle="-.", linewidth=2, label="Log-series") plt.plot(x, zipf_SAD, color="red", linestyle="-", linewidth=2, label="Zipf") plt.tight_layout() plt.xlabel("Rank Abundance", fontsize=22) plt.ylabel("Abundance, " + r"$log_{10}$", fontsize=22) output = "dorm_fix_prob.png" plt.legend(loc="upper right") # plt.yscale('log') # plt.yscale('log') plt.xlim(1, len(SAD)) plt.ylim(-0.25, max_y) plt.tick_params(axis="both", which="major", labelsize=14) plt.legend(frameon=False, fontsize=18) fig_name = str(mydir + "figures/" + figname + "_RGB." + saveAs) plt.savefig(fig_name, bbox_inches="tight", pad_inches=0.4, dpi=600, format=saveAs) plt.close()
def sample_lines_mete_geom_test(datasets, SAD_number, iterations, percents): #percents = [0.500000, 0.250000, 0.125000, 0.062500, 0.031250, 0.015625] SAD_number = int(SAD_number) iterations = int(iterations) methods = ['geom', 'mete'] for i, dataset in enumerate(datasets): signal.signal(signal.SIGALRM, timeout_handler) if dataset == 'MGRAST': # fix subset l8r IN = mydir + dataset + '-Data' + '/MGRAST/MGRAST-SADs.txt' nsr2_data_mete_geom = gf.import_NSR2_data(mydir + 'NSR2/' + 'mete_MGRAST_NSR2.txt') elif dataset == '95' or dataset == '97' or dataset == '99': IN = mydir + dataset + '-Data/' + str(dataset) + '/MGRAST-' + str(dataset) + '-SADs.txt' nsr2_data_mete_geom = gf.import_NSR2_data(mydir + 'NSR2/' + 'mete_MGRAST'+dataset+'_NSR2.txt') elif dataset == 'HMP': IN = mydir + dataset + '-Data' + '/' + dataset +'-SADs_NAP.txt' nsr2_data_mete_geom = gf.import_NSR2_data(mydir + 'NSR2/' + 'mete_'+dataset+'_NSR2.txt') else: IN = mydir + dataset + '-Data' + '/' + dataset +'-SADs.txt' nsr2_data_mete_geom = gf.import_NSR2_data(mydir + 'NSR2/' + 'mete_'+dataset+'_NSR2.txt') nsr2_data_mete_geom_N_site = np.column_stack((nsr2_data_mete_geom["site"], nsr2_data_mete_geom["N"])) nsr2_data_mete_geom_sorted = nsr2_data_mete_geom_N_site[nsr2_data_mete_geom_N_site[:,1].argsort()[::-1]] nsr2_data_mete_geom_top100 = nsr2_data_mete_geom_N_site[nsr2_data_mete_geom_N_site[:,1].argsort()[::-1]][:SAD_number,] # Get the SAD numbers mete_geom_numbers = nsr2_data_mete_geom_top100[:,0] mete_geom_numbers = mete_geom_numbers.astype(int) OUT1 = open(mydir + 'SubSampled-Data' + '/' + dataset + '_geom_SubSampled_Data.txt', 'w+') OUT2 = open(mydir + 'SubSampled-Data' + '/' + dataset + '_mete_SubSampled_Data.txt', 'w+') num_lines = sum(1 for line in open(IN)) test_lines = 0 succeess_lines_geom = SAD_number succeess_lines_mete = SAD_number while (succeess_lines_geom > 0) and (succeess_lines_mete > 0): site = nsr2_data_mete_geom_sorted[test_lines,0] for j,line in enumerate(open(IN)): if (j != site): continue else: if dataset == "HMP": line = line.strip().split(',') line = [x.strip(' ') for x in line] line = [x.strip('[]') for x in line] site_name = line[0] line.pop(0) else: line = eval(line) obs = map(int, line) # Calculate relative abundance of each OTU # Use that as weights N_0 = float(sum(obs)) S_0 = len(obs) N_max = max(obs) if S_0 < 10 or N_0 <= S_0: test_lines += 1 continue line_ra = map(lambda x: x/N_0, obs) # Calculate relative abundance of each OTU # Use that as weights sample_sizes = map(lambda x: round(x*N_0), percents) if any(sample_size <= 10 for sample_size in sample_sizes) == True: test_lines += 1 continue gm_lines = SAD_number geom_means = [N_0, S_0, N_max] mete_means = [N_0, S_0, N_max] print dataset, N_0, S_0, ' countdown: ', succeess_lines_geom # separate this. get percents for Zipf and mete/geom # then go on with the sampling failed_percents = 0 for k, percent in enumerate(percents): sample_size = round(percent * N_0) if sample_size <= 10 or failed_percents > 0: continue mg_iter = iterations N_max_list_mg = [] N_0_list_mg = [] S_0_list_mg = [] r2_list_BS = [] r2_list_METE = [] iter_count_current = 0 iter_count = iterations fail_threshold = 20 iter_failed = 0 while (mg_iter > 0) and (iter_failed < fail_threshold): sample_k = np.random.multinomial(sample_size, line_ra, size = None) sample_k_sorted = -np.sort( -sample_k[sample_k != 0] ) N_k = sum(sample_k_sorted) S_k = sample_k_sorted.size if S_k < 10 or N_k <= S_k: iter_failed += 1 continue N_max_k = max(sample_k_sorted) logSeries = mete.get_mete_rad(S_k, N_k) pred_mete = logSeries[0] r2_mete = macroecotools.obs_pred_rsquare(np.log10(sample_k_sorted), np.log10(pred_mete)) pred_BS = get_GeomSeries(N_k, S_k, False) # False mean no zeros allowed r2_BS = macroecotools.obs_pred_rsquare(np.log10(sample_k_sorted), np.log10(pred_BS)) r2_list = [r2_mete, r2_BS] if any( (r2 == -float('inf') ) or (r2 == float('inf') ) or (r2 == float('Nan') ) for r2 in r2_list): #mg_iter += 1 iter_failed += 1 continue N_max_list_mg.append(N_max_k) N_0_list_mg.append(N_k) S_0_list_mg.append(S_k) r2_list_BS.append(r2_BS) r2_list_METE.append(r2_mete) mg_iter -= 1 if len(N_max_list_mg) != iterations: test_lines += 1 continue N_0_mg_mean = np.mean(N_0_list_mg) geom_means.append(N_0_mg_mean) mete_means.append(N_0_mg_mean) S_0_mean = np.mean(S_0_list_mg) geom_means.append(S_0_mean) mete_means.append(S_0_mean) N_max_mg_mean = np.mean(N_max_list_mg) geom_means.append(N_max_mg_mean) mete_means.append(N_max_mg_mean) r2_BS_mg_mean = np.mean(r2_list_BS) geom_means.append(r2_BS_mg_mean) r2_METE_mg_mean = np.mean(r2_list_METE) mete_means.append(r2_METE_mg_mean) '''Now we check if the lists are the right length there are 6 iterations for the percentage mete/ geom, append four items each iteration. 4*6 = 24, add three original = 27 likewise, for zipf, (5*6) + 3 = 33 ''' test_lines += 1 if (len(geom_means) == 27): succeess_lines_geom -= 1 geom_means_str = ' '.join(map(str, geom_means)) #OUT1.write(','.join(map(repr, geom_means_str[i])) print>> OUT1, j, geom_means_str if (len(mete_means) == 27): succeess_lines_mete -= 1 mete_means_str = ' '.join(map(str, mete_means)) print>> OUT2, j, mete_means_str print dataset, percent
if len(sys.argv) > 1: S0 = int(sys.argv[1]) N0 = int(sys.argv[2]) if os.path.exists('../demo') is False: os.mkdir('../demo') beta = mete.get_beta(S0, N0) n0 = mete.trunc_logser_rvs(exp(-beta), N0, S0) n0 = list(n0) n0 = [int(x) for x in n0] n0.sort(reverse=True) rad = mete.get_mete_rad(S0, N0)[0] Amax = 4 Amin = 1 recur = mete.downscale_sar(Amax, S0, N0, Amin) recur_obsSAD = mete.downscale_sar_fixed_abu(Amax, n0, Amin) Avals = recur_obsSAD[0][ : ] nonrecur = mete.sar_noniterative(Avals, Amax, S0, N0, 'precise') nonrecur_obsSAD = mete.sar_noniterative_fixed_abu(Avals, Amax, n0) sad_out = np.empty((S0, 2)) sad_out[ : , 0] = n0
zipf_r2s = [] pln_r2s = [] shuffle(RADs) for i, obs in enumerate(RADs): N = int(sum(obs)) S = int(len(obs)) print i, N, S, len(pln_r2s) if S >= 10 and N > 50: if N < 10000: result = mete.get_mete_rad(S, N) predRAD = result[0] mete_r2 = mct.obs_pred_rsquare(np.array(obs), np.array(predRAD)) mete_r2s.append(mete_r2) #zipf_pred = dist.zipf(obs) #predRAD = zipf_pred.from_cdf() #zipf_r2 = mct.obs_pred_rsquare(np.array(obs), np.array(predRAD)) #zipf_r2s.append(zipf_r2) predRAD = get_rad_from_obs(obs, 'pln') pln_r2 = mct.obs_pred_rsquare(np.array(obs), np.array(predRAD)) pln_r2s.append(pln_r2) if len(pln_r2s) > 200: break
def generate_obs_pred_data(datasets, methods, size): for method in methods: for dataset in datasets: #OUT1 = open(mydir + "ObsPred/" + method +'_'+dataset+'_obs_pred.txt','w+') #OUT2 = open(mydir + "NSR2/" + method +'_'+dataset+'_NSR2.txt','w+') #OUT1 = open(mydir + "ObsPred/" + method +'_'+dataset+'_obs_pred_subset.txt','w+') #OUT2 = open(mydir + "NSR2/" + method +'_'+dataset+'_NSR2_subset.txt','w+') if dataset == "HMP": IN = mydir + dataset + '-Data' + '/' + dataset +'-SADs.txt' num_lines = sum(1 for line in open(IN)) OUT1 = open(mydir + "ObsPred/" + method +'_'+dataset+'_obs_pred.txt','w+') OUT2 = open(mydir + "NSR2/" + method +'_'+dataset+'_NSR2.txt','w+') elif dataset == 'EMPclosed' or dataset == 'EMPpen': IN = mydir + dataset + '-Data' + '/' + dataset +'-SADs.txt' num_lines = sum(1 for line in open(IN)) random_sites = np.random.randint(num_lines,size=size) num_lines = size OUT1 = open(mydir + "ObsPred/" + method +'_'+dataset+'_obs_pred_subset.txt','w+') OUT2 = open(mydir + "NSR2/" + method +'_'+dataset+'_NSR2_subset.txt','w+') num_lines = sum(1 for line in open(IN)) else: IN = mydir + 'MGRAST-Data/' + dataset + '/' + 'MGRAST-' + dataset + '-SADs.txt' num_lines = sum(1 for line in open(IN)) OUT1 = open(mydir + "ObsPred/" + method +'_'+ 'MGRAST' + dataset+'_obs_pred.txt','w+') OUT2 = open(mydir + "NSR2/" + method +'_'+ 'MGRAST' + dataset+'_NSR2.txt','w+') for j,line in enumerate(open(IN)): if dataset == "HMP": line = line.split() elif size == 0: line = eval(line) else: line = eval(line) if j not in random_sites: continue #line.strip("[]") #line.split() obs = map(int, line) N = sum(obs) S = len(obs) if S < 10 or N <= S: num_lines += 1 continue obs.sort() obs.reverse() print method, dataset, N, S, 'countdown:', num_lines, if method == 'geom': # Predicted geometric series pred = get_GeomSeries(N, S, False) # False mean no zeros allowed elif method == 'mete': # Predicted log-series logSeries = mete.get_mete_rad(S, N) pred = logSeries[0] r2 = macroecotools.obs_pred_rsquare(np.log10(obs), np.log10(pred)) print " r2:", r2 if r2 == -float('inf') or r2 == float('inf') or r2 == float('Nan'): print r2 + " is Nan or inf, removing..." continue print>> OUT2, j, N, S, r2 # write to file, by cite, observed and expected ranked abundances for i, sp in enumerate(pred): print>> OUT1, j, obs[i], pred[i] num_lines -= 1 OUT1.close() print dataset
print 'Number of RADs:', len(RADs) mete_r2s = [] pln_r2s = [] zipf_r2s = [] ct = 0 shuffle(RADs) for obs in RADs: N = int(sum(obs)) S = int(len(obs)) s = obs.count(1) if S > 9 and N > 9: ct += 1 pred = mete.get_mete_rad(S, N)[0] mete_r2 = mct.obs_pred_rsquare(obs, np.array(pred)) mete_r2s.append(mete_r2) pred = get_pln_from_obs(obs, 'pln') pred = np.log10(pred) obs1 = np.log10(obs) pln_r2 = mct.obs_pred_rsquare(obs1, pred) pln_r2s.append(pln_r2) print ct, 'N:', N, ' S:', S, ' n:', len( pln_r2s), ' | mete:', mete_r2, ' pln:', pln_r2 if len(pln_r2s) > minct: break kernel = 0.5 D = get_kdens_choose_kernel(pln_r2s, kernel)