def generate_kde_to_file(datasets, methods): for i, dataset in enumerate(datasets): for j, method in enumerate(methods): IN = gf.import_NSR2_data(mydir + 'NSR2/' + method+'_'+dataset+'_NSR2.txt') r2s = ((IN["R2"])) #r2s = r2s[(r2s >= -1) & (r2s <= 1)] r2_kde = gf.CV_KDE(r2s) r2_kde_table = pd.DataFrame({'Grid':r2_kde[0], 'PDF':r2_kde[1]}) OUT_name = method + '_' + dataset + '_KDEs.txt' OUT_dir = mydir + 'KDEs/' r2_kde_table.to_csv(os.path.join(OUT_dir, OUT_name), sep='\t')
def r2_KDE(datasets, methods): bins = np.linspace(-1, 1, 100) #fig = plt.figure() for i, dataset in enumerate(datasets): fig, ax = plt.subplots() max_x = 0 min_x = 0 for j, method in enumerate(methods): path = mydir + 'KDEs/' + method+'_'+dataset+'_KDEs.txt' IN1 = pd.read_csv(path, sep='\t') #IN1 = IN1[(IN1.Grid >= -1) & (IN1.Grid <= 1)] IN2 = gf.import_NSR2_data(mydir + 'NSR2/' + method+'_'+dataset+'_NSR2.txt') r2s = ((IN2["R2"])) #r2s = r2s[(r2s >= -1) & (r2s <= 1)] if j == 0: ax.plot(IN1.Grid, IN1.PDF, linewidth=2, alpha=0.5, color =colors[j], label='Broken-stick') elif j == 1: ax.plot(IN1.Grid, IN1.PDF, linewidth=2, alpha=0.5, color =colors[j], label='METE') elif j == 2: ax.plot(IN1.Grid, IN1.PDF, linewidth=2, alpha=0.5, color =colors[j], label='Zipf') #plt.hist(r2s, 30, stacked = True, color = colors[j], alpha=0.3, normed= True) #ax.hist(r2s, 50,color = colors[j], alpha=0.3) #n, bins, patches = plt.hist(r2s, 50, normed=1, facecolor='green', alpha=0.75) if np.max(IN1.PDF) > max_x: max_x = np.max(IN1.PDF) if np.min(IN1.PDF) < min_x: min_x = np.min(IN1.PDF) print np.min(IN1.PDF) figure_name = '../figures/KDEs/' + str(dataset) + '_KDE.png' plt.grid(True) ax.legend(loc='upper left', shadow=False) plt.xlabel(r'$r^{2}_{m}$', fontsize = 14) plt.ylabel(r'$probability$', fontsize = 14) if i == 0: plt.title('HMP', fontsize = 18) if i == 1: plt.title('EMP', fontsize = 18) if i == 2: plt.title('MG-RAST', fontsize = 18) plt.xlim(-7, 1) plt.savefig(figure_name, bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close()
def sample_lines_mete_geom_test(datasets, SAD_number, iterations, percents): #percents = [0.500000, 0.250000, 0.125000, 0.062500, 0.031250, 0.015625] SAD_number = int(SAD_number) iterations = int(iterations) methods = ['geom', 'mete'] for i, dataset in enumerate(datasets): signal.signal(signal.SIGALRM, timeout_handler) if dataset == 'MGRAST': # fix subset l8r IN = mydir + dataset + '-Data' + '/MGRAST/MGRAST-SADs.txt' nsr2_data_mete_geom = gf.import_NSR2_data(mydir + 'NSR2/' + 'mete_MGRAST_NSR2.txt') elif dataset == '95' or dataset == '97' or dataset == '99': IN = mydir + dataset + '-Data/' + str(dataset) + '/MGRAST-' + str(dataset) + '-SADs.txt' nsr2_data_mete_geom = gf.import_NSR2_data(mydir + 'NSR2/' + 'mete_MGRAST'+dataset+'_NSR2.txt') elif dataset == 'HMP': IN = mydir + dataset + '-Data' + '/' + dataset +'-SADs_NAP.txt' nsr2_data_mete_geom = gf.import_NSR2_data(mydir + 'NSR2/' + 'mete_'+dataset+'_NSR2.txt') else: IN = mydir + dataset + '-Data' + '/' + dataset +'-SADs.txt' nsr2_data_mete_geom = gf.import_NSR2_data(mydir + 'NSR2/' + 'mete_'+dataset+'_NSR2.txt') nsr2_data_mete_geom_N_site = np.column_stack((nsr2_data_mete_geom["site"], nsr2_data_mete_geom["N"])) nsr2_data_mete_geom_sorted = nsr2_data_mete_geom_N_site[nsr2_data_mete_geom_N_site[:,1].argsort()[::-1]] nsr2_data_mete_geom_top100 = nsr2_data_mete_geom_N_site[nsr2_data_mete_geom_N_site[:,1].argsort()[::-1]][:SAD_number,] # Get the SAD numbers mete_geom_numbers = nsr2_data_mete_geom_top100[:,0] mete_geom_numbers = mete_geom_numbers.astype(int) OUT1 = open(mydir + 'SubSampled-Data' + '/' + dataset + '_geom_SubSampled_Data.txt', 'w+') OUT2 = open(mydir + 'SubSampled-Data' + '/' + dataset + '_mete_SubSampled_Data.txt', 'w+') num_lines = sum(1 for line in open(IN)) test_lines = 0 succeess_lines_geom = SAD_number succeess_lines_mete = SAD_number while (succeess_lines_geom > 0) and (succeess_lines_mete > 0): site = nsr2_data_mete_geom_sorted[test_lines,0] for j,line in enumerate(open(IN)): if (j != site): continue else: if dataset == "HMP": line = line.strip().split(',') line = [x.strip(' ') for x in line] line = [x.strip('[]') for x in line] site_name = line[0] line.pop(0) else: line = eval(line) obs = map(int, line) # Calculate relative abundance of each OTU # Use that as weights N_0 = float(sum(obs)) S_0 = len(obs) N_max = max(obs) if S_0 < 10 or N_0 <= S_0: test_lines += 1 continue line_ra = map(lambda x: x/N_0, obs) # Calculate relative abundance of each OTU # Use that as weights sample_sizes = map(lambda x: round(x*N_0), percents) if any(sample_size <= 10 for sample_size in sample_sizes) == True: test_lines += 1 continue gm_lines = SAD_number geom_means = [N_0, S_0, N_max] mete_means = [N_0, S_0, N_max] print dataset, N_0, S_0, ' countdown: ', succeess_lines_geom # separate this. get percents for Zipf and mete/geom # then go on with the sampling failed_percents = 0 for k, percent in enumerate(percents): sample_size = round(percent * N_0) if sample_size <= 10 or failed_percents > 0: continue mg_iter = iterations N_max_list_mg = [] N_0_list_mg = [] S_0_list_mg = [] r2_list_BS = [] r2_list_METE = [] iter_count_current = 0 iter_count = iterations fail_threshold = 20 iter_failed = 0 while (mg_iter > 0) and (iter_failed < fail_threshold): sample_k = np.random.multinomial(sample_size, line_ra, size = None) sample_k_sorted = -np.sort( -sample_k[sample_k != 0] ) N_k = sum(sample_k_sorted) S_k = sample_k_sorted.size if S_k < 10 or N_k <= S_k: iter_failed += 1 continue N_max_k = max(sample_k_sorted) logSeries = mete.get_mete_rad(S_k, N_k) pred_mete = logSeries[0] r2_mete = macroecotools.obs_pred_rsquare(np.log10(sample_k_sorted), np.log10(pred_mete)) pred_BS = get_GeomSeries(N_k, S_k, False) # False mean no zeros allowed r2_BS = macroecotools.obs_pred_rsquare(np.log10(sample_k_sorted), np.log10(pred_BS)) r2_list = [r2_mete, r2_BS] if any( (r2 == -float('inf') ) or (r2 == float('inf') ) or (r2 == float('Nan') ) for r2 in r2_list): #mg_iter += 1 iter_failed += 1 continue N_max_list_mg.append(N_max_k) N_0_list_mg.append(N_k) S_0_list_mg.append(S_k) r2_list_BS.append(r2_BS) r2_list_METE.append(r2_mete) mg_iter -= 1 if len(N_max_list_mg) != iterations: test_lines += 1 continue N_0_mg_mean = np.mean(N_0_list_mg) geom_means.append(N_0_mg_mean) mete_means.append(N_0_mg_mean) S_0_mean = np.mean(S_0_list_mg) geom_means.append(S_0_mean) mete_means.append(S_0_mean) N_max_mg_mean = np.mean(N_max_list_mg) geom_means.append(N_max_mg_mean) mete_means.append(N_max_mg_mean) r2_BS_mg_mean = np.mean(r2_list_BS) geom_means.append(r2_BS_mg_mean) r2_METE_mg_mean = np.mean(r2_list_METE) mete_means.append(r2_METE_mg_mean) '''Now we check if the lists are the right length there are 6 iterations for the percentage mete/ geom, append four items each iteration. 4*6 = 24, add three original = 27 likewise, for zipf, (5*6) + 3 = 33 ''' test_lines += 1 if (len(geom_means) == 27): succeess_lines_geom -= 1 geom_means_str = ' '.join(map(str, geom_means)) #OUT1.write(','.join(map(repr, geom_means_str[i])) print>> OUT1, j, geom_means_str if (len(mete_means) == 27): succeess_lines_mete -= 1 mete_means_str = ' '.join(map(str, mete_means)) print>> OUT2, j, mete_means_str print dataset, percent
def test_zipf_num_est(datasets, estimators, SAD_number, iterations, fail_threshold): percents = [0.500000, 0.250000, 0.125000, 0.062500, 0.031250, 0.015625] for dataset in datasets: signal.signal(signal.SIGALRM, gf.timeout_handler) if dataset == 'MGRAST': # fix subset l8r IN = mydir + dataset + '-Data' + '/MGRAST/MGRAST-SADs.txt' nsr2_data_zipf = gf.import_NSR2_data(mydir + 'NSR2/' + 'zipf_MGRAST_NSR2.txt') elif dataset == '95' or dataset == '97' or dataset == '99': IN = mydir + dataset + '-Data/' + str(dataset) + '/MGRAST-' + str(dataset) + '-SADs.txt' nsr2_data_zipf = gf.import_NSR2_data(mydir + 'NSR2/' +'zipf_MGRAST'+dataset+'_NSR2.txt') elif dataset == 'HMP': IN = mydir + dataset + '-Data' + '/' + dataset +'-SADs_NAP.txt' nsr2_data_zipf = gf.import_NSR2_data(mydir + 'NSR2/' + 'zipf_'+dataset+'_NSR2.txt') else: IN = mydir + dataset + '-Data' + '/' + dataset +'-SADs.txt' nsr2_data_zipf = gf.import_NSR2_data(mydir + 'NSR2/' + 'zipf_'+dataset+'_NSR2.txt') nsr2_data_zipf_N_site = np.column_stack((nsr2_data_zipf["site"], nsr2_data_zipf["N"])) # Sort these arrays nsr2_data_zipf_sorted = nsr2_data_zipf_N_site[nsr2_data_zipf_N_site[:,1].argsort()[::-1]] nsr2_data_zipf_top100 = nsr2_data_zipf_sorted[:SAD_number,] # Get the SAD numbers zipf_numbers = nsr2_data_zipf_top100[:,0] zipf_numbers = zipf_numbers.astype(int) successful_SADs_samplings = SAD_number for estimator in estimators: OUT = open(mydir + 'SubSampled-Data' + '/' + dataset + '_zipf_' + \ str(estimator) + '_SubSampled_Data.txt', 'w+') num_lines = sum(1 for line in open(IN)) test_lines = 0 succeess_lines = SAD_number while succeess_lines > 0: site = nsr2_data_zipf_sorted[test_lines,0] for j,line in enumerate(open(IN)): if (j != site): continue else: if dataset == "HMP": line = line.strip().split(',') line = [x.strip(' ') for x in line] line = [x.strip('[]') for x in line] site_name = line[0] line.pop(0) else: line = eval(line) obs = map(int, line) # Calculate relative abundance of each OTU # Use that as weights N_0 = float(sum(obs)) S_0 = len(obs) N_max = max(obs) if S_0 < 10 or N_0 <= S_0: test_lines += 1 continue line_ra = map(lambda x: x/N_0, obs) sample_sizes = map(lambda x: round(x*N_0), percents) if any(sample_size <= 10 for sample_size in sample_sizes) == True: test_lines += 1 continue zipf_means = [N_0, S_0, N_max] failed_percents = 0 for k, percent in enumerate(percents): if failed_percents > 0: continue N_max_list_zipf = [] N_0_list_zipf = [] S_0_list_zipf = [] r2_list_zipf = [] gamma_list = [] iter_count_current = 0 iter_count = iterations iter_failed = 0 while iter_count > 0 and iter_failed < fail_threshold: sample_size_k = sample_sizes[0] sample_k = np.random.multinomial(sample_size_k, line_ra, size = None) sample_k_sorted = -np.sort( -sample_k[sample_k != 0] ) N_0_k = sum(sample_k_sorted) S_0_k = sample_k_sorted.size if S_0_k < 10 or N_0_k <= S_0_k: continue N_max_k = max(sample_k_sorted) iter_count_current += 1 # Start the timer. Once 1 second is over, a SIGALRM signal is sent. signal.alarm(2) # This try/except loop ensures that # you'll catch TimeoutException when it's sent. #start_time = time.time() try: # Whatever your function that might hang zipf_class = gf.zipf(sample_k_sorted, estimator) pred_tuple = zipf_class.from_cdf() Zipf_solve_line = zipf_class.zipf_solver(sample_k_sorted) rv = stats.zipf(Zipf_solve_line) pred_zipf = pred_tuple[0] gamma = pred_tuple[1] r2_zipf = macroecotools.obs_pred_rsquare(np.log10(sample_k_sorted), np.log10(pred_zipf)) if (r2_zipf == -float('inf') ) or (r2_zipf == float('inf') ) or (r2_zipf == float('Nan') ): continue else: r2_list_zipf.append(r2_zipf) gamma_list.append(gamma) N_max_list_zipf.append(N_max_k) N_0_list_zipf.append(N_0_k) S_0_list_zipf.append(S_0_k) except gf.TimeoutException: print "Line " + str(j) + ": " + str(estimator) + " timed out" iter_count -= 1 if iter_failed >= fail_threshold: failed_percents += 1 iter_failed += 1 continue # continue the for loop if function takes more than x seconds else: iter_count -= 1 #print("--- %s seconds ---" % (time.time() - start_time)) # Reset the alarm signal.alarm(0) if len(N_0_list_zipf) != iterations: test_lines += 1 continue N_0_zipf_mean = np.mean(N_0_list_zipf) zipf_means.append(N_0_zipf_mean) S_0_zipf_mean = np.mean(S_0_list_zipf) zipf_means.append(S_0_zipf_mean) N_max_zipf_mean = np.mean(N_max_list_zipf) zipf_means.append(N_max_zipf_mean) r2_zipf_mean = np.mean(r2_list_zipf) zipf_means.append(r2_zipf_mean) gamma_zipf_mean = np.mean(gamma_list) zipf_means.append(gamma_zipf_mean) '''Now we check if the lists are the right length there are 6 iterations for the percentage mete/ geom, append four items each iteration. 4*6 = 24, add three original = 27 likewise, for zipf, (5*6) + 3 = 33 ''' if len(zipf_means) == 33: test_lines += 1 succeess_lines -= 1 zipf_means_str = ' '.join(map(str, zipf_means)) #OUT1.write(','.join(map(repr, geom_means_str[i])) print>> OUT, j, zipf_means_str print "Line " + str(j) + ": " + str(succeess_lines) + " SADs to go!" else: test_lines += 1 #print estimator print dataset