예제 #1
0
def get_envpred(envpred_data, predtype=['sad', 'rad']):
    if predtype is 'sad':
        envpred = DataFrame(columns=['site_id', 'octave', 'env_pred'])
    if predtype is 'rad':
        envpred = DataFrame(columns=['site_id', 'rank', 'env_pred'])
    if predtype is 'rare':
        envpred = DataFrame(columns=['site_id', 'env_pred'])
    for index, site in envpred_data.iterrows():
        obs_S = site['S']
        envpred_S = 10 ** site['logSpred']
        envpred_N = 10 ** site['logNpred']
        if predtype is 'sad':        
            sad_bins = get_log_bins([envpred_N])
            octave = range(0, len(sad_bins) - 1)
            site_pred = get_mete_sad(envpred_S, envpred_N, bin_edges=sad_bins)
            site_ids = [site['site_id'] for i in range(0, len(site_pred))]
            site_pred_with_id = DataFrame(np.column_stack([site_ids, octave, site_pred]),
                                          columns=['site_id', 'octave', 'env_pred'])    
        if predtype is 'rad':
            # note using observed S here for time being            
            rank = range(1, int(obs_S + 1))
            site_beta = get_beta(envpred_S, envpred_N)
            site_pred, p = get_mete_rad(obs_S, envpred_N, beta=site_beta)
            site_ids = [site['site_id'] for i in range(0, len(site_pred))]
            site_pred_with_id = DataFrame(np.column_stack([site_ids, rank, site_pred]),
                                          columns=['site_id', 'rank', 'env_pred'])
        if predtype is 'rare':
            pred_rad = get_mete_rad(int(envpred_S), envpred_N)[0]
            site_pred = sum([i <= 10 for i in pred_rad])
            site_pred_with_id = DataFrame(np.column_stack([site['site_id'], site_pred]),
                                          columns=['site_id', 'env_pred']) 
        envpred = envpred.append(site_pred_with_id, ignore_index=True)
    return envpred
def get_obs_pred_sad(raw_data_site, dataset_name, model, out_dir = './out_files/'):
    """Write the observed and predicted RAD to file for a given model.
    
    Inputs:
     raw_data_site - data in the same format as obtained by clean_data_genera(), with
        four columns site, sp, dbh, and genus, and only for one site.
    dataset_name - name of the dataset for raw_data_site.
    model - can take one of three values 'ssnt', 'asne', or 'agsne'. Note that the predicted SAD for SSNT does not 
        change with alternative scaling of D.
    out_dir - directory for output file.
    
    """
    G, S, N, E = get_GSNE(raw_data_site)
    if model == 'ssnt': 
        pred = mete.get_mete_rad(S, N, version = 'untruncated')[0]
    elif model == 'asne': 
        pred = mete.get_mete_rad(S, N)[0]
    elif model == 'agsne': 
        pred = agsne.get_mete_agsne_rad(G, S, N, E)
    obs = np.sort([len(raw_data_site[raw_data_site['sp'] == sp]) for sp in np.unique(raw_data_site['sp'])])[::-1]
    results = np.zeros((S, ), dtype = ('S15, i8, i8'))
    results['f0'] = np.array([raw_data_site['site'][0]] * S)
    results['f1'] = obs
    results['f2'] = pred    
    
    if model == 'ssnt': 
        f1_write = open(out_dir + dataset_name + '_obs_pred_rad_ssnt_0.csv', 'ab')
        f2_write = open(out_dir + dataset_name + '_obs_pred_rad_ssnt_1.csv', 'ab')
        f2 = csv.writer(f2_write)
        f2.writerows(results)
        f2_write.close()
    else: f1_write = open(out_dir + dataset_name + '_obs_pred_rad_' + model + '.csv', 'ab')
    f1 = csv.writer(f1_write)
    f1.writerows(results)
    f1_write.close()
예제 #3
0
def plot_RADs_canonical(N, S):

    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    
    # Predicted geometric series
    predRAD = predRADs.get_GeomSeries(N, S, False) # False mean no zeros allowed
    ranks = range(1, S+1)
    plt.plot(ranks, predRAD, lw = 1, c='m')
    
    # Predicted log-series
    logSeries = mete.get_mete_rad(S, N)
    predRAD = logSeries[0]
    ranks = range(1, S+1)
    plt.plot(ranks, predRAD, lw = 1, c='c')
        
    # Predicted PLN
    #predRAD = pln.get_rad_from_obs(predRAD, 'pln') 
    #ranks = range(1, len(predRAD)+1)
    #plt.plot(ranks, predRAD, lw = 1, c='gray')   
    
    plt.yscale('log')
    plt.show()
    
    return
예제 #4
0
파일: RADfits.py 프로젝트: klocey/MicroMETE
def getPredRADs(N, S, Nmax):
    
    PRED = []
    
    # Predicted geometric series
    predRAD = predRADs.get_GeomSeries(N, S, False) # False mean no zeros allowed
    PRED.append(predRAD)
    
    # Predicted log-series
    logSeries = mete.get_mete_rad(S, N)
    predRAD = logSeries[0]
    PRED.append(predRAD)
    
    # Predicted PLN
    predRAD = pln.get_rad_from_obs(RAD, 'pln') 
    PRED.append(predRAD)
    
    sample_size = 10
    # Predicted from compositions (basically geometric series)
    predRAD = getPred(N, S, maxn, 'compositions', sample_size)
    PRED.append(predRAD)
    
    # Predicted from Fraction 1: Power Fraction
    predRAD = getPred(N, S, maxn, 'power fraction', sample_size)
    PRED.append(predRAD)
    
    # Predicted from Fraction 2: Random non-preemption
    predRAD = getPred(N, S, maxn, 'random fraction non-preemption', sample_size)
    PRED.append(predRAD)
    
    # Predicted from Fraction 3: Random preemption 
    predRAD = getPred(N, S, maxn, 'random fraction', sample_size)
    PRED.append(predRAD)
    
    return PRED
def get_pred_geom_logser(dataset):
    out_write_geom = open('./data/' + dataset + '/' + dataset + '-obs-pred-geom.txt', 'w')
    out_write_logser = open('./data/' + dataset + '/' + dataset + '-obs-pred-logser.txt', 'w')
    out_geom = csv.writer(out_write_geom, delimiter = '\t')
    out_logser = csv.writer(out_write_logser, delimiter = '\t')
    
    data = get_SADs(dataset)
    data = data[data['obs'] != 0] # Remove rows with zeros
    site_list = np.sort(list(set(data['site'])))
    for site in site_list:
        data_site = data[data['site'] == site]
        S = len(data_site)
        N = sum(data_site['obs'])
        if S > 4 and round(N) > S: 
            cdf = [(S - i + 0.5) / S for i in range(1, S + 1)]
            pred_geom = trunc_geom.ppf(np.array(cdf), S / N, N)
            pred_logser = get_mete_rad(int(S), int(round(N)))[0]
            results_geom = np.zeros((len(data_site), ), dtype = [('f0', 'S25'), ('f1', float), ('f2', int)])
            results_geom['f0'] = np.array([site] * len(data_site))
            results_geom['f1'] = np.array(sorted(data_site['obs'], reverse = True))
            results_geom['f2'] = np.array(pred_geom)
            out_geom.writerows(results_geom)
            results_logser = np.zeros((len(data_site), ), dtype = [('f0', 'S25'), ('f1', float), ('f2', int)])
            results_logser['f0'] = np.array([site] * len(data_site))
            results_logser['f1'] = np.array(sorted(data_site['obs'], reverse = True))
            results_logser['f2'] = np.array(pred_logser)
            out_logser.writerows(results_logser)
    
    out_write_geom.close()
    out_write_logser.close()
def get_obs_pred_sad(raw_data_site,
                     dataset_name,
                     model,
                     out_dir='./out_files/'):
    """Write the observed and predicted RAD to file for a given model.
    
    Inputs:
     raw_data_site - data in the same format as obtained by clean_data_genera(), with
        four columns site, sp, dbh, and genus, and only for one site.
    dataset_name - name of the dataset for raw_data_site.
    model - can take one of three values 'ssnt', 'asne', or 'agsne'. Note that the predicted SAD for SSNT does not 
        change with alternative scaling of D.
    out_dir - directory for output file.
    
    """
    G, S, N, E = get_GSNE(raw_data_site)
    if model == 'ssnt':
        pred = mete.get_mete_rad(S, N, version='untruncated')[0]
    elif model == 'asne':
        pred = mete.get_mete_rad(S, N)[0]
    elif model == 'agsne':
        pred = agsne.get_mete_agsne_rad(G, S, N, E)
    obs = np.sort([
        len(raw_data_site[raw_data_site['sp'] == sp])
        for sp in np.unique(raw_data_site['sp'])
    ])[::-1]
    results = np.zeros((S, ), dtype=('S15, i8, i8'))
    results['f0'] = np.array([raw_data_site['site'][0]] * S)
    results['f1'] = obs
    results['f2'] = pred

    if model == 'ssnt':
        f1_write = open(out_dir + dataset_name + '_obs_pred_rad_ssnt_0.csv',
                        'ab')
        f2_write = open(out_dir + dataset_name + '_obs_pred_rad_ssnt_1.csv',
                        'ab')
        f2 = csv.writer(f2_write)
        f2.writerows(results)
        f2_write.close()
    else:
        f1_write = open(
            out_dir + dataset_name + '_obs_pred_rad_' + model + '.csv', 'ab')
    f1 = csv.writer(f1_write)
    f1.writerows(results)
    f1_write.close()
def fig1(figname = 'Fig1', data_dir= mydir, saveAs = 'eps'):
    SAD = [10000, 8000, 6000, 5000, 1000, 200, 100,  20, 18, 16, 14, 12, 10, 4,5,
        4, 4, 3, 3, 2, 2, 2, 2, 2,2, 1, 1, 1, 1, 1,1,1,1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
    SAD.sort()
    SAD.reverse()
    x = range(1, len(SAD) +1)
    N = sum(SAD)
    S = len(SAD)

    geom = np.log10(mo.get_Geom(N, S, False))

    logSeries = np.log10(mete.get_mete_rad(S, N)[0])


    lognorm_pred = mo.lognorm(SAD, 'pln')
    lognorm_SAD = np.log10(lognorm_pred.get_rad_from_obs()[0])
    zipf_class = mo.zipf(SAD, 'fmin')
    pred_tuple = zipf_class.from_cdf()
    zipf_SAD = np.log10(pred_tuple[0])
    gamma = pred_tuple[1]

    SAD = np.log10(SAD)
    fig = plt.figure()
    plt.plot()

    max_y = max(max(SAD),  max(zipf_SAD))

    plt.plot(x, SAD,color = '#A9A9A9', linestyle = '-', linewidth=2, label="Observed")
    plt.plot(x, geom,color = '#00008B', linestyle = '-', linewidth=2, label="Broken-stick")
    plt.plot(x, lognorm_SAD, color = '#0000CD',linestyle = '--', linewidth=2, label="Lognormal")
    plt.plot(x, logSeries, color = '#FF4500',linestyle = '-.', linewidth=2, label="Log-series")
    plt.plot(x, zipf_SAD, color = 'red',linestyle = '-',linewidth=2,  label="Zipf")

    plt.tight_layout()
    plt.xlabel('Rank Abundance', fontsize = 22)
    plt.ylabel('Abundance, ' +r'$log_{10}$', fontsize = 22)
    output = "dorm_fix_prob.png"
    plt.legend(loc='upper right')

    #plt.yscale('log')
    #plt.yscale('log')
    plt.xlim(1, len(SAD))
    plt.ylim(-0.25 , max_y)

    plt.tick_params(axis='both', which='major', labelsize=14)
    plt.legend(frameon=False, fontsize= 18)

    fig_name = str(mydir + 'figures/' + figname + '_RGB.' + saveAs)
    plt.savefig(fig_name, bbox_inches = "tight", pad_inches = 0.4, dpi = 600, \
        format = saveAs)
    plt.close()
예제 #8
0
def run_test(raw_data, dataset_name, data_dir='./data/', cutoff = 9):
    """Use data to compare the predicted and empirical SADs and get results in csv files
    
    Keyword arguments:
    raw_data : numpy structured array with 4 columns: 'site','year','sp','ab'
    dataset_name : short code that will indicate the name of the dataset in
                    the output file names
    data_dir : directory in which to store output
    cutoff : minimum number of species required to run - 1.
    
    """
    
    usites = np.sort(list(set(raw_data["site"])))
    f1 = csv.writer(open(data_dir + dataset_name + '_obs_pred.csv','wb'))
    f2 = csv.writer(open(data_dir + dataset_name + '_dist_test.csv','wb'))
    
    for i in range(0, len(usites)):
        subsites = raw_data["site"][raw_data["site"] == usites[i]]        
        subab = raw_data["ab"][raw_data["site"] == usites[i]]
        N = sum(subab)
        S = len(subsites)
        if S > cutoff:
            print("%s, Site %s, S=%s, N=%s" % (dataset_name, i, S, N))
            # Generate predicted values and p (e ** -beta) based on METE:
            mete_pred = mete.get_mete_rad(int(S), int(N))
            pred = np.array(mete_pred[0])
            p = mete_pred[1]
            p_untruncated = exp(-mete.get_beta(S, N, version='untruncated'))
            obsab = np.sort(subab)[::-1]
            # Calculate Akaike weight of log-series:
            L_logser = md.logser_ll(obsab, p)
            L_logser_untruncated = md.logser_ll(obsab, p_untruncated)
            mu, sigma = md.pln_solver(obsab)
            L_pln = md.pln_ll(mu,sigma,obsab)        
            k1 = 1
            k2 = 2    
            AICc_logser = macroecotools.AICc(k1, L_logser, S)
            AICc_logser_untruncated = macroecotools.AICc(k1, L_logser_untruncated, S)
            AICc_pln = macroecotools.AICc(k2, L_pln, S)
            weight = macroecotools.aic_weight(AICc_logser, AICc_pln, S, cutoff = 4)
            weight_untruncated = macroecotools.aic_weight(AICc_logser_untruncated,
                                                     AICc_pln, S, cutoff = 4)
            #save results to a csv file:
            results = ((np.column_stack((subsites, obsab, pred))))
            results2 = ((np.column_stack((np.array(usites[i], dtype='S20'),
                                                   S, N, p, weight,
                                                   p_untruncated,
                                                   weight_untruncated))))
            f1.writerows(results)
            f2.writerows(results2)
예제 #9
0
def get_envpred_sads(envpred_data):
    envpred_sads = DataFrame(columns=['SiteID', 'EnvPred'])
    for index, site in envpred_data.iterrows():
        obs_S = site['S']
        envpred_S = 10 ** site['predlogS']
        envpred_N = 10 ** site['predlogN']
        beta = get_beta(envpred_S, envpred_N)
        #To produce a comparable number of species use obs_S; IS THIS RIGHT?
        site_sad, p = get_mete_rad(obs_S, envpred_N, beta=beta)
        site_ids = [site['SiteID'] for i in range(0, len(site_sad))]
        site_sad_with_id = DataFrame(np.column_stack([site_ids, site_sad]),
                                     columns=['SiteID', 'EnvPred'])
        envpred_sads = envpred_sads.append(site_sad_with_id, ignore_index=True)
    return envpred_sads
예제 #10
0
def generate_obs_pred_data(datasets, methods):

    for method in methods:
        for dataset in datasets:

            gN = 0
            #OUT = open(mydir+'/data/'+method+'_'+dataset+'_obs_pred.txt','w+')
            IN = mydir+'/MicroMETE/data/'+dataset+'_SADs.txt'
            num_lines = sum(1 for line in open(IN))

            for line in open(IN):

                line = line.split()
                obs = map(int, line)
                obs = list([x for x in obs if x > 1])

                N = sum(obs)
                gN += N
                print N
                S = len(obs)

                if S < 10:
                    continue

                obs.sort()
                obs.reverse()
                print method, dataset, N, S, 'countdown:', num_lines,

                if method == 'geom': # Predicted geometric series
                    pred = predRADs.get_GeomSeries(N, S, False) # False mean no zeros allowed

                elif method == 'mete': # Predicted log-series
                    logSeries = mete.get_mete_rad(S, N)
                    pred = logSeries[0]

                r2 = macroecotools.obs_pred_rsquare(np.log10(obs), np.log10(pred))
                print " r2:", r2

                # write to file, by cite, observed and expected ranked abundances
                #for i, sp in enumerate(pred):
                #    print>> OUT, obs[i], pred[i]

                num_lines -= 1

            print 'N(HMP): ',gN
            #OUT.close()

        print dataset
예제 #11
0
def sim_null(S0, N0, dic_beta):
    """Abundances simulated from a discrete uniform and associated METE predictions"""
    N_sim = sorted(np.random.random_integers(1, (2 * N0 - S0) / S0, S0), reverse = True)
    N_tot = sum(N_sim)
    
    #In cases where N and S are nearly equal it is possible for random draws to
    #yield all singletons which breaks the numerical solutions for Beta.
    #If this is the case make one species a doubleton.
    if N_tot == S0:
        N_sim[0] = 2
        
    if (S0, N0) not in dic_beta:
        dic_beta[(S0, N0)] = mete.get_beta(S0, sum(N_sim))
    N_pred = mete.get_mete_rad(S0, sum(N_sim), dic_beta[(S0, N0)])[0] 
    np.random.seed()
    return N_sim, N_pred
예제 #12
0
파일: Global.py 프로젝트: klocey/MicroMETE
def plot_RADs_canonical(N, S):

    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)

    # Predicted geometric series
    print 'generating geometric series'
    t0 = time.time()
    predRAD = get_GeomSeries(N, S, False) # False mean no zeros allowed
    t = time.time() - t0
    print 'time for geometric series:',t
    ranks = range(1, S+1)
    plt.plot(ranks, predRAD, lw = 1, c='m')

    # Predicted log-series
    print 'generating log-series'
    t0 = time.time()
    logSeries = mete.get_mete_rad(S, N)
    t = time.time() - t0
    print 'time for log-series:',t
    predRAD = logSeries[0]
    ranks = range(1, S+1)
    plt.plot(ranks, predRAD, lw = 1, c='c')

    # Predicted PLN
    print 'generating Poisson log-normal'
    t0 = time.time()
    predRAD = pln.get_rad_from_obs(predRAD, 'pln')
    t = time.time() - t0
    print 'time for log-normal:',t
    ranks = range(1, len(predRAD)+1)
    plt.plot(ranks, predRAD, lw = 1, c='gray')

    plt.yscale('log')
    plt.savefig('/Users/lisalocey/Desktop/RareBio/figs/GlobalRADs_N='+str(int(N))+'_S='+str(int(S))+'.png',dpi=600)

    plt.show()

    return
예제 #13
0
def fig1(figname="Fig1", data_dir=mydir, saveAs="eps"):
    SAD = [
        10000,
        8000,
        6000,
        5000,
        1000,
        200,
        100,
        20,
        18,
        16,
        14,
        12,
        10,
        4,
        5,
        4,
        4,
        3,
        3,
        2,
        2,
        2,
        2,
        2,
        2,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
    ]
    SAD.sort()
    SAD.reverse()
    x = range(1, len(SAD) + 1)
    N = sum(SAD)
    S = len(SAD)

    geom = np.log10(mo.get_Geom(N, S, False))

    logSeries = np.log10(mete.get_mete_rad(S, N)[0])

    lognorm_pred = mo.lognorm(SAD, "pln")
    lognorm_SAD = np.log10(lognorm_pred.get_rad_from_obs()[0])
    zipf_class = mo.zipf(SAD, "fmin")
    pred_tuple = zipf_class.from_cdf()
    zipf_SAD = np.log10(pred_tuple[0])
    gamma = pred_tuple[1]

    SAD = np.log10(SAD)
    fig = plt.figure()
    plt.plot()

    max_y = max(max(SAD), max(zipf_SAD))

    plt.plot(x, SAD, color="#A9A9A9", linestyle="-", linewidth=2, label="Observed")
    plt.plot(x, geom, color="#00008B", linestyle="-", linewidth=2, label="Broken-stick")
    plt.plot(x, lognorm_SAD, color="#0000CD", linestyle="--", linewidth=2, label="Lognormal")
    plt.plot(x, logSeries, color="#FF4500", linestyle="-.", linewidth=2, label="Log-series")
    plt.plot(x, zipf_SAD, color="red", linestyle="-", linewidth=2, label="Zipf")

    plt.tight_layout()
    plt.xlabel("Rank Abundance", fontsize=22)
    plt.ylabel("Abundance, " + r"$log_{10}$", fontsize=22)
    output = "dorm_fix_prob.png"
    plt.legend(loc="upper right")

    # plt.yscale('log')
    # plt.yscale('log')
    plt.xlim(1, len(SAD))
    plt.ylim(-0.25, max_y)

    plt.tick_params(axis="both", which="major", labelsize=14)
    plt.legend(frameon=False, fontsize=18)

    fig_name = str(mydir + "figures/" + figname + "_RGB." + saveAs)
    plt.savefig(fig_name, bbox_inches="tight", pad_inches=0.4, dpi=600, format=saveAs)
    plt.close()
예제 #14
0
def sample_lines_mete_geom_test(datasets, SAD_number, iterations, percents):
    #percents = [0.500000, 0.250000, 0.125000, 0.062500, 0.031250, 0.015625]
    SAD_number = int(SAD_number)
    iterations = int(iterations)
    methods = ['geom', 'mete']
    for i, dataset in enumerate(datasets):
        signal.signal(signal.SIGALRM, timeout_handler)
        if dataset == 'MGRAST':
            # fix subset l8r
            IN = mydir  + dataset + '-Data' + '/MGRAST/MGRAST-SADs.txt'
            nsr2_data_mete_geom = gf.import_NSR2_data(mydir + 'NSR2/' + 'mete_MGRAST_NSR2.txt')
        elif dataset == '95' or dataset == '97' or dataset == '99':
            IN = mydir  + dataset + '-Data/' + str(dataset) + '/MGRAST-' + str(dataset) + '-SADs.txt'
            nsr2_data_mete_geom = gf.import_NSR2_data(mydir + 'NSR2/' + 'mete_MGRAST'+dataset+'_NSR2.txt')
        elif dataset == 'HMP':
            IN = mydir  + dataset + '-Data' + '/' + dataset +'-SADs_NAP.txt'
            nsr2_data_mete_geom = gf.import_NSR2_data(mydir + 'NSR2/' + 'mete_'+dataset+'_NSR2.txt')
        else:
            IN = mydir  + dataset + '-Data' + '/' + dataset +'-SADs.txt'
            nsr2_data_mete_geom = gf.import_NSR2_data(mydir + 'NSR2/' + 'mete_'+dataset+'_NSR2.txt')

        nsr2_data_mete_geom_N_site = np.column_stack((nsr2_data_mete_geom["site"], nsr2_data_mete_geom["N"]))
        nsr2_data_mete_geom_sorted = nsr2_data_mete_geom_N_site[nsr2_data_mete_geom_N_site[:,1].argsort()[::-1]]
        nsr2_data_mete_geom_top100 = nsr2_data_mete_geom_N_site[nsr2_data_mete_geom_N_site[:,1].argsort()[::-1]][:SAD_number,]
        # Get the SAD numbers
        mete_geom_numbers = nsr2_data_mete_geom_top100[:,0]
        mete_geom_numbers = mete_geom_numbers.astype(int)

        OUT1 = open(mydir + 'SubSampled-Data' + '/' + dataset + '_geom_SubSampled_Data.txt', 'w+')
        OUT2 = open(mydir + 'SubSampled-Data' + '/' + dataset + '_mete_SubSampled_Data.txt', 'w+')
        num_lines = sum(1 for line in open(IN))
        test_lines = 0
        succeess_lines_geom = SAD_number
        succeess_lines_mete = SAD_number
        while (succeess_lines_geom > 0) and (succeess_lines_mete > 0):
            site = nsr2_data_mete_geom_sorted[test_lines,0]
            for j,line in enumerate(open(IN)):
                if (j != site):
                    continue
                else:
                    if dataset == "HMP":
                        line = line.strip().split(',')
                        line = [x.strip(' ') for x in line]
                        line = [x.strip('[]') for x in line]
                        site_name = line[0]
                        line.pop(0)
                    else:
                        line = eval(line)
                obs = map(int, line)
                # Calculate relative abundance of each OTU
                # Use that as weights
                N_0 = float(sum(obs))
                S_0 = len(obs)
                N_max = max(obs)
                if S_0 < 10 or N_0 <= S_0:
                    test_lines += 1
                    continue
                line_ra = map(lambda x: x/N_0, obs)
                # Calculate relative abundance of each OTU
                # Use that as weights
                sample_sizes = map(lambda x: round(x*N_0), percents)
                if any(sample_size <= 10 for sample_size in sample_sizes)  == True:
                    test_lines += 1
                    continue
                gm_lines = SAD_number
                geom_means = [N_0, S_0, N_max]
                mete_means = [N_0, S_0, N_max]
                print dataset, N_0, S_0, ' countdown: ', succeess_lines_geom
                # separate this. get percents for Zipf and mete/geom
                # then go on with the sampling
                failed_percents = 0
                for k, percent in enumerate(percents):
                    sample_size = round(percent * N_0)
                    if sample_size <= 10 or failed_percents > 0:
                        continue
                    mg_iter = iterations

                    N_max_list_mg = []
                    N_0_list_mg = []
                    S_0_list_mg = []
                    r2_list_BS = []
                    r2_list_METE = []
                    iter_count_current = 0
                    iter_count = iterations
                    fail_threshold = 20
                    iter_failed = 0
                    while (mg_iter > 0) and (iter_failed < fail_threshold):
                        sample_k = np.random.multinomial(sample_size, line_ra, size = None)
                        sample_k_sorted = -np.sort( -sample_k[sample_k != 0] )
                        N_k = sum(sample_k_sorted)
                        S_k = sample_k_sorted.size
                        if S_k < 10 or N_k <= S_k:
                            iter_failed += 1
                            continue
                        N_max_k = max(sample_k_sorted)
                        logSeries = mete.get_mete_rad(S_k, N_k)
                        pred_mete = logSeries[0]
                        r2_mete = macroecotools.obs_pred_rsquare(np.log10(sample_k_sorted), np.log10(pred_mete))
                        pred_BS = get_GeomSeries(N_k, S_k, False) # False mean no zeros allowed
                        r2_BS = macroecotools.obs_pred_rsquare(np.log10(sample_k_sorted), np.log10(pred_BS))
                        r2_list = [r2_mete, r2_BS]
                        if any( (r2 == -float('inf') ) or (r2 == float('inf') ) or (r2 == float('Nan') ) for r2 in r2_list):
                            #mg_iter += 1
                            iter_failed += 1
                            continue
                        N_max_list_mg.append(N_max_k)
                        N_0_list_mg.append(N_k)
                        S_0_list_mg.append(S_k)
                        r2_list_BS.append(r2_BS)
                        r2_list_METE.append(r2_mete)
                        mg_iter -= 1

                    if len(N_max_list_mg) != iterations:
                        test_lines += 1
                        continue
                    N_0_mg_mean = np.mean(N_0_list_mg)
                    geom_means.append(N_0_mg_mean)
                    mete_means.append(N_0_mg_mean)

                    S_0_mean = np.mean(S_0_list_mg)
                    geom_means.append(S_0_mean)
                    mete_means.append(S_0_mean)

                    N_max_mg_mean = np.mean(N_max_list_mg)
                    geom_means.append(N_max_mg_mean)
                    mete_means.append(N_max_mg_mean)

                    r2_BS_mg_mean = np.mean(r2_list_BS)
                    geom_means.append(r2_BS_mg_mean)
                    r2_METE_mg_mean = np.mean(r2_list_METE)
                    mete_means.append(r2_METE_mg_mean)

                '''Now we check if the lists are the right length
                there are 6 iterations for the percentage
                mete/ geom, append four items each iteration.
                4*6 = 24, add three original = 27
                likewise, for zipf, (5*6) + 3 = 33 '''
                test_lines += 1
                if (len(geom_means) == 27):
                    succeess_lines_geom -= 1
                    geom_means_str = ' '.join(map(str, geom_means))
                    #OUT1.write(','.join(map(repr, geom_means_str[i]))
                    print>> OUT1, j, geom_means_str
                if (len(mete_means) == 27):
                    succeess_lines_mete -= 1
                    mete_means_str = ' '.join(map(str, mete_means))
                    print>> OUT2, j, mete_means_str
                print dataset, percent
예제 #15
0
if len(sys.argv) > 1:
    S0 = int(sys.argv[1])
    N0 = int(sys.argv[2])

if os.path.exists('../demo') is False:
    os.mkdir('../demo')

beta = mete.get_beta(S0, N0)

n0 = mete.trunc_logser_rvs(exp(-beta), N0, S0)
n0 = list(n0)
n0 = [int(x) for x in n0]
n0.sort(reverse=True)

rad = mete.get_mete_rad(S0, N0)[0]

Amax = 4
Amin = 1

recur = mete.downscale_sar(Amax, S0, N0, Amin)
recur_obsSAD = mete.downscale_sar_fixed_abu(Amax, n0, Amin)

Avals = recur_obsSAD[0][ : ]

nonrecur = mete.sar_noniterative(Avals, Amax, S0, N0, 'precise')
nonrecur_obsSAD = mete.sar_noniterative_fixed_abu(Avals, Amax, n0)

sad_out = np.empty((S0, 2)) 

sad_out[ : , 0] = n0
예제 #16
0
zipf_r2s = []
pln_r2s = []

shuffle(RADs)
for i, obs in enumerate(RADs):

    N = int(sum(obs))
    S = int(len(obs))

    print i, N, S, len(pln_r2s)

    if S >= 10 and N > 50:

        if N < 10000:

            result = mete.get_mete_rad(S, N)
            predRAD = result[0]
            mete_r2 = mct.obs_pred_rsquare(np.array(obs), np.array(predRAD))
            mete_r2s.append(mete_r2)

            #zipf_pred = dist.zipf(obs)
            #predRAD = zipf_pred.from_cdf()
            #zipf_r2 = mct.obs_pred_rsquare(np.array(obs), np.array(predRAD))
            #zipf_r2s.append(zipf_r2)

            predRAD = get_rad_from_obs(obs, 'pln')
            pln_r2 = mct.obs_pred_rsquare(np.array(obs), np.array(predRAD))
            pln_r2s.append(pln_r2)

    if len(pln_r2s) > 200: break
예제 #17
0
def generate_obs_pred_data(datasets, methods, size):

    for method in methods:
        for dataset in datasets:
            #OUT1 = open(mydir + "ObsPred/" + method +'_'+dataset+'_obs_pred.txt','w+')
            #OUT2 = open(mydir + "NSR2/" + method +'_'+dataset+'_NSR2.txt','w+')
            #OUT1 = open(mydir + "ObsPred/" + method +'_'+dataset+'_obs_pred_subset.txt','w+')
            #OUT2 = open(mydir + "NSR2/" + method +'_'+dataset+'_NSR2_subset.txt','w+')

            if dataset == "HMP":
                IN = mydir  + dataset + '-Data' + '/' + dataset +'-SADs.txt'
                num_lines = sum(1 for line in open(IN))
                OUT1 = open(mydir + "ObsPred/" + method +'_'+dataset+'_obs_pred.txt','w+')
                OUT2 = open(mydir + "NSR2/" + method +'_'+dataset+'_NSR2.txt','w+')
            elif dataset == 'EMPclosed' or dataset == 'EMPpen':
                IN = mydir  + dataset + '-Data' + '/' + dataset +'-SADs.txt'
                num_lines = sum(1 for line in open(IN))
                random_sites = np.random.randint(num_lines,size=size)
                num_lines = size
                OUT1 = open(mydir + "ObsPred/" + method +'_'+dataset+'_obs_pred_subset.txt','w+')
                OUT2 = open(mydir + "NSR2/" + method +'_'+dataset+'_NSR2_subset.txt','w+')
                num_lines = sum(1 for line in open(IN))
            else:
                IN = mydir + 'MGRAST-Data/' + dataset +  '/' + 'MGRAST-' + dataset + '-SADs.txt'
                num_lines = sum(1 for line in open(IN))
                OUT1 = open(mydir + "ObsPred/" + method +'_'+ 'MGRAST' + dataset+'_obs_pred.txt','w+')
                OUT2 = open(mydir + "NSR2/" + method +'_'+ 'MGRAST' + dataset+'_NSR2.txt','w+')

            for j,line in enumerate(open(IN)):
                if dataset == "HMP":
                    line = line.split()
                elif size == 0:
                    line = eval(line)
                else:
                    line = eval(line)
                    if j not in random_sites:
                        continue
                #line.strip("[]")
                #line.split()
                obs = map(int, line)

                N = sum(obs)
                S = len(obs)

                if S < 10 or N <= S:
                    num_lines += 1
                    continue

                obs.sort()
                obs.reverse()
                print method, dataset, N, S, 'countdown:', num_lines,

                if method == 'geom': # Predicted geometric series
                    pred = get_GeomSeries(N, S, False) # False mean no zeros allowed

                elif method == 'mete': # Predicted log-series
                    logSeries = mete.get_mete_rad(S, N)
                    pred = logSeries[0]

                r2 = macroecotools.obs_pred_rsquare(np.log10(obs), np.log10(pred))
                print " r2:", r2
                if r2 == -float('inf') or r2 == float('inf') or r2 == float('Nan'):
                    print r2 + " is Nan or inf, removing..."
                    continue
                print>> OUT2, j, N, S, r2
                # write to file, by cite, observed and expected ranked abundances
                for i, sp in enumerate(pred):
                    print>> OUT1, j, obs[i], pred[i]


                num_lines -= 1

            OUT1.close()

        print dataset
예제 #18
0
print 'Number of RADs:', len(RADs)
mete_r2s = []
pln_r2s = []
zipf_r2s = []

ct = 0
shuffle(RADs)
for obs in RADs:
    N = int(sum(obs))
    S = int(len(obs))
    s = obs.count(1)

    if S > 9 and N > 9:
        ct += 1
        pred = mete.get_mete_rad(S, N)[0]
        mete_r2 = mct.obs_pred_rsquare(obs, np.array(pred))
        mete_r2s.append(mete_r2)

        pred = get_pln_from_obs(obs, 'pln')
        pred = np.log10(pred)
        obs1 = np.log10(obs)
        pln_r2 = mct.obs_pred_rsquare(obs1, pred)
        pln_r2s.append(pln_r2)

        print ct, 'N:', N, ' S:', S, ' n:', len(
            pln_r2s), ' |  mete:', mete_r2, '  pln:', pln_r2
    if len(pln_r2s) > minct: break

kernel = 0.5
D = get_kdens_choose_kernel(pln_r2s, kernel)
예제 #19
0
zipf_r2s = []
pln_r2s = []

shuffle(RADs)
for i, obs in enumerate(RADs):

    N = int(sum(obs))
    S = int(len(obs))

    print i, N, S, len(pln_r2s)

    if S >= 10 and N > 50:

        if N < 10000:

            result = mete.get_mete_rad(S, N)
            predRAD = result[0]
            mete_r2 = mct.obs_pred_rsquare(np.array(obs), np.array(predRAD))
            mete_r2s.append(mete_r2)

            #zipf_pred = dist.zipf(obs)
            #predRAD = zipf_pred.from_cdf()
            #zipf_r2 = mct.obs_pred_rsquare(np.array(obs), np.array(predRAD))
            #zipf_r2s.append(zipf_r2)

            predRAD = get_rad_from_obs(obs, 'pln')
            pln_r2 = mct.obs_pred_rsquare(np.array(obs), np.array(predRAD))
            pln_r2s.append(pln_r2)

    if len(pln_r2s) > 200: break