Exemplo n.º 1
0
 def test_subsample(self):
     """subsample should return a random subsample of a vector"""
     a = array([0,5,0])
     self.assertEqual(subsample(a,5), array([0,5,0]))
     self.assertEqual(subsample(a,2), array([0,2,0]))
     b = array([2,0,1])
     obs = subsample(b,2)
     assert (obs == array([1,0,1])).all() or (obs ==  array([2,0,0])).all()
Exemplo n.º 2
0
Arquivo: parse.py Projeto: binma/qiime
def filter_otus_by_lineage(sample_ids, otu_ids, otu_table, lineages, \
    wanted_lineage, max_seqs_per_sample, min_seqs_per_sample):
    """Filter OTU table to keep only desired lineages and sample sizes."""
    #first step: figure out which OTUs we want to keep
    if wanted_lineage is not None:  #None = keep all
        if '&&' in wanted_lineage:
            wanted_lineage = set(wanted_lineage.split('&&'))
        else:
            wanted_lineage = set([wanted_lineage])
        good_indices = []
        for i,l in enumerate(lineages):
            if set(l).intersection(wanted_lineage):
                good_indices.append(i)
        otu_table = otu_table[good_indices]
        otu_ids = map(otu_ids.__getitem__, good_indices)
        lineages = map(lineages.__getitem__, good_indices)
    #now have reduced collection of OTUs filtered by lineage.
    #figure out which samples will be dropped because too small
    big_enough_samples = (otu_table.sum(0)>=min_seqs_per_sample).nonzero()
    otu_table = otu_table[:,big_enough_samples[0]]
    sample_ids = map(sample_ids.__getitem__, big_enough_samples[0])
    #figure out which samples will be reduced because too big
    too_big_samples = (otu_table.sum(0)>max_seqs_per_sample).nonzero()[0]
    if too_big_samples.shape[0]:    #means that there were some
        for i in too_big_samples:
            otu_table[:,i] = subsample(otu_table[:,i].ravel(), \
                max_seqs_per_sample)
    return sample_ids, otu_ids, otu_table, lineages
Exemplo n.º 3
0
def filter_otus_by_lineage(sample_ids, otu_ids, otu_table, lineages, \
    wanted_lineage, max_seqs_per_sample, min_seqs_per_sample):
    """Filter OTU table to keep only desired lineages and sample sizes."""
    #first step: figure out which OTUs we want to keep
    if wanted_lineage is not None:  #None = keep all
        if '&&' in wanted_lineage:
            wanted_lineage = set(wanted_lineage.split('&&'))
        else:
            wanted_lineage = set([wanted_lineage])
        good_indices = []
        for i,l in enumerate(lineages):
            if set(l).intersection(wanted_lineage):
                good_indices.append(i)
        otu_table = otu_table[good_indices]
        otu_ids = map(otu_ids.__getitem__, good_indices)
        lineages = map(lineages.__getitem__, good_indices)
    #now have reduced collection of OTUs filtered by lineage.
    #figure out which samples will be dropped because too small
    big_enough_samples = (otu_table.sum(0)>=min_seqs_per_sample).nonzero()
    otu_table = otu_table[:,big_enough_samples[0]]
    sample_ids = map(sample_ids.__getitem__, big_enough_samples[0])
    #figure out which samples will be reduced because too big
    too_big_samples = (otu_table.sum(0)>max_seqs_per_sample).nonzero()[0]
    if too_big_samples.shape[0]:    #means that there were some
        for i in too_big_samples:
            otu_table[:,i] = subsample(otu_table[:,i].ravel(), \
                max_seqs_per_sample)
    return sample_ids, otu_ids, otu_table, lineages
Exemplo n.º 4
0
def get_rare_data(sample_ids,
                  otu_table,
                  seqs_per_sample,
                  include_small_samples=False):
    """Filter OTU table to keep only desired sample sizes.
    
    - include_small_sampes=False => do not write samples with < seqs_per_sample
    total sequecnes
    - otu_table (input and out) is otus(rows) by samples (cols)
    - no otus are removed, even if they are absent in the rarefied table"""
    res_otu_table = otu_table.copy()
    res_sample_ids = sample_ids
    #figure out which samples will be dropped because too small
    too_big_samples = (otu_table.sum(0) > seqs_per_sample).nonzero()[0]
    if too_big_samples.shape[0]:  #means that there were some
        for i in too_big_samples:
            res_otu_table[:, i] = subsample(otu_table[:, i].ravel(),
                                            seqs_per_sample)
    if not include_small_samples:
        big_enough_samples = (res_otu_table.sum(0) >=
                              seqs_per_sample).nonzero()
        res_otu_table = res_otu_table[:, big_enough_samples[0]]
        res_sample_ids = map(sample_ids.__getitem__, big_enough_samples[0])
    #figure out which samples will be reduced because too big
    return res_sample_ids, res_otu_table
Exemplo n.º 5
0
 def test_subsample(self):
     """subsample should return a random subsample of a vector"""
     a = array([0,5,0])
     self.assertEqual(subsample(a,5), array([0,5,0]))
     self.assertEqual(subsample(a,2), array([0,2,0]))
     b = array([2,0,1])
     
     # selecting 2 counts from the vector 1000 times yields each of the 
     # two possible results at least once each
     b = array([2,0,1])
     actual = {}
     for i in range(1000):
         e = subsample(b,2)
         actual[tuple(e)] = None
     self.assertEqual(actual, {(1,0,1):None,(2,0,0):None})
     
     obs = subsample(b,2)
     assert (obs == array([1,0,1])).all() or (obs ==  array([2,0,0])).all()
Exemplo n.º 6
0
def michaelis_menten_fit(counts,
                         num_repeats=1,
                         params_guess=None,
                         return_b=False):
    """Michaelis-Menten fit to rarefaction curve of observed species

    Note: there is some controversy about how to do the fitting. The ML model
    givem by Raaijmakers 1987 is based on the assumption that error is roughly
    proportional to magnitude of observation, reasonable for enzyme kinetics
    but not reasonable for rarefaction data. Here we just do a nonlinear
    curve fit for the parameters using least-squares.


    S = Smax*n/(B + n) . n: number of individuals, S: # of species
    returns Smax

    inputs:
    num_repeats: will perform rarefaction (subsampling without replacement)
    this many times at each value of n
    params_guess: intial guess of Smax, B (None => default)
    return_b: if True will return the estimate for Smax, B. Default is just Smax

    the fit is made to datapoints where n = 1,2,...counts.sum(),
    S = species represented in random sample of n individuals

    """
    counts = asarray(counts)
    if params_guess is None:
        params_guess = array([100, 500])

    # observed # of species vs # of individuals sampled, S vs n
    xvals = arange(1, counts.sum() + 1)
    ymtx = []
    for i in range(num_repeats):
        ymtx.append(
            array([
                observed_species(rarefaction.subsample(counts, n))
                for n in xvals
            ]))
    ymtx = asarray(ymtx)
    yvals = ymtx.mean(0)

    # fit to obs_sp = max_sp * num_idiv / (num_indiv + B)
    # return max_sp
    def fitfn(p, n):  # works with vectors of n, returns vector of S
        return p[0] * n / (p[1] + n)

    def errfn(p, n, y):  # vectors of actual vals y and number of individuals n
        return ((fitfn(p, n) - y)**2).sum()

    p1 = fmin_powell(errfn, params_guess, args=(xvals, yvals), disp=0)
    if return_b:
        return p1
    else:
        return p1[0]  # return only S_max, not the K_m (B) param
Exemplo n.º 7
0
def michaelis_menten_fit(counts, num_repeats=1, params_guess=None,
    return_b=False):
    """Michaelis-Menten fit to rarefaction curve of observed species

    Note: there is some controversy about how to do the fitting. The ML model
    givem by Raaijmakers 1987 is based on the assumption that error is roughly
    proportional to magnitude of observation, reasonable for enzyme kinetics
    but not reasonable for rarefaction data. Here we just do a nonlinear
    curve fit for the parameters using least-squares.
    

    S = Smax*n/(B + n) . n: number of individuals, S: # of species
    returns Smax
    
    inputs:
    num_repeats: will perform rarefaction (subsampling without replacement)
    this many times at each value of n
    params_guess: intial guess of Smax, B (None => default)
    return_b: if True will return the estimate for Smax, B. Default is just Smax
    
    the fit is made to datapoints where n = 1,2,...counts.sum(),
    S = species represented in random sample of n individuals
    
    """
    counts = asarray(counts)
    if params_guess is None:
        params_guess = array([100,500])

    # observed # of species vs # of individuals sampled, S vs n
    xvals = arange(1,counts.sum()+1)
    ymtx = []
    for i in range(num_repeats):
        ymtx.append( array([observed_species(rarefaction.subsample(counts,n)) \
        for n in xvals]))
    ymtx = asarray(ymtx)
    yvals = ymtx.mean(0)
    
    # fit to obs_sp = max_sp * num_idiv / (num_indiv + B)
    # return max_sp
    def fitfn(p,n): # works with vectors of n, returns vector of S
        return p[0]*n/(p[1] + n)
    
    def errfn(p,n,y): # vectors of actual vals y and number of individuals n
        return ((fitfn(p,n) - y)**2).sum()

    p1 = fmin_powell(errfn, params_guess, args=(xvals,yvals), disp=0)
    if return_b:
        return p1
    else:
        return p1[0] # return only S_max, not the K_m (B) param