Exemplo n.º 1
0
def cog_info(candidates, sp2hits):
    sp_coverages = [hits / float(len(candidates)) for hits in sp2hits.values()]
    species_covered = len(set(sp2hits.keys())) + 1
    min_cov = _min(sp_coverages)
    max_cov = _min(sp_coverages)
    median_cov = _median(sp_coverages)
    return min_cov, max_cov, median_cov
Exemplo n.º 2
0
def cog_info(candidates, sp2hits):
    sp_coverages = [hits/float(len(candidates)) for hits in sp2hits.values()]
    species_covered = len(set(sp2hits.keys()))+1
    min_cov = _min(sp_coverages)
    max_cov = _min(sp_coverages)
    median_cov = _median(sp_coverages)
    return min_cov, max_cov, median_cov
Exemplo n.º 3
0
 def sort_cogs_by_sp_repr(c1, c2):
     c1_repr = _min([sp2cogs[_sp] for _sp, _seq in c1])
     c2_repr = _min([sp2cogs[_sp] for _sp, _seq in c2])
     r = cmp(c1_repr, c2_repr)
     if r == 0:
         r = -1 * cmp(len(c1), len(c2))
         if r == 0:
             return cmp(sorted(c1), sorted(c2))
         else:
             return r
     else:
         return r
Exemplo n.º 4
0
 def _sort_cogs(cogs1, cogs2):
     cogs1 = cogs1[1] # discard seed info
     cogs2 = cogs2[1] # discard seed info        
     cog_sizes1 = [len(cog) for cog in cogs1]
     cog_sizes2 = [len(cog) for cog in cogs2]
     mx1, mn1, avg1 = _max(cog_sizes1), _min(cog_sizes1), round(_mean(cog_sizes1))
     mx2, mn2, avg2 = _max(cog_sizes2), _min(cog_sizes2), round(_mean(cog_sizes2))
     
     # we want to maximize all these values in the following order:
     for i, j in ((mx1, mx2), (avg1, avg2), (len(cogs1), len(cogs2))):
         v = -1 * cmp(i, j)
         if v != 0:
             break
     return v
Exemplo n.º 5
0
    def _sort_cogs(cogs1, cogs2):
        cogs1 = cogs1[1]  # discard seed info
        cogs2 = cogs2[1]  # discard seed info
        cog_sizes1 = [len(cog) for cog in cogs1]
        cog_sizes2 = [len(cog) for cog in cogs2]
        mx1, mn1, avg1 = _max(cog_sizes1), _min(cog_sizes1), round(
            _mean(cog_sizes1))
        mx2, mn2, avg2 = _max(cog_sizes2), _min(cog_sizes2), round(
            _mean(cog_sizes2))

        # we want to maximize all these values in the following order:
        for i, j in ((mx1, mx2), (avg1, avg2), (len(cogs1), len(cogs2))):
            v = -1 * cmp(i, j)
            if v != 0:
                break
        return v
Exemplo n.º 6
0
def get_cog_score(candidates, sp2hits, max_cogs, all_species):

    cog_cov = _mean([len(cogs) for cogs in candidates])/float(len(sp2hits)+1)
    cog_mean_cov = _mean([len(cogs)/float(len(sp2hits)) for cogs in candidates]) # numero medio de especies en cada cog
    cog_min_sp = _min([len(cogs) for cogs in candidates])

    sp_coverages = [sp2hits.get(sp, 0)/float(len(candidates)) for sp in all_species]
    species_covered = len(set(sp2hits.keys()))+1

    nfactor = len(candidates)/float(max_cogs) # Numero de cogs
    min_cov = _min(sp_coverages) # el coverage de la peor especie
    max_cov = _min(sp_coverages)
    median_cov = _median(sp_coverages)
    cov_std = _std(sp_coverages)

    score = _min([nfactor, cog_mean_cov, min_cov])
    return score, min_cov, max_cov, median_cov, cov_std, cog_cov 
Exemplo n.º 7
0
 def sort_cogs_by_size(c1, c2):
     '''
     sort cogs by descending size. If two cogs are the same size, sort
     them keeping first the one with the less represented
     species. Otherwise sort by sequence name sp_seqid.'''
     
     r = -1 * cmp(len(c1), len(c2))
     if r == 0:
         # finds the cog including the less represented species
         c1_repr = _min([sp2cogs[_sp] for _sp, _seq in c1])
         c2_repr = _min([sp2cogs[_sp] for _sp, _seq in c2])
         r = cmp(c1_repr, c2_repr)
         if r == 0:
             return cmp(sorted(c1), sorted(c2))
         else:
             return r
     else:
         return r
Exemplo n.º 8
0
def get_identity(fname):
    s = SeqGroup(fname)
    seqlen = len(s.id2seq.itervalues().next())
    ident = list()
    for i in xrange(seqlen):
        states = defaultdict(int)
        for seq in s.id2seq.itervalues():
            if seq[i] != "-":
                states[seq[i]] += 1
        values = states.values()
        if values:
            ident.append(float(max(values)) / sum(values))
    return (_max(ident), _min(ident), _mean(ident), _std(ident))
Exemplo n.º 9
0
def get_cog_score(candidates, sp2hits, max_cogs, all_species):

    cog_cov = _mean([len(cogs)
                     for cogs in candidates]) / float(len(sp2hits) + 1)
    cog_mean_cov = _mean([
        len(cogs) / float(len(sp2hits)) for cogs in candidates
    ])  # numero medio de especies en cada cog
    cog_min_sp = _min([len(cogs) for cogs in candidates])

    sp_coverages = [
        sp2hits.get(sp, 0) / float(len(candidates)) for sp in all_species
    ]
    species_covered = len(set(sp2hits.keys())) + 1

    nfactor = len(candidates) / float(max_cogs)  # Numero de cogs
    min_cov = _min(sp_coverages)  # el coverage de la peor especie
    max_cov = _min(sp_coverages)
    median_cov = _median(sp_coverages)
    cov_std = _std(sp_coverages)

    score = _min([nfactor, cog_mean_cov, min_cov])
    return score, min_cov, max_cov, median_cov, cov_std, cog_cov
Exemplo n.º 10
0
def get_seqs_identity(alg, seqs):
    """ Returns alg statistics regarding a set of sequences"""
    seqlen = len(alg.get_seq(seqs[0]))
    ident = list()
    for i in xrange(seqlen):
        states = defaultdict(int)
        for seq_id in seqs:
            seq = alg.get_seq(seq_id)
            if seq[i] != "-":
                states[seq[i]] += 1
        values = states.values()
        if values:
            ident.append(float(max(values)) / sum(values))
    return (_max(ident), _min(ident), _mean(ident), _std(ident))