def cog_info(candidates, sp2hits): sp_coverages = [hits / float(len(candidates)) for hits in sp2hits.values()] species_covered = len(set(sp2hits.keys())) + 1 min_cov = _min(sp_coverages) max_cov = _min(sp_coverages) median_cov = _median(sp_coverages) return min_cov, max_cov, median_cov
def cog_info(candidates, sp2hits): sp_coverages = [hits/float(len(candidates)) for hits in sp2hits.values()] species_covered = len(set(sp2hits.keys()))+1 min_cov = _min(sp_coverages) max_cov = _min(sp_coverages) median_cov = _median(sp_coverages) return min_cov, max_cov, median_cov
def sort_cogs_by_sp_repr(c1, c2): c1_repr = _min([sp2cogs[_sp] for _sp, _seq in c1]) c2_repr = _min([sp2cogs[_sp] for _sp, _seq in c2]) r = cmp(c1_repr, c2_repr) if r == 0: r = -1 * cmp(len(c1), len(c2)) if r == 0: return cmp(sorted(c1), sorted(c2)) else: return r else: return r
def _sort_cogs(cogs1, cogs2): cogs1 = cogs1[1] # discard seed info cogs2 = cogs2[1] # discard seed info cog_sizes1 = [len(cog) for cog in cogs1] cog_sizes2 = [len(cog) for cog in cogs2] mx1, mn1, avg1 = _max(cog_sizes1), _min(cog_sizes1), round(_mean(cog_sizes1)) mx2, mn2, avg2 = _max(cog_sizes2), _min(cog_sizes2), round(_mean(cog_sizes2)) # we want to maximize all these values in the following order: for i, j in ((mx1, mx2), (avg1, avg2), (len(cogs1), len(cogs2))): v = -1 * cmp(i, j) if v != 0: break return v
def _sort_cogs(cogs1, cogs2): cogs1 = cogs1[1] # discard seed info cogs2 = cogs2[1] # discard seed info cog_sizes1 = [len(cog) for cog in cogs1] cog_sizes2 = [len(cog) for cog in cogs2] mx1, mn1, avg1 = _max(cog_sizes1), _min(cog_sizes1), round( _mean(cog_sizes1)) mx2, mn2, avg2 = _max(cog_sizes2), _min(cog_sizes2), round( _mean(cog_sizes2)) # we want to maximize all these values in the following order: for i, j in ((mx1, mx2), (avg1, avg2), (len(cogs1), len(cogs2))): v = -1 * cmp(i, j) if v != 0: break return v
def get_cog_score(candidates, sp2hits, max_cogs, all_species): cog_cov = _mean([len(cogs) for cogs in candidates])/float(len(sp2hits)+1) cog_mean_cov = _mean([len(cogs)/float(len(sp2hits)) for cogs in candidates]) # numero medio de especies en cada cog cog_min_sp = _min([len(cogs) for cogs in candidates]) sp_coverages = [sp2hits.get(sp, 0)/float(len(candidates)) for sp in all_species] species_covered = len(set(sp2hits.keys()))+1 nfactor = len(candidates)/float(max_cogs) # Numero de cogs min_cov = _min(sp_coverages) # el coverage de la peor especie max_cov = _min(sp_coverages) median_cov = _median(sp_coverages) cov_std = _std(sp_coverages) score = _min([nfactor, cog_mean_cov, min_cov]) return score, min_cov, max_cov, median_cov, cov_std, cog_cov
def sort_cogs_by_size(c1, c2): ''' sort cogs by descending size. If two cogs are the same size, sort them keeping first the one with the less represented species. Otherwise sort by sequence name sp_seqid.''' r = -1 * cmp(len(c1), len(c2)) if r == 0: # finds the cog including the less represented species c1_repr = _min([sp2cogs[_sp] for _sp, _seq in c1]) c2_repr = _min([sp2cogs[_sp] for _sp, _seq in c2]) r = cmp(c1_repr, c2_repr) if r == 0: return cmp(sorted(c1), sorted(c2)) else: return r else: return r
def get_identity(fname): s = SeqGroup(fname) seqlen = len(s.id2seq.itervalues().next()) ident = list() for i in xrange(seqlen): states = defaultdict(int) for seq in s.id2seq.itervalues(): if seq[i] != "-": states[seq[i]] += 1 values = states.values() if values: ident.append(float(max(values)) / sum(values)) return (_max(ident), _min(ident), _mean(ident), _std(ident))
def get_cog_score(candidates, sp2hits, max_cogs, all_species): cog_cov = _mean([len(cogs) for cogs in candidates]) / float(len(sp2hits) + 1) cog_mean_cov = _mean([ len(cogs) / float(len(sp2hits)) for cogs in candidates ]) # numero medio de especies en cada cog cog_min_sp = _min([len(cogs) for cogs in candidates]) sp_coverages = [ sp2hits.get(sp, 0) / float(len(candidates)) for sp in all_species ] species_covered = len(set(sp2hits.keys())) + 1 nfactor = len(candidates) / float(max_cogs) # Numero de cogs min_cov = _min(sp_coverages) # el coverage de la peor especie max_cov = _min(sp_coverages) median_cov = _median(sp_coverages) cov_std = _std(sp_coverages) score = _min([nfactor, cog_mean_cov, min_cov]) return score, min_cov, max_cov, median_cov, cov_std, cog_cov
def get_seqs_identity(alg, seqs): """ Returns alg statistics regarding a set of sequences""" seqlen = len(alg.get_seq(seqs[0])) ident = list() for i in xrange(seqlen): states = defaultdict(int) for seq_id in seqs: seq = alg.get_seq(seq_id) if seq[i] != "-": states[seq[i]] += 1 values = states.values() if values: ident.append(float(max(values)) / sum(values)) return (_max(ident), _min(ident), _mean(ident), _std(ident))