def get_trimal_conservation(alg_file, trimal_bin): output = commands.getoutput("%s -ssc -in %s" % (trimal_bin, alg_file)) conservation = [] for line in output.split("\n")[3:]: a, b = list(map(float, line.split())) conservation.append(b) mean = _mean(conservation) std = _std(conservation) return mean, std
def get_identity(fname): s = SeqGroup(fname) seqlen = len(six.itervalues(s.id2seq)) ident = list() for i in range(seqlen): states = defaultdict(int) for seq in six.itervalues(s.id2seq): if seq[i] != "-": states[seq[i]] += 1 values = list(states.values()) if values: ident.append(float(max(values)) / sum(values)) return (_max(ident), _min(ident), _mean(ident), _std(ident))
def get_seqs_identity(alg, seqs): ''' Returns alg statistics regarding a set of sequences''' seqlen = len(alg.get_seq(seqs[0])) ident = list() for i in range(seqlen): states = defaultdict(int) for seq_id in seqs: seq = alg.get_seq(seq_id) if seq[i] != "-": states[seq[i]] += 1 values = list(states.values()) if values: ident.append(float(max(values)) / sum(values)) return (_max(ident), _min(ident), _mean(ident), _std(ident))
def get_seqs_identity(alg, seqs): """ Returns alg statistics regarding a set of sequences""" seqlen = len(alg.get_seq(seqs[0])) ident = list() for i in range(seqlen): states = defaultdict(int) for seq_id in seqs: seq = alg.get_seq(seq_id) if seq[i] != "-": states[seq[i]] += 1 values = list(states.values()) if values: ident.append(float(max(values)) / sum(values)) return (_max(ident), _min(ident), _mean(ident), _std(ident))
def get_cog_score(candidates, sp2hits, max_cogs, all_species): cog_cov = _mean([len(cogs) for cogs in candidates])/float(len(sp2hits)+1) cog_mean_cov = _mean([len(cogs)/float(len(sp2hits)) for cogs in candidates]) # numero medio de especies en cada cog cog_min_sp = _min([len(cogs) for cogs in candidates]) sp_coverages = [sp2hits.get(sp, 0)/float(len(candidates)) for sp in all_species] species_covered = len(set(sp2hits.keys()))+1 nfactor = len(candidates)/float(max_cogs) # Numero de cogs min_cov = _min(sp_coverages) # el coverage de la peor especie max_cov = _min(sp_coverages) median_cov = _median(sp_coverages) cov_std = _std(sp_coverages) score = _min([nfactor, cog_mean_cov, min_cov]) return score, min_cov, max_cov, median_cov, cov_std, cog_cov
def get_cog_score(candidates, sp2hits, max_cogs, all_species): cog_cov = _mean([len(cogs) for cogs in candidates]) / float(len(sp2hits) + 1) cog_mean_cov = _mean([ len(cogs) / float(len(sp2hits)) for cogs in candidates ]) # numero medio de especies en cada cog cog_min_sp = _min([len(cogs) for cogs in candidates]) sp_coverages = [ sp2hits.get(sp, 0) / float(len(candidates)) for sp in all_species ] species_covered = len(set(sp2hits.keys())) + 1 nfactor = len(candidates) / float(max_cogs) # Numero de cogs min_cov = _min(sp_coverages) # el coverage de la peor especie max_cov = _min(sp_coverages) median_cov = _median(sp_coverages) cov_std = _std(sp_coverages) score = _min([nfactor, cog_mean_cov, min_cov]) return score, min_cov, max_cov, median_cov, cov_std, cog_cov
def get_best_selection(cogs_selections, species): ALL_SPECIES = set(species) def _compare_cog_selection(cs1, cs2): seed_1, missing_sp_allowed_1, candidates_1, sp2hits_1 = cs1 seed_2, missing_sp_allowed_2, candidates_2, sp2hits_2 = cs2 score_1, min_cov_1, max_cov_1, median_cov_1, cov_std_1, cog_cov_1 = get_cog_score( candidates_1, sp2hits_1, median_cogs, ALL_SPECIES - set([seed_1])) score_2, min_cov_2, max_cov_2, median_cov_2, cov_std_2, cog_cov_2 = get_cog_score( candidates_2, sp2hits_2, median_cogs, ALL_SPECIES - set([seed_2])) sp_represented_1 = len(sp2hits_1) sp_represented_2 = len(sp2hits_1) cmp_rpr = cmp(sp_represented_1, sp_represented_2) if cmp_rpr == 1: return 1 elif cmp_rpr == -1: return -1 else: cmp_score = cmp(score_1, score_2) if cmp_score == 1: return 1 elif cmp_score == -1: return -1 else: cmp_mincov = cmp(min_cov_1, min_cov_2) if cmp_mincov == 1: return 1 elif cmp_mincov == -1: return -1 else: cmp_maxcov = cmp(max_cov_1, max_cov_2) if cmp_maxcov == 1: return 1 elif cmp_maxcov == -1: return -1 else: cmp_cand = cmp(len(candidates_1), len(candidates_2)) if cmp_cand == 1: return 1 elif cmp_cand == -1: return -1 else: return 0 min_score = 0.5 max_cogs = _max([len(data[2]) for data in cogs_selections]) median_cogs = _median([len(data[2]) for data in cogs_selections]) cogs_selections.sort(_compare_cog_selection) cogs_selections.reverse() header = [ 'seed', 'missing sp allowed', 'spcs covered', '#COGs', 'mean sp coverage)', '#COGs for worst sp.', '#COGs for best sp.', 'sp. in COGS(avg)', 'SCORE' ] print_header = True best_cog_selection = None cog_analysis = StringIO() for i, cogs in enumerate(cogs_selections): seed, missing_sp_allowed, candidates, sp2hits = cogs sp_percent_coverages = [ (100 * sp2hits.get(sp, 0)) / float(len(candidates)) for sp in species ] sp_coverages = [sp2hits.get(sp, 0) for sp in species] score, min_cov, max_cov, median_cov, cov_std, cog_cov = get_cog_score( candidates, sp2hits, median_cogs, ALL_SPECIES - set([seed])) if best_cog_selection is None: best_cog_selection = i flag = "*" else: flag = " " data = (candidates, flag+"%10s" %seed, \ missing_sp_allowed, \ "%d (%0.1f%%)" %(len(set(sp2hits.keys()))+1, 100*float(len(ALL_SPECIES))/(len(sp2hits)+1)) , \ len(candidates), \ "%0.1f%% +- %0.1f" %(_mean(sp_percent_coverages), _std(sp_percent_coverages)), \ "% 3d (%0.1f%%)" %(min(sp_coverages),100*min(sp_coverages)/float(len(candidates))), \ "% 3d (%0.1f%%)" %(max(sp_coverages),100*max(sp_coverages)/float(len(candidates))), \ cog_cov, score ) if print_header: print_as_table([data[1:]], header=header, print_header=True, stdout=cog_analysis) print_header = False else: print_as_table([data[1:]], header=header, print_header=False, stdout=cog_analysis) #raw_input("Press") print(cog_analysis.getvalue()) #best_cog_selection = int(raw_input("choose:")) return cogs_selections[best_cog_selection], cog_analysis
def finish(self): def sort_cogs_by_size(c1, c2): ''' sort cogs by descending size. If two cogs are the same size, sort them keeping first the one with the less represented species. Otherwise sort by sequence name sp_seqid.''' r = -1 * cmp(len(c1), len(c2)) if r == 0: # finds the cog including the less represented species c1_repr = _min([sp2cogs[_sp] for _sp, _seq in c1]) c2_repr = _min([sp2cogs[_sp] for _sp, _seq in c2]) r = cmp(c1_repr, c2_repr) if r == 0: return cmp(sorted(c1), sorted(c2)) else: return r else: return r def sort_cogs_by_sp_repr(c1, c2): c1_repr = _min([sp2cogs[_sp] for _sp, _seq in c1]) c2_repr = _min([sp2cogs[_sp] for _sp, _seq in c2]) r = cmp(c1_repr, c2_repr) if r == 0: r = -1 * cmp(len(c1), len(c2)) if r == 0: return cmp(sorted(c1), sorted(c2)) else: return r else: return r all_species = self.targets | self.outgroups # strict threshold #min_species = len(all_species) - int(round(self.missing_factor * len(all_species))) # Relax threshold for cog selection to ensure sames genes are always included min_species = len(all_species) - int(round(self.missing_factor * len(GLOBALS["target_species"]))) min_species = max(min_species, (1-self.max_missing_factor) * len(all_species)) smallest_cog, largest_cog = len(all_species), 0 all_singletons = [] sp2cogs = defaultdict(int) for cognumber, cog in enumerate(open(GLOBALS["cogs_file"])): sp2seqs = defaultdict(list) for sp, seqid in [list(map(str.strip, seq.split(GLOBALS["spname_delimiter"], 1))) for seq in cog.split("\t")]: sp2seqs[sp].append(seqid) one2one_cog = set() for sp, seqs in six.iteritems(sp2seqs): #if len(seqs) != 1: # print sp, len(seqs) if sp in all_species and len(seqs) == 1: sp2cogs[sp] += 1 one2one_cog.add((sp, seqs[0])) smallest_cog = min(smallest_cog, len(one2one_cog)) largest_cog = max(largest_cog, len(one2one_cog)) all_singletons.append(one2one_cog) #if len(one2one_cog) >= min_species: # valid_cogs.append(one2one_cog) cognumber += 1 # sets the ammount of cogs in file for sp, ncogs in sorted(list(sp2cogs.items()), key=lambda x: x[1], reverse=True): log.log(28, "% 20s found in single copy in % 6d (%0.1f%%) COGs " %(sp, ncogs, 100 * ncogs/float(cognumber))) valid_cogs = sorted([sing for sing in all_singletons if len(sing) >= min_species], sort_cogs_by_size) log.log(28, "Largest cog size: %s. Smallest cog size: %s" %( largest_cog, smallest_cog)) self.cog_analysis = "" # save original cog names hitting the hard limit if len(valid_cogs) > self.cog_hard_limit: log.warning("Applying hard limit number of COGs: %d out of %d available" %(self.cog_hard_limit, len(valid_cogs))) self.raw_cogs = valid_cogs[:self.cog_hard_limit] self.cogs = [] # Translate sequence names into the internal DB names sp_repr = defaultdict(int) sizes = [] for co in self.raw_cogs: sizes.append(len(co)) for sp, seq in co: sp_repr[sp] += 1 co_names = ["%s%s%s" %(sp, GLOBALS["spname_delimiter"], seq) for sp, seq in co] encoded_names = db.translate_names(co_names) if len(encoded_names) != len(co): print(set(co) - set(encoded_names.keys())) raise DataError("Some sequence ids could not be translated") self.cogs.append(list(encoded_names.values())) # ERROR! COGs selected are not the prioritary cogs sorted out before!!! # Sort Cogs according to the md5 hash of its content. Random # sorting but kept among runs #map(lambda x: x.sort(), self.cogs) #self.cogs.sort(lambda x,y: cmp(md5(','.join(x)), md5(','.join(y)))) log.log(28, "Analysis of current COG selection:") for sp, ncogs in sorted(list(sp_repr.items()), key=lambda x:x[1], reverse=True): log.log(28, " % 30s species present in % 6d COGs (%0.1f%%)" %(sp, ncogs, 100 * ncogs/float(len(self.cogs)))) log.log(28, " %d COGs selected with at least %d species out of %d" %(len(self.cogs), min_species, len(all_species))) log.log(28, " Average COG size %0.1f/%0.1f +- %0.1f" %(_mean(sizes), _median(sizes), _std(sizes))) # Some consistency checks missing_sp = (all_species) - set(sp_repr.keys()) if missing_sp: log.error("%d missing species or not present in single-copy in any cog:\n%s" %\ (len(missing_sp), '\n'.join(missing_sp))) open('etebuild.valid_species_names.tmp', 'w').write('\n'.join(list(sp_repr.keys())) +'\n') log.error("All %d valid species have been dumped into etebuild.valid_species_names.tmp." " You can use --spfile to restrict the analysis to those species." %len(sp_repr)) raise TaskError('missing or not single-copy species under current cog selection') CogSelectorTask.store_data(self, self.cogs, self.cog_analysis)
def get_best_selection(cogs_selections, species): ALL_SPECIES = set(species) def _compare_cog_selection(cs1, cs2): seed_1, missing_sp_allowed_1, candidates_1, sp2hits_1 = cs1 seed_2, missing_sp_allowed_2, candidates_2, sp2hits_2 = cs2 score_1, min_cov_1, max_cov_1, median_cov_1, cov_std_1, cog_cov_1 = get_cog_score(candidates_1, sp2hits_1, median_cogs, ALL_SPECIES-set([seed_1])) score_2, min_cov_2, max_cov_2, median_cov_2, cov_std_2, cog_cov_2 = get_cog_score(candidates_2, sp2hits_2, median_cogs, ALL_SPECIES-set([seed_2])) sp_represented_1 = len(sp2hits_1) sp_represented_2 = len(sp2hits_1) cmp_rpr = cmp(sp_represented_1, sp_represented_2) if cmp_rpr == 1: return 1 elif cmp_rpr == -1: return -1 else: cmp_score = cmp(score_1, score_2) if cmp_score == 1: return 1 elif cmp_score == -1: return -1 else: cmp_mincov = cmp(min_cov_1, min_cov_2) if cmp_mincov == 1: return 1 elif cmp_mincov == -1: return -1 else: cmp_maxcov = cmp(max_cov_1, max_cov_2) if cmp_maxcov == 1: return 1 elif cmp_maxcov == -1: return -1 else: cmp_cand = cmp(len(candidates_1), len(candidates_2)) if cmp_cand == 1: return 1 elif cmp_cand == -1: return -1 else: return 0 min_score = 0.5 max_cogs = _max([len(data[2]) for data in cogs_selections]) median_cogs = _median([len(data[2]) for data in cogs_selections]) cogs_selections.sort(_compare_cog_selection) cogs_selections.reverse() header = ['seed', 'missing sp allowed', 'spcs covered', '#COGs', 'mean sp coverage)', '#COGs for worst sp.', '#COGs for best sp.', 'sp. in COGS(avg)', 'SCORE' ] print_header = True best_cog_selection = None cog_analysis = StringIO() for i, cogs in enumerate(cogs_selections): seed, missing_sp_allowed, candidates, sp2hits = cogs sp_percent_coverages = [(100*sp2hits.get(sp,0))/float(len(candidates)) for sp in species] sp_coverages = [sp2hits.get(sp, 0) for sp in species] score, min_cov, max_cov, median_cov, cov_std, cog_cov = get_cog_score(candidates, sp2hits, median_cogs, ALL_SPECIES-set([seed])) if best_cog_selection is None: best_cog_selection = i flag = "*" else: flag = " " data = (candidates, flag+"%10s" %seed, \ missing_sp_allowed, \ "%d (%0.1f%%)" %(len(set(sp2hits.keys()))+1, 100*float(len(ALL_SPECIES))/(len(sp2hits)+1)) , \ len(candidates), \ "%0.1f%% +- %0.1f" %(_mean(sp_percent_coverages), _std(sp_percent_coverages)), \ "% 3d (%0.1f%%)" %(min(sp_coverages),100*min(sp_coverages)/float(len(candidates))), \ "% 3d (%0.1f%%)" %(max(sp_coverages),100*max(sp_coverages)/float(len(candidates))), \ cog_cov, score ) if print_header: print_as_table([data[1:]], header=header, print_header=True, stdout=cog_analysis) print_header = False else: print_as_table([data[1:]], header=header, print_header=False, stdout=cog_analysis) #raw_input("Press") print(cog_analysis.getvalue()) #best_cog_selection = int(raw_input("choose:")) return cogs_selections[best_cog_selection], cog_analysis
def finish(self): def sort_cogs_by_size(c1, c2): ''' sort cogs by descending size. If two cogs are the same size, sort them keeping first the one with the less represented species. Otherwise sort by sequence name sp_seqid.''' r = -1 * cmp(len(c1), len(c2)) if r == 0: # finds the cog including the less represented species c1_repr = _min([sp2cogs[_sp] for _sp, _seq in c1]) c2_repr = _min([sp2cogs[_sp] for _sp, _seq in c2]) r = cmp(c1_repr, c2_repr) if r == 0: return cmp(sorted(c1), sorted(c2)) else: return r else: return r def sort_cogs_by_sp_repr(c1, c2): c1_repr = _min([sp2cogs[_sp] for _sp, _seq in c1]) c2_repr = _min([sp2cogs[_sp] for _sp, _seq in c2]) r = cmp(c1_repr, c2_repr) if r == 0: r = -1 * cmp(len(c1), len(c2)) if r == 0: return cmp(sorted(c1), sorted(c2)) else: return r else: return r all_species = self.targets | self.outgroups # strict threshold #min_species = len(all_species) - int(round(self.missing_factor * len(all_species))) # Relax threshold for cog selection to ensure sames genes are always included min_species = len(all_species) - int( round(self.missing_factor * len(GLOBALS["target_species"]))) min_species = max(min_species, (1 - self.max_missing_factor) * len(all_species)) smallest_cog, largest_cog = len(all_species), 0 all_singletons = [] sp2cogs = defaultdict(int) for cognumber, cog in enumerate(open(GLOBALS["cogs_file"])): sp2seqs = defaultdict(list) for sp, seqid in [ list( map(str.strip, seq.split(GLOBALS["spname_delimiter"], 1))) for seq in cog.split("\t") ]: sp2seqs[sp].append(seqid) one2one_cog = set() for sp, seqs in six.iteritems(sp2seqs): #if len(seqs) != 1: # print sp, len(seqs) if sp in all_species and len(seqs) == 1: sp2cogs[sp] += 1 one2one_cog.add((sp, seqs[0])) smallest_cog = min(smallest_cog, len(one2one_cog)) largest_cog = max(largest_cog, len(one2one_cog)) all_singletons.append(one2one_cog) #if len(one2one_cog) >= min_species: # valid_cogs.append(one2one_cog) cognumber += 1 # sets the ammount of cogs in file for sp, ncogs in sorted(list(sp2cogs.items()), key=lambda x: x[1], reverse=True): log.log( 28, "% 20s found in single copy in % 6d (%0.1f%%) COGs " % (sp, ncogs, 100 * ncogs / float(cognumber))) valid_cogs = sorted( [sing for sing in all_singletons if len(sing) >= min_species], sort_cogs_by_size) log.log( 28, "Largest cog size: %s. Smallest cog size: %s" % (largest_cog, smallest_cog)) self.cog_analysis = "" # save original cog names hitting the hard limit if len(valid_cogs) > self.cog_hard_limit: log.warning( "Applying hard limit number of COGs: %d out of %d available" % (self.cog_hard_limit, len(valid_cogs))) self.raw_cogs = valid_cogs[:self.cog_hard_limit] self.cogs = [] # Translate sequence names into the internal DB names sp_repr = defaultdict(int) sizes = [] for co in self.raw_cogs: sizes.append(len(co)) for sp, seq in co: sp_repr[sp] += 1 co_names = [ "%s%s%s" % (sp, GLOBALS["spname_delimiter"], seq) for sp, seq in co ] encoded_names = db.translate_names(co_names) if len(encoded_names) != len(co): print(set(co) - set(encoded_names.keys())) raise DataError("Some sequence ids could not be translated") self.cogs.append(list(encoded_names.values())) # ERROR! COGs selected are not the prioritary cogs sorted out before!!! # Sort Cogs according to the md5 hash of its content. Random # sorting but kept among runs #map(lambda x: x.sort(), self.cogs) #self.cogs.sort(lambda x,y: cmp(md5(','.join(x)), md5(','.join(y)))) log.log(28, "Analysis of current COG selection:") for sp, ncogs in sorted(list(sp_repr.items()), key=lambda x: x[1], reverse=True): log.log( 28, " % 30s species present in % 6d COGs (%0.1f%%)" % (sp, ncogs, 100 * ncogs / float(len(self.cogs)))) log.log( 28, " %d COGs selected with at least %d species out of %d" % (len(self.cogs), min_species, len(all_species))) log.log( 28, " Average COG size %0.1f/%0.1f +- %0.1f" % (_mean(sizes), _median(sizes), _std(sizes))) # Some consistency checks missing_sp = (all_species) - set(sp_repr.keys()) if missing_sp: log.error("%d missing species or not present in single-copy in any cog:\n%s" %\ (len(missing_sp), '\n'.join(missing_sp))) open('etebuild.valid_species_names.tmp', 'w').write('\n'.join(list(sp_repr.keys())) + '\n') log.error( "All %d valid species have been dumped into etebuild.valid_species_names.tmp." " You can use --spfile to restrict the analysis to those species." % len(sp_repr)) raise TaskError( 'missing or not single-copy species under current cog selection' ) CogSelectorTask.store_data(self, self.cogs, self.cog_analysis)