def run_ptp(self, jp): full_aln = SeqGroup(self.epa_alignment) species_list = epa_2_ptp(epa_jp=jp, ref_jp=self.refjson, full_alignment=full_aln, min_lw=0.5, debug=self.cfg.debug) self.cfg.log.debug("Species clusters:") if fout: fo2 = open(fout + ".species", "w") else: fo2 = None for sp_cluster in species_list: translated_taxa = [] for taxon in sp_cluster: origin_taxon_name = EpacConfig.strip_query_prefix(taxon) translated_taxa.append(origin_taxon_name) s = ",".join(translated_taxa) if fo2: fo2.write(s + "\n") self.cfg.log.debug(s) if fo2: fo2.close()
def check_seq_tax_labels(self, seq_name, orig_ranks, ranks, lws): mislabel_lvl = -1 min_len = min(len(orig_ranks),len(ranks)) for rank_lvl in range(min_len): if ranks[rank_lvl] != Taxonomy.EMPTY_RANK and ranks[rank_lvl] != orig_ranks[rank_lvl]: mislabel_lvl = rank_lvl break if mislabel_lvl >= 0: real_lvl = self.guess_rank_level(orig_ranks, mislabel_lvl) mis_rec = {} mis_rec['name'] = EpacConfig.strip_ref_prefix(seq_name) mis_rec['orig_level'] = mislabel_lvl mis_rec['real_level'] = real_lvl mis_rec['level_name'] = self.rank_level_name(real_lvl)[1] mis_rec['inv_level'] = -1 * real_lvl # just for sorting mis_rec['orig_ranks'] = orig_ranks mis_rec['ranks'] = ranks mis_rec['lws'] = lws mis_rec['conf'] = lws[mislabel_lvl] self.mislabels.append(mis_rec) return mis_rec else: return None
def get_noalign_list(self): noalign_list = [] if os.path.exists(self.noalign): with open(self.noalign) as fnoa: lines = fnoa.readlines() for line in lines: taxon_name = line.strip()[1:] origin_taxon_name = EpacConfig.strip_query_prefix( taxon_name) noalign_list.append(origin_taxon_name) return noalign_list
def setUp(self): self.testfile_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), "testfiles") self.tax_fname = os.path.join(self.testfile_dir, "test_clean.tax") self.taxonomy = Taxonomy(EpacConfig.REF_SEQ_PREFIX, self.tax_fname) tax_map = self.taxonomy.get_map() cfg = EpacConfig() self.taxtree_helper = TaxTreeHelper(cfg, tax_map) outgr_fname = os.path.join(self.testfile_dir, "outgroup.nw") self.expected_outgr = Tree(outgr_fname)
def test_taxtree_builder(self): cfg = EpacConfig() testfile_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testfiles") tax_fname = os.path.join(testfile_dir, "test.tax") tax = Taxonomy(EpacConfig.REF_SEQ_PREFIX, tax_fname) tree_fname = os.path.join(testfile_dir, "taxtree.nw") expected_tree = Tree(tree_fname, format=8) tb = TaxTreeBuilder(cfg, tax) tax_tree, seq_ids = tb.build() self.assertEqual(seq_ids, tax.get_map().keys()) self.assertEqual(tax_tree.write(format=8), expected_tree.write(format=8))
def classify(self, query_fname, minp=0.9, ptp=False): if self.jplace_fname: jp = EpaJsonParser(self.jplace_fname) else: self.checkinput(query_fname, minp) jp = self.run_epa() self.cfg.log.info( "Assigning taxonomic labels based on EPA placements...\n") placements = jp.get_placement() if self.out_assign_fname: fo = open(self.out_assign_fname, "w") else: fo = None noassign_list = [] for place in placements: taxon_name = place["n"][0] origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name) edges = place["p"] ranks, lws = self.classify_helper.classify_seq(edges) rankout = self.print_ranks(ranks, lws, self.cfg.min_lhw) if rankout == None: noassign_list.append(origin_taxon_name) else: output = "%s\t%s\t" % (origin_taxon_name, rankout) if self.cfg.check_novelty: isnovo = self.novelty_check(place_edge=str(edges[0][0]), ranks=ranks, lws=lws) output += "*" if isnovo else "o" self.print_result_line(fo, output) noassign_list += self.get_noalign_list() for taxon_name in noassign_list: output = "%s\t\t\t?" % origin_taxon_name self.print_result_line(fo, output) if fo: fo.close() ############################################# # # EPA-PTP species delimitation # ############################################# if ptp: self.run_ptp(jp)
def check_seq_ids(self): # check that seq IDs in taxonomy and alignment correspond self.mis_ids = [] for sid in self.taxonomy.seq_ranks_map.iterkeys(): unprefixed_sid = EpacConfig.strip_ref_prefix(sid) if not self.alignment.has_seq(unprefixed_sid): self.mis_ids.append(unprefixed_sid) if len(self.mis_ids) > 0 and self.verbose: errmsg = "ERROR: Following %d sequence(s) are missing in your alignment file:\n%s\n\n" % (len(self.mis_ids), "\n".join(self.mis_ids)) errmsg += "Please make sure sequence IDs in taxonomic annotation file and in alignment are identical!\n" self.cfg.exit_user_error(errmsg) return self.mis_ids
def check_seq_ids(self): # check that seq IDs in taxonomy and alignment correspond self.mis_ids = [] for sid in self.taxonomy.seq_ranks_map.iterkeys(): unprefixed_sid = EpacConfig.strip_ref_prefix(sid) if not self.alignment.has_seq(unprefixed_sid): self.mis_ids.append(unprefixed_sid) if len(self.mis_ids) > 0 and self.verbose: errmsg = "ERROR: Following %d sequence(s) are missing in your alignment file:\n%s\n\n" % ( len(self.mis_ids), "\n".join(self.mis_ids)) errmsg += "Please make sure sequence IDs in taxonomic annotation file and in alignment are identical!\n" self.cfg.exit_user_error(errmsg) return self.mis_ids
def mis_rec_to_string(self, mis_rec): lvl = mis_rec['orig_level'] uncorr_name = EpacConfig.strip_ref_prefix(self.refjson.get_uncorr_seqid(mis_rec['name'])) uncorr_orig_ranks = self.refjson.get_uncorr_ranks(mis_rec['orig_ranks']) uncorr_ranks = self.refjson.get_uncorr_ranks(mis_rec['ranks']) output = uncorr_name + "\t" if lvl >= 0: output += "%s\t%s\t%s\t%.3f\t" % (mis_rec['level_name'], uncorr_orig_ranks[lvl], uncorr_ranks[lvl], mis_rec['lws'][lvl]) else: output += "%s\t%s\t%s\t%.3f\t" % (mis_rec['level_name'], "NA", "NA", mis_rec['lws'][0]) output += Taxonomy.lineage_str(uncorr_orig_ranks) + "\t" output += Taxonomy.lineage_str(uncorr_ranks) + "\t" output += ";".join(["%.3f" % conf for conf in mis_rec['lws']]) if 'rank_conf' in mis_rec: output += "\t%.3f" % mis_rec['rank_conf'] return output
def mis_rec_to_string(self, mis_rec): lvl = mis_rec['orig_level'] uncorr_name = EpacConfig.strip_ref_prefix( self.refjson.get_uncorr_seqid(mis_rec['name'])) uncorr_orig_ranks = self.refjson.get_uncorr_ranks( mis_rec['orig_ranks']) uncorr_ranks = self.refjson.get_uncorr_ranks(mis_rec['ranks']) output = uncorr_name + "\t" if lvl >= 0: output += "%s\t%s\t%s\t%.3f\t" % ( mis_rec['level_name'], uncorr_orig_ranks[lvl], uncorr_ranks[lvl], mis_rec['lws'][lvl]) else: output += "%s\t%s\t%s\t%.3f\t" % (mis_rec['level_name'], "NA", "NA", mis_rec['lws'][0]) output += Taxonomy.lineage_str(uncorr_orig_ranks) + "\t" output += Taxonomy.lineage_str(uncorr_ranks) + "\t" output += ";".join(["%.3f" % conf for conf in mis_rec['lws']]) if 'rank_conf' in mis_rec: output += "\t%.3f" % mis_rec['rank_conf'] return output
def test_epac_config(self): args = self.get_default_namespace() cfg = EpacConfig(args) self.check_common_config(cfg)
def classify(self, query_fname, fout = None, method = "1", minlw = 0.0, pv = 0.02, minp = 0.9, ptp = False): if self.jplace_fname: jp = EpaJsonParser(self.jplace_fname) else: self.checkinput(query_fname, minp) raxml = RaxmlWrapper(config) reftree_fname = self.cfg.tmp_fname("ref_%NAME%.tre") self.refjson.get_raxml_readable_tree(reftree_fname) optmod_fname = self.cfg.tmp_fname("%NAME%.opt") self.refjson.get_binary_model(optmod_fname) job_name = self.cfg.subst_name("epa_%NAME%") reftree_str = self.refjson.get_raxml_readable_tree() reftree = Tree(reftree_str) self.reftree_size = len(reftree.get_leaves()) # IMPORTANT: set EPA heuristic rate based on tree size! self.cfg.resolve_auto_settings(self.reftree_size) # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file if self.cfg.epa_load_optmod: self.cfg.raxml_model = self.refjson.get_ratehet_model() reduced_align_fname = raxml.reduce_alignment(self.epa_alignment) jp = raxml.run_epa(job_name, reduced_align_fname, reftree_fname, optmod_fname) placements = jp.get_placement() if fout: fo = open(fout, "w") else: fo = None output2 = "" for place in placements: output = None taxon_name = place["n"][0] origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name) edges = place["p"] # edges = self.erlang_filter(edges, p = pv) if len(edges) > 0: ranks, lws = self.classify_helper.classify_seq(edges, method, minlw) isnovo = self.novelty_check(place_edge = str(edges[0][0]), ranks =ranks, lws = lws, minlw = minlw) rankout = self.print_ranks(ranks, lws, minlw) if rankout == None: output2 = output2 + origin_taxon_name+ "\t\t\t?\n" else: output = "%s\t%s\t" % (origin_taxon_name, self.print_ranks(ranks, lws, minlw)) if isnovo: output += "*" else: output +="o" if self.cfg.verbose: print(output) if fo: fo.write(output + "\n") else: output2 = output2 + origin_taxon_name+ "\t\t\t?\n" if os.path.exists(self.noalign): with open(self.noalign) as fnoa: lines = fnoa.readlines() for line in lines: taxon_name = line.strip()[1:] origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name) output = "%s\t\t\t?" % origin_taxon_name if self.cfg.verbose: print(output) if fo: fo.write(output + "\n") if self.cfg.verbose: print(output2) if fo: fo.write(output2) fo.close() ############################################# # # EPA-PTP species delimitation # ############################################# if ptp: full_aln = SeqGroup(self.epa_alignment) species_list = epa_2_ptp(epa_jp = jp, ref_jp = self.refjson, full_alignment = full_aln, min_lw = 0.5, debug = self.cfg.debug) if self.cfg.verbose: print "Species clusters:" if fout: fo2 = open(fout+".species", "w") else: fo2 = None for sp_cluster in species_list: translated_taxa = [] for taxon in sp_cluster: origin_taxon_name = EpacConfig.strip_query_prefix(taxon) translated_taxa.append(origin_taxon_name) s = ",".join(translated_taxa) if fo2: fo2.write(s + "\n") if self.cfg.verbose: print s if fo2: fo2.close() ############################################# if not self.jplace_fname: if not self.cfg.debug: raxml.cleanup(job_name) FileUtils.remove_if_exists(reduced_align_fname) FileUtils.remove_if_exists(reftree_fname) FileUtils.remove_if_exists(optmod_fname)
def setUp(self): self.cfg = EpacConfig() self.testfile_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), "testfiles")
def classify(self, query_fname, minp = 0.9, ptp = False): if self.jplace_fname: jp = EpaJsonParser(self.jplace_fname) else: self.checkinput(query_fname, minp) self.cfg.log.info("Running RAxML-EPA to place %d query sequences...\n" % self.query_count) raxml = RaxmlWrapper(config) reftree_fname = self.cfg.tmp_fname("ref_%NAME%.tre") self.refjson.get_raxml_readable_tree(reftree_fname) optmod_fname = self.cfg.tmp_fname("%NAME%.opt") self.refjson.get_binary_model(optmod_fname) job_name = self.cfg.subst_name("epa_%NAME%") reftree_str = self.refjson.get_raxml_readable_tree() reftree = Tree(reftree_str) self.reftree_size = len(reftree.get_leaves()) # IMPORTANT: set EPA heuristic rate based on tree size! self.cfg.resolve_auto_settings(self.reftree_size) # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file if self.cfg.epa_load_optmod: self.cfg.raxml_model = self.refjson.get_ratehet_model() reduced_align_fname = raxml.reduce_alignment(self.epa_alignment) jp = raxml.run_epa(job_name, reduced_align_fname, reftree_fname, optmod_fname) raxml.copy_epa_jplace(job_name, self.out_jplace_fname, move=True) self.cfg.log.info("Assigning taxonomic labels based on EPA placements...\n") placements = jp.get_placement() if self.out_assign_fname: fo = open(self.out_assign_fname, "w") else: fo = None noassign_list = [] for place in placements: taxon_name = place["n"][0] origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name) edges = place["p"] if len(edges) > 0: ranks, lws = self.classify_helper.classify_seq(edges) isnovo = self.novelty_check(place_edge = str(edges[0][0]), ranks=ranks, lws=lws) rankout = self.print_ranks(ranks, lws, self.cfg.min_lhw) if rankout == None: noassign_list.append(origin_taxon_name) else: output = "%s\t%s\t" % (origin_taxon_name, rankout) if isnovo: output += "*" else: output +="o" if self.cfg.verbose: print(output) if fo: fo.write(output + "\n") else: noassign_list.append(origin_taxon_name) if os.path.exists(self.noalign): with open(self.noalign) as fnoa: lines = fnoa.readlines() for line in lines: taxon_name = line.strip()[1:] origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name) noassign_list.append(origin_taxon_name) for taxon_name in noassign_list: output = "%s\t\t\t?" % origin_taxon_name if self.cfg.verbose: print(output) if fo: fo.write(output + "\n") if fo: fo.close() ############################################# # # EPA-PTP species delimitation # ############################################# if ptp: full_aln = SeqGroup(self.epa_alignment) species_list = epa_2_ptp(epa_jp = jp, ref_jp = self.refjson, full_alignment = full_aln, min_lw = 0.5, debug = self.cfg.debug) self.cfg.log.debug("Species clusters:") if fout: fo2 = open(fout+".species", "w") else: fo2 = None for sp_cluster in species_list: translated_taxa = [] for taxon in sp_cluster: origin_taxon_name = EpacConfig.strip_query_prefix(taxon) translated_taxa.append(origin_taxon_name) s = ",".join(translated_taxa) if fo2: fo2.write(s + "\n") self.cfg.log.debug(s) if fo2: fo2.close()