def run_epa_once(self, reftree, th): reftree_fname = self.cfg.tmp_fname("final_ref_%NAME%.tre") job_name = self.cfg.subst_name("final_epa_%NAME%") reftree.write(outfile=reftree_fname) # IMPORTANT: don't load the model, since it's invalid for the pruned true !!! optmod_fname="" epa_result = self.raxml.run_epa(job_name, self.refalign_fname, reftree_fname, optmod_fname) reftree_epalbl_str = epa_result.get_std_newick_tree() placements = epa_result.get_placement() # update branchid-taxonomy mapping to account for possible changes in branch numbering reftree_tax = Tree(reftree_epalbl_str) th.set_bf_unrooted_tree(reftree_tax) bid_tax_map = th.get_bid_taxonomy_map() cl = TaxClassifyHelper(self.cfg, bid_tax_map, self.brlen_pv, self.rate, self.node_height) for place in placements: seq_name = place["n"][0] # get original taxonomic label orig_ranks = self.get_orig_ranks(seq_name) # get EPA tax label ranks, lws = cl.classify_seq(place["p"]) # check if they match mis_rec = self.check_seq_tax_labels(seq_name, orig_ranks, ranks, lws) if not self.cfg.debug: self.raxml.cleanup(job_name) FileUtils.remove_if_exists(reftree_fname)
def build_hmm_profile(self, json_builder): print "Building the HMMER profile...\n" # this stupid workaround is needed because RAxML outputs the reduced # alignment in relaxed PHYLIP format, which is not supported by HMMER refalign_fasta = self.cfg.tmp_fname("%NAME%_ref_reduced.fa") self.reduced_refalign_seqs.write(outfile=refalign_fasta) hmm = hmmer(self.cfg, refalign_fasta) fprofile = hmm.build_hmm_profile() json_builder.set_hmm_profile(fprofile) if not self.cfg.debug: FileUtils.remove_if_exists(refalign_fasta) FileUtils.remove_if_exists(fprofile)
def run_test(self): self.raxml = RaxmlWrapper(self.cfg) print "Number of sequences in the reference: %d\n" % self.reftree_size self.refjson.get_raxml_readable_tree(self.reftree_fname) self.refalign_fname = self.refjson.get_alignment(self.tmp_refaln) self.refjson.get_binary_model(self.optmod_fname) if self.ranktest: print "Running the leave-one-rank-out test...\n" subtree_count = self.run_leave_subtree_out_test() print "Running the leave-one-sequence-out test...\n" self.run_leave_seq_out_test() if len(self.mislabels) > 0: print "Leave-one-out test identified %d suspicious sequences; running final EPA test to check them...\n" % len(self.mislabels) self.write_mislabels(final=False) self.run_final_epa_test() self.sort_mislabels() self.write_mislabels() print "\nPercentage of mislabeled sequences: %.2f %%" % (float(len(self.mislabels)) / self.reftree_size * 100) if not self.cfg.debug: FileUtils.remove_if_exists(self.reftree_fname) FileUtils.remove_if_exists(self.optmod_fname) FileUtils.remove_if_exists(self.refalign_fname)
def cleanup(self): FileUtils.remove_if_exists(self.outgr_fname) FileUtils.remove_if_exists(self.reftree_mfu_fname) FileUtils.remove_if_exists(self.reftree_bfu_fname) FileUtils.remove_if_exists(self.optmod_fname) FileUtils.remove_if_exists(self.lblalign_fname) FileUtils.remove_if_exists(self.outgr_fname) FileUtils.remove_if_exists(self.reduced_refalign_fname) FileUtils.remove_if_exists(self.refalign_fname)
def cleanup(self): FileUtils.remove_if_exists(self.tmp_refaln)
def cleanup(self): FileUtils.remove_if_exists(self.tmp_refaln) FileUtils.remove_if_exists(self.epa_alignment) FileUtils.remove_if_exists(self.hmmprofile) FileUtils.remove_if_exists(self.tmpquery) FileUtils.remove_if_exists(self.noalign)
def classify(self, query_fname, fout = None, method = "1", minlw = 0.0, pv = 0.02, minp = 0.9, ptp = False): if self.jplace_fname: jp = EpaJsonParser(self.jplace_fname) else: self.checkinput(query_fname, minp) raxml = RaxmlWrapper(config) reftree_fname = self.cfg.tmp_fname("ref_%NAME%.tre") self.refjson.get_raxml_readable_tree(reftree_fname) optmod_fname = self.cfg.tmp_fname("%NAME%.opt") self.refjson.get_binary_model(optmod_fname) job_name = self.cfg.subst_name("epa_%NAME%") reftree_str = self.refjson.get_raxml_readable_tree() reftree = Tree(reftree_str) self.reftree_size = len(reftree.get_leaves()) # IMPORTANT: set EPA heuristic rate based on tree size! self.cfg.resolve_auto_settings(self.reftree_size) # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file if self.cfg.epa_load_optmod: self.cfg.raxml_model = self.refjson.get_ratehet_model() reduced_align_fname = raxml.reduce_alignment(self.epa_alignment) jp = raxml.run_epa(job_name, reduced_align_fname, reftree_fname, optmod_fname) placements = jp.get_placement() if fout: fo = open(fout, "w") else: fo = None output2 = "" for place in placements: output = None taxon_name = place["n"][0] origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name) edges = place["p"] # edges = self.erlang_filter(edges, p = pv) if len(edges) > 0: ranks, lws = self.classify_helper.classify_seq(edges, method, minlw) isnovo = self.novelty_check(place_edge = str(edges[0][0]), ranks =ranks, lws = lws, minlw = minlw) rankout = self.print_ranks(ranks, lws, minlw) if rankout == None: output2 = output2 + origin_taxon_name+ "\t\t\t?\n" else: output = "%s\t%s\t" % (origin_taxon_name, self.print_ranks(ranks, lws, minlw)) if isnovo: output += "*" else: output +="o" if self.cfg.verbose: print(output) if fo: fo.write(output + "\n") else: output2 = output2 + origin_taxon_name+ "\t\t\t?\n" if os.path.exists(self.noalign): with open(self.noalign) as fnoa: lines = fnoa.readlines() for line in lines: taxon_name = line.strip()[1:] origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name) output = "%s\t\t\t?" % origin_taxon_name if self.cfg.verbose: print(output) if fo: fo.write(output + "\n") if self.cfg.verbose: print(output2) if fo: fo.write(output2) fo.close() ############################################# # # EPA-PTP species delimitation # ############################################# if ptp: full_aln = SeqGroup(self.epa_alignment) species_list = epa_2_ptp(epa_jp = jp, ref_jp = self.refjson, full_alignment = full_aln, min_lw = 0.5, debug = self.cfg.debug) if self.cfg.verbose: print "Species clusters:" if fout: fo2 = open(fout+".species", "w") else: fo2 = None for sp_cluster in species_list: translated_taxa = [] for taxon in sp_cluster: origin_taxon_name = EpacConfig.strip_query_prefix(taxon) translated_taxa.append(origin_taxon_name) s = ",".join(translated_taxa) if fo2: fo2.write(s + "\n") if self.cfg.verbose: print s if fo2: fo2.close() ############################################# if not self.jplace_fname: if not self.cfg.debug: raxml.cleanup(job_name) FileUtils.remove_if_exists(reduced_align_fname) FileUtils.remove_if_exists(reftree_fname) FileUtils.remove_if_exists(optmod_fname)