Exemplo n.º 1
0
    def run_epa_once(self, reftree, th):
        reftree_fname = self.cfg.tmp_fname("final_ref_%NAME%.tre")
        job_name = self.cfg.subst_name("final_epa_%NAME%")

        reftree.write(outfile=reftree_fname)

        # IMPORTANT: don't load the model, since it's invalid for the pruned true !!! 
        optmod_fname=""
        epa_result = self.raxml.run_epa(job_name, self.refalign_fname, reftree_fname, optmod_fname)
        reftree_epalbl_str = epa_result.get_std_newick_tree()        
        placements = epa_result.get_placement()
        
        # update branchid-taxonomy mapping to account for possible changes in branch numbering
        reftree_tax = Tree(reftree_epalbl_str)
        th.set_bf_unrooted_tree(reftree_tax)
        bid_tax_map = th.get_bid_taxonomy_map()
        
        cl = TaxClassifyHelper(self.cfg, bid_tax_map, self.brlen_pv, self.rate, self.node_height)

        for place in placements:
            seq_name = place["n"][0]

            # get original taxonomic label
            orig_ranks = self.get_orig_ranks(seq_name)
            # get EPA tax label
            ranks, lws = cl.classify_seq(place["p"])
            # check if they match
            mis_rec = self.check_seq_tax_labels(seq_name, orig_ranks, ranks, lws)

        if not self.cfg.debug:
            self.raxml.cleanup(job_name)
            FileUtils.remove_if_exists(reftree_fname)
Exemplo n.º 2
0
    def build_hmm_profile(self, json_builder):
        print "Building the HMMER profile...\n"

        # this stupid workaround is needed because RAxML outputs the reduced
        # alignment in relaxed PHYLIP format, which is not supported by HMMER
        refalign_fasta = self.cfg.tmp_fname("%NAME%_ref_reduced.fa")
        self.reduced_refalign_seqs.write(outfile=refalign_fasta)

        hmm = hmmer(self.cfg, refalign_fasta)
        fprofile = hmm.build_hmm_profile()

        json_builder.set_hmm_profile(fprofile)
        
        if not self.cfg.debug:
            FileUtils.remove_if_exists(refalign_fasta)
            FileUtils.remove_if_exists(fprofile)
Exemplo n.º 3
0
    def run_test(self):
        self.raxml = RaxmlWrapper(self.cfg)

        print "Number of sequences in the reference: %d\n" % self.reftree_size

        self.refjson.get_raxml_readable_tree(self.reftree_fname)
        self.refalign_fname = self.refjson.get_alignment(self.tmp_refaln)        
        self.refjson.get_binary_model(self.optmod_fname)

        if self.ranktest:
            print "Running the leave-one-rank-out test...\n"
            subtree_count = self.run_leave_subtree_out_test()
            
        print "Running the leave-one-sequence-out test...\n"
        self.run_leave_seq_out_test()

        if len(self.mislabels) > 0:
            print "Leave-one-out test identified %d suspicious sequences; running final EPA test to check them...\n" % len(self.mislabels)
            self.write_mislabels(final=False)
            self.run_final_epa_test()

        self.sort_mislabels()
        self.write_mislabels()
        print "\nPercentage of mislabeled sequences: %.2f %%" % (float(len(self.mislabels)) / self.reftree_size * 100)

        if not self.cfg.debug:
            FileUtils.remove_if_exists(self.reftree_fname)
            FileUtils.remove_if_exists(self.optmod_fname)
            FileUtils.remove_if_exists(self.refalign_fname)
Exemplo n.º 4
0
 def cleanup(self):
     FileUtils.remove_if_exists(self.outgr_fname)
     FileUtils.remove_if_exists(self.reftree_mfu_fname)
     FileUtils.remove_if_exists(self.reftree_bfu_fname)
     FileUtils.remove_if_exists(self.optmod_fname)
     FileUtils.remove_if_exists(self.lblalign_fname)
     FileUtils.remove_if_exists(self.outgr_fname)
     FileUtils.remove_if_exists(self.reduced_refalign_fname)
     FileUtils.remove_if_exists(self.refalign_fname)
Exemplo n.º 5
0
 def cleanup(self):
     FileUtils.remove_if_exists(self.tmp_refaln)
Exemplo n.º 6
0
 def cleanup(self):
     FileUtils.remove_if_exists(self.tmp_refaln)
     FileUtils.remove_if_exists(self.epa_alignment)
     FileUtils.remove_if_exists(self.hmmprofile)
     FileUtils.remove_if_exists(self.tmpquery)
     FileUtils.remove_if_exists(self.noalign)
Exemplo n.º 7
0
    def classify(self, query_fname, fout = None, method = "1", minlw = 0.0, pv = 0.02, minp = 0.9, ptp = False):
        if self.jplace_fname:
            jp = EpaJsonParser(self.jplace_fname)
        else:        
            self.checkinput(query_fname, minp)
            raxml = RaxmlWrapper(config)
            reftree_fname = self.cfg.tmp_fname("ref_%NAME%.tre")
            self.refjson.get_raxml_readable_tree(reftree_fname)
            optmod_fname = self.cfg.tmp_fname("%NAME%.opt")
            self.refjson.get_binary_model(optmod_fname)
            job_name = self.cfg.subst_name("epa_%NAME%")

            reftree_str = self.refjson.get_raxml_readable_tree()
            reftree = Tree(reftree_str)

            self.reftree_size = len(reftree.get_leaves())

            # IMPORTANT: set EPA heuristic rate based on tree size!                
            self.cfg.resolve_auto_settings(self.reftree_size)
            # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file        
            if self.cfg.epa_load_optmod:
                self.cfg.raxml_model = self.refjson.get_ratehet_model()

            reduced_align_fname = raxml.reduce_alignment(self.epa_alignment)

            jp = raxml.run_epa(job_name, reduced_align_fname, reftree_fname, optmod_fname)
        
        placements = jp.get_placement()
        
        if fout:
            fo = open(fout, "w")
        else:
            fo = None
        
        output2 = ""
        for place in placements:
            output = None
            taxon_name = place["n"][0]
            origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name)
            edges = place["p"]
#            edges = self.erlang_filter(edges, p = pv)
            if len(edges) > 0:
                ranks, lws = self.classify_helper.classify_seq(edges, method, minlw)
                
                isnovo = self.novelty_check(place_edge = str(edges[0][0]), ranks =ranks, lws = lws, minlw = minlw)
                rankout = self.print_ranks(ranks, lws, minlw)
                
                if rankout == None:
                    output2 = output2 + origin_taxon_name+ "\t\t\t?\n"
                else:
                    output = "%s\t%s\t" % (origin_taxon_name, self.print_ranks(ranks, lws, minlw))
                    if isnovo: 
                        output += "*"
                    else:
                        output +="o"
                    if self.cfg.verbose:
                        print(output) 
                    if fo:
                        fo.write(output + "\n")
            else:
                output2 = output2 + origin_taxon_name+ "\t\t\t?\n"
        
        if os.path.exists(self.noalign):
            with open(self.noalign) as fnoa:
                lines = fnoa.readlines()
                for line in lines:
                    taxon_name = line.strip()[1:]
                    origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name)
                    output = "%s\t\t\t?" % origin_taxon_name
                    if self.cfg.verbose:
                        print(output)
                    if fo:
                        fo.write(output + "\n")
        
        if self.cfg.verbose:
            print(output2)
        
        if fo:
            fo.write(output2)
            fo.close()

        #############################################
        #
        # EPA-PTP species delimitation
        #
        #############################################
        if ptp:
            full_aln = SeqGroup(self.epa_alignment)
            species_list = epa_2_ptp(epa_jp = jp, ref_jp = self.refjson, full_alignment = full_aln, min_lw = 0.5, debug = self.cfg.debug)
            
            if self.cfg.verbose:
                print "Species clusters:"

            if fout:
                fo2 = open(fout+".species", "w")
            else:
                fo2 = None

            for sp_cluster in species_list:
                translated_taxa = []
                for taxon in sp_cluster:
                    origin_taxon_name = EpacConfig.strip_query_prefix(taxon)
                    translated_taxa.append(origin_taxon_name)
                s = ",".join(translated_taxa)
                if fo2:
                    fo2.write(s + "\n")
                if self.cfg.verbose:
                    print s

            if fo2:
                fo2.close()
        #############################################
        
        if not self.jplace_fname:
            if not self.cfg.debug:
                raxml.cleanup(job_name)
                FileUtils.remove_if_exists(reduced_align_fname)
                FileUtils.remove_if_exists(reftree_fname)
                FileUtils.remove_if_exists(optmod_fname)