def get_parent_tip_ranks(self, tax_tree): rank_tips = {} rank_parent = {} for node in tax_tree.traverse("postorder"): if node.is_leaf() or node.is_root(): continue tax_path = node.name ranks = Taxonomy.split_rank_uid(tax_path) rank_lvl = Taxonomy.lowest_assigned_rank_level(ranks) if rank_lvl < 2: continue parent_ranks = Taxonomy.split_rank_uid(node.up.name) parent_lvl = Taxonomy.lowest_assigned_rank_level(parent_ranks) if parent_lvl < 1: continue rank_seqs = node.get_leaf_names() rank_size = len(rank_seqs) if rank_size < 2 or rank_size > self.reftree_size-4: continue # print rank_lvl, "\t", tax_path, "\t", rank_seqs, "\n" rank_tips[tax_path] = node.get_leaf_names() rank_parent[tax_path] = parent_ranks return rank_parent, rank_tips
def get_parent_tip_ranks(self, tax_tree): rank_tips = {} rank_parent = {} for node in tax_tree.traverse("postorder"): if node.is_leaf() or node.is_root(): continue tax_path = node.name ranks = Taxonomy.split_rank_uid(tax_path) rank_lvl = Taxonomy.lowest_assigned_rank_level(ranks) if rank_lvl < 2: continue parent_ranks = Taxonomy.split_rank_uid(node.up.name) parent_lvl = Taxonomy.lowest_assigned_rank_level(parent_ranks) if parent_lvl < 1: continue rank_seqs = node.get_leaf_names() rank_size = len(rank_seqs) if rank_size < 2 or rank_size > self.reftree_size - 4: continue # print rank_lvl, "\t", tax_path, "\t", rank_seqs, "\n" rank_tips[tax_path] = node.get_leaf_names() rank_parent[tax_path] = parent_ranks return rank_parent, rank_tips
def check_seq_tax_labels(self, seq_name, orig_ranks, ranks, lws): mis_rec = None num_common_ranks = len(self.tax_common_ranks) orig_rank_level = Taxonomy.lowest_assigned_rank_level(orig_ranks) new_rank_level = Taxonomy.lowest_assigned_rank_level(ranks) #if new_rank_level < 0 or (new_rank_level < num_common_ranks and orig_rank_level >= num_common_ranks): # if new_rank_level < 0: if len(ranks) == 0: mis_rec = {} mis_rec['name'] = seq_name mis_rec['orig_level'] = -1 mis_rec['real_level'] = 0 mis_rec['level_name'] = "[NotIngroup]" mis_rec['inv_level'] = -1 * mis_rec[ 'real_level'] # just for sorting mis_rec['orig_ranks'] = orig_ranks mis_rec['ranks'] = [] mis_rec['lws'] = [1.0] mis_rec['conf'] = mis_rec['lws'][0] else: mislabel_lvl = -1 min_len = min(len(orig_ranks), len(ranks)) for rank_lvl in range(min_len): if ranks[rank_lvl] != Taxonomy.EMPTY_RANK and ranks[ rank_lvl] != orig_ranks[rank_lvl]: mislabel_lvl = rank_lvl break if mislabel_lvl >= 0: real_lvl = self.tax_code.guess_rank_level( orig_ranks, mislabel_lvl) mis_rec = {} mis_rec['name'] = seq_name mis_rec['orig_level'] = mislabel_lvl mis_rec['real_level'] = real_lvl mis_rec['level_name'] = self.tax_code.rank_level_name( real_lvl)[0] mis_rec['inv_level'] = -1 * mis_rec[ 'real_level'] # just for sorting mis_rec['orig_ranks'] = orig_ranks mis_rec['ranks'] = ranks mis_rec['lws'] = lws mis_rec['conf'] = lws[mislabel_lvl] if mis_rec: self.mislabels.append(mis_rec) return mis_rec
def label_bf_tree_with_ranks(self): """labeling inner tree nodes with taxonomic ranks""" if not self.bf_rooted_tree: raise AssertionError( "self.bf_rooted_tree is not set: TaxTreeHelper.set_bf_unrooted_tree() must be called before!" ) for node in self.bf_rooted_tree.traverse("postorder"): if node.is_leaf(): seq_ranks = self.origin_taxonomy[node.name] rank_level = Taxonomy.lowest_assigned_rank_level(seq_ranks) node.add_feature("rank_level", rank_level) node.add_feature("ranks", seq_ranks) node.name += "__" + seq_ranks[rank_level] else: if len(node.children) != 2: raise AssertionError("FATAL ERROR: tree is not bifurcating!") lchild = node.children[0] rchild = node.children[1] rank_level = min(lchild.rank_level, rchild.rank_level) while rank_level >= 0 and lchild.ranks[rank_level] != rchild.ranks[rank_level]: rank_level -= 1 node.add_feature("rank_level", rank_level) node_ranks = [Taxonomy.EMPTY_RANK] * 7 if rank_level >= 0: node_ranks[0 : rank_level + 1] = lchild.ranks[0 : rank_level + 1] node.name = lchild.ranks[rank_level] else: node.name = "Undefined" if hasattr(node, "B") and self.cfg.verbose: print "INFO: no taxonomic annotation for branch %s (reason: children belong to different kingdoms)" % node.B node.add_feature("ranks", node_ranks) self.tax_tree = self.bf_rooted_tree
def check_seq_tax_labels(self, seq_name, orig_ranks, ranks, lws): mis_rec = None num_common_ranks = len(self.tax_common_ranks) orig_rank_level = Taxonomy.lowest_assigned_rank_level(orig_ranks) new_rank_level = Taxonomy.lowest_assigned_rank_level(ranks) #if new_rank_level < 0 or (new_rank_level < num_common_ranks and orig_rank_level >= num_common_ranks): # if new_rank_level < 0: if len(ranks) == 0: mis_rec = {} mis_rec['name'] = seq_name mis_rec['orig_level'] = -1 mis_rec['real_level'] = 0 mis_rec['level_name'] = "[NotIngroup]" mis_rec['inv_level'] = -1 * mis_rec['real_level'] # just for sorting mis_rec['orig_ranks'] = orig_ranks mis_rec['ranks'] = [] mis_rec['lws'] = [1.0] mis_rec['conf'] = mis_rec['lws'][0] else: mislabel_lvl = -1 min_len = min(len(orig_ranks),len(ranks)) for rank_lvl in range(min_len): if ranks[rank_lvl] != Taxonomy.EMPTY_RANK and ranks[rank_lvl] != orig_ranks[rank_lvl]: mislabel_lvl = rank_lvl break if mislabel_lvl >= 0: real_lvl = self.tax_code.guess_rank_level(orig_ranks, mislabel_lvl) mis_rec = {} mis_rec['name'] = seq_name mis_rec['orig_level'] = mislabel_lvl mis_rec['real_level'] = real_lvl mis_rec['level_name'] = self.tax_code.rank_level_name(real_lvl)[0] mis_rec['inv_level'] = -1 * mis_rec['real_level'] # just for sorting mis_rec['orig_ranks'] = orig_ranks mis_rec['ranks'] = ranks mis_rec['lws'] = lws mis_rec['conf'] = lws[mislabel_lvl] if mis_rec: self.mislabels.append(mis_rec) return mis_rec
def run_leave_subtree_out_test(self): job_name = self.cfg.subst_name("l1out_rank_%NAME%") # if self.jplace_fname: # jp = EpaJsonParser(self.jplace_fname) # else: #create file with subtrees rank_parent, rank_tips = self.get_parent_tip_ranks(self.tax_tree) subtree_list = list(rank_tips.items()) if len(subtree_list) == 0: return 0 subtree_list_file = self.cfg.tmp_fname("treelist_%NAME%.txt") with open(subtree_list_file, "w") as fout: for rank_name, tips in subtree_list: fout.write("%s\n" % " ".join(tips)) jp_list = self.raxml.run_epa(job_name, self.refalign_fname, self.reftree_fname, self.optmod_fname, mode="l1o_subtree", subtree_fname=subtree_list_file) subtree_count = 0 for jp in jp_list: placements = jp.get_placement() for place in placements: ranks, lws = self.classify_seq(place) tax_path = subtree_list[subtree_count][0] orig_ranks = Taxonomy.split_rank_uid(tax_path) rank_level = Taxonomy.lowest_assigned_rank_level(orig_ranks) rank_prefix = self.tax_code.guess_rank_level_name( orig_ranks, rank_level)[0] rank_name = orig_ranks[rank_level] if not rank_name.startswith(rank_prefix): rank_name = rank_prefix + rank_name parent_ranks = rank_parent[tax_path] # print orig_ranks, "\n", parent_ranks, "\n", ranks, "\n" mis_rec = self.check_rank_tax_labels(rank_name, parent_ranks, ranks, lws) if mis_rec: self.misrank_conf_map[tax_path] = mis_rec['conf'] subtree_count += 1 return subtree_count
def run_leave_subtree_out_test(self): job_name = self.cfg.subst_name("l1out_rank_%NAME%") # if self.jplace_fname: # jp = EpaJsonParser(self.jplace_fname) # else: #create file with subtrees rank_parent, rank_tips = get_parent_tip_ranks(self.tax_tree) subtree_list = rank_tips.items() if len(subtree_list) == 0: return 0 subtree_list_file = self.cfg.tmp_fname("treelist_%NAME%.txt") with open(subtree_list_file, "w") as fout: for rank_name, tips in subtree_list: fout.write("%s\n" % " ".join(tips)) jp_list = self.raxml.run_epa(job_name, self.refalign_fname, self.reftree_fname, self.optmod_fname, mode="l1o_subtree", subtree_fname=subtree_list_file) subtree_count = 0 for jp in jp_list: placements = jp.get_placement() for place in placements: ranks, lws = self.classify_seq(place) tax_path = subtree_list[subtree_count][0] orig_ranks = Taxonomy.split_rank_uid(tax_path) rank_level = Taxonomy.lowest_assigned_rank_level(orig_ranks) rank_prefix = self.guess_rank_level_name(orig_ranks, rank_level)[0] rank_name = orig_ranks[rank_level] if not rank_name.startswith(rank_prefix): rank_name = rank_prefix + rank_name parent_ranks = rank_parent[tax_path] # print orig_ranks, "\n", parent_ranks, "\n", ranks, "\n" mis_rec = self.check_rank_tax_labels(rank_name, parent_ranks, ranks, lws) if mis_rec: self.misrank_conf_map[tax_path] = mis_rec['conf'] subtree_count += 1 return subtree_count
def run_leave_subtree_out_test(self): job_name = self.cfg.subst_name("l1out_rank_%NAME%") # if self.jplace_fname: # jp = EpaJsonParser(self.jplace_fname) # else: #create file with subtrees rank_tips = {} rank_parent = {} for node in self.tax_tree.traverse("postorder"): if node.is_leaf() or node.is_root(): continue tax_path = node.name ranks = Taxonomy.split_rank_uid(tax_path) rank_lvl = Taxonomy.lowest_assigned_rank_level(ranks) if rank_lvl < 2: continue parent_ranks = Taxonomy.split_rank_uid(node.up.name) parent_lvl = Taxonomy.lowest_assigned_rank_level(parent_ranks) if parent_lvl < 1: continue rank_seqs = node.get_leaf_names() rank_size = len(rank_seqs) if rank_size < 2 or rank_size > self.reftree_size-4: continue # print rank_lvl, "\t", tax_path, "\t", rank_seqs, "\n" rank_tips[tax_path] = node.get_leaf_names() rank_parent[tax_path] = parent_ranks subtree_list = rank_tips.items() if len(subtree_list) == 0: return 0 subtree_list_file = self.cfg.tmp_fname("treelist_%NAME%.txt") with open(subtree_list_file, "w") as fout: for rank_name, tips in subtree_list: fout.write("%s\n" % " ".join(tips)) jp_list = self.raxml.run_epa(job_name, self.refalign_fname, self.reftree_fname, self.optmod_fname, mode="l1o_subtree", subtree_fname=subtree_list_file) subtree_count = 0 for jp in jp_list: placements = jp.get_placement() for place in placements: ranks, lws = self.classify_seq(place) tax_path = subtree_list[subtree_count][0] orig_ranks = Taxonomy.split_rank_uid(tax_path) rank_level = Taxonomy.lowest_assigned_rank_level(orig_ranks) rank_prefix = self.guess_rank_level_name(orig_ranks, rank_level)[0] rank_name = orig_ranks[rank_level] if not rank_name.startswith(rank_prefix): rank_name = rank_prefix + rank_name parent_ranks = rank_parent[tax_path] # print orig_ranks, "\n", parent_ranks, "\n", ranks, "\n" mis_rec = self.check_rank_tax_labels(rank_name, parent_ranks, ranks, lws) if mis_rec: self.misrank_conf_map[tax_path] = mis_rec['conf'] subtree_count += 1 return subtree_count
def run_leave_subtree_out_test(self): job_name = self.cfg.subst_name("l1out_rank_%NAME%") # if self.jplace_fname: # jp = EpaJsonParser(self.jplace_fname) # else: #create file with subtrees rank_tips = {} rank_parent = {} for node in self.tax_tree.traverse("postorder"): if node.is_leaf() or node.is_root(): continue tax_path = node.name ranks = Taxonomy.split_rank_uid(tax_path) rank_lvl = Taxonomy.lowest_assigned_rank_level(ranks) if rank_lvl < 2: continue parent_ranks = Taxonomy.split_rank_uid(node.up.name) parent_lvl = Taxonomy.lowest_assigned_rank_level(parent_ranks) if parent_lvl < 1: continue rank_seqs = node.get_leaf_names() rank_size = len(rank_seqs) if rank_size < 2 or rank_size > self.reftree_size - 4: continue # print rank_lvl, "\t", tax_path, "\t", rank_seqs, "\n" rank_tips[tax_path] = node.get_leaf_names() rank_parent[tax_path] = parent_ranks subtree_list = rank_tips.items() if len(subtree_list) == 0: return 0 subtree_list_file = self.cfg.tmp_fname("treelist_%NAME%.txt") with open(subtree_list_file, "w") as fout: for rank_name, tips in subtree_list: fout.write("%s\n" % " ".join(tips)) jp_list = self.raxml.run_epa(job_name, self.refalign_fname, self.reftree_fname, self.optmod_fname, mode="l1o_subtree", subtree_fname=subtree_list_file) subtree_count = 0 for jp in jp_list: placements = jp.get_placement() for place in placements: ranks, lws = self.classify_seq(place) tax_path = subtree_list[subtree_count][0] orig_ranks = Taxonomy.split_rank_uid(tax_path) rank_level = Taxonomy.lowest_assigned_rank_level(orig_ranks) rank_prefix = self.guess_rank_level_name( orig_ranks, rank_level)[0] rank_name = orig_ranks[rank_level] if not rank_name.startswith(rank_prefix): rank_name = rank_prefix + rank_name parent_ranks = rank_parent[tax_path] # print orig_ranks, "\n", parent_ranks, "\n", ranks, "\n" mis_rec = self.check_rank_tax_labels(rank_name, parent_ranks, ranks, lws) if mis_rec: self.misrank_conf_map[tax_path] = mis_rec['conf'] subtree_count += 1 return subtree_count