def build_bid_taxonomy_map(self): self.bid_taxonomy_map = {} self.ranks_set = set([]) for node in self.tax_tree.traverse("postorder"): if not node.is_root() and hasattr(node, "B"): parent = node.up branch_rdiff = Taxonomy.lowest_assigned_rank_level(node.ranks) - Taxonomy.lowest_assigned_rank_level( parent.ranks ) branch_rank_id = Taxonomy.get_rank_uid(node.ranks) branch_len = node.dist self.bid_taxonomy_map[node.B] = (branch_rank_id, branch_rdiff, branch_len) self.ranks_set.add(branch_rank_id)
def build_bid_taxonomy_map(self): self.bid_taxonomy_map = {} self.ranks_set = set([]) for node in self.tax_tree.traverse("postorder"): if not node.is_root() and hasattr(node, "B"): parent = node.up branch_rdiff = Taxonomy.lowest_assigned_rank_level( node.ranks) - Taxonomy.lowest_assigned_rank_level( parent.ranks) branch_rank_id = Taxonomy.get_rank_uid(node.ranks) branch_len = node.dist self.bid_taxonomy_map[node.B] = (branch_rank_id, branch_rdiff, branch_len) self.ranks_set.add(branch_rank_id)
def strip_missing_ranks(self, ranks): rank_level = len(ranks) while not Taxonomy.get_rank_uid( ranks[0:rank_level]) in self.ranks_set and rank_level > 0: rank_level -= 1 return ranks[0:rank_level]
def get_seq_ranks_from_tree(self, seq_name): if seq_name not in self.name2taxnode: errmsg = "FATAL ERROR: Sequence %s is not found in the taxonomic tree!" % seq_name self.cfg.exit_fatal_error(errmsg) seq_node = self.name2taxnode[seq_name] ranks = Taxonomy.split_rank_uid(seq_node.up.name) return ranks
def label_bf_tree_with_ranks(self): """labeling inner tree nodes with taxonomic ranks""" if not self.bf_rooted_tree: raise AssertionError( "self.bf_rooted_tree is not set: TaxTreeHelper.set_bf_unrooted_tree() must be called before!" ) for node in self.bf_rooted_tree.traverse("postorder"): if node.is_leaf(): seq_ranks = self.origin_taxonomy[node.name] rank_level = Taxonomy.lowest_assigned_rank_level(seq_ranks) node.add_feature("rank_level", rank_level) node.add_feature("ranks", seq_ranks) node.name += "__" + seq_ranks[rank_level] else: if len(node.children) != 2: raise AssertionError( "FATAL ERROR: tree is not bifurcating!") lchild = node.children[0] rchild = node.children[1] rank_level = min(lchild.rank_level, rchild.rank_level) while rank_level >= 0 and lchild.ranks[ rank_level] != rchild.ranks[rank_level]: rank_level -= 1 node.add_feature("rank_level", rank_level) node_ranks = [Taxonomy.EMPTY_RANK] * max( len(lchild.ranks), len(rchild.ranks)) if rank_level >= 0: node_ranks[0:rank_level + 1] = lchild.ranks[0:rank_level + 1] node.name = lchild.ranks[rank_level] else: node.name = "Undefined" if hasattr(node, "B"): self.cfg.log.debug( "INFO: empty taxonomic annotation for branch %s (child nodes have no common ranks)", node.B) node.add_feature("ranks", node_ranks) self.tax_tree = self.bf_rooted_tree self.init_taxnode_map()
def label_bf_tree_with_ranks(self): """labeling inner tree nodes with taxonomic ranks""" if not self.bf_rooted_tree: raise AssertionError( "self.bf_rooted_tree is not set: TaxTreeHelper.set_bf_unrooted_tree() must be called before!" ) for node in self.bf_rooted_tree.traverse("postorder"): if node.is_leaf(): seq_ranks = self.origin_taxonomy[node.name] rank_level = Taxonomy.lowest_assigned_rank_level(seq_ranks) node.add_feature("rank_level", rank_level) node.add_feature("ranks", seq_ranks) node.name += "__" + seq_ranks[rank_level] else: if len(node.children) != 2: raise AssertionError("FATAL ERROR: tree is not bifurcating!") lchild = node.children[0] rchild = node.children[1] rank_level = min(lchild.rank_level, rchild.rank_level) while rank_level >= 0 and lchild.ranks[rank_level] != rchild.ranks[rank_level]: rank_level -= 1 node.add_feature("rank_level", rank_level) node_ranks = [Taxonomy.EMPTY_RANK] * max(len(lchild.ranks), len(rchild.ranks)) if rank_level >= 0: node_ranks[0 : rank_level + 1] = lchild.ranks[0 : rank_level + 1] node.name = lchild.ranks[rank_level] else: node.name = "Undefined" if hasattr(node, "B"): self.cfg.log.debug( "INFO: empty taxonomic annotation for branch %s (child nodes have no common ranks)", node.B ) node.add_feature("ranks", node_ranks) self.tax_tree = self.bf_rooted_tree self.init_taxnode_map()
def assign_taxonomy_maxsum(self, edges, minlw): """this function sums up all LH-weights for each rank and takes the rank with the max. sum """ # in EPA result, each placement(=branch) has a "weight" # since we are interested in taxonomic placement, we do not care about branch vs. branch comparisons, # but only consider rank vs. rank (e. g. G1 S1 vs. G1 S2 vs. G1) # Thus we accumulate weights for each rank, there are to measures: # "own" weight = sum of weight of all placements EXACTLY to this rank (e.g. for G1: G1 only) # "total" rank = own rank + own rank of all children (for G1: G1 or G1 S1 or G1 S2) rw_own = {} rw_total = {} ranks = [Taxonomy.EMPTY_RANK] for edge in edges: br_id = str(edge[0]) lweight = edge[2] lowest_rank = None lowest_rank_lvl = None if lweight == 0.: continue # accumulate weight for the current sequence br_rank_id, rdiff, brlen = self.bid_taxonomy_map[br_id] ranks = Taxonomy.split_rank_uid(br_rank_id) for i in range(len(ranks)): rank = ranks[i] rank_id = Taxonomy.get_rank_uid(ranks, i) if rank != Taxonomy.EMPTY_RANK: rw_total[rank_id] = rw_total.get(rank_id, 0) + lweight lowest_rank_lvl = i lowest_rank = rank_id else: break if lowest_rank: if rdiff > 0: # if ranks of 'upper' and 'lower' adjacent nodes of a branch are non-equal, split LHW among them parent_rank = Taxonomy.get_rank_uid( ranks, lowest_rank_lvl - rdiff) rw_own[lowest_rank] = rw_own.get( lowest_rank, 0) + lweight * (1 - self.parent_lhw_coeff) rw_own[parent_rank] = rw_own.get( parent_rank, 0) + lweight * self.parent_lhw_coeff # correct total lhw for all levels between "parent" and "lowest" # NOTE: all those intermediate ranks are in fact indistinguishable, e.g. a family which contains a single genus for r in range(rdiff): interim_rank = Taxonomy.get_rank_uid( ranks, lowest_rank_lvl - r) rw_total[interim_rank] = rw_total.get( interim_rank, 0) - lweight * self.parent_lhw_coeff else: rw_own[lowest_rank] = rw_own.get(lowest_rank, 0) + lweight # else: # self.cfg.log.debug("WARNING: no annotation for branch %s", br_id) # if all branches have empty ranks only, just return this placement if len(rw_total) == 0: return ranks, [1.] * len(ranks) # we assign the sequence to a rank, which has the max "own" weight AND # whose "total" weight is greater than a confidence threshold max_rw = 0. ass_rank_id = None for r in rw_own.iterkeys(): if rw_own[r] > max_rw and rw_total[r] >= minlw: ass_rank_id = r max_rw = rw_own[r] if not ass_rank_id: ass_rank_id = max(rw_total.iterkeys(), key=(lambda key: rw_total[key])) a_ranks = Taxonomy.split_rank_uid(ass_rank_id) # "total" weight is considered as confidence value for now a_conf = [0.] * len(a_ranks) for i in range(len(a_conf)): rank = a_ranks[i] if rank != Taxonomy.EMPTY_RANK: rank_id = Taxonomy.get_rank_uid(a_ranks, i) a_conf[i] = rw_total[rank_id] return a_ranks, a_conf
def get_branch_ranks(self, br_id): br_rec = self.bid_taxonomy_map[br_id] br_rank_id = br_rec[0] ranks = Taxonomy.split_rank_uid(br_rank_id) return ranks
def assign_taxonomy_maxsum(self, edges, minlw): """this function sums up all LH-weights for each rank and takes the rank with the max. sum """ # in EPA result, each placement(=branch) has a "weight" # since we are interested in taxonomic placement, we do not care about branch vs. branch comparisons, # but only consider rank vs. rank (e. g. G1 S1 vs. G1 S2 vs. G1) # Thus we accumulate weights for each rank, there are to measures: # "own" weight = sum of weight of all placements EXACTLY to this rank (e.g. for G1: G1 only) # "total" rank = own rank + own rank of all children (for G1: G1 or G1 S1 or G1 S2) rw_own = {} rw_total = {} ranks = [Taxonomy.EMPTY_RANK] for edge in edges: br_id = str(edge[0]) lweight = edge[2] lowest_rank = None lowest_rank_lvl = None if lweight == 0.0: continue # accumulate weight for the current sequence br_rank_id, rdiff, brlen = self.bid_taxonomy_map[br_id] ranks = Taxonomy.split_rank_uid(br_rank_id) for i in range(len(ranks)): rank = ranks[i] rank_id = Taxonomy.get_rank_uid(ranks, i) if rank != Taxonomy.EMPTY_RANK: rw_total[rank_id] = rw_total.get(rank_id, 0) + lweight lowest_rank_lvl = i lowest_rank = rank_id else: break if lowest_rank: if rdiff > 0: # if ranks of 'upper' and 'lower' adjacent nodes of a branch are non-equal, split LHW among them parent_rank = Taxonomy.get_rank_uid(ranks, lowest_rank_lvl - rdiff) rw_own[lowest_rank] = rw_own.get(lowest_rank, 0) + lweight * (1 - self.parent_lhw_coeff) rw_own[parent_rank] = rw_own.get(parent_rank, 0) + lweight * self.parent_lhw_coeff # correct total lhw for all levels between "parent" and "lowest" # NOTE: all those intermediate ranks are in fact indistinguishable, e.g. a family which contains a single genus for r in range(rdiff): interim_rank = Taxonomy.get_rank_uid(ranks, lowest_rank_lvl - r) rw_total[interim_rank] = rw_total.get(interim_rank, 0) - lweight * self.parent_lhw_coeff else: rw_own[lowest_rank] = rw_own.get(lowest_rank, 0) + lweight # else: # self.cfg.log.debug("WARNING: no annotation for branch %s", br_id) # if all branches have empty ranks only, just return this placement if len(rw_total) == 0: return ranks, [1.0] * len(ranks) # we assign the sequence to a rank, which has the max "own" weight AND # whose "total" weight is greater than a confidence threshold max_rw = 0.0 ass_rank_id = None for r in rw_own.iterkeys(): if rw_own[r] > max_rw and rw_total[r] >= minlw: ass_rank_id = r max_rw = rw_own[r] if not ass_rank_id: ass_rank_id = max(rw_total.iterkeys(), key=(lambda key: rw_total[key])) a_ranks = Taxonomy.split_rank_uid(ass_rank_id) # "total" weight is considered as confidence value for now a_conf = [0.0] * len(a_ranks) for i in range(len(a_conf)): rank = a_ranks[i] if rank != Taxonomy.EMPTY_RANK: rank_id = Taxonomy.get_rank_uid(a_ranks, i) a_conf[i] = rw_total[rank_id] return a_ranks, a_conf
def strip_missing_ranks(self, ranks): rank_level = len(ranks) while not Taxonomy.get_rank_uid(ranks[0:rank_level]) in self.ranks_set and rank_level > 0: rank_level -= 1 return ranks[0:rank_level]