def parse(handle): """Parse the trees in a Nexus file. Uses the old Nexus.Trees parser to extract the trees, converts them back to plain Newick trees, and feeds those strings through the new Newick parser. This way we don't have to modify the Nexus module yet. (Perhaps we'll eventually change Nexus to use the new NewickIO parser directly.) """ nex = Nexus.Nexus(handle) # NB: Once Nexus.Trees is modified to use Tree.Newick objects, do this: # return iter(nex.trees) # Until then, convert the Nexus.Trees.Tree object hierarchy: def node2clade(nxtree, node): subclades = [node2clade(nxtree, nxtree.node(n)) for n in node.succ] return Newick.Clade(branch_length=node.data.branchlength, name=node.data.taxon, clades=subclades, confidence=node.data.support, comment=node.data.comment) for nxtree in nex.trees: newroot = node2clade(nxtree, nxtree.node(nxtree.root)) yield Newick.Tree(root=newroot, rooted=nxtree.rooted, name=nxtree.name, weight=nxtree.weight)
def taxid2tree(self, taxid_list, out_fmt="newick"): """ This function take a list of gi as input, will generate a path for for each gi, then construct a newick or phyloxml tree based on these gi pathes. out_fmt = newick / phyloxml ... """ treeFile = StringIO() # get pathes for a list of taxid path_list =[";".join([str(item) for item in self.get_path(taxid)]) for taxid in taxid_list ] # read in pathFile, and store node info into nodes nodes = {} # data format {"node_name": Clade_object} root = None # to parese path iterately for i, path in enumerate(path_list): line = path.strip().split(";") if root is None: root = line[0] else: assert root == line[0], "The %d-th line is from a different root"%(i+1) # check node iterately, first reverse list, to from leaf to root # to make sure every node has a parent node leaf2root = line[::-1] for j, item in enumerate(leaf2root): # find child_node and parent_node, root node's parent is itself if j == len(line)-1: child_node = item; parent_node=item else: child_node = item; parent_node = leaf2root[j+1] if nodes.has_key(child_node): continue else: # add this node nodes[child_node] = Newick.Clade(name=child_node) # add its parent info nodes[child_node].parent = parent_node for node_name, node_clade in nodes.iteritems(): # find the root node, its parent is itself if node_name == node_clade.parent: root_node = node_clade print "root node is %s, constructing tree ..."%(str(node_name)) # if node is not root, then find its parent, and add to its parent's clades else: parent_node = nodes[node_clade.parent] parent_node.clades.append(node_clade) del node_clade.parent tree = Newick.Tree(root = root_node) bp.write(tree, treeFile, out_fmt) treeStr = treeFile.getvalue() return treeStr
def _parse_tree(self, text): """Parses the text representation into an Tree object.""" # XXX what global info do we have here? Any? Use **kwargs? return Newick.Tree(root=self._parse_subtree(text))
def _parse_tree(self, text): """Parses the text representation into an Tree object.""" tokens = re.finditer(tokenizer, text.strip()) new_clade = self.new_clade root_clade = new_clade() current_clade = root_clade entering_branch_length = False lp_count = 0 rp_count = 0 for match in tokens: token = match.group() if token.startswith("'"): # quoted label; add characters to clade name current_clade.name = token[1:-1] elif token.startswith('['): # comment current_clade.comment = token[1:-1] if self.comments_are_confidence: # Try to use this comment as a numeric support value current_clade.confidence = _parse_confidence(current_clade.comment) elif token == '(': # start a new clade, which is a child of the current clade current_clade = new_clade(current_clade) entering_branch_length = False lp_count += 1 elif token == ',': # if the current clade is the root, then the external parentheses # are missing and a new root should be created if current_clade is root_clade: root_clade = new_clade() current_clade.parent = root_clade # start a new child clade at the same level as the current clade parent = self.process_clade(current_clade) current_clade = new_clade(parent) entering_branch_length = False elif token == ')': # done adding children for this parent clade parent = self.process_clade(current_clade) if not parent: raise NewickError('Parenthesis mismatch.') current_clade = parent entering_branch_length = False rp_count += 1 elif token == ';': break elif token.startswith(':'): # branch length or confidence value = float(token[1:]) if self.values_are_confidence: current_clade.confidence = value else: current_clade.branch_length = value elif token == '\n': pass else: # unquoted node label current_clade.name = token if not lp_count == rp_count: raise NewickError('Number of open/close parentheses do not match.') # if ; token broke out of for loop, there should be no remaining tokens try: next_token = next(tokens) raise NewickError('Text after semicolon in Newick tree: %s' % next_token.group()) except StopIteration: pass self.process_clade(current_clade) self.process_clade(root_clade) return Newick.Tree(root=root_clade, rooted=self.rooted)
def _parse_tree(self, text, rooted): """Parses the text representation into an Tree object.""" # XXX Pass **kwargs along from Parser.parse? return Newick.Tree(root=self._parse_subtree(text), rooted=self.rooted)
def path2newick(self, path2pathFile, node_fmt="taxid", out_fmt="newick"): """ This function take taxonomic path file as input, path should be consist of taxonomic id, not scitific name, because some scientific name are the same in different rank, but ids are unique. node_fmt = taxid / sciName out_fmt = newick / phyloxml ... """ path, fileName = os.path.split(path2pathFile) basename = os.path.splitext(fileName)[0] outFile = os.path.join(path, basename + "2tree_" + node_fmt + "." + out_fmt) with open(path2pathFile, "r") as pathFile: # read in pathFile, and store node info into nodes nodes = {} # data format {"node_name": Clade_object} root = None # open file to parese line iterately for i, line in enumerate(pathFile): line = line.strip() if line.endswith(";"): line = line.rstrip(";") line = line.strip().split(";") if root is None: root = line[1] else: assert root == line[ 1], "The %d-th line is from a different root" % (i + 1) # check node iterately, first reverse list, to from leaf to root # to make sure every node has a parent node leaf2root = line[::-1] for j, item in enumerate(leaf2root): # find child_node and parent_node, root node's parent is itself if j == len(line) - 1: child_node = item parent_node = item else: child_node = item parent_node = leaf2root[j + 1] if nodes.has_key(child_node): continue else: # add this node nodes[child_node] = Newick.Clade(name=child_node) # add its parent info nodes[child_node].parent = parent_node for node_name, node_clade in nodes.iteritems(): # find the root node, its parent is itself if node_name == node_clade.parent: root_node = node_clade print node_clade print "root node found!! " # if node is not root, then find its parent, and add to its parent's clades else: parent_node = nodes[node_clade.parent] parent_node.clades.append(node_clade) del node_clade.parent # transform between output node format if node_fmt == "taxid": tree = Newick.Tree(root=root_node) else: assert node_fmt == "sciName", "The node_fmt should be taxid or sciName" # convert taxid to sciName for node_name, node in nodes.iteritems(): node_name = self.get_sciName(node_name) for child in node.clades: if child: child.name = self.get_sciName(child.name) root_node.name = self.get_sciName(root_node.name) tree = Newick.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (out_fmt, outFile) bp.write(tree, outFile, out_fmt)