data_outs.addStream(sys.stdout) # Write out parameters data_outs.write("# Run started {}\n".format(util.timestamp())) data_outs.write("# Command: {}\n".format(' '.join(sys.argv))) data_outs.write("# Parameters:\n") optdict = vars(options) for (k,v) in optdict.items(): data_outs.write("#\t{k}: {v}\n".format(k=k, v=v)) # Read input fname =os.path.expanduser(options.in_fname) if not os.path.isfile(fname): raise IOError("# Error: file {} does not exist".format(fname)) tree_root = Newick.Clade() tree_root.parent = None tree_root.name = "cellular organisms" # Get directory of guide file path = os.path.dirname(fname) curwd = os.getcwd() species_names = [] with open(fname,'r') as inf: os.chdir(path) tab = util.readTable(inf, header=True) rows = tab.dictrows if options.debug: rows = [x for x in tab.dictrows][:2]
def taxid2tree(self, taxid_list, out_fmt="newick"): """ This function take a list of gi as input, will generate a path for for each gi, then construct a newick or phyloxml tree based on these gi pathes. out_fmt = newick / phyloxml ... """ treeFile = StringIO() # get pathes for a list of taxid path_list = [ ";".join([str(item) for item in self.get_path(taxid)]) for taxid in taxid_list ] # read in pathFile, and store node info into nodes nodes = {} # data format {"node_name": Clade_object} root = None # to parese path iterately for i, path in enumerate(path_list): line = path.strip().split(";") if root is None: root = line[0] else: assert root == line[ 0], "The %d-th line is from a different root" % (i + 1) # check node iterately, first reverse list, to from leaf to root # to make sure every node has a parent node leaf2root = line[::-1] for j, item in enumerate(leaf2root): # find child_node and parent_node, root node's parent is itself if j == len(line) - 1: child_node = item parent_node = item else: child_node = item parent_node = leaf2root[j + 1] if nodes.has_key(child_node): continue else: # add this node nodes[child_node] = Newick.Clade(name=child_node) # add its parent info nodes[child_node].parent = parent_node for node_name, node_clade in nodes.iteritems(): # find the root node, its parent is itself if node_name == node_clade.parent: root_node = node_clade print "root node is %s, constructing tree ..." % ( str(node_name)) # if node is not root, then find its parent, and add to its parent's clades else: parent_node = nodes[node_clade.parent] parent_node.clades.append(node_clade) del node_clade.parent tree = Newick.Tree(root=root_node) bp.write(tree, treeFile, out_fmt) treeStr = treeFile.getvalue() return treeStr
def new_clade(self, parent=None): """Return new Newick.Clade, optionally with temporary reference to parent.""" clade = Newick.Clade() if parent: clade.parent = parent return clade
def path2newick(self, path2pathFile, node_fmt="taxid", out_fmt="newick"): """ This function take taxonomic path file as input, path should be consist of taxonomic id, not scitific name, because some scientific name are the same in different rank, but ids are unique. node_fmt = taxid / sciName out_fmt = newick / phyloxml ... """ path, fileName = os.path.split(path2pathFile) basename = os.path.splitext(fileName)[0] outFile = os.path.join(path, basename + "2tree_" + node_fmt + "." + out_fmt) with open(path2pathFile, "r") as pathFile: # read in pathFile, and store node info into nodes nodes = {} # data format {"node_name": Clade_object} root = None # open file to parese line iterately for i, line in enumerate(pathFile): line = line.strip() if line.endswith(";"): line = line.rstrip(";") line = line.strip().split(";") if root is None: root = line[1] else: assert root == line[ 1], "The %d-th line is from a different root" % (i + 1) # check node iterately, first reverse list, to from leaf to root # to make sure every node has a parent node leaf2root = line[::-1] for j, item in enumerate(leaf2root): # find child_node and parent_node, root node's parent is itself if j == len(line) - 1: child_node = item parent_node = item else: child_node = item parent_node = leaf2root[j + 1] if nodes.has_key(child_node): continue else: # add this node nodes[child_node] = Newick.Clade(name=child_node) # add its parent info nodes[child_node].parent = parent_node for node_name, node_clade in nodes.iteritems(): # find the root node, its parent is itself if node_name == node_clade.parent: root_node = node_clade print node_clade print "root node found!! " # if node is not root, then find its parent, and add to its parent's clades else: parent_node = nodes[node_clade.parent] parent_node.clades.append(node_clade) del node_clade.parent # transform between output node format if node_fmt == "taxid": tree = Newick.Tree(root=root_node) else: assert node_fmt == "sciName", "The node_fmt should be taxid or sciName" # convert taxid to sciName for node_name, node in nodes.iteritems(): node_name = self.get_sciName(node_name) for child in node.clades: if child: child.name = self.get_sciName(child.name) root_node.name = self.get_sciName(root_node.name) tree = Newick.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (out_fmt, outFile) bp.write(tree, outFile, out_fmt)