for nodeid in taxa: parentid = node2parent[nodeid] while nodeid != parentid: #costruiamo un nuovo dizionario per i soli taxa che abbiamo identificato nel campione ass_node2parent[nodeid] = parentid nodeid = parentid parentid = node2parent[nodeid] node2parentid = {} for nodeid in ass_node2parent.keys(): parentid = ass_node2parent[nodeid] # Stores node connections all_ids.update([nodeid, parentid]) # Creates a new TreeNode instance for each new node in file n = TreeNode() # Sets some TreeNode attributes n.add_feature("name", node2name[nodeid]) n.add_feature("taxid", nodeid) n.add_feature("Order", node2order[nodeid]) # updates node list and connections node2parentid[n] = parentid id2node[nodeid] = n print len(id2node) # Reconstruct tree topology from previously stored tree connections print 'Reconstructing tree topology...' for node in id2node.itervalues(): parentid = node2parentid[node] parent = id2node[parentid] # node with taxid=1 is the root of the tree if node.taxid == "1": t = node
def build_tax_tree(): import os import sys from string import strip from ete2 import TreeNode, Tree #print sys.argv[1] #if len(sys.argv) == 1: # print "Usage: taxid2lineage file_with_taxids.txt" #else: # f = open(sys.argv[1], 'r') # This sets Unbuffered stdout/auto-flush sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) id2node= {} id2rank={} node2parentid = {} all_ids = set([]) all_nodes = [] id2name= {} # Loads info from NCBI taxonomy files if os.path.exists("nodes.dmp"): NODESFILE = open('nodes.dmp') elif os.path.exists("nodes.dmp.bz2"): import bz2 NODESFILE = bz2.BZ2File('nodes.dmp.bz2') else: print '"nodes.dmp" file is missing. Try to downloaded from: ' if os.path.exists("names_scientific.dmp"): NAMESFILE = open('names_scientific.dmp') elif os.path.exists("names_scientific.dmp.bz2"): import bz2 NAMESFILE = bz2.BZ2File('names_scientific.dmp.bz2') else: print '"names_scientific.dmp" file is missing. Try to downloaded from: ' # Reads taxid/names transaltion #print 'Loading species names from "names_scientific.dmp" file...', for line in NAMESFILE: line = line.strip() fields = map(strip, line.split("|")) nodeid, name = fields[0], fields[1] id2name[nodeid] = name # Reads node connections in nodes.dmp #print 'Loading node connections form "nodes.dmp" file...', for line in NODESFILE: line = line.strip() fields = map(strip, line.split("|")) nodeid, parentid,rankid = fields[0], fields[1], fields[2] id2rank[nodeid]=rankid if nodeid =="" or parentid == "": raw_input("Wrong nodeid!") # Stores node connections all_ids.update([nodeid, parentid]) # Creates a new TreeNode instance for each new node in file n = TreeNode() # Sets some TreeNode attributes n.add_feature("name", id2name[nodeid]) n.add_feature("taxid", nodeid) n.add_feature("rank",id2rank[nodeid]) # updates node list and connections node2parentid[n]=parentid id2node[nodeid] = n #print len(id2node) # Reconstruct tree topology from previously stored tree connections #print 'Reconstructing tree topology...' for node in id2node.itervalues(): parentid = node2parentid[node] parent = id2node[parentid] if node.taxid == "1": t = node else: parent.add_child(node) return id2node, id2name
def load_NCBI(species_file, names_file, nodes_file ): if not os.path.isfile(species_file): print "ERROR "+species_file+' can\'t be read. Exiting... ' sys.exit(8) all_wanted_species={} # species_name: taxid (string) print "Reading wanted species from file: "+species_file ifile=open(species_file, 'r') for iline in ifile: species_name=iline.strip() all_wanted_species[species_name]=-1 ifile.close() # This sets Unbuffered stdout/auto-flush sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) id2node= {} node2parentid = {} all_ids = set([]) all_nodes = [] id2name= {} # Loads info from NCBI taxonomy files if os.path.exists(nodes_file): NODESFILE = open(nodes_file) elif os.path.exists(nodes_file+"bz2"): import bz2 NODESFILE = bz2.BZ2File(nodes_file+'.bz2') else: print nodes_file+' file is missing. ' sys.exit(8) if os.path.exists(names_file): NAMESFILE = open(names_file) elif os.path.exists(names_file+"bz2"): import bz2 NAMESFILE = bz2.BZ2File(names_file+'.bz2') else: print names_file +' file is missing. ' sys.exit(8) # Reads taxid/names transaltion print 'Loading species names from "names.dmp" file...', for line in NAMESFILE: # lines are redundant. synonyms are on different lines defined by the same id. So, we store only lines with "scientific name". line = line.strip() fields = map(strip, line.split("|")) nodeid, name = fields[0], fields[1] if all_wanted_species.has_key(name): all_wanted_species[name]=nodeid if fields[3]=='scientific name': #storing name that will appear afterwards in the ete2 node id2name[nodeid] = name print len(id2name) any_species_is_missing=0 for species_name in all_wanted_species: if all_wanted_species[species_name]==-1: print "ERROR the species name \""+species_name+"\" was not found!" any_species_is_missing=1 if any_species_is_missing: sys.exit(9) # Reads node connections in nodes.dmp print 'Loading node connections from "nodes.dmp" file...', for line in NODESFILE: line = line.strip() fields = map(strip, line.split("|")) nodeid, parentid = fields[0], fields[1] if nodeid =="" or parentid == "": raw_input("Wrong nodeid!") # Stores node connections all_ids.update([nodeid, parentid]) # Creates a new TreeNode instance for each new node in file n = TreeNode() # Sets some TreeNode attributes n.add_feature("name", id2name[nodeid]) n.add_feature("taxid", nodeid) # updates node list and connections node2parentid[n]=parentid id2node[nodeid] = n print len(id2node) # Reconstruct tree topology from previously stored tree connections print 'Reconstructing tree topology...' for node in id2node.itervalues(): parentid = node2parentid[node] parent = id2node[parentid] # node with taxid=1 is the root of the tree if node.taxid == "1": t = node else: parent.add_child(node) return t, id2node, all_wanted_species