for nodeid in taxa:
    parentid = node2parent[nodeid]
    while nodeid != parentid:  #costruiamo un nuovo dizionario per i soli taxa che abbiamo identificato nel campione
        ass_node2parent[nodeid] = parentid
        nodeid = parentid
        parentid = node2parent[nodeid]

node2parentid = {}
for nodeid in ass_node2parent.keys():
    parentid = ass_node2parent[nodeid]
    # Stores node connections
    all_ids.update([nodeid, parentid])
    # Creates a new TreeNode instance for each new node in file
    n = TreeNode()
    # Sets some TreeNode attributes
    n.add_feature("name", node2name[nodeid])
    n.add_feature("taxid", nodeid)
    n.add_feature("Order", node2order[nodeid])

    # updates node list and connections
    node2parentid[n] = parentid
    id2node[nodeid] = n
print len(id2node)
# Reconstruct tree topology from previously stored tree connections
print 'Reconstructing tree topology...'
for node in id2node.itervalues():
    parentid = node2parentid[node]
    parent = id2node[parentid]
    # node with taxid=1 is the root of the tree
    if node.taxid == "1":
        t = node
示例#2
0
def build_tax_tree():
        import os
        import sys 
        from string import strip
        from ete2 import TreeNode, Tree
        #print sys.argv[1]
        #if len(sys.argv) == 1:
        #	print "Usage:  taxid2lineage file_with_taxids.txt"
        #else:
        #	f = open(sys.argv[1], 'r')

        # This sets Unbuffered stdout/auto-flush
        sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)

        id2node= {}
        id2rank={}
        node2parentid = {}
        all_ids = set([])
        all_nodes = []
        id2name= {}

        # Loads info from NCBI taxonomy files
        if os.path.exists("nodes.dmp"):
            NODESFILE = open('nodes.dmp')
        elif os.path.exists("nodes.dmp.bz2"):
            import bz2
            NODESFILE = bz2.BZ2File('nodes.dmp.bz2')
        else:
            print '"nodes.dmp" file is missing. Try to downloaded from: '

        if os.path.exists("names_scientific.dmp"):
            NAMESFILE = open('names_scientific.dmp')
        elif os.path.exists("names_scientific.dmp.bz2"):
            import bz2
            NAMESFILE = bz2.BZ2File('names_scientific.dmp.bz2')
        else:
            print '"names_scientific.dmp" file is missing. Try to downloaded from: '

        # Reads taxid/names transaltion
        #print 'Loading species names from "names_scientific.dmp" file...',
        for line in NAMESFILE:
            line = line.strip()
            fields = map(strip, line.split("|"))
            nodeid, name = fields[0], fields[1]
            id2name[nodeid] = name


        # Reads node connections in nodes.dmp
        #print 'Loading node connections form "nodes.dmp" file...', 
        for line in NODESFILE:
            line = line.strip()
            fields = map(strip, line.split("|"))
            nodeid, parentid,rankid = fields[0], fields[1], fields[2]
            id2rank[nodeid]=rankid
            if nodeid =="" or parentid == "":
                raw_input("Wrong nodeid!")

            # Stores node connections
            all_ids.update([nodeid, parentid])

            # Creates a new TreeNode instance for each new node in file
            n = TreeNode()
            # Sets some TreeNode attributes
            n.add_feature("name", id2name[nodeid])
            n.add_feature("taxid", nodeid)
            n.add_feature("rank",id2rank[nodeid])

            # updates node list and connections
            node2parentid[n]=parentid
            id2node[nodeid] = n
        #print len(id2node)

        # Reconstruct tree topology from previously stored tree connections
        #print 'Reconstructing tree topology...'
        for node in id2node.itervalues():
            parentid = node2parentid[node]
            parent = id2node[parentid]
            if node.taxid == "1":
                t = node
            else:
                parent.add_child(node)

        return id2node, id2name
def load_NCBI(species_file, names_file, nodes_file ):
    if not os.path.isfile(species_file):
            print "ERROR "+species_file+' can\'t be read. Exiting... '
            sys.exit(8)

    all_wanted_species={} # species_name:   taxid (string)

    print "Reading wanted species from file: "+species_file
    ifile=open(species_file, 'r')
    for iline in ifile:
            species_name=iline.strip()
            all_wanted_species[species_name]=-1
    ifile.close()


    # This sets Unbuffered stdout/auto-flush
    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)

    id2node= {}
    node2parentid = {}
    all_ids = set([])
    all_nodes = []
    id2name= {}

    # Loads info from NCBI taxonomy files
    if os.path.exists(nodes_file):
        NODESFILE = open(nodes_file)
    elif os.path.exists(nodes_file+"bz2"):
        import bz2
        NODESFILE = bz2.BZ2File(nodes_file+'.bz2')
    else:
        print nodes_file+' file is missing. '
        sys.exit(8)

    if os.path.exists(names_file):
        NAMESFILE = open(names_file)
    elif os.path.exists(names_file+"bz2"):
        import bz2
        NAMESFILE = bz2.BZ2File(names_file+'.bz2')
    else:
        print names_file +' file is missing. '
        sys.exit(8)


    # Reads taxid/names transaltion
    print 'Loading species names from "names.dmp" file...',
    for line in NAMESFILE:
        # lines are redundant. synonyms are on different lines defined by the same id. So, we store only lines with "scientific name".
        line = line.strip()
        fields = map(strip, line.split("|"))
        nodeid, name = fields[0], fields[1]

        if all_wanted_species.has_key(name):
            all_wanted_species[name]=nodeid

        if fields[3]=='scientific name':
            #storing name that will appear afterwards in the ete2 node
            id2name[nodeid] = name

    print len(id2name)

    any_species_is_missing=0
    for species_name in all_wanted_species:
            if all_wanted_species[species_name]==-1:
                    print "ERROR the species name \""+species_name+"\" was not found!"
                    any_species_is_missing=1
    if any_species_is_missing:                    
        sys.exit(9)


    # Reads node connections in nodes.dmp
    print 'Loading node connections from "nodes.dmp" file...', 
    for line in NODESFILE:
        line = line.strip()
        fields = map(strip, line.split("|"))
        nodeid, parentid = fields[0], fields[1]
        if nodeid =="" or parentid == "":
            raw_input("Wrong nodeid!")

        # Stores node connections
        all_ids.update([nodeid, parentid])

        # Creates a new TreeNode instance for each new node in file
        n = TreeNode()
        # Sets some TreeNode attributes
        n.add_feature("name", id2name[nodeid])
        n.add_feature("taxid", nodeid)

        # updates node list and connections
        node2parentid[n]=parentid
        id2node[nodeid] = n

    print len(id2node)


    # Reconstruct tree topology from previously stored tree connections
    print 'Reconstructing tree topology...'
    for node in id2node.itervalues():
        parentid = node2parentid[node]
        parent = id2node[parentid]
        # node with taxid=1 is the root of the tree
        if node.taxid == "1":
            t = node
        else:
            parent.add_child(node)
    return t, id2node, all_wanted_species
for nodeid in taxa:
    parentid = node2parent[nodeid]
    while nodeid != parentid:  #costruiamo un nuovo dizionario per i soli taxa che abbiamo identificato nel campione
        ass_node2parent[nodeid] = parentid
        nodeid = parentid
        parentid = node2parent[nodeid]

node2parentid = {}
for nodeid in ass_node2parent.keys():
    parentid = ass_node2parent[nodeid]
    # Stores node connections
    all_ids.update([nodeid, parentid])
    # Creates a new TreeNode instance for each new node in file
    n = TreeNode()
    # Sets some TreeNode attributes
    n.add_feature("name", node2name[nodeid])
    n.add_feature("taxid", nodeid)
    n.add_feature("Order", node2order[nodeid])

    # updates node list and connections
    node2parentid[n] = parentid
    id2node[nodeid] = n
print len(id2node)
# Reconstruct tree topology from previously stored tree connections
print 'Reconstructing tree topology...'
for node in id2node.itervalues():
    parentid = node2parentid[node]
    parent = id2node[parentid]
    # node with taxid=1 is the root of the tree
    if node.taxid == "1":
        t = node