示例#1
0
文件: ncbiquery.py 项目: Ward9250/ete
def load_ncbi_tree_from_dump(tar):
    from ete3 import Tree
    # Download: ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
    parent2child = {}
    name2node = {}
    node2taxname = {}
    synonyms = set()
    name2rank = {}
    node2common = {}
    print("Loading node names...")
    for line in tar.extractfile("names.dmp"):
        line = str(line.decode())
        fields =  list(map(str.strip, line.split("|")))
        nodename = fields[0]
        name_type = fields[3].lower()
        taxname = fields[1]
        if name_type == "scientific name":
            node2taxname[nodename] = taxname
        if name_type == "genbank common name":
            node2common[nodename] = taxname
        elif name_type in set(["synonym", "equivalent name", "genbank equivalent name",
                               "anamorph", "genbank synonym", "genbank anamorph", "teleomorph"]):
            synonyms.add( (nodename, taxname) )
    print(len(node2taxname), "names loaded.")
    print(len(synonyms), "synonyms loaded.")

    print("Loading nodes...")
    for line in tar.extractfile("nodes.dmp"):
        line = str(line.decode())
        fields =  line.split("|")
        nodename = fields[0].strip()
        parentname = fields[1].strip()
        n = Tree()
        n.name = nodename
        n.taxname = node2taxname[nodename]
        if nodename in node2common:
            n.common_name = node2common[nodename]
        n.rank = fields[2].strip()
        parent2child[nodename] = parentname
        name2node[nodename] = n
    print(len(name2node), "nodes loaded.")

    print("Linking nodes...")
    for node in name2node:
       if node == "1":
           t = name2node[node]
       else:
           parent = parent2child[node]
           parent_node = name2node[parent]
           parent_node.add_child(name2node[node])
    print("Tree is loaded.")
    return t, synonyms
示例#2
0
def getTheTrees(): 
	##DOWNLOAD taxdump and store in taxo folder
	##DOWNLOAD TAXREF BY HAND! and put it in taxo/

	class Trans:
		def __init__(self):
			self.common_name_FR = []


	print "Getting french translations..."
	TRANS = {} ##translations in french
	with open("taxo/TAXREFv11.txt") as f:  
		for line in f:
			sciname = line.split("\t")[14]
			comnameFR = line.split("\t")[19]
			if (TRANS.has_key(sciname)==False and line.split("\t")[19]!=''):
				TRANS[sciname] = Trans()
			if (line.split("\t")[19]!=''):
				TRANS[sciname].common_name_FR.append(comnameFR)

	#get translation of ranks
	print "\nGetting rank names in french..."
	RANKS = {}
	with open("ranks.txt") as f:  
		for line in f:
			rank_en = line.split("\t")[0]
			rank_fr = line.split("\t")[1].rstrip() ##to remove \n
			RANKS[rank_en] = rank_fr


	class Taxid:
		def __init__(self):
			self.sci_name = ""
			self.authority = ""
			self.synonym = ""
#			self.common_name = ""
			self.common_name = []
#			self.common_name_FR = ""
			self.common_name_FR = []

	cpt = 0
	cptfr = 0
	ATTR = {} ##here we will list attribute of each species per taxid
	print "Reading NCBI taxonomy..."
	with open("taxo/names.dmp") as f:  
		for line in f:		
			taxid = line.split("|")[0].replace("\t","")
			tid_val = line.split("|")[1].replace("\t","")
			tid_type = line.split("|")[3].replace("\t","")
			if (ATTR.has_key(taxid)==False):
				ATTR[taxid] = Taxid()
			if (tid_type=="scientific name"):
				ATTR[taxid].sci_name = tid_val
				#and get translation in french (if any)
				if TRANS.has_key(tid_val):
					ATTR[taxid].common_name_FR = TRANS[tid_val].common_name_FR
					cptfr += 1
			if (tid_type=="authority"):
				if (ATTR[taxid].authority!=""):
					ATTR[taxid].authority = ATTR[taxid].authority + ", " + tid_val
				else:
					ATTR[taxid].authority = tid_val
			if (tid_type=="synonym"):
				if (ATTR[taxid].synonym!=""):
					ATTR[taxid].synonym = ATTR[taxid].synonym + ", " + tid_val
				else:
					ATTR[taxid].synonym = tid_val
			if (tid_type=="common name"):
				cpt +=1
				ATTR[taxid].common_name.append(tid_val)
				# if (ATTR[taxid].common_name!=""):
				# 	ATTR[taxid].common_name = ATTR[taxid].common_name + ", " + tid_val
				# else: 
				# 	ATTR[taxid].common_name = tid_val


	T = {}

	###New gettrees
	from ete3 import Tree
	filepath = 'taxo/nodes.dmp'  
	print "Building the NCBI taxonomy tree..."
	with open(filepath) as fp:  
		first_line = fp.readline() ## remove the 1 | 1 edge
		for line in fp:
			dad = line.split("|")[1].replace("\t","")
			son = line.split("|")[0].replace("\t","")
			rank = line.split("|")[2].replace("\t","")
			if (T.has_key(dad)==False):
				T[dad] = Tree()
				T[dad].name = dad
#				T[dad].rank = rank
#				T[dad].rank_FR = RANKS[rank]
				T[dad].taxid = dad
				T[dad].sci_name = ATTR[dad].sci_name
				T[dad].common_name = ATTR[dad].common_name
				T[dad].synonym = ATTR[dad].synonym
				T[dad].authority = ATTR[dad].authority
				T[dad].common_name_FR = ATTR[dad].common_name_FR
			if (T.has_key(son)==False):
				T[son] = Tree()
				T[son].name = son
				T[son].rank = rank
				T[son].rank_FR = RANKS[rank]
				T[son].taxid = son
				T[son].sci_name = ATTR[son].sci_name
				T[son].common_name = ATTR[son].common_name
				T[son].synonym = ATTR[son].synonym
				T[son].authority = ATTR[son].authority
				T[son].common_name_FR = ATTR[son].common_name_FR
			else:
				if (hasattr(T[son], 'rank')==False):
					T[son].rank = rank
					T[son].rank_FR = RANKS[rank]
			T[dad].add_child(T[son])
	return T
示例#3
0
def getTheTrees():
	class Trans:
		def __init__(self):
			self.common_name_FR = []

	print "Getting french translations..."
	os.system("sudo wget -O taxo/TAXONOMIC-VERNACULAR-FR.txt https://github.com/damiendevienne/taxonomy-fr/blob/master/TAXONOMIC-VERNACULAR-FR.txt?raw=true")
	TRANS = {} ##translations in french
	with open("taxo/TAXONOMIC-VERNACULAR-FR.txt") as f:
		for line in f:
			sciname = line.split("\t")[0]
			comnameFR = line.split("\t")[1].rstrip()
			if (TRANS.has_key(sciname)==False):
				TRANS[sciname] = Trans()
			TRANS[sciname].common_name_FR.append(comnameFR)


	#get translation of ranks
	print "\nGetting rank names in french..."
	RANKS = {}
	with open("taxo/ranks.txt") as f:  
		for line in f:
			rank_en = line.split("\t")[0]
			rank_fr = line.split("\t")[1].rstrip() ##to remove \n
			RANKS[rank_en] = rank_fr


	class Taxid:
		def __init__(self):
			self.sci_name = ""
			self.authority = ""
			self.synonym = ""
#			self.common_name = ""
			self.common_name = []
#			self.common_name_FR = ""
			self.common_name_FR = []

	cpt = 0
	cptfr = 0
	ATTR = {} ##here we will list attribute of each species per taxid
	print "Reading NCBI taxonomy..."
	with open("taxo/names.dmp") as f:  
		for line in f:		
			taxid = line.split("|")[0].replace("\t","")
			tid_val = line.split("|")[1].replace("\t","")
			tid_type = line.split("|")[3].replace("\t","")
			if (ATTR.has_key(taxid)==False):
				ATTR[taxid] = Taxid()
			if (tid_type=="scientific name"):
				ATTR[taxid].sci_name = tid_val
				#and get translation in french (if any)
				if TRANS.has_key(tid_val):
					ATTR[taxid].common_name_FR = TRANS[tid_val].common_name_FR
					cptfr += 1
			if (tid_type=="authority"):
				if (ATTR[taxid].authority!=""):
					ATTR[taxid].authority = ATTR[taxid].authority + ", " + tid_val
				else:
					ATTR[taxid].authority = tid_val
			if (tid_type=="synonym"):
				if (ATTR[taxid].synonym!=""):
					ATTR[taxid].synonym = ATTR[taxid].synonym + ", " + tid_val
				else:
					ATTR[taxid].synonym = tid_val
			if (tid_type=="common name"):
				cpt +=1
				ATTR[taxid].common_name.append(tid_val)
                        if (tid_type=="genbank common name"):
                                cpt +=1
                                ATTR[taxid].common_name.append(tid_val)

				# if (ATTR[taxid].common_name!=""):
				# 	ATTR[taxid].common_name = ATTR[taxid].common_name + ", " + tid_val
				# else: 
				# 	ATTR[taxid].common_name = tid_val


	T = {}

	###New gettrees
	from ete3 import Tree
	filepath = 'taxo/nodes.dmp'  
	print "Building the NCBI taxonomy tree..."
	with open(filepath) as fp:  
		first_line = fp.readline() ## remove the 1 | 1 edge
		for line in fp:
			dad = line.split("|")[1].replace("\t","")
			son = line.split("|")[0].replace("\t","")
			rank = line.split("|")[2].replace("\t","") ##rank OF THE SON!
			if (T.has_key(dad)==False):
				T[dad] = Tree()
				T[dad].name = dad
#				T[dad].rank = rank
#				T[dad].rank_FR = RANKS[rank]
				T[dad].taxid = dad
				T[dad].sci_name = ATTR[dad].sci_name
				T[dad].common_name = ATTR[dad].common_name
				T[dad].synonym = ATTR[dad].synonym
				T[dad].authority = ATTR[dad].authority
				T[dad].common_name_FR = ATTR[dad].common_name_FR
			if (T.has_key(son)==False):
				T[son] = Tree()
				T[son].name = son
				T[son].rank = rank
				T[son].rank_FR = RANKS[rank]
				T[son].taxid = son
				T[son].sci_name = ATTR[son].sci_name
				T[son].common_name = ATTR[son].common_name
				T[son].synonym = ATTR[son].synonym
				T[son].authority = ATTR[son].authority
				T[son].common_name_FR = ATTR[son].common_name_FR
			else:
				if (hasattr(T[son], 'rank')==False):
					T[son].rank = rank
					T[son].rank_FR = RANKS[rank]
			T[dad].add_child(T[son])
	#Desambiguation
	T['54972'].rank_FR = "" #because those are birds, not "sabline" flowers


	return T
示例#4
0
filepath = 'taxo/nodes.dmp'  
print "Building the NCBI taxonomy tree..."
with open(filepath) as fp:  
	first_line = fp.readline() ## remove the 1 | 1 edge
	for line in fp:
		dad = line.split("|")[1].replace("\t","")
		son = line.split("|")[0].replace("\t","")
		rank = line.split("|")[2].replace("\t","")
		if (T.has_key(dad)==False):
			T[dad] = Tree()
			T[dad].name = dad
			T[dad].rank = rank
			T[dad].rank_FR = RANKS[rank]
			T[dad].taxid = dad
			T[dad].sci_name = ATTR[dad].sci_name
			T[dad].common_name = ATTR[dad].common_name
			T[dad].synonym = ATTR[dad].synonym
			T[dad].authority = ATTR[dad].authority
			T[dad].common_name_FR = ATTR[dad].common_name_FR
		if (T.has_key(son)==False):
			T[son] = Tree()
			T[son].name = son
			T[son].rank = rank
			T[son].rank_FR = RANKS[rank]
			T[son].taxid = son
			T[son].sci_name = ATTR[son].sci_name
			T[son].common_name = ATTR[son].common_name
			T[son].synonym = ATTR[son].synonym
			T[son].authority = ATTR[son].authority
			T[son].common_name_FR = ATTR[son].common_name_FR
		T[dad].add_child(T[son])
def getTheTrees():
    ##DOWNLOAD taxdump and store in taxo folder
    ##DOWNLOAD TAXREF BY HAND! and put it in taxo/

    class Trans:
        def __init__(self):
            self.common_name_FR = []

    print("Getting french translations...")
    TRANS = {}  ##translations in french
    with open("taxo/TAXREFv11.txt") as f:
        for line in f:
            sciname = line.split("\t")[14]
            comnameFR = line.split("\t")[19]
            if (sciname not in TRANS and line.split("\t")[19] != ''):
                TRANS[sciname] = Trans()
            if (line.split("\t")[19] != ''):
                TRANS[sciname].common_name_FR.append(comnameFR)

    #get translation of ranks
    print("Getting rank names in french...")
    RANKS = {}
    with open("taxo/ranks_FR.txt") as f:
        for line in f:
            rank_en = line.split("\t")[0]
            rank_fr = line.split("\t")[1].rstrip()  ##to remove \n
            RANKS[rank_en] = rank_fr

    class Taxid:
        def __init__(self):
            self.sci_name = ""
            self.authority = ""
            self.synonym = ""
            #			self.common_name = ""
            self.common_name = []
            #			self.common_name_FR = ""
            self.common_name_FR = []

    cpt = 0
    cptfr = 0
    ATTR = {}  ##here we will list attribute of each species per taxid
    print("Reading NCBI taxonomy...")
    with open("taxo/names.dmp") as f:
        for line in f:
            taxid = line.split("|")[0].replace("\t", "")
            tid_val = line.split("|")[1].replace("\t", "")
            tid_type = line.split("|")[3].replace("\t", "")
            ##PEUT ETRE RAJOUTER DES PETTS FILTRES COMME CA ??? A VOIR.
            # n.common_name = n.common_name[0] if len(n.common_name)>0 else ""
            # n.common_name = n.common_name.replace("'","''");
            # n.common_name_FR = n.common_name_FR[0] if len(n.common_name_FR)>0 else ""
            # n.common_name_FR = n.common_name_FR.replace("'","''");
            # n.rank = n.rank.replace("'","''");
            # n.rank_FR = n.rank_FR.replace("'","''");

            # n.sci_name = n.sci_name.replace("'","''")
            # #add parenthesis to the common name
            # if n.common_name!='':
            #     n.common_name = "(" + n.common_name + ")"

            if (taxid not in ATTR):
                ATTR[taxid] = Taxid()
            if (tid_type == "scientific name"):
                ATTR[taxid].sci_name = tid_val
                #and get translation in french (if any)
                if tid_val in TRANS:
                    ATTR[taxid].common_name_FR = TRANS[tid_val].common_name_FR
                    cptfr += 1
            if (tid_type == "authority"):
                if (ATTR[taxid].authority != ""):
                    ATTR[taxid].authority = ATTR[
                        taxid].authority + ", " + tid_val
                else:
                    ATTR[taxid].authority = tid_val
            if (tid_type == "synonym"):
                if (ATTR[taxid].synonym != ""):
                    ATTR[taxid].synonym = ATTR[taxid].synonym + ", " + tid_val
                else:
                    ATTR[taxid].synonym = tid_val
            if (tid_type == "common name"):
                cpt += 1
                ATTR[taxid].common_name.append(tid_val)
                # if (ATTR[taxid].common_name!=""):
                # 	ATTR[taxid].common_name = ATTR[taxid].common_name + ", " + tid_val
                # else:
                # 	ATTR[taxid].common_name = tid_val

    T = {}

    ###New gettrees
    filepath = 'taxo/nodes.dmp'
    print("Building the NCBI taxonomy tree...")
    with open(filepath) as fp:
        first_line = fp.readline()  ## remove the 1 | 1 edge
        for line in fp:
            dad = line.split("|")[1].replace("\t", "")
            son = line.split("|")[0].replace("\t", "")
            rank = line.split("|")[2].replace("\t", "")
            if (dad not in T):
                T[dad] = Tree()
                T[dad].name = dad
                #				T[dad].rank = rank
                #				T[dad].rank_FR = RANKS[rank]
                T[dad].taxid = dad
                T[dad].sci_name = ATTR[dad].sci_name
                T[dad].common_name = ATTR[dad].common_name
                T[dad].synonym = ATTR[dad].synonym
                T[dad].authority = ATTR[dad].authority
                T[dad].common_name_FR = ATTR[dad].common_name_FR
            if (son not in T):
                T[son] = Tree()
                T[son].name = son
                T[son].rank = rank
                T[son].rank_FR = RANKS[rank]
                T[son].taxid = son
                T[son].sci_name = ATTR[son].sci_name
                T[son].common_name = ATTR[son].common_name
                T[son].synonym = ATTR[son].synonym
                T[son].authority = ATTR[son].authority
                T[son].common_name_FR = ATTR[son].common_name_FR
            else:
                if (hasattr(T[son], 'rank') == False):
                    T[son].rank = rank
                    T[son].rank_FR = RANKS[rank]
            T[dad].add_child(T[son])
    return T
示例#6
0
def getTheTrees():
    ##DOWNLOAD taxdump and store in taxo folder
    ##DOWNLOAD TAXREF BY HAND! and put it in taxo/

    class Trans:
        def __init__(self):
            self.common_name_FR = []

    print "Getting french translations..."
    TRANS = {}  ##translations in french
    with open("taxo/TAXREFv11.txt") as f:
        for line in f:
            sciname = line.split("\t")[14]
            comnameFR = line.split("\t")[19]
            if (TRANS.has_key(sciname) == False
                    and line.split("\t")[19] != ''):
                TRANS[sciname] = Trans()
            if (line.split("\t")[19] != ''):
                TRANS[sciname].common_name_FR.append(comnameFR)

    #get translation of ranks
    print "\nGetting rank names in french..."
    RANKS = {}
    with open("taxo/ranks.txt") as f:
        for line in f:
            rank_en = line.split("\t")[0]
            rank_fr = line.split("\t")[1].rstrip()  ##to remove \n
            RANKS[rank_en] = rank_fr

    class Taxid:
        def __init__(self):
            self.sci_name = ""
            self.authority = ""
            self.synonym = ""
            #			self.common_name = ""
            self.common_name = []
            #			self.common_name_FR = ""
            self.common_name_FR = []

    cpt = 0
    cptfr = 0
    ATTR = {}  ##here we will list attribute of each species per taxid
    print "Reading NCBI taxonomy..."
    with open("taxo/names.dmp") as f:
        for line in f:
            taxid = line.split("|")[0].replace("\t", "")
            tid_val = line.split("|")[1].replace("\t", "")
            tid_type = line.split("|")[3].replace("\t", "")
            if (ATTR.has_key(taxid) == False):
                ATTR[taxid] = Taxid()
            if (tid_type == "scientific name"):
                ATTR[taxid].sci_name = tid_val
                #and get translation in french (if any)
                if TRANS.has_key(tid_val):
                    ATTR[taxid].common_name_FR = TRANS[tid_val].common_name_FR
                    cptfr += 1
            if (tid_type == "authority"):
                if (ATTR[taxid].authority != ""):
                    ATTR[taxid].authority = ATTR[
                        taxid].authority + ", " + tid_val
                else:
                    ATTR[taxid].authority = tid_val
            if (tid_type == "synonym"):
                if (ATTR[taxid].synonym != ""):
                    ATTR[taxid].synonym = ATTR[taxid].synonym + ", " + tid_val
                else:
                    ATTR[taxid].synonym = tid_val
            if (tid_type == "common name"):
                cpt += 1
                ATTR[taxid].common_name.append(tid_val)
            if (tid_type == "genbank common name"):
                cpt += 1
                ATTR[taxid].common_name.append(tid_val)

            # if (ATTR[taxid].common_name!=""):
            # 	ATTR[taxid].common_name = ATTR[taxid].common_name + ", " + tid_val
            # else:
            # 	ATTR[taxid].common_name = tid_val

    T = {}

    ###New gettrees
    from ete3 import Tree
    filepath = 'taxo/nodes.dmp'
    print "Building the NCBI taxonomy tree..."
    with open(filepath) as fp:
        first_line = fp.readline()  ## remove the 1 | 1 edge
        for line in fp:
            dad = line.split("|")[1].replace("\t", "")
            son = line.split("|")[0].replace("\t", "")
            rank = line.split("|")[2].replace("\t", "")
            if (T.has_key(dad) == False):
                T[dad] = Tree()
                T[dad].name = dad
                T[dad].taxid = dad
                T[dad].sci_name = ATTR[dad].sci_name
                T[dad].common_name = ATTR[dad].common_name
                T[dad].synonym = ATTR[dad].synonym
                T[dad].authority = ATTR[dad].authority
                T[dad].common_name_FR = ATTR[dad].common_name_FR
            if (T.has_key(son) == False):
                T[son] = Tree()
                T[son].name = son
                T[son].rank = rank
                T[son].rank_FR = RANKS[rank]
                T[son].taxid = son
                T[son].sci_name = ATTR[son].sci_name
                T[son].common_name = ATTR[son].common_name
                T[son].synonym = ATTR[son].synonym
                T[son].authority = ATTR[son].authority
                T[son].common_name_FR = ATTR[son].common_name_FR
            else:
                if (hasattr(T[son], 'rank') == False):
                    T[son].rank = rank
#					T[son].rank_FR = RANKS[rank]
            T[dad].add_child(T[son])
    return T
示例#7
0
filepath = 'taxo/nodes.dmp'
print "Building the NCBI taxonomy tree..."
with open(filepath) as fp:
    first_line = fp.readline()  ## remove the 1 | 1 edge
    for line in fp:
        dad = line.split("|")[1].replace("\t", "")
        son = line.split("|")[0].replace("\t", "")
        rank = line.split("|")[2].replace("\t", "")
        if (T.has_key(dad) == False):
            T[dad] = Tree()
            T[dad].name = dad
            T[dad].rank = rank
            T[dad].rank_FR = RANKS[rank]
            T[dad].taxid = dad
            T[dad].sci_name = ATTR[dad].sci_name
            T[dad].common_name = ATTR[dad].common_name
            T[dad].synonym = ATTR[dad].synonym
            T[dad].authority = ATTR[dad].authority
            T[dad].common_name_FR = ATTR[dad].common_name_FR
        if (T.has_key(son) == False):
            T[son] = Tree()
            T[son].name = son
            T[son].rank = rank
            T[son].rank_FR = RANKS[rank]
            T[son].taxid = son
            T[son].sci_name = ATTR[son].sci_name
            T[son].common_name = ATTR[son].common_name
            T[son].synonym = ATTR[son].synonym
            T[son].authority = ATTR[son].authority
            T[son].common_name_FR = ATTR[son].common_name_FR
        T[dad].add_child(T[son])