def name_ancestors(timetreefile, to_table=False, ete3_algo=False, uniq=True): logger.info('Loading data') ### /!\ quoted_node_names only from ete3 v3.1.1 timetree = PhyloTree(timetreefile, format=1, quoted_node_names=True) ncbi = NCBITaxa() name2taxid = ncbi.get_name_translator([sp.replace('_', ' ') for sp in \ timetree.get_leaf_names()]) for leaf in timetree.get_leaves(): try: leaf.add_feature('taxid', name2taxid[leaf.name.replace('_', ' ')][0]) except KeyError: logger.warning('Species %r not found', leaf.name) leaf.delete(prevent_nondicotomic=True, preserve_branch_length=True) logger.info('Placing common ancestors') if ete3_algo: ncbi.annotate_tree(timetree, 'taxid') else: myannotate(timetree, ncbi) matchrename_ncbitax(timetree, uniq) #logger.debug({ft:getattr(timetree, ft) for ft in timetree.features}) if not to_table: print(timetree.write(format=1, format_root_node=True)) else: for node in timetree.traverse(): if not node.is_leaf(): print(node.oldname + '\t' + getattr(node, 'sci_name', ''))
def ultrametricer(node_order, tree_file): with open(tree_file) as f: mytree = PhyloTree(f.next().strip(), format=1) # First I get every single leaf leaves = mytree.get_leaves() # The total distance must be: v = len(leaves) # Now we get the expected distances distances = dict() for i, node in enumerate(node_order): distances[node] = i + 1 for node in leaves: distances[node.name] = v # We add the root (that has no name) distances[""] = 0 # We get the root root = mytree.get_tree_root() for node in leaves: #Now I start traversing to the root while (node.up): # The expected distance of this branch is: expected = distances[node.name] - distances[node.up.name] node.dist = expected node = node.up return mytree.write(format=1)
tphy = PhyloTree("/home/xavi/Documents/scripts/ete-proves/cyps.newick") tsps = PhyloTree( "/home/xavi/Documents/scripts/ete-proves/cyps_sps_22mosquits.newick") # In[]: # assign species names to tree def get_species_name(node_name_string): # Species code is the first part of leaf name (separated by an # underscore character) spcode = node_name_string.split("_")[0] return spcode tphy.set_species_naming_function(get_species_name) for n in tphy.get_leaves(): print("node:", n.name, "Species name:", n.species) # In[]: # find evolutionary events using tree reconciliation tree_rec, evev_rec = tphy.reconcile(tsps) # In[]: print(tree_rec) tree_rec.show() # In[]: # find evolutionary events using species overlap evev = tphy.get_descendant_evol_events() for ev in evev:
seq2sp_dict[seq]=sp def get_species_name(node_name_string): return seq2sp_dict[node_name_string] def put_species_name(node_name_string): return node_name_string # read the gene tree genetree = PhyloTree(GeneTreeFilename, sp_naming_function=get_species_name) sptree = PhyloTree(SpeciesTreeFilename, sp_naming_function=put_species_name) logger.debug("Genetree") for n in genetree.get_leaves(): logger.debug("node: %s Species name: %s", n.name, n.species) logger.debug("SpeciesTree") for n in sptree.get_leaves(): logger.debug("node: %s Species name: %s", n.name, n.species) iS = 0 sp_dict = {} for n in sptree.traverse(): n.S=iS iS+=1 if not n.is_leaf(): n.name = n.S else: sp_dict[n.name] = n.S
# libraries from ete3 import PhyloTree # read tree from file phy = PhyloTree("adar_hol.01.iqt.contree.newick") # assign species names to tree phy.set_species_naming_function(lambda node: node.name.split("_")[0]) for n in phy.get_leaves(): print("node:", n.name, "Species name:", n.species) # root tree phy_outgroup = phy.get_midpoint_outgroup() phy.set_outgroup(phy_outgroup) # find evolutionary events evev = phy.get_descendant_evol_events(sos_thr=0.9) for ev in evev: if ev.etype == "S": print(ev.orthologs) # find evolutionary events evev = phy.get_descendant_evol_events(sos_thr=0.9) # all events for ev in evev: print(ev.etype, ','.join(ev.in_seqs), "<====>", ','.join(ev.out_seqs)) # all events involving either Hsap or Drer fseqs = lambda slist: [
t = PhyloTree("(((Hsa_001,Ptr_001),(Cfa_001,Mms_001)),(Dme_001,Dme_002));") # /-Hsa_001 # /--------| # | \-Ptr_001 # /--------| # | | /-Cfa_001 # | \--------| #---------| \-Mms_001 # | # | /-Dme_001 # \--------| # \-Dme_002 # # Prints current leaf names and species codes print "Deafult mode:" for n in t.get_leaves(): print "node:", n.name, "Species name:", n.species # node: Dme_001 Species name: Dme # node: Dme_002 Species name: Dme # node: Hsa_001 Species name: Hsa # node: Ptr_001 Species name: Ptr # node: Cfa_001 Species name: Cfa # node: Mms_001 Species name: Mms # # We can also use our own leaf name parsing function to obtain species # names. All we need to do is create a python function that takes # node's name as argument and return its corresponding species name. def get_species_name(node_name_string): # Species code is the first part of leaf name (separated by an # underscore character) spcode = node_name_string.split("_")[0]
def process_tree(treepath): ''' processes a tree to extract orthology relationships between target taxid and the rest of species, organized by orthology type and species code ''' treepath = str(treepath) treepath = treepath.rstrip() t = PhyloTree(treepath, sp_naming_function=get_species) # traverse all leaves in tree file and get taxid leaf_count = 0 for leaf in t: leaf_count += 1 tax = int(leaf.name.split(".", 1)[0]) #get scientific name and convert taxid from int to str sci_name = names.get(tax) leaf.taxid = str(tax) #rename leaves names try: good_name = "%s" % (conversion[leaf.name][0]) except: good_name = leaf.name good_name = re.sub("[ |\t,:)(;\n\]\[]+", "_", good_name) leaf.good_name = good_name #obtain cluster name from tree file path clus_name = os.path.split(treepath)[-1].replace(".fa.final_tree.nw", "") try: base_name = conversion[clus_name][0].replace('|', '_') except: base_name = clus_name[0] t.dist = 0 #colapses plat specific node2content = t.get_cached_content() target_species = set([target_taxid]) def is_sp_specific(_node): _species = set([_leaf.species for _leaf in node2content[_node]]) if not (_species - target_species): return True return False #traverse only lamprey leaves if collapse == 'yes': for n in t.get_leaves(is_leaf_fn=is_sp_specific): if n.children: for ch in n.get_children(): ch.detach() n.taxid = target_taxid n.name = "%s" % ('|'.join( [_lf.name for _lf in node2content[n]])) n.good_name = "{%s}" % ('|'.join( [_lf.good_name for _lf in node2content[n]])) #set outgroup outgroup = t.get_midpoint_outgroup() try: t.set_outgroup(outgroup) except: if len(t) == 1: return else: raise node2content = t.get_cached_content() event_lines = [] for ev in t.get_descendant_evol_events(): if ev.etype == "S": source_seqs = node2content[ev.node.children[0]] ortho_seqs = node2content[ev.node.children[1]] sp_1 = set() for leaf in source_seqs: sp_1.add(leaf.taxid) sp_2 = set() for leaf in ortho_seqs: sp_2.add(leaf.taxid) if str(target_taxid) in sp_1: source_seqs, ortho_seqs = source_seqs, ortho_seqs elif str(target_taxid) in sp_2: source_seqs, ortho_seqs = ortho_seqs, source_seqs else: continue #co_orthologs is a list with lamprey seed in source_seqs co_orthologs = [ leaf.good_name for leaf in source_seqs if leaf.taxid == str(target_taxid) ] co_orthologs.sort() #orthologs is a list of all ortho_seqs names orthologs = defaultdict(set) for leaf in ortho_seqs: sp = int(leaf.taxid) orthologs[sp].add(leaf.good_name) if len(co_orthologs) == 1: _otype = "one-to-" else: _otype = "many-to-" for sp, orth in orthologs.iteritems(): if len(orth) == 1: otype = _otype + "one" else: otype = _otype + "many" event_lines.append('\t'.join([ ','.join(co_orthologs), otype, str(sp), names[sp], ','.join(sorted(orth)), '\n' ])) return event_lines
return pickle.load(f) genedict = load_obj('genedict') speciescolors = load_obj('colors') red = Color('red') blue = Color('blue') colorvec = list(red.range_to(blue, len(genedict))) colormap = {} columnmap = {} for i,fasta in enumerate(genedict): columnmap[fasta] = i colormap[fasta] = colorvec[i].hex annotated = [] print speciescolors for fasta in genedict: for leaf in t.get_leaves(): nst = NodeStyle() nst["size"] = 0 nst["fgcolor"] = 'black' nst["hz_line_width"] = 2 nst["vt_line_width"]= 2 nst.show_name = True if leaf.name.split('/')[0] in genedict[fasta]: if 'HH' not in fasta and 'LOMETS' not in fasta: leaf.add_face( RectFace ( 10 , 10 , colormap[fasta], colormap[fasta] ), column = columnmap[fasta] ) if leaf not in annotated: try: face = leaf.add_face( TextFace ( text = genedict[fasta][leaf.name.split('/')[0]][2]) , column = 10 ) annotated.append(leaf) except:
def process_tree(treepath): ''' processes a tree to extract orthology relationships between target taxid and the rest of species, organized by orthology type and species code ''' treepath = str(treepath) treepath = treepath.rstrip() t = PhyloTree(treepath, sp_naming_function=get_species) treefile = os.path.basename(treepath) t.dist = 0 outgroup = t.get_midpoint_outgroup() try: t.set_outgroup(outgroup) t.standardize() except: if args.pairs_table: if len(t) == 1: sys.stderr.write(treefile + 'len(t) == 1' + '\n') return ([], []) #return (['aa', 'aa'] ,[['aa', 'aa']]) else: sys.stderr.write(treefile + 'len(t) != 1' + '\n') l = t.get_leaf_names() r = l[0] t.set_outgroup(r) pass #return ([],[]) #return (['None', 'None'] ,[['None', 'None']]) else: if len(t) == 1: sys.stderr.write(treefile + 'len(t) == 1' + '\n') return [] else: sys.stderr.write(treefile + 'len(t) != 1' + '\n') return [] names = {} for leaf in t: try: sp = str(leaf.name.split('.')[0]) leaf.taxid = str(sp) sci_name = ncbi.get_taxid_translator([sp]) names[sp] = sci_name[int(sp)] except: names[sp] = '' if args.conv_table: try: good_name = "%s" % (conversion[leaf.name][0]) except: good_name = leaf.name leaf.good_name = good_name node2content = t.get_cached_content() target_species = set([target_taxid]) def is_sp_specific(_node): _species = set([_leaf.species for _leaf in node2content[_node]]) if not (_species - target_species): return True return False #traverse only target taxid leaves if collapse == 'yes': for n in t.get_leaves(is_leaf_fn=is_sp_specific): if n.children: for ch in n.get_children(): ch.detach() n.taxid = target_taxid n.name = "{%s}" % ('|'.join( [_lf.name for _lf in node2content[n]])) if args.conv_table: n.good_name = "{%s}" % ('|'.join( [_lf.good_name for _lf in node2content[n]])) all_ortholgs_tree = [] all_ortholgs_pairs = [] event_lines = [] for ev in t.get_descendant_evol_events(): if ev.etype == "S": source_seqs = ev.node.children[0] ortho_seqs = ev.node.children[1] if target_taxid: sp_1 = set() for leaf in source_seqs: sp_1.add(leaf.taxid) sp_2 = set() for leaf in ortho_seqs: sp_2.add(leaf.taxid) if str(target_taxid) in sp_1: source_seqs, ortho_seqs = source_seqs, ortho_seqs elif str(target_taxid) in sp_2: source_seqs, ortho_seqs = ortho_seqs, source_seqs else: continue if args.conv_table: co_orthologs = [leaf.good_name for leaf in source_seqs] co_orthologs.sort() else: co_orthologs = [leaf.name for leaf in source_seqs] co_orthologs.sort() orthologs = defaultdict(set) for leaf in ortho_seqs: sp = str(leaf.name.split('.')[0]) if args.conv_table: orthologs[sp].add(leaf.good_name) else: orthologs[sp].add(leaf.name) if len(source_seqs) == 1: _otype = "one-to-" else: _otype = "many-to-" for sp, orth in orthologs.items(): if len(orth) == 1: otype = _otype + "one" else: otype = _otype + "many" event_lines.append('\t'.join([ ','.join(co_orthologs), otype, str(sp), ','.join(sorted(orth)), treefile, names[sp], '\n' ])) if args.pairs_table: source_seqs_names = [] ortho_seqs_names = [] for node in source_seqs: for leaf in node: if args.conv_table: name = leaf.good_name else: name = leaf.name source_seqs_names.append(name) for node in ortho_seqs: for leaf in node: if args.conv_table: name = leaf.good_name else: name = leaf.name ortho_seqs_names.append(name) all_ortholgs_node = itertools.product(source_seqs_names, ortho_seqs_names) all_ortholgs_tree.append(all_ortholgs_node) for node in all_ortholgs_tree: for pair in node: all_ortholgs_pairs.append(pair) #return (event_lines, all_ortholgs_pairs) if args.pairs_table: return (event_lines, all_ortholgs_pairs) else: return (event_lines)
t = PhyloTree("(((Hsa_001,Ptr_001),(Cfa_001,Mms_001)),(Dme_001,Dme_002));") # /-Hsa_001 # /--------| # | \-Ptr_001 # /--------| # | | /-Cfa_001 # | \--------| # ---------| \-Mms_001 # | # | /-Dme_001 # \--------| # \-Dme_002 # # Prints current leaf names and species codes print "Deafult mode:" for n in t.get_leaves(): print "node:", n.name, "Species name:", n.species # node: Dme_001 Species name: Dme # node: Dme_002 Species name: Dme # node: Hsa_001 Species name: Hsa # node: Ptr_001 Species name: Ptr # node: Cfa_001 Species name: Cfa # node: Mms_001 Species name: Mms # # We can also use our own leaf name parsing function to obtain species # names. All we need to do is create a python function that takes # node's name as argument and return its corresponding species name. def get_species_name(node_name_string): # Species code is the first part of leaf name (separated by an # underscore character) spcode = node_name_string.split("_")[0]