def phylogenetic_tree(extant_taxa, branch_lengths=(.01, .5)): names_as_strings = [str(i) for i in range(extant_taxa)] tree = PhyloTree(format=1) tree.populate(extant_taxa, random_branches=True, branch_range=branch_lengths, names_library=names_as_strings) tree.dist = 0 current_name = 2 * extant_taxa - 2 for node in tree.traverse('preorder'): if not node.is_leaf(): node.name = str(current_name) current_name -= 1 return tree
def process_tree(treepath): ''' processes a tree to extract orthology relationships between target taxid and the rest of species, organized by orthology type and species code ''' treepath = str(treepath) treepath = treepath.rstrip() t = PhyloTree(treepath, sp_naming_function=get_species) # traverse all leaves in tree file and get taxid leaf_count = 0 for leaf in t: leaf_count += 1 tax = int(leaf.name.split(".", 1)[0]) #get scientific name and convert taxid from int to str sci_name = names.get(tax) leaf.taxid = str(tax) #rename leaves names try: good_name = "%s" % (conversion[leaf.name][0]) except: good_name = leaf.name good_name = re.sub("[ |\t,:)(;\n\]\[]+", "_", good_name) leaf.good_name = good_name #obtain cluster name from tree file path clus_name = os.path.split(treepath)[-1].replace(".fa.final_tree.nw", "") try: base_name = conversion[clus_name][0].replace('|', '_') except: base_name = clus_name[0] t.dist = 0 #colapses plat specific node2content = t.get_cached_content() target_species = set([target_taxid]) def is_sp_specific(_node): _species = set([_leaf.species for _leaf in node2content[_node]]) if not (_species - target_species): return True return False #traverse only lamprey leaves if collapse == 'yes': for n in t.get_leaves(is_leaf_fn=is_sp_specific): if n.children: for ch in n.get_children(): ch.detach() n.taxid = target_taxid n.name = "%s" % ('|'.join( [_lf.name for _lf in node2content[n]])) n.good_name = "{%s}" % ('|'.join( [_lf.good_name for _lf in node2content[n]])) #set outgroup outgroup = t.get_midpoint_outgroup() try: t.set_outgroup(outgroup) except: if len(t) == 1: return else: raise node2content = t.get_cached_content() event_lines = [] for ev in t.get_descendant_evol_events(): if ev.etype == "S": source_seqs = node2content[ev.node.children[0]] ortho_seqs = node2content[ev.node.children[1]] sp_1 = set() for leaf in source_seqs: sp_1.add(leaf.taxid) sp_2 = set() for leaf in ortho_seqs: sp_2.add(leaf.taxid) if str(target_taxid) in sp_1: source_seqs, ortho_seqs = source_seqs, ortho_seqs elif str(target_taxid) in sp_2: source_seqs, ortho_seqs = ortho_seqs, source_seqs else: continue #co_orthologs is a list with lamprey seed in source_seqs co_orthologs = [ leaf.good_name for leaf in source_seqs if leaf.taxid == str(target_taxid) ] co_orthologs.sort() #orthologs is a list of all ortho_seqs names orthologs = defaultdict(set) for leaf in ortho_seqs: sp = int(leaf.taxid) orthologs[sp].add(leaf.good_name) if len(co_orthologs) == 1: _otype = "one-to-" else: _otype = "many-to-" for sp, orth in orthologs.iteritems(): if len(orth) == 1: otype = _otype + "one" else: otype = _otype + "many" event_lines.append('\t'.join([ ','.join(co_orthologs), otype, str(sp), names[sp], ','.join(sorted(orth)), '\n' ])) return event_lines
def process_tree(treepath): ''' processes a tree to extract orthology relationships between target taxid and the rest of species, organized by orthology type and species code ''' treepath = str(treepath) treepath = treepath.rstrip() t = PhyloTree(treepath, sp_naming_function=get_species) treefile = os.path.basename(treepath) t.dist = 0 outgroup = t.get_midpoint_outgroup() try: t.set_outgroup(outgroup) t.standardize() except: if args.pairs_table: if len(t) == 1: sys.stderr.write(treefile + 'len(t) == 1' + '\n') return ([], []) #return (['aa', 'aa'] ,[['aa', 'aa']]) else: sys.stderr.write(treefile + 'len(t) != 1' + '\n') l = t.get_leaf_names() r = l[0] t.set_outgroup(r) pass #return ([],[]) #return (['None', 'None'] ,[['None', 'None']]) else: if len(t) == 1: sys.stderr.write(treefile + 'len(t) == 1' + '\n') return [] else: sys.stderr.write(treefile + 'len(t) != 1' + '\n') return [] names = {} for leaf in t: try: sp = str(leaf.name.split('.')[0]) leaf.taxid = str(sp) sci_name = ncbi.get_taxid_translator([sp]) names[sp] = sci_name[int(sp)] except: names[sp] = '' if args.conv_table: try: good_name = "%s" % (conversion[leaf.name][0]) except: good_name = leaf.name leaf.good_name = good_name node2content = t.get_cached_content() target_species = set([target_taxid]) def is_sp_specific(_node): _species = set([_leaf.species for _leaf in node2content[_node]]) if not (_species - target_species): return True return False #traverse only target taxid leaves if collapse == 'yes': for n in t.get_leaves(is_leaf_fn=is_sp_specific): if n.children: for ch in n.get_children(): ch.detach() n.taxid = target_taxid n.name = "{%s}" % ('|'.join( [_lf.name for _lf in node2content[n]])) if args.conv_table: n.good_name = "{%s}" % ('|'.join( [_lf.good_name for _lf in node2content[n]])) all_ortholgs_tree = [] all_ortholgs_pairs = [] event_lines = [] for ev in t.get_descendant_evol_events(): if ev.etype == "S": source_seqs = ev.node.children[0] ortho_seqs = ev.node.children[1] if target_taxid: sp_1 = set() for leaf in source_seqs: sp_1.add(leaf.taxid) sp_2 = set() for leaf in ortho_seqs: sp_2.add(leaf.taxid) if str(target_taxid) in sp_1: source_seqs, ortho_seqs = source_seqs, ortho_seqs elif str(target_taxid) in sp_2: source_seqs, ortho_seqs = ortho_seqs, source_seqs else: continue if args.conv_table: co_orthologs = [leaf.good_name for leaf in source_seqs] co_orthologs.sort() else: co_orthologs = [leaf.name for leaf in source_seqs] co_orthologs.sort() orthologs = defaultdict(set) for leaf in ortho_seqs: sp = str(leaf.name.split('.')[0]) if args.conv_table: orthologs[sp].add(leaf.good_name) else: orthologs[sp].add(leaf.name) if len(source_seqs) == 1: _otype = "one-to-" else: _otype = "many-to-" for sp, orth in orthologs.items(): if len(orth) == 1: otype = _otype + "one" else: otype = _otype + "many" event_lines.append('\t'.join([ ','.join(co_orthologs), otype, str(sp), ','.join(sorted(orth)), treefile, names[sp], '\n' ])) if args.pairs_table: source_seqs_names = [] ortho_seqs_names = [] for node in source_seqs: for leaf in node: if args.conv_table: name = leaf.good_name else: name = leaf.name source_seqs_names.append(name) for node in ortho_seqs: for leaf in node: if args.conv_table: name = leaf.good_name else: name = leaf.name ortho_seqs_names.append(name) all_ortholgs_node = itertools.product(source_seqs_names, ortho_seqs_names) all_ortholgs_tree.append(all_ortholgs_node) for node in all_ortholgs_tree: for pair in node: all_ortholgs_pairs.append(pair) #return (event_lines, all_ortholgs_pairs) if args.pairs_table: return (event_lines, all_ortholgs_pairs) else: return (event_lines)
tree = PhyloTree('(Orangutan,Human,Chimp);') tree.link_to_alignment(""" >Chimp HARWLNEKLRCELRTLKKLGLDGYKAVSQYVKGRA >Orangutan DARWINEKLRCVSRTLKKLGLDGYKGVSQYVKGRP >Human DARWHNVKLRCELRTLKKLGLVGFKAVSQFVIRRA """) nt_sequences = {"Human" : "GACGCACGGTGGCACAACGTAAAATTAAGATGTGAATTGAGAACTCTGAAAAAATTGGGACTGGTCGGCTTCAAGGCAGTAAGTCAATTCGTAATACGTCGTGCG", "Chimp" : "CACGCCCGATGGCTCAACGAAAAGTTAAGATGCGAATTGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGCAGTAAGTCAGTACGTTAAAGGTCGTGCG", "Orangutan": "GATGCACGCTGGATCAACGAAAAGTTAAGATGCGTATCGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGGAGTAAGTCAATACGTTAAAGGTCGTCCG" } for l in nt_sequences: (tree & l).nt_sequence = nt_sequences[l] tree.dist = 0 ts = TreeStyle() ts.title.add_face(TextFace("Example for nucleotides...", fsize=15), column=0) ts.layout_fn = test_layout_evol tree.show(tree_style=ts) # Show very large algs tree = PhyloTree('(Orangutan,Human,Chimp);') tree.link_to_alignment(">Human\n" + ''.join([_aabgcolors.keys()[int(random() * len (_aabgcolors))] for _ in xrange (5000)]) + \ "\n>Chimp\n" + ''.join([_aabgcolors.keys()[int(random() * len (_aabgcolors))] for _ in xrange (5000)]) + \ "\n>Orangutan\n" + ''.join([_aabgcolors.keys()[int(random() * len (_aabgcolors))] for _ in xrange (5000)])) tree.dist = 0 ts = TreeStyle() ts.title.add_face(TextFace("better not set interactivity if alg is very large", fsize=15), column=0) ts.layout_fn = test_layout_phylo_aa
>Orangutan DARWINEKLRCVSRTLKKLGLDGYKGVSQYVKGRP >Human DARWHNVKLRCELRTLKKLGLVGFKAVSQFVIRRA """) nt_sequences = { "Human": "GACGCACGGTGGCACAACGTAAAATTAAGATGTGAATTGAGAACTCTGAAAAAATTGGGACTGGTCGGCTTCAAGGCAGTAAGTCAATTCGTAATACGTCGTGCG", "Chimp": "CACGCCCGATGGCTCAACGAAAAGTTAAGATGCGAATTGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGCAGTAAGTCAGTACGTTAAAGGTCGTGCG", "Orangutan": "GATGCACGCTGGATCAACGAAAAGTTAAGATGCGTATCGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGGAGTAAGTCAATACGTTAAAGGTCGTCCG" } for l in nt_sequences: (tree & l).nt_sequence = nt_sequences[l] tree.dist = 0 ts = TreeStyle() ts.title.add_face(TextFace("Example for nucleotides...", fsize=15), column=0) ts.layout_fn = test_layout_evol tree.show(tree_style=ts) # Show very large algs tree = PhyloTree('(Orangutan,Human,Chimp);') tree.link_to_alignment(">Human\n" + ''.join([_aabgcolors.keys()[int(random() * len (_aabgcolors))] for _ in xrange (5000)]) + \ "\n>Chimp\n" + ''.join([_aabgcolors.keys()[int(random() * len (_aabgcolors))] for _ in xrange (5000)]) + \ "\n>Orangutan\n" + ''.join([_aabgcolors.keys()[int(random() * len (_aabgcolors))] for _ in xrange (5000)])) tree.dist = 0 ts = TreeStyle() ts.title.add_face(TextFace( "better not set interactivity if alg is very large", fsize=15),