def reconcile_etetoolkit(protein): species_tree = PhyloTree(SPECIES_TREE_FILE.format( protein, 'nh'), format=1, sp_naming_function=lambda name: name) gene_tree = PhyloTree(GENE_TREE_FILE.format( protein, protein, 'nh'), format=1, sp_naming_function=lambda name: name) recon_tree, events = gene_tree.reconcile(species_tree) recon_tree.render("phylotree.png")
def build_tree(self, sample, rank_limit='None'): # Gets taxids of sample. Gets all taxids if sample is None. taxids = set(self.get_all_tax_ids(sample)) taxid2nodes = {} all_nodes = {} root_children = [] for taxid in taxids: taxid2nodes[taxid] = [] taxpath = self.get_taxpath(taxid) rank = self.get_rank(taxid) if self.rank_position[rank] <= self.rank_position[rank_limit]: for node_id in taxpath: if node_id != '': if node_id not in all_nodes: node = all_nodes.setdefault(node_id, PhyloTree()) node.name = str(node_id) node.taxid = node_id rank = self.get_rank(node_id) node.add_feature("rank", rank) node.add_feature("sci_name", self.get_name(node_id)) if rank == 'superkingdom': root_children.append(node) else: node = all_nodes[node_id] # node already exists taxid2nodes[taxid].append(node) # generate parent child relationships for taxid in taxid2nodes.keys(): parent = None for node in taxid2nodes[taxid]: if parent and node not in parent.children: parent.add_child(node) parent = node root = PhyloTree() root.name = 'root' root.taxid = '0' root.add_feature("rank", "root") root.add_feature("sci_name", "root") for child in root_children: root.add_child(child) tree = root if len(root.children) == 1: tree = root.children[0].detach() return tree
def get_example_tree(): # Performs a tree reconciliation analysis gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));' species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);" genetree = PhyloTree(gene_tree_nw) sptree = PhyloTree(species_tree_nw) recon_tree, events = genetree.reconcile(sptree) recon_tree.link_to_alignment(alg) return recon_tree, TreeStyle()
def test_lineages(self): """ Search trees (naming format: NumericTaxid.SequenceName) for nodes containing branches that separate two groups of primate genes where, in one side, the human gene has been lost, and the branch support value of the matching node is higher than 0.9. /-Any primate taxid (9443 in lineage) support >= 0.9--| \-Any primate taxid except human """ t1 = PhyloTree("(9601.ENSPPYP00000022176:1,9593.ENSGGOP00000009720:1);") t2 = PhyloTree("(9361.ENSDNOP00000016844:1,9258.ENSOANP00000032529:1);") t3 = PhyloTree( "(((((37347.ENSTBEP00000010698:0.120098,(9361.ENSDNOP00000000113:0.0697238,(9785.ENSLAFP00000009564:0.0297499,(9371.ENSETEP00000002412:0.0588324,9813.ENSPCAP00000006440:0.026638)0.985184:0.0242194)0.99985:0.0211882)0.99706:0.0161759)0.756:0.00666819,((132908.ENSPVAP00000002358:0.0439546,59463.ENSMLUP00000004598:0.0635161)0.994843:0.00885432,(9796.ENSECAP00000009809:0.0292517,((9685.ENSFCAP00000004938:0.056779,(9615.ENSCAFP00000008559:0.039179,(9823.ENSSSCP00000024070:0.126803,(9669.ENSMPUP00000010096:0.0341928,9646.ENSAMEP00000005906:0.0189746)0.995231:0.00951966)0.915476:0.0046099)0.949664:0.00417374)0.99985:0.0133593,(9739.ENSTTRP00000009464:0.0664336,9913.ENSBTAP00000001687:0.036632)0.99985:0.0236174)0.939309:0.00508062)0.991475:0.00823937)0.99985:0.0107263)0.99985:0.0100107,((9986.ENSOCUP00000014919:0.0830612,10141.ENSCPOP00000005291:0.12195)0.99985:0.0202639,((9483.ENSCJAP00000047968:0.0446865,(9544.ENSMMUP00000007168:0.0201746,((9593.ENSGGOP00000005929:0.00916494,(9606.ENSP00000294053:1.3e-07,9598.ENSPTRP00000006940:0.0068176)0.955193:0.00220905)0.99985:0.00778854,(9601.ENSPPYP00000004174:0.00495163,61853.ENSNLEP00000020892:0.179569)0.290072:0.00153447)0.998732:0.00889714)0.99985:0.0144864)0.99985:0.0344562,(9478.ENSTSYP00000006073:0.129349,(30608.ENSMICP00000010690:0.0852248,30611.ENSOGAP00000013738:0.0467206)0.99985:0.0188861)0.232709:0.00179852)0.99985:0.00929928)0.51042:0.00516905)0.367617:0.00813494,(43179.ENSSTOP00000004287:0.0599707,(10020.ENSDORP00000000618:0.138502,(10116.ENSRNOP00000026665:0.0528487,10090.ENSMUSP00000001884:0.0307781)0.99985:0.089983)0.99985:0.018366)0.698647:0.00414256)0.995833:0.06629,(9258.ENSOANP00000012946:0.33344,(13616.ENSMODP00000032549:0.0348012,(9315.ENSMEUP00000011030:0.0138664,9305.ENSSHAP00000003293:0.0185119)0.570293:0.0137766)0.99985:0.143897)0.995833:0.06629);") t4 = PhyloTree("(9593.ENSGGOP00000025542:1,9601.ENSPPYP00000004907:1);") t5 = PhyloTree( "(9371.ENSETEP00000005103:0.0955875,(9785.ENSLAFP00000014743:0.0214619,(9813.ENSPCAP00000005573:0.0376639,(9796.ENSECAP00000019319:0.0196571,(37347.ENSTBEP00000012329:0.0242927,((9361.ENSDNOP00000011716:0.0676669,(9606.ENSP00000374323:9e-07,(9593.ENSGGOP00000028731:0.00246332,(61853.ENSNLEP00000002377:0.0030064,(9601.ENSPPYP00000015233:0.0112606,(9598.ENSPTRP00000026129:0.00246268,9483.ENSCJAP00000015834:0.0290829)0:1.2e-07)0:6.5e-07)0.146278:0.00614181)0.146329:0.00485474)0.991187:0.014264)0.763764:0.00352544,((10020.ENSDORP00000008692:0.0259566,(30608.ENSMICP00000002718:0.0380742,9478.ENSTSYP00000009200:0.0174548)0.197348:0.00155005)0.99985:0.0110622,((((132908.ENSPVAP00000013183:0.0099908,59463.ENSMLUP00000014424:0.0115111)0.99985:0.00655941,(10141.ENSCPOP00000003417:0.0535498,((9669.ENSMPUP00000002651:0.0156675,(9646.ENSAMEP00000014393:0.0142536,9615.ENSCAFP00000013394:0.00243184)0.930921:0.00345947)0.99985:0.015828,(9913.ENSBTAP00000053531:0.0545233,9739.ENSTTRP00000001508:0.0344514)0.985783:0.00536759)0:1.1e-07)0:1.1e-07)0.99985:0.00795592,(10090.ENSMUSP00000066734:0.0572278,(43179.ENSSTOP00000020881:0.021661,30611.ENSOGAP00000000479:0.00876016)0.955042:0.00724791)0.992776:0.0044053)0:3.4e-07,(9258.ENSOANP00000012014:0.10692,(9315.ENSMEUP00000001901:0.0451997,13616.ENSMODP00000021214:0.00830289)0.994926:0.0229072)0.99985:0.0500253)0.981032:0.00621499)0:9e-08)0.723103:0.00185076)0.580248:0.00162611)0.99985:0.0167207)0.863552:0.00574499)1:0.0955875);") t6 = PhyloTree( "((9305.ENSSHAP00000010229:0.0607855,13616.ENSMODP00000009656:0.0615237)0.99985:0.0877765,(9785.ENSLAFP00000028174:0.0885004,(((9823.ENSSSCP00000002806:0.0860827,9823.ENSSSCP00000002780:0.0111508)0.99985:0.122086,((9913.ENSBTAP00000038896:0.050358,(9685.ENSFCAP00000017257:0.0778567,(9986.ENSOCUP00000017975:0.161424,(9615.ENSCAFP00000020783:0.056902,(9646.ENSAMEP00000019763:0.0857189,9669.ENSMPUP00000019474:0.0325693)0.99985:0.0314116)0.875671:0.00690881)0.942895:0.0136375)0.798192:0.00741364)0.967573:0.0100004,(59463.ENSMLUP00000020576:0.0755216,9796.ENSECAP00000004613:0.0777605)0.799782:0.00471384)0.911021:0.00832673)0.659845:0.00664335,((43179.ENSSTOP00000021465:0.123042,9593.ENSGGOP00000020601:0.0781752)0.987812:0.0311266,(30611.ENSOGAP00000021055:0.090792,(10116.ENSRNOP00000016702:0.0112116,10090.ENSMUSP00000050705:0.0330259)0.99985:0.134681)0.972881:0.0174783)0.998643:0.0179346)0.901179:0.017737)0.99985:0.0877765);") t7 = PhyloTree( "(9258.ENSOANP00000017269:0.144169,(((10090.ENSMUSP00000089169:0.0424834,10116.ENSRNOP00000026070:0.0151696)0.99985:0.0742333,(((((132908.ENSPVAP00000008558:0.0138473,(30608.ENSMICP00000004293:1.5e-07,((9986.ENSOCUP00000020707:0.0691049,37347.ENSTBEP00000002617:0.0138881)0:1.2e-07,(9371.ENSETEP00000012957:0.0515389,(9785.ENSLAFP00000009919:0.0260641,9813.ENSPCAP00000013834:0.0329521)0.741149:0.0041225)0.998768:0.00855745)0.99985:0.0111961)0.867255:0.00524663)0:4.3e-07,(9361.ENSDNOP00000010929:0.0359312,(9739.ENSTTRP00000015818:0.0267351,9796.ENSECAP00000009501:0.0168218)0.868862:0.00355516)0:8e-08)0.99985:0.0056594,(9913.ENSBTAP00000012912:0.0231165,(9669.ENSMPUP00000002012:0.00320767,9823.ENSSSCP00000023102:0.0629927)0.99134:0.00309237)0.988361:0.00284581)0:1.5e-07,((59463.ENSMLUP00000015155:0.0360776,9615.ENSCAFP00000002053:0.00579656)0.961397:0.00553059,(9685.ENSFCAP00000023114:0.0115974,9646.ENSAMEP00000004090:0.00575272)0.959045:0.00279601)0.988458:0.00279093)0.998008:0.00284847,(30611.ENSOGAP00000001383:0.00849776,((9483.ENSCJAP00000006698:0.0114709,(9544.ENSMMUP00000006654:0.00568623,(61853.ENSNLEP00000004122:0.00566385,(9601.ENSPPYP00000021653:0.00853215,(9593.ENSGGOP00000020462:1.8e-07,(9598.ENSPTRP00000035990:1e-08,9606.ENSP00000365550:1e-08)0.99985:0.00282071)0.996162:0.00281965)0:1.7e-07)0:8e-08)0.954037:0.0027827)0.99985:0.00818313,(43179.ENSSTOP00000012068:0.0109022,(9478.ENSTSYP00000008441:0.0132658,10141.ENSCPOP00000000986:0.0564111)0.314526:0.00294575)0:7e-08)0.980721:0.00309462)0.991529:0.00280168)0:1.6e-07)0.99985:0.0483405,(9315.ENSMEUP00000015273:0.00839008,(9305.ENSSHAP00000020642:0.00542335,13616.ENSMODP00000010568:0.101485)0:2.1e-07)0.99985:0.0336521)1:0.144169);") t8 = PhyloTree( "(((9371.ENSETEP00000003671:0.0131637,(9258.ENSOANP00000006745:0.117598,(132908.ENSPVAP00000001122:0.0159907,(30611.ENSOGAP00000013217:0.0071702,(((9823.ENSSSCP00000000042:0.0144457,(9646.ENSAMEP00000009872:0.0154876,9361.ENSDNOP00000012437:0.0817179)0:1e-06)0.998538:0.00765581,(9544.ENSMMUP00000001765:1e-08,(10116.ENSRNOP00000010491:0.0292686,(9669.ENSMPUP00000016236:0.340739,9615.ENSCAFP00000001415:4e-07)0.989009:0.00985882)0:8.7e-07)0:8.7e-07)0.99736:0.00973955,(((9606.ENSP00000379704:1e-08,(9601.ENSPPYP00000013264:0.00772278,9598.ENSPTRP00000024873:1e-08)0:2.3e-07)0.996569:0.00720502,(9913.ENSBTAP00000017531:0.0145949,9739.ENSTTRP00000016448:0.00723237)0.996503:0.00710774)0:4.2e-07,((9593.ENSGGOP00000008768:0.270021,(9785.ENSLAFP00000013194:0.00881524,9478.ENSTSYP00000011482:6.1e-07)0.482225:0.00675219)0.500314:0.00675139,(((59463.ENSMLUP00000002337:0.0319341,30608.ENSMICP00000003266:6.2e-07)0.987498:0.010619,(9796.ENSECAP00000021110:0.0073991,(9986.ENSOCUP00000007142:0.0196352,37347.ENSTBEP00000000333:0.0989537)0:9.5e-07)0:1.09e-06)0.873107:0.00951386,((9685.ENSFCAP00000000826:3e-07,(43179.ENSSTOP00000011619:0.00863897,10090.ENSMUSP00000023095:1e-08)0:1e-08)0.99985:0.132958,(10020.ENSDORP00000013215:0.0339132,10141.ENSCPOP00000011894:4.1e-07)0:4.1e-07)0.524756:0.00714334)0:8.1e-07)0.99985:0.00971634)0:7e-08)0:7e-08)0.772739:0.0177399)0.992096:0.0404786)0.817723:0.0310407)0.522416:0.072068,(9305.ENSSHAP00000014579:0.246289,9315.ENSMEUP00000008760:0.0666798)0.977479:0.195421)0.99985:1.2587,((((37347.ENSTBEP00000000946:0.0956163,(9483.ENSCJAP00000024301:0.0743892,(9593.ENSGGOP00000012469:0.00721405,(9606.ENSP00000391249:1e-08,9606.ENSP00000461549:1e-08)0:1.3e-07)0.993649:0.00856538)0.99985:0.0230549)0.975176:0.0143781,(30611.ENSOGAP00000003324:0.104251,30608.ENSMICP00000007369:0.0381575)0.990656:0.0183563)0.916137:0.00581305,(9823.ENSSSCP00000018191:0.0558998,((10020.ENSDORP00000010153:0.197695,((9796.ENSECAP00000018039:0.0363101,132908.ENSPVAP00000013461:0.0941126)0.892367:0.013635,((9739.ENSTTRP00000004783:0.0138565,9913.ENSBTAP00000003415:0.0166473)0.99985:0.0326524,((9371.ENSETEP00000006140:0.107709,(9785.ENSLAFP00000006435:0.170692,9813.ENSPCAP00000005503:0.0655274)0:2.68e-06)0.99985:0.0526328,(9258.ENSOANP00000002804:0.150016,(9315.ENSMEUP00000001056:0.0197146,(13616.ENSMODP00000002021:0.0382813,9305.ENSSHAP00000007534:0.0357616)0.99985:0.0843541)0.99985:0.115238)0.99985:0.133971)0.964252:0.0135998)0.99559:0.0163904)0.732303:0.00993157)0.99985:0.0470037,(9685.ENSFCAP00000008713:0.124988,(9615.ENSCAFP00000007771:0.0225216,(9646.ENSAMEP00000014479:0.0718956,9669.ENSMPUP00000013273:0.0487162)0.99985:0.0148769)0:9.2e-07)0.99985:0.0433867)0.99277:0.027679)0.99985:0.0134312)0:4.7e-07,(43179.ENSSTOP00000019919:0.152642,((10116.ENSRNOP00000003891:0.158016,10090.ENSMUSP00000091435:0.0102936)0.99985:0.0704992,(10141.ENSCPOP00000011436:0.130601,9986.ENSOCUP00000015843:0.529405)0:5.42e-06)0.909203:0.011833)0.428577:0.0186403)0.99985:1.2587);") t9 = PhyloTree("(9305.ENSSHAP00000009662:1,9305.ENSSHAP00000009620:1);") t10 = PhyloTree("((9315.ENSMEUP00000008285:0.899711,9258.ENSOANP00000027752:0.559777)0.99985:0.11989,((9739.ENSTTRP00000010720:0.164873,9913.ENSBTAP00000003500:0.298158)0.99985:0.109903,((9685.ENSFCAP00000006440:0.239731,(9615.ENSCAFP00000042310:0.122399,(9646.ENSAMEP00000002314:0.18278,9669.ENSMPUP00000005544:0.270727)0.6117:0.0396991)0.99985:0.0702148)0.99985:0.082488,(132908.ENSPVAP00000014833:0.488081,(9796.ENSECAP00000022144:0.310699,(((9785.ENSLAFP00000009512:0.187095,9813.ENSPCAP00000004417:0.493329)0.99985:0.359095,(30611.ENSOGAP00000016876:0.334272,(9483.ENSCJAP00000021314:0.178043,(9601.ENSPPYP00000003401:0.0415077,((61853.ENSNLEP00000003253:0.196659,9544.ENSMMUP00000037769:0.326984)0.835225:0.0989423,(9593.ENSGGOP00000004740:0.101826,9606.ENSP00000182290:0.0204981)0.997196:0.020731)0.307827:0.0046059)0.99985:0.0991112)0.99985:0.162323)0.972253:0.0380139)0.70642:0.0193389,((10141.ENSCPOP00000016274:0.272126,43179.ENSSTOP00000015376:0.458416)0.996119:0.0901785,(37347.ENSTBEP00000013312:0.328061,(10020.ENSDORP00000010739:0.398341,(10116.ENSRNOP00000051746:0.0455948,10090.ENSMUSP00000009396:0.0811741)0.99985:0.269525)0.791467:0.0577236)0.536676:0.0461933)0.99985:0.0620583)0.99985:0.0788824)0.969465:0.0395994)0.635969:0.0171601)0.702925:0.0283261)0.99985:0.11989);") trees = [(t1, "t1", True), (t2, "t2", False), (t3, "t3", True), (t4, "t4", True), (t5, "t5", True), (t6, "t6", False), (t7, "t7", True), (t8, "t8", True), (t9, "t9", False), (t10, "t10", True)] for tree, tree_name, has_matches in trees: tree.set_species_naming_function(lambda n: n.name.split(".")[0] if "." in n.name else '') tree.annotate_ncbi_taxa() # Has support for two primates where at least one is not H**o sapiens pattern = """ ( ' 9443 in @.lineage ' , ' 9443 in @.lineage and @.name!=9606 ' )' @.support >= 0.9 '; """ pattern = TreePattern(pattern) if not has_matches: self.assertEqual(list(pattern.find_match(tree)), []) else: match = pattern.find_match(tree).next() self.assertEqual(match.support >= 0.9, True) test_status = (9443 in match.children[0].lineage and \ 9443 in match.children[1].lineage and \ match.children[1].name != '9606') # permute children and check again test_status2 = (9443 in match.children[1].lineage and \ 9443 in match.children[0].lineage and \ match.children[0].name != '9606') self.assertEqual(test_status, True) self.assertEqual(test_status2, True)
def extract_clades(newick_file, processed_newick_out=None): """ the outer logic for tree splitting """ # preprocess tree print("Pre-processing tree ({})".format(newick_file)) tree = PhyloTree(newick_file) R = tree.get_midpoint_outgroup() tree.set_outgroup(R) tree.ladderize() tree.convert_to_ultrametric() if (processed_newick_out is not None): tree.write(format=1, outfile=processed_newick_out) # calculate clades print("Calling clades ({})".format(newick_file)) def get_branch_length(node): for l in node: return l.get_distance(node) len_tree = len(tree) dist_tree = get_branch_length(tree) def condition_discard(node, tree): return (len(node) < 3) def condition_ok(node, tree): return len(node) < max(10, len_tree / 50) branches = get_pruned_branch(tree, tree, condition_discard, condition_ok, []) clades = {} for i, branch in enumerate( sorted(branches, key=lambda nodes: -1 * len(nodes))): clades[str(i + 1)] = [node.name for node in branch] return clades
def cut_stray_genes(gene, species_keep, species_list): ######Showing the tree###### clade_tree = PhyloTree(gene + "/" + gene + ".3.fa.tre") clade_tree.prune(species_keep, preserve_branch_length=True) if len(species_keep) > 1: view_rooted_tree(clade_tree) print("\nThis is the clade tree. There are " + str(len(species_keep)) + " total gene copies.\n") else: print( "\nSpecies tree only contains 1 species. Tree will not be shown.") cut_list = species_keep view_counts(cut_list, species_list) ######Removing stray within-clade gene copies from the clade###### cut_question = raw_input("\nAre there stray genes to cut? (y/n)") while cut_question[0] == "y": cut_gene_str = raw_input( "\nEnter genes to cut, separated by a space: ") cut_gene_list = [item for item in cut_gene_str.split()] cut_list = [i for i in cut_list if i not in cut_gene_list] if set(cut_gene_list).issubset(species_keep): try: clade_tree.prune(cut_list, preserve_branch_length=True) view_rooted_tree(clade_tree) view_counts(cut_list, species_list) except ValueError: print( "\nSomething is wrong with the way the genes were entered. You entered:\n" + cut_gene_str + "\nCut abandoned.") else: print( "\nAt least one gene is not found on the tree. You entered:\n" + cut_gene_str + "\nCut abandoned.") cut_question = raw_input("\nAre there more genes to cut? (y/n)") return (cut_list)
def make_other_groups(gene, species_keep, species_list): full_tree = PhyloTree(gene + "/" + gene + ".3.fa.tre") ######Checking if the list is empty###### if len(species_keep) == 0: print("\nThere are no other genes in this gene family.") else: ######Removing stray within-clade gene copies from the clade###### cut_list = cut_stray_other(gene, species_keep, species_list) ######Making it a group###### group_list = cut_list ######Checking that there is only one gene per species###### check_set = {str(item[0:3]) for item in group_list} while len(group_list) != len(check_set): view_counts(cut_list, species_list) group_str = raw_input( "\nYou can only have one gene per species. Enter more genes to cut, separated by a space: " ) group_list = [item for item in group_list if item not in group_str] check_set = {str(item[0:3]) for item in group_list} print("\nThere are " + str(len(group_list)) + " genes in this group.\nGroup looks like:") print(group_list) print("\nMaking the group.") ######Saving gene group as a file###### with open(gene + "/" + gene + "_noclade_prune.txt", "a") as group_file: for i in group_list: group_file.write(i + "\n") ######Saving name of group to a master list###### with open(gene + "/" + gene + "_master_tree_list.txt", "a") as master: master.write(gene + "_noclade\n")
def main(args): genome_names = load_genome_names_by_clade_name(args.clade_name) LOGGER.info("loaded {} {} genomes".format(len(genome_names), args.clade_name)) cdss = load_cdss_by_genome_names(genome_names) LOGGER.info("loaded {} cdss".format(len(cdss))) ortho_fp = pathlib.Path(build_clade_filepath(args.clade_name)).joinpath("./ortho/{}.ortho".format(args.clade_name)) ortho_df = pd.read_csv(ortho_fp, sep='\t') cdss = set_gene_name_to_cdss(cdss, ortho_fp) LOGGER.info("loaded orthology from {}".format(ortho_fp)) if args.split_fp: cdss = set_split_to_cdss(cdss, args.split_fp) LOGGER.info("loaded simulated segmentation from {}".format(args.split_fp)) tree = None if args.tree_fp: tree = PhyloTree(args.tree_fp, format=1) LOGGER.info("loaded phylogenetic tree from {}".format(args.tree_fp)) records = [] cdsDAO = CdsDAO(cdss) gene_names = sorted(set(ortho_df["gene_name"])) # gene_names = list(gene_names)[:100] LOGGER.info("found {} genes to search".format(len(gene_names))) for origin_gene_name in gene_names: LOGGER.info("start {}".format(origin_gene_name)) records += detect_edges_all(origin_gene_name, args.score_method, cdsDAO, tree) out_df = pd.DataFrame(records, columns=["x", "y", "score", "score_naive", "total", "found", "bls", "top_offset", "top_relationship", "top_ratio"]) out_df.to_csv(args.out_fp, sep='\t', index=False) LOGGER.info("saved results to {}".format(args.out_fp))
def score_family_tree(self): outgrp_regex_str, species_dict, ingroup_regex_str = read_profile_file( BasePath.species_profile_filename) fam_tree_filename = BasePath.rooted_famtrees_dir + "/" + self.fam_id fam_tree = PhyloTree(fam_tree_filename, format=1) outgrp_re = re.compile(outgrp_regex_str) ingrp_re = re.compile(ingroup_regex_str) flag = self.check_if_tree_contains_outgroups(fam_tree, outgrp_re) if flag == 1: return 0 ingroup_matches_arr = self.get_ingroup_sequence_list( fam_tree, ingrp_re) ingroup_pair_arr = self.get_ingroup_sequence_pairs(ingroup_matches_arr) precision_val = self.inspect_ingroup_pairs(fam_tree, ingroup_pair_arr, outgrp_re) tree_score_filename = BasePath.outpath + "/" + self.fam_id + "/" + self.fam_id + \ BasePath.tree_score_fileextension tree_score_file = open(tree_score_filename, "w") tree_score_file.write(self.fam_id + " " + str(precision_val) + "\n") tree_score_file.close()
def LoadTrees(treeFile, dlm): """Reads and stores phylogenetic trees from a file Parameters ------ treefile: file, file of newick trees, 1 per line outgroup: str, last entry from quartet Returns ------ treelist: obj, ete3 object of trees """ print("loading trees...") treelist = [] pbar = tqdm(total=file_len(treeFile)) with open(treeFile, 'r') as newick: for line in newick: pbar.update(1) if not line.startswith("NA"): t = PhyloTree(line) t.set_species_naming_function( lambda node: node.name.split(dlm)[0]) treelist.append(t) pbar.close() return (treelist)
def do_alnntree(pref, ndf, fasta, refs, congen, targetids, gaps=0.9, cpus=-1): # TODO: add checkpoint to avoid repeating to_phy = congen for name, data in ndf.groupby('saccver'): # mi = data.sstart.min() # ma = data.send.max() tx = data.staxid.iloc[0] try: seq = refs['>%s' % name].replace('\n', '').strip() # [mi-1:ma+1] except KeyError: name = name.split('|')[0] seq = refs['>%s' % name].replace('\n', '').strip() to_phy += '>%d.%s\n%s\n' % (tx, name, seq) with shelve.open(fasta) as dic: for h, s in dic.items(): if h.strip()[1:] in targetids: print(h) to_phy += '%s\n%s\n' % (h, s.strip().replace('\n', '')) else: print(h, 'not in') aln, _ = stdin_run(['mafft', '--thread', str(cpus), '--auto', '-'], to_phy) trm = trimaln(aln.decode('utf-8'), targetids, gaps=gaps) tre, _ = stdin_run(['fasttreeMP', '-nt', '-gtr', '-gamma'], trm) tre = tre.strip()[:-1].replace(b';', b'-').decode('utf-8') + ';' t = PhyloTree(tre, sp_naming_function=lambda name: name.split('.')[0]) with open('%s.aln' % pref, 'w') as al, open('%s.treepickle' % pref, 'wb') \ as tp: al.write(trm) t.write(outfile='%s.tree' % pref) dill.dump(t, tp) tax2 = t.annotate_ncbi_taxa() fix_species(t) print(t) return t, tax2
def __init__(self, newick, alg, taxid, tid, actions, style, predraw_fn=None): try: self.tree = PhyloTree(newick=newick, alignment=alg, alg_format="fasta") except NewickError: self.tree = Tree(newick, format=1) if predraw_fn: predraw_fn(self.tree) self.tree.actions = actions self.tree.tree_style = style self.taxid = taxid #print taxid self.treeid = tid self.mapid = "map_" + tid self.imgid = "img_" + tid self.boxid = 'box_' + tid # Initialze node internal IDs for index, n in enumerate(self.tree.traverse('preorder')): n._nid = index
def test_shortcut_functions(self): t = PhyloTree( """((((Human_1, Chimp_1), (Human_2, (Chimp_2, Chimp_3))), ((Fish_1, (Human_3, Fish_3)), Yeast_2)), Yeast_1);""") t.set_species_naming_function(lambda node: node.name.split("_")[0]) t.get_descendant_evol_events() # DDDSSSDDS root = t.get_tree_root() # Detects two consecutive nodes with duplications pattern0 = """('n_duplications(@) > 0')'n_duplications(@) > 0 '; """ pattern1 = """( 'contains_leaves(@, ["Chimp_2", "Chimp_3"])'); """ pattern2 = """'n_speciations(@) > 3 '; """ pattern0 = TreePattern(pattern0) pattern1 = TreePattern(pattern1) pattern2 = TreePattern(pattern2) pattern0_match = list(pattern0.find_match(t, maxhits=None)) pattern1_match = list(pattern1.find_match(t, maxhits=None)) pattern2_match = list(pattern2.find_match(t, maxhits=None)) self.assertEqual(len(pattern0_match), 5) self.assertEqual(len(pattern1_match), 4) self.assertEqual(pattern1_match[0], root) self.assertEqual(len(pattern2_match), 2) self.assertEqual(pattern2_match[0], root) self.assertEqual(pattern2_match[1], root.children[0])
def get_ingroup_monoplyletic_clades(self): outgrp_regex_str, species_dict, ingroup_regex_str = read_profile_file( BasePath.species_profile_filename) fam_tree_filename = BasePath.outpath + "/" + self.fam_id + "/" + BasePath.raxml_tree_fileprefix + self.fam_id outgroup_re = re.compile(outgrp_regex_str) fam_tree = PhyloTree(fam_tree_filename, format=1) self.process_family_tree(fam_tree, outgroup_re, species_dict)
def load_json(fp): data = json.loads(clean_json(fp)) taxonomy = {} count_total = 0 counts = [] for row in data['ubiome_bacteriacounts']: normalise_row(row) counts.append(row['count_norm']) t = PhyloTree() t.name = row['tax_name'] t.add_features(**row) taxonomy[row['taxon']] = t root = taxonomy[min(taxonomy.keys())] count_total = root.count_norm root.alpha = alpha_function(counts) for t in taxonomy.values(): t.add_feature('count_pct', float(t.count_norm) / count_total * 100) parent = t.parent tp = taxonomy.get(parent) if tp is not None: tp.add_child(t) print('loaded {} into tree depth {} diversity {:.2f}'.format( len(taxonomy), len(root), root.alpha)) return root
def get_ingroup_monoplyletic_clades(self): outgrp_regex_str, species_dict, ingroup_regex_str, outgroup_id_arr = read_profile_file(BasePath.species_profile_filename) fam_tree_filename = BasePath.outpath + "/" + self.fam_id + "/" + self.fam_id + \ BasePath.rooted_fasttree_fileextension outgroup_re = re.compile(outgrp_regex_str) fam_tree = PhyloTree(fam_tree_filename, format=1) self.process_family_tree(fam_tree, outgroup_re, species_dict)
def run(args): from ete3 import Tree, PhyloTree for nw in args.src_tree_iterator: if args.orthologs is not None: t = PhyloTree(nw) for e in t.get_descendant_evol_events(): print(e.in_seqs, e.out_seqs)
def root_tree(self): outgrp_regex_str, species_dict, ingroup_regex_str, outgroup_id_arr = read_profile_file( BasePath.species_profile_filename) fam_tree_filename = BasePath.outpath + "/" + self.fam_id + "/" + self.fam_id + BasePath.fasttree_fileextension fam_tree = PhyloTree(fam_tree_filename, format=1) outgrp_re = re.compile(outgrp_regex_str) ingrp_re = re.compile(ingroup_regex_str) outgroup_sequence_list = self.get_regex_matching_sequence_list_from_node( fam_tree, outgrp_re) outgroup_monophyly_check = fam_tree.check_monophyly( values=outgroup_sequence_list, target_attr="name") if outgroup_monophyly_check[0]: print "Outgroups are monophyletic" root_node = fam_tree.get_common_ancestor(outgroup_sequence_list) fam_tree.set_outgroup(root_node) self.write_rooted_tree(fam_tree) else: print "Outgroups are not monophyletic" outgroup_sequence_list_from_seqlist = self.get_outgroup_sequences_from_seqlist( ) arranged_outgroup_sequence_list = self.arrange_outgroup_sequence_ids( outgroup_sequence_list_from_seqlist, outgroup_id_arr) root_node = arranged_outgroup_sequence_list[0] print "Rooting using sequence {0}".format(root_node) fam_tree.set_outgroup(root_node) self.write_rooted_tree(fam_tree)
def prune_main(gene, speciesList, cladeDict): gene = str(gene) erase_previous_files(gene) copy_list = copies_in_group(gene) gene_type = count_summarize(gene, copy_list, speciesList, cladeDict) choice = "n" if gene_type == "small": small_family(gene) elif gene_type == "single": single_copy(gene, copy_list, cladeDict) else: print("\nShowing the gene tree.") clade_tree = PhyloTree(gene + "/" + gene + ".3.fa.tre") view_rooted_tree(clade_tree) choice2 = raw_input( "\nWould you like to split this gene family into multiple families? (y/n)" ) if choice2[0] == "y": pre_prune(gene) else: choice = raw_input( "\nContinue with pruning as single gene family? (y/n)") if choice[0] == "y": make_clade_groups(gene, cladeDict, copy_list, speciesList) make_all_lists(gene, cladeDict)
def get_ingroup_monoplyletic_clades(self): print 'Clade species representation cutoff {0}'.format(self.species_representaion_cutoff) outgrp_regex_str, species_dict, ingroup_regex_str = read_profile_file(BasePath.species_profile_filename) fam_tree_filename = BasePath.rooted_famtrees_dir + "/" + self.fam_id outgroup_re = re.compile(outgrp_regex_str) fam_tree = PhyloTree(fam_tree_filename, format=1) self.process_family_tree(fam_tree, outgroup_re, species_dict)
def name_ancestors(timetreefile, to_table=False, ete3_algo=False, uniq=True): logger.info('Loading data') ### /!\ quoted_node_names only from ete3 v3.1.1 timetree = PhyloTree(timetreefile, format=1, quoted_node_names=True) ncbi = NCBITaxa() name2taxid = ncbi.get_name_translator([sp.replace('_', ' ') for sp in \ timetree.get_leaf_names()]) for leaf in timetree.get_leaves(): try: leaf.add_feature('taxid', name2taxid[leaf.name.replace('_', ' ')][0]) except KeyError: logger.warning('Species %r not found', leaf.name) leaf.delete(prevent_nondicotomic=True, preserve_branch_length=True) logger.info('Placing common ancestors') if ete3_algo: ncbi.annotate_tree(timetree, 'taxid') else: myannotate(timetree, ncbi) matchrename_ncbitax(timetree, uniq) #logger.debug({ft:getattr(timetree, ft) for ft in timetree.features}) if not to_table: print(timetree.write(format=1, format_root_node=True)) else: for node in timetree.traverse(): if not node.is_leaf(): print(node.oldname + '\t' + getattr(node, 'sci_name', ''))
def etealign(tree, MA): t = tree treefix = open(t, "r") t = treefix.readline().replace("'", "") tree = PhyloTree(t) print(tree) tree.link_to_alignment(alignment=MA, alg_format="fasta") tree.show()
def open_tree(tree_file_path): """Opens tree (contree or treefile) and assigns support values to nodes in case of a standard tree file""" if 'contree' in tree_file_path: tree = PhyloTree(tree_file_path, sp_naming_function=None) elif 'treefile' in tree_file_path: # Branch supports in SH-aLRT support (%) / ultrafast bootstrap support (%) tree = PhyloTree(tree_file_path, sp_naming_function=None, format=1) for node in tree.iter_descendants(): if not node.is_leaf(): support_values = node.name.split('/') try: node.support = float(support_values[1]) except IndexError: # No support values when sequences were identical --> set support artifically to 100.0 node.support = 100.0 #node.add_features(shalrt = float(support_values[0])) # Not necessary... else: sys.exit('Error: tree format not recognised') return tree
def safe_phylo_read(filename) -> PhyloTree: if isinstance(filename, PhyloTree): return filename try: return PhyloTree(filename, format=3) except: try: return PhyloTree(filename) except: try: return PhyloTree(filename, format=1) except: try: return PhyloTree(filename, format=5) except NewickError as e: print(f"Are you sure tree {filename} exists?", file=sys.stderr, flush=True) raise e
def test_species(self): """ tests if node.species and ncbi_query are working """ # test node.species species_tree = PhyloTree( """(Felis_catus_1:1, (Homo_sapiens_1:1, Pan_troglodytes_1:1), Saccharomyces_cerevisiae_1:1);""", format=1) species_tree.set_species_naming_function(lambda n: n.name.split("_")[1] if "_" in n.name else '') pattern0 = """('', (' len(set(["sapiens","pygmaeus"]) & species(@))>0', Pan_troglodytes_1) );""" pattern0 = TreePattern(pattern0) root = species_tree.get_tree_root() self.assertEqual(list(pattern0.find_match(species_tree)), [root]) # test ncbi taxonomy ncbi = NCBITaxa() taxonomy_tree = PhyloTree("((9598, 9606), 10090);", sp_naming_function=lambda name: name) taxonomy_tree.annotate_ncbi_taxa() root = taxonomy_tree.get_tree_root() pattern1 = """ ' @.sci_name == "Euarchontoglires" ';""" pattern2 = """ (( '@.sci_name=="H**o sapiens"' , '9526 in @.lineage ' )' @.rank=="subfamily" and @.taxid == 207598 ') ' @.sci_name == "Euarchontoglires" and "cellular organisms" in @.named_lineage'; """ pattern1 = TreePattern(pattern1) pattern2 = TreePattern(pattern2) match1 = pattern1.find_match(taxonomy_tree) match2 = pattern2.find_match(taxonomy_tree) self.assertEqual(list(match1), [root]) self.assertEqual(list(match2), [root])
def make_species_list(path): t = PhyloTree("{}.3.fa.tre".format(path)) leaves = [] for leaf in t: leaves.append(leaf) l = [str(i) for i in leaves] l = [i.lstrip("\n--") for i in l] l2 = [re.sub("\d", "", i) for i in l] return (l, l2)
def process_family_tree(fam_tree_fileName, profile_fileName): outgrp_regex_str, species_dict = read_profile_file(profile_fileName) outgrp_re = re.compile(outgrp_regex_str) fam_tree = PhyloTree(fam_tree_fileName, format=1) if not (detect_multifurcation(fam_tree)): return 0 node_dict = get_ingroup_monophyletic_clade_nodes(fam_tree, outgrp_re) get_SO_duplication_events(fam_tree, node_dict, species_dict, fam_tree_fileName)
def build_tree(aln, tree, basename, show, output_format): """Build phylogenetic tree from files This function creates a file with the phylogenetic tree and alignment from the fasta multiple alignment file and the tree in newick format. Parameters ------------- aln: string Alignment string in fasta format tree: string Tree string in newick format basename: string Basename of the original alignment file show: boolean Show ETE tree browser (yes/no) output_format: string Format of the output """ if tree[-1] != ";": genetree = PhyloTree("{};".format(tree)) else: genetree = PhyloTree(tree) ts = TreeStyle() ts.show_leaf_name = False new_tree = "{BASENAME}_Tree.{FORMAT}".format(BASENAME=basename, FORMAT=output_format) new_tree_aln = "{BASENAME}_Tree_aln.{FORMAT}".format(BASENAME=basename, FORMAT=output_format) if show: genetree.render(new_tree, tree_style=ts) genetree.link_to_alignment(aln) genetree.render(new_tree_aln, tree_style=ts) genetree.show(tree_style=ts) else: genetree.render(new_tree, tree_style=ts) genetree.link_to_alignment(aln) genetree.render(new_tree_aln, tree_style=ts)
def main(): parser = argparse.ArgumentParser(description='Gene Copy Number Finder') parser.add_argument('--genetree', required=True, help='GeneTree in nhx format') parser.add_argument('--speciesorder', required=True, help='Comma-separated species list') args = parser.parse_args() species_list = args.speciesorder.split(",") species_list = [_.strip() for _ in species_list] table = [] with open(args.genetree, "r") as f: # reads multiple gene tree line by line gene tree for line in f: # Remove empty NHX features that can be produced by TreeBest but break ete3 line = line.replace('[&&NHX]', '') # reads single gene tree genetree = PhyloTree(line) leaves = genetree.get_leaf_names() leaves_parts = [_.split("_") for _ in leaves] for i, leaf_parts in enumerate(leaves_parts): if len(leaf_parts) != 2: raise Exception( "Leaf node '%s' is not in gene_species format" % leaves[i]) leaves_species = [_[1] for _ in leaves_parts] species_counter = collections.Counter(leaves_species) # Assign to ref_species the first element of species_list which # appears in a leaf node for ref_species in species_list: if ref_species in species_counter: break else: raise Exception( "None of the specified species was found in the GeneTree '%s'" % line) # Find the gene of the (first) leaf node for the ref_species for leaf_parts in leaves_parts: if leaf_parts[1] == ref_species: species_counter['gene'] = leaf_parts[0] break table.append(species_counter) colList = ["gene"] + species_list printTSV(table, colList)
def main(): usage = "usage: %prog --genetree <genetree-file> --speciestree <speciestree-file> [options]" parser = optparse.OptionParser(usage=usage) parser.add_option('--genetree', help='GeneTree in nhx format') parser.add_option('--speciestree', help='Species Tree in nhx format') parser.add_option('--species_format', type='int', default=8, help='Species Tree input format (0-9)') parser.add_option('--gene_node', type='int', default=0, help='Gene node format 0=gene_species, 1=species_gene') parser.add_option('--gainlose', action='store_true', default=False, help='Find out gene gain/lose') parser.add_option('--output_format', type='int', default=9, help='GeneTree output format (0-9)') options, args = parser.parse_args() if options.genetree is None: parser.error("--genetree option must be specified, GeneTree in nhx format") # reads single gene tree genetree = PhyloTree(options.genetree) # sets species naming function if options.gene_node == 0: genetree.set_species_naming_function(parse_sp_name) # reconcile species tree with gene tree to help find out gene gain/lose if options.gainlose: if options.speciestree is None: parser.error("--speciestree option must be specified, species tree in nhx format") # reads species tree speciestree = PhyloTree(options.speciestree, format=options.species_format) # Removes '*' from Species names comes from Species tree configrured for TreeBest for leaf in speciestree: leaf.name = leaf.name.strip('*') genetree, events = genetree.reconcile(speciestree) # splits tree by duplication events which returns the list of all subtrees resulting from splitting current tree by its duplication nodes. for cluster_id, node in enumerate(genetree.split_by_dups(), 1): outfile = str(cluster_id) + '_genetree.nhx' with open(outfile, 'w') as f: f.write(node.write(format=options.output_format))