예제 #1
0
def do_alnntree(pref, ndf, fasta, refs, congen, targetids, gaps=0.9, cpus=-1):
    # TODO: add checkpoint to avoid repeating
    to_phy = congen
    for name, data in ndf.groupby('saccver'):
        # mi = data.sstart.min()
        # ma = data.send.max()
        tx = data.staxid.iloc[0]
        try:
            seq = refs['>%s' % name].replace('\n', '').strip()  # [mi-1:ma+1]
        except KeyError:
            name = name.split('|')[0]
            seq = refs['>%s' % name].replace('\n', '').strip()
        to_phy += '>%d.%s\n%s\n' % (tx, name, seq)
    with shelve.open(fasta) as dic:
        for h, s in dic.items():
            if h.strip()[1:] in targetids:
                print(h)
                to_phy += '%s\n%s\n' % (h, s.strip().replace('\n', ''))
            else:
                print(h, 'not in')
    aln, _ = stdin_run(['mafft', '--thread', str(cpus), '--auto', '-'], to_phy)
    trm = trimaln(aln.decode('utf-8'), targetids, gaps=gaps)
    tre, _ = stdin_run(['fasttreeMP', '-nt', '-gtr', '-gamma'], trm)
    tre = tre.strip()[:-1].replace(b';', b'-').decode('utf-8') + ';'
    t = PhyloTree(tre, sp_naming_function=lambda name: name.split('.')[0])
    with open('%s.aln' % pref, 'w') as al, open('%s.treepickle' % pref, 'wb') \
            as tp:
        al.write(trm)
        t.write(outfile='%s.tree' % pref)
        dill.dump(t, tp)
    tax2 = t.annotate_ncbi_taxa()
    fix_species(t)
    print(t)
    return t, tax2
예제 #2
0
def run(args):
    from ete3 import Tree, PhyloTree
    for nw in args.src_tree_iterator:
        if args.orthologs is not None:
            t = PhyloTree(nw)
            for e in t.get_descendant_evol_events():
                print(e.in_seqs, e.out_seqs)
예제 #3
0
def LoadTrees(treeFile, dlm):
    """Reads and stores phylogenetic trees from a file

    Parameters
    ------
    treefile: file, file of newick trees, 1 per line
    outgroup: str, last entry from quartet

    Returns
    ------
    treelist: obj, ete3 object of trees

    """
    print("loading trees...")
    treelist = []
    pbar = tqdm(total=file_len(treeFile))
    with open(treeFile, 'r') as newick:
        for line in newick:
            pbar.update(1)
            if not line.startswith("NA"):
                t = PhyloTree(line)
                t.set_species_naming_function(
                    lambda node: node.name.split(dlm)[0])
                treelist.append(t)
    pbar.close()
    return (treelist)
예제 #4
0
파일: ete_extract.py 프로젝트: Ward9250/ete
def run(args):
    from ete3 import Tree, PhyloTree
    for nw in args.src_tree_iterator:
        if args.orthologs is not None:
            t = PhyloTree(nw)
            for e in t.get_descendant_evol_events():
                print(e.in_seqs, e.out_seqs)
예제 #5
0
def reconcile_etetoolkit(protein):
    species_tree = PhyloTree(SPECIES_TREE_FILE.format(
        protein, 'nh'), format=1, sp_naming_function=lambda name: name)
    gene_tree = PhyloTree(GENE_TREE_FILE.format(
        protein, protein, 'nh'), format=1, sp_naming_function=lambda name: name)
    recon_tree, events = gene_tree.reconcile(species_tree)
    recon_tree.render("phylotree.png")
def cut_stray_other(gene, species_keep, species_list):
	######Showing the tree######
	clade_tree=PhyloTree(gene+"/"+gene+".3.fa.tre")
	clade_tree.prune(species_keep,preserve_branch_length=True)
	if len(species_keep)>1:
		view_rooted_tree(clade_tree)
		print("\nThis is the clade tree. There are "+str(len(species_keep))+" total gene copies.\n")
	else:
		print("\nSpecies tree only contains 1 species. Tree will not be shown.")
	cut_list=species_keep
	view_counts(cut_list, species_list)
	######Removing stray within-clade gene copies from the clade######
	cut_question=raw_input("\nAre there stray genes to cut? (y/n)")
	while cut_question[0]== "y":
		choice4=raw_input("\nIf this group is a monophyletic clade, type c.\nOtherwise, type n.")
		if choice4[0]=="c":
			cut_gene_list=choose_clade(clade_tree)
		else:
			cut_gene_str=raw_input("\nEnter genes to cut, separated by a space: ")
			cut_gene_list=[item for item in cut_gene_str.split()]
		cut_list=[i for i in cut_list if i not in cut_gene_list]
		if set(cut_gene_list).issubset(species_keep):
			try:
				clade_tree.prune(cut_list,preserve_branch_length=True)
				view_rooted_tree(clade_tree)
				view_counts(cut_list, species_list)
			except ValueError:
				print ("\nSomething is wrong with the way the genes were entered. You entered:\n"+cut_gene_str+"\nCut abandoned.")
		else:
			print ("\nAt least one gene is not found on the tree. You entered:\n"+cut_gene_str+"\nCut abandoned.")
		cut_question=raw_input("\nAre there more genes to cut? (y/n)")
	return (cut_list)
def define_groups(gene, cut_list, species_list, species_keep, clade_name):
	clade_tree=PhyloTree(gene+"/"+gene+".3.fa.tre")
	clade_tree.prune(cut_list,preserve_branch_length=True)
	n=1
	######Designating whole clade duplications######
	choice=raw_input("\nWould you like to make a group? (y/n)")
	while choice[0] == "y":
		choice4=raw_input("\nIf this group includes all the genes left on the tree, type a.\nIf this group is a monophyletic clade, type c.\nOtherwise, type n.")
		if choice4[0]=="a":
			group_list=cut_list
		elif choice4[0]=="c":
			group_list=choose_clade(clade_tree)
		else:
			group_str=raw_input("\nEnter genes for the group, separated by a space: ")
			group_list=[item for item in group_str.split()]
		######Checking that there is only one gene per species######
		group_list2,subclade_name=check_single_group(group_list)
		######Checking for typos######
		for i in group_list2:
			if set(i).issubset(species_keep):
				######Allow a chance to back out, for example if user forgot to enter spaces######
				print("\nThere are "+str(len(i))+" genes in this group.\nGroup looks like:")
				print (i)
				choice3=raw_input("\nMake the group? (y/n)")
			else:	
				print ("\nAt least one gene is not found on the tree. You entered:\n")
				print (i)
				choice3=raw_input("\nEnter n to abandon this list and start again.")
			if choice3[0]=="y":
				######Saving group as file and add group to master list######
				if subclade_name=="ynyn":
					clade_filename="{}_{}".format(clade_name, n)
					saving_group(gene, i, clade_filename)
				else:
					clade_filename="{}_{}_{}".format(clade_name, subclade_name, n)
					saving_group(gene, i, clade_filename)
				n=n+1
			else: 
				print("\nGroup abandoned.")
				choice=raw_input("\nWould you like to make a group for this clade? (y/n)")
			cut_list=[j for j in cut_list if j not in i]
		######Checking to see if the tree is empty######
		if len(cut_list) == 0:
			print("\nThe tree is now empty. We will continue with the next clade.")
			choice="n"
			######Preparing for next group######
		else:
			choice2=raw_input("\nWould you like to view the tree with the group removed? (y/n)")
			if choice2[0] == "y":
				clade_tree.prune(cut_list,preserve_branch_length=True)
				view_rooted_tree(clade_tree)
				view_counts(cut_list, species_list)
				choice=raw_input("\nWould you like to make another group for this clade? (y/n)")
			else: 
				print("\nGroup abandoned.")
				choice=raw_input("\nWould you like to make a group for this clade? (y/n)")
예제 #8
0
def get_example_tree():

    # Performs a tree reconciliation analysis
    gene_tree_nw = '/home/issa/Documents/stage/raxml/clusters_Trimal/bestTree/RAxML_bestTree.cluster_9.fasta.aln'
    species_tree_nw = '/home/issa/Documents/stage/raxml/specie_tree_Trimal/RAxML_bestTree.specie_TREE_trimal.tree'
    genetree = PhyloTree(gene_tree_nw)
    sptree = PhyloTree(species_tree_nw)
    recon_tree, events = genetree.reconcile(sptree)
    recon_tree.link_to_alignment(alg)
    return recon_tree, TreeStyle()
예제 #9
0
def get_example_tree():

    # Performs a tree reconciliation analysis
    gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));'
    species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);"
    genetree = PhyloTree(gene_tree_nw)
    sptree = PhyloTree(species_tree_nw)
    recon_tree, events = genetree.reconcile(sptree)
    recon_tree.link_to_alignment(alg)
    return recon_tree, TreeStyle()
예제 #10
0
def main():
    parser = argparse.ArgumentParser(description='Gene Copy Number Finder')
    parser.add_argument('--genetree',
                        required=True,
                        help='GeneTree in nhx format')
    parser.add_argument('--speciesorder',
                        required=True,
                        help='Comma-separated species list')
    args = parser.parse_args()

    species_list = args.speciesorder.split(",")
    species_list = [_.strip() for _ in species_list]
    table = []

    with open(args.genetree, "r") as f:
        # reads multiple gene tree line by line gene tree
        for line in f:
            # Remove empty NHX features that can be produced by TreeBest but break ete3
            line = line.replace('[&&NHX]', '')

            # reads single gene tree
            genetree = PhyloTree(line)
            leaves = genetree.get_leaf_names()

            leaves_parts = [_.split("_") for _ in leaves]
            for i, leaf_parts in enumerate(leaves_parts):
                if len(leaf_parts) != 2:
                    raise Exception(
                        "Leaf node '%s' is not in gene_species format" %
                        leaves[i])

            leaves_species = [_[1] for _ in leaves_parts]
            species_counter = collections.Counter(leaves_species)

            # Assign to ref_species the first element of species_list which
            # appears in a leaf node
            for ref_species in species_list:
                if ref_species in species_counter:
                    break
            else:
                raise Exception(
                    "None of the specified species was found in the GeneTree '%s'"
                    % line)

            # Find the gene of the (first) leaf node for the ref_species
            for leaf_parts in leaves_parts:
                if leaf_parts[1] == ref_species:
                    species_counter['gene'] = leaf_parts[0]
                    break

            table.append(species_counter)

    colList = ["gene"] + species_list
    printTSV(table, colList)
예제 #11
0
    def test_lineages(self):
        """
        Search trees (naming format: NumericTaxid.SequenceName)
        for nodes containing branches that separate two groups of primate genes where,
        in one side, the human gene has been lost,
        and the branch support value of the matching node is higher than 0.9.

                                  /-Any primate taxid (9443 in lineage)
        support >= 0.9--|
                                  \-Any primate taxid except human

        """
        t1 = PhyloTree("(9601.ENSPPYP00000022176:1,9593.ENSGGOP00000009720:1);")
        t2 = PhyloTree("(9361.ENSDNOP00000016844:1,9258.ENSOANP00000032529:1);")
        t3 = PhyloTree(
            "(((((37347.ENSTBEP00000010698:0.120098,(9361.ENSDNOP00000000113:0.0697238,(9785.ENSLAFP00000009564:0.0297499,(9371.ENSETEP00000002412:0.0588324,9813.ENSPCAP00000006440:0.026638)0.985184:0.0242194)0.99985:0.0211882)0.99706:0.0161759)0.756:0.00666819,((132908.ENSPVAP00000002358:0.0439546,59463.ENSMLUP00000004598:0.0635161)0.994843:0.00885432,(9796.ENSECAP00000009809:0.0292517,((9685.ENSFCAP00000004938:0.056779,(9615.ENSCAFP00000008559:0.039179,(9823.ENSSSCP00000024070:0.126803,(9669.ENSMPUP00000010096:0.0341928,9646.ENSAMEP00000005906:0.0189746)0.995231:0.00951966)0.915476:0.0046099)0.949664:0.00417374)0.99985:0.0133593,(9739.ENSTTRP00000009464:0.0664336,9913.ENSBTAP00000001687:0.036632)0.99985:0.0236174)0.939309:0.00508062)0.991475:0.00823937)0.99985:0.0107263)0.99985:0.0100107,((9986.ENSOCUP00000014919:0.0830612,10141.ENSCPOP00000005291:0.12195)0.99985:0.0202639,((9483.ENSCJAP00000047968:0.0446865,(9544.ENSMMUP00000007168:0.0201746,((9593.ENSGGOP00000005929:0.00916494,(9606.ENSP00000294053:1.3e-07,9598.ENSPTRP00000006940:0.0068176)0.955193:0.00220905)0.99985:0.00778854,(9601.ENSPPYP00000004174:0.00495163,61853.ENSNLEP00000020892:0.179569)0.290072:0.00153447)0.998732:0.00889714)0.99985:0.0144864)0.99985:0.0344562,(9478.ENSTSYP00000006073:0.129349,(30608.ENSMICP00000010690:0.0852248,30611.ENSOGAP00000013738:0.0467206)0.99985:0.0188861)0.232709:0.00179852)0.99985:0.00929928)0.51042:0.00516905)0.367617:0.00813494,(43179.ENSSTOP00000004287:0.0599707,(10020.ENSDORP00000000618:0.138502,(10116.ENSRNOP00000026665:0.0528487,10090.ENSMUSP00000001884:0.0307781)0.99985:0.089983)0.99985:0.018366)0.698647:0.00414256)0.995833:0.06629,(9258.ENSOANP00000012946:0.33344,(13616.ENSMODP00000032549:0.0348012,(9315.ENSMEUP00000011030:0.0138664,9305.ENSSHAP00000003293:0.0185119)0.570293:0.0137766)0.99985:0.143897)0.995833:0.06629);")
        t4 = PhyloTree("(9593.ENSGGOP00000025542:1,9601.ENSPPYP00000004907:1);")
        t5 = PhyloTree(
            "(9371.ENSETEP00000005103:0.0955875,(9785.ENSLAFP00000014743:0.0214619,(9813.ENSPCAP00000005573:0.0376639,(9796.ENSECAP00000019319:0.0196571,(37347.ENSTBEP00000012329:0.0242927,((9361.ENSDNOP00000011716:0.0676669,(9606.ENSP00000374323:9e-07,(9593.ENSGGOP00000028731:0.00246332,(61853.ENSNLEP00000002377:0.0030064,(9601.ENSPPYP00000015233:0.0112606,(9598.ENSPTRP00000026129:0.00246268,9483.ENSCJAP00000015834:0.0290829)0:1.2e-07)0:6.5e-07)0.146278:0.00614181)0.146329:0.00485474)0.991187:0.014264)0.763764:0.00352544,((10020.ENSDORP00000008692:0.0259566,(30608.ENSMICP00000002718:0.0380742,9478.ENSTSYP00000009200:0.0174548)0.197348:0.00155005)0.99985:0.0110622,((((132908.ENSPVAP00000013183:0.0099908,59463.ENSMLUP00000014424:0.0115111)0.99985:0.00655941,(10141.ENSCPOP00000003417:0.0535498,((9669.ENSMPUP00000002651:0.0156675,(9646.ENSAMEP00000014393:0.0142536,9615.ENSCAFP00000013394:0.00243184)0.930921:0.00345947)0.99985:0.015828,(9913.ENSBTAP00000053531:0.0545233,9739.ENSTTRP00000001508:0.0344514)0.985783:0.00536759)0:1.1e-07)0:1.1e-07)0.99985:0.00795592,(10090.ENSMUSP00000066734:0.0572278,(43179.ENSSTOP00000020881:0.021661,30611.ENSOGAP00000000479:0.00876016)0.955042:0.00724791)0.992776:0.0044053)0:3.4e-07,(9258.ENSOANP00000012014:0.10692,(9315.ENSMEUP00000001901:0.0451997,13616.ENSMODP00000021214:0.00830289)0.994926:0.0229072)0.99985:0.0500253)0.981032:0.00621499)0:9e-08)0.723103:0.00185076)0.580248:0.00162611)0.99985:0.0167207)0.863552:0.00574499)1:0.0955875);")
        t6 = PhyloTree(
            "((9305.ENSSHAP00000010229:0.0607855,13616.ENSMODP00000009656:0.0615237)0.99985:0.0877765,(9785.ENSLAFP00000028174:0.0885004,(((9823.ENSSSCP00000002806:0.0860827,9823.ENSSSCP00000002780:0.0111508)0.99985:0.122086,((9913.ENSBTAP00000038896:0.050358,(9685.ENSFCAP00000017257:0.0778567,(9986.ENSOCUP00000017975:0.161424,(9615.ENSCAFP00000020783:0.056902,(9646.ENSAMEP00000019763:0.0857189,9669.ENSMPUP00000019474:0.0325693)0.99985:0.0314116)0.875671:0.00690881)0.942895:0.0136375)0.798192:0.00741364)0.967573:0.0100004,(59463.ENSMLUP00000020576:0.0755216,9796.ENSECAP00000004613:0.0777605)0.799782:0.00471384)0.911021:0.00832673)0.659845:0.00664335,((43179.ENSSTOP00000021465:0.123042,9593.ENSGGOP00000020601:0.0781752)0.987812:0.0311266,(30611.ENSOGAP00000021055:0.090792,(10116.ENSRNOP00000016702:0.0112116,10090.ENSMUSP00000050705:0.0330259)0.99985:0.134681)0.972881:0.0174783)0.998643:0.0179346)0.901179:0.017737)0.99985:0.0877765);")
        t7 = PhyloTree(
            "(9258.ENSOANP00000017269:0.144169,(((10090.ENSMUSP00000089169:0.0424834,10116.ENSRNOP00000026070:0.0151696)0.99985:0.0742333,(((((132908.ENSPVAP00000008558:0.0138473,(30608.ENSMICP00000004293:1.5e-07,((9986.ENSOCUP00000020707:0.0691049,37347.ENSTBEP00000002617:0.0138881)0:1.2e-07,(9371.ENSETEP00000012957:0.0515389,(9785.ENSLAFP00000009919:0.0260641,9813.ENSPCAP00000013834:0.0329521)0.741149:0.0041225)0.998768:0.00855745)0.99985:0.0111961)0.867255:0.00524663)0:4.3e-07,(9361.ENSDNOP00000010929:0.0359312,(9739.ENSTTRP00000015818:0.0267351,9796.ENSECAP00000009501:0.0168218)0.868862:0.00355516)0:8e-08)0.99985:0.0056594,(9913.ENSBTAP00000012912:0.0231165,(9669.ENSMPUP00000002012:0.00320767,9823.ENSSSCP00000023102:0.0629927)0.99134:0.00309237)0.988361:0.00284581)0:1.5e-07,((59463.ENSMLUP00000015155:0.0360776,9615.ENSCAFP00000002053:0.00579656)0.961397:0.00553059,(9685.ENSFCAP00000023114:0.0115974,9646.ENSAMEP00000004090:0.00575272)0.959045:0.00279601)0.988458:0.00279093)0.998008:0.00284847,(30611.ENSOGAP00000001383:0.00849776,((9483.ENSCJAP00000006698:0.0114709,(9544.ENSMMUP00000006654:0.00568623,(61853.ENSNLEP00000004122:0.00566385,(9601.ENSPPYP00000021653:0.00853215,(9593.ENSGGOP00000020462:1.8e-07,(9598.ENSPTRP00000035990:1e-08,9606.ENSP00000365550:1e-08)0.99985:0.00282071)0.996162:0.00281965)0:1.7e-07)0:8e-08)0.954037:0.0027827)0.99985:0.00818313,(43179.ENSSTOP00000012068:0.0109022,(9478.ENSTSYP00000008441:0.0132658,10141.ENSCPOP00000000986:0.0564111)0.314526:0.00294575)0:7e-08)0.980721:0.00309462)0.991529:0.00280168)0:1.6e-07)0.99985:0.0483405,(9315.ENSMEUP00000015273:0.00839008,(9305.ENSSHAP00000020642:0.00542335,13616.ENSMODP00000010568:0.101485)0:2.1e-07)0.99985:0.0336521)1:0.144169);")
        t8 = PhyloTree(
            "(((9371.ENSETEP00000003671:0.0131637,(9258.ENSOANP00000006745:0.117598,(132908.ENSPVAP00000001122:0.0159907,(30611.ENSOGAP00000013217:0.0071702,(((9823.ENSSSCP00000000042:0.0144457,(9646.ENSAMEP00000009872:0.0154876,9361.ENSDNOP00000012437:0.0817179)0:1e-06)0.998538:0.00765581,(9544.ENSMMUP00000001765:1e-08,(10116.ENSRNOP00000010491:0.0292686,(9669.ENSMPUP00000016236:0.340739,9615.ENSCAFP00000001415:4e-07)0.989009:0.00985882)0:8.7e-07)0:8.7e-07)0.99736:0.00973955,(((9606.ENSP00000379704:1e-08,(9601.ENSPPYP00000013264:0.00772278,9598.ENSPTRP00000024873:1e-08)0:2.3e-07)0.996569:0.00720502,(9913.ENSBTAP00000017531:0.0145949,9739.ENSTTRP00000016448:0.00723237)0.996503:0.00710774)0:4.2e-07,((9593.ENSGGOP00000008768:0.270021,(9785.ENSLAFP00000013194:0.00881524,9478.ENSTSYP00000011482:6.1e-07)0.482225:0.00675219)0.500314:0.00675139,(((59463.ENSMLUP00000002337:0.0319341,30608.ENSMICP00000003266:6.2e-07)0.987498:0.010619,(9796.ENSECAP00000021110:0.0073991,(9986.ENSOCUP00000007142:0.0196352,37347.ENSTBEP00000000333:0.0989537)0:9.5e-07)0:1.09e-06)0.873107:0.00951386,((9685.ENSFCAP00000000826:3e-07,(43179.ENSSTOP00000011619:0.00863897,10090.ENSMUSP00000023095:1e-08)0:1e-08)0.99985:0.132958,(10020.ENSDORP00000013215:0.0339132,10141.ENSCPOP00000011894:4.1e-07)0:4.1e-07)0.524756:0.00714334)0:8.1e-07)0.99985:0.00971634)0:7e-08)0:7e-08)0.772739:0.0177399)0.992096:0.0404786)0.817723:0.0310407)0.522416:0.072068,(9305.ENSSHAP00000014579:0.246289,9315.ENSMEUP00000008760:0.0666798)0.977479:0.195421)0.99985:1.2587,((((37347.ENSTBEP00000000946:0.0956163,(9483.ENSCJAP00000024301:0.0743892,(9593.ENSGGOP00000012469:0.00721405,(9606.ENSP00000391249:1e-08,9606.ENSP00000461549:1e-08)0:1.3e-07)0.993649:0.00856538)0.99985:0.0230549)0.975176:0.0143781,(30611.ENSOGAP00000003324:0.104251,30608.ENSMICP00000007369:0.0381575)0.990656:0.0183563)0.916137:0.00581305,(9823.ENSSSCP00000018191:0.0558998,((10020.ENSDORP00000010153:0.197695,((9796.ENSECAP00000018039:0.0363101,132908.ENSPVAP00000013461:0.0941126)0.892367:0.013635,((9739.ENSTTRP00000004783:0.0138565,9913.ENSBTAP00000003415:0.0166473)0.99985:0.0326524,((9371.ENSETEP00000006140:0.107709,(9785.ENSLAFP00000006435:0.170692,9813.ENSPCAP00000005503:0.0655274)0:2.68e-06)0.99985:0.0526328,(9258.ENSOANP00000002804:0.150016,(9315.ENSMEUP00000001056:0.0197146,(13616.ENSMODP00000002021:0.0382813,9305.ENSSHAP00000007534:0.0357616)0.99985:0.0843541)0.99985:0.115238)0.99985:0.133971)0.964252:0.0135998)0.99559:0.0163904)0.732303:0.00993157)0.99985:0.0470037,(9685.ENSFCAP00000008713:0.124988,(9615.ENSCAFP00000007771:0.0225216,(9646.ENSAMEP00000014479:0.0718956,9669.ENSMPUP00000013273:0.0487162)0.99985:0.0148769)0:9.2e-07)0.99985:0.0433867)0.99277:0.027679)0.99985:0.0134312)0:4.7e-07,(43179.ENSSTOP00000019919:0.152642,((10116.ENSRNOP00000003891:0.158016,10090.ENSMUSP00000091435:0.0102936)0.99985:0.0704992,(10141.ENSCPOP00000011436:0.130601,9986.ENSOCUP00000015843:0.529405)0:5.42e-06)0.909203:0.011833)0.428577:0.0186403)0.99985:1.2587);")
        t9 = PhyloTree("(9305.ENSSHAP00000009662:1,9305.ENSSHAP00000009620:1);")
        t10 = PhyloTree("((9315.ENSMEUP00000008285:0.899711,9258.ENSOANP00000027752:0.559777)0.99985:0.11989,((9739.ENSTTRP00000010720:0.164873,9913.ENSBTAP00000003500:0.298158)0.99985:0.109903,((9685.ENSFCAP00000006440:0.239731,(9615.ENSCAFP00000042310:0.122399,(9646.ENSAMEP00000002314:0.18278,9669.ENSMPUP00000005544:0.270727)0.6117:0.0396991)0.99985:0.0702148)0.99985:0.082488,(132908.ENSPVAP00000014833:0.488081,(9796.ENSECAP00000022144:0.310699,(((9785.ENSLAFP00000009512:0.187095,9813.ENSPCAP00000004417:0.493329)0.99985:0.359095,(30611.ENSOGAP00000016876:0.334272,(9483.ENSCJAP00000021314:0.178043,(9601.ENSPPYP00000003401:0.0415077,((61853.ENSNLEP00000003253:0.196659,9544.ENSMMUP00000037769:0.326984)0.835225:0.0989423,(9593.ENSGGOP00000004740:0.101826,9606.ENSP00000182290:0.0204981)0.997196:0.020731)0.307827:0.0046059)0.99985:0.0991112)0.99985:0.162323)0.972253:0.0380139)0.70642:0.0193389,((10141.ENSCPOP00000016274:0.272126,43179.ENSSTOP00000015376:0.458416)0.996119:0.0901785,(37347.ENSTBEP00000013312:0.328061,(10020.ENSDORP00000010739:0.398341,(10116.ENSRNOP00000051746:0.0455948,10090.ENSMUSP00000009396:0.0811741)0.99985:0.269525)0.791467:0.0577236)0.536676:0.0461933)0.99985:0.0620583)0.99985:0.0788824)0.969465:0.0395994)0.635969:0.0171601)0.702925:0.0283261)0.99985:0.11989);")

        trees = [(t1, "t1", True), (t2, "t2", False), (t3, "t3", True),
                 (t4, "t4", True), (t5, "t5", True), (t6, "t6", False),
                 (t7, "t7", True), (t8, "t8", True), (t9, "t9", False),
                 (t10, "t10", True)]
        for tree, tree_name, has_matches in trees:
            tree.set_species_naming_function(lambda n: n.name.split(".")[0] if "." in n.name else '')
            tree.annotate_ncbi_taxa()
            # Has support for two primates where at least one is not H**o sapiens
            pattern = """
                ( ' 9443 in @.lineage ' , ' 9443 in @.lineage and @.name!=9606 ' )' @.support >= 0.9 ';
                """
            pattern = TreePattern(pattern)
            if not has_matches:
                self.assertEqual(list(pattern.find_match(tree)), [])
            else:
                match = pattern.find_match(tree).next()
                self.assertEqual(match.support >= 0.9, True)
                test_status = (9443 in match.children[0].lineage and \
                               9443 in match.children[1].lineage and \
                               match.children[1].name != '9606')
                # permute children and check again
                test_status2 = (9443 in match.children[1].lineage and \
                               9443 in match.children[0].lineage and \
                               match.children[0].name != '9606')
                self.assertEqual(test_status, True)
                self.assertEqual(test_status2, True)
예제 #12
0
def process_family_tree(fam_tree_fileName, prune_re, out_dirName):
    fam_tree = PhyloTree(fam_tree_fileName, format=1)
    fam_tree_id = os.path.splitext(os.path.basename(fam_tree_fileName))[0]
    print fam_tree_id

    leaf_arr = get_node_leaves(fam_tree)
    prune_seq_arr = get_sequences_for_pruning(leaf_arr, prune_re)
    try:
        prune_tree(prune_seq_arr, fam_tree)
    except:
        return 0

    fam_tree.write(format=1, outfile=out_dirName + "/" + fam_tree_id)
예제 #13
0
    def root_tree(self):
        outgrp_regex_str, species_dict, ingroup_regex_str, outgroup_id_arr = read_profile_file(
            BasePath.species_profile_filename)
        fam_tree_filename = BasePath.outpath + "/" + self.fam_id + "/" + self.fam_id + BasePath.fasttree_fileextension
        fam_tree = PhyloTree(fam_tree_filename, format=1)

        outgrp_re = re.compile(outgrp_regex_str)
        ingrp_re = re.compile(ingroup_regex_str)

        outgroup_sequence_list = self.get_regex_matching_sequence_list_from_node(
            fam_tree, outgrp_re)
        outgroup_monophyly_check = fam_tree.check_monophyly(
            values=outgroup_sequence_list, target_attr="name")
        if outgroup_monophyly_check[0]:
            print "Outgroups are monophyletic"
            root_node = fam_tree.get_common_ancestor(outgroup_sequence_list)
            fam_tree.set_outgroup(root_node)
            self.write_rooted_tree(fam_tree)

        else:
            print "Outgroups are not monophyletic"

            outgroup_sequence_list_from_seqlist = self.get_outgroup_sequences_from_seqlist(
            )
            arranged_outgroup_sequence_list = self.arrange_outgroup_sequence_ids(
                outgroup_sequence_list_from_seqlist, outgroup_id_arr)
            root_node = arranged_outgroup_sequence_list[0]
            print "Rooting using sequence {0}".format(root_node)
            fam_tree.set_outgroup(root_node)
            self.write_rooted_tree(fam_tree)
예제 #14
0
def yesMake(cut_list, gene, tree_file_name):
	tree=PhyloTree(tree_file_name)
	tree.prune(cut_list, preserve_branch_length=True)
	if gene[-3] == "_":
		n = int(gene[-1])+1
		gene1 = gene[:-1]+str(n)
	else:
		gene1 = gene+"_10"
	new_file = "{}/{}/{}.3.fa.tre".format(sys.argv[1], gene1, gene1)
	directory = ("{}/{}".format(sys.argv[1], gene1))
	os.system("mkdir {}".format(directory))
	tree.write(format=1, outfile=new_file)
	with open(sys.argv[3], "a") as master:
		master.write("{}\n".format(gene1))
	return(gene1)
예제 #15
0
파일: tree.py 프로젝트: ballardt/mti
def tax_node(name, rel_abund=0):
    """
    TODO update this documentation
    Create a node on the taxonomic tree

    Args:
            taxid (string): The taxid of the node

    Returns:
            The newly created node
    """
    node = PhyloTree()
    node.name = name
    node.rel_abund = rel_abund
    return node
예제 #16
0
def get_order(tree):

    mytree = PhyloTree(tree, format=1)
    distances = dict()

    for mynode in mytree.traverse():
        if mynode.is_leaf():
            continue
        one_leaf = mynode.get_leaves()[0]
        dist = mynode.get_distance(one_leaf)
        distances[mynode.name] = dist

    node_order = sorted(distances.items(), key=lambda x: x[1])
    node_order = [x[0] for x in node_order][::-1]
    return ",".join(node_order)
 def get_ingroup_monoplyletic_clades(self):
     outgrp_regex_str, species_dict, ingroup_regex_str, outgroup_id_arr = read_profile_file(BasePath.species_profile_filename)
     fam_tree_filename = BasePath.outpath + "/" + self.fam_id + "/" + self.fam_id + \
         BasePath.rooted_fasttree_fileextension
     outgroup_re = re.compile(outgrp_regex_str)
     fam_tree = PhyloTree(fam_tree_filename, format=1)
     self.process_family_tree(fam_tree, outgroup_re, species_dict)
예제 #18
0
def prune_main(gene, speciesList, cladeDict):
    gene = str(gene)
    erase_previous_files(gene)
    copy_list = copies_in_group(gene)
    gene_type = count_summarize(gene, copy_list, speciesList, cladeDict)
    choice = "n"
    if gene_type == "small":
        small_family(gene)
    elif gene_type == "single":
        single_copy(gene, copy_list, cladeDict)
    else:
        print("\nShowing the gene tree.")
        clade_tree = PhyloTree(gene + "/" + gene + ".3.fa.tre")
        view_rooted_tree(clade_tree)
        choice2 = raw_input(
            "\nWould you like to split this gene family into multiple families? (y/n)"
        )
        if choice2[0] == "y":
            pre_prune(gene)
        else:
            choice = raw_input(
                "\nContinue with pruning as single gene family? (y/n)")
    if choice[0] == "y":
        make_clade_groups(gene, cladeDict, copy_list, speciesList)
        make_all_lists(gene, cladeDict)
예제 #19
0
def make_other_groups(gene, species_keep, species_list):
    full_tree = PhyloTree(gene + "/" + gene + ".3.fa.tre")
    ######Checking if the list is empty######
    if len(species_keep) == 0:
        print("\nThere are no other genes in this gene family.")
    else:
        ######Removing stray within-clade gene copies from the clade######
        cut_list = cut_stray_other(gene, species_keep, species_list)
        ######Making it a group######
        group_list = cut_list
        ######Checking that there is only one gene per species######
        check_set = {str(item[0:3]) for item in group_list}
        while len(group_list) != len(check_set):
            view_counts(cut_list, species_list)
            group_str = raw_input(
                "\nYou can only have one gene per species. Enter more genes to cut, separated by a space: "
            )
            group_list = [item for item in group_list if item not in group_str]
            check_set = {str(item[0:3]) for item in group_list}
        print("\nThere are " + str(len(group_list)) +
              " genes in this group.\nGroup looks like:")
        print(group_list)
        print("\nMaking the group.")
        ######Saving gene group as a file######
        with open(gene + "/" + gene + "_noclade_prune.txt", "a") as group_file:
            for i in group_list:
                group_file.write(i + "\n")
        ######Saving name of group to a master list######
        with open(gene + "/" + gene + "_master_tree_list.txt", "a") as master:
            master.write(gene + "_noclade\n")
def main(args):
    genome_names = load_genome_names_by_clade_name(args.clade_name)
    LOGGER.info("loaded {} {} genomes".format(len(genome_names), args.clade_name))
    cdss = load_cdss_by_genome_names(genome_names)
    LOGGER.info("loaded {} cdss".format(len(cdss)))

    ortho_fp = pathlib.Path(build_clade_filepath(args.clade_name)).joinpath("./ortho/{}.ortho".format(args.clade_name))
    ortho_df = pd.read_csv(ortho_fp, sep='\t')
    cdss = set_gene_name_to_cdss(cdss, ortho_fp)
    LOGGER.info("loaded orthology from {}".format(ortho_fp))

    if args.split_fp:
        cdss = set_split_to_cdss(cdss, args.split_fp)
        LOGGER.info("loaded simulated segmentation from {}".format(args.split_fp))

    tree = None
    if args.tree_fp:
        tree = PhyloTree(args.tree_fp, format=1)
        LOGGER.info("loaded phylogenetic tree from {}".format(args.tree_fp))

    records = []
    cdsDAO = CdsDAO(cdss)
    gene_names = sorted(set(ortho_df["gene_name"]))
    #    gene_names = list(gene_names)[:100]
    LOGGER.info("found {} genes to search".format(len(gene_names)))
    for origin_gene_name in gene_names:
        LOGGER.info("start {}".format(origin_gene_name))
        records += detect_edges_all(origin_gene_name, args.score_method, cdsDAO, tree)

    out_df = pd.DataFrame(records, columns=["x", "y", "score", "score_naive", "total", "found", "bls",
                                            "top_offset", "top_relationship", "top_ratio"])
    out_df.to_csv(args.out_fp, sep='\t', index=False)
    LOGGER.info("saved results to {}".format(args.out_fp))
 def get_ingroup_monoplyletic_clades(self):
     outgrp_regex_str, species_dict, ingroup_regex_str = read_profile_file(
         BasePath.species_profile_filename)
     fam_tree_filename = BasePath.outpath + "/" + self.fam_id + "/" + BasePath.raxml_tree_fileprefix + self.fam_id
     outgroup_re = re.compile(outgrp_regex_str)
     fam_tree = PhyloTree(fam_tree_filename, format=1)
     self.process_family_tree(fam_tree, outgroup_re, species_dict)
    def score_family_tree(self):

        outgrp_regex_str, species_dict, ingroup_regex_str = read_profile_file(
            BasePath.species_profile_filename)
        fam_tree_filename = BasePath.rooted_famtrees_dir + "/" + self.fam_id
        fam_tree = PhyloTree(fam_tree_filename, format=1)

        outgrp_re = re.compile(outgrp_regex_str)
        ingrp_re = re.compile(ingroup_regex_str)

        flag = self.check_if_tree_contains_outgroups(fam_tree, outgrp_re)
        if flag == 1:
            return 0

        ingroup_matches_arr = self.get_ingroup_sequence_list(
            fam_tree, ingrp_re)
        ingroup_pair_arr = self.get_ingroup_sequence_pairs(ingroup_matches_arr)
        precision_val = self.inspect_ingroup_pairs(fam_tree, ingroup_pair_arr,
                                                   outgrp_re)

        tree_score_filename = BasePath.outpath + "/" + self.fam_id + "/" + self.fam_id + \
            BasePath.tree_score_fileextension

        tree_score_file = open(tree_score_filename, "w")
        tree_score_file.write(self.fam_id + " " + str(precision_val) + "\n")
        tree_score_file.close()
예제 #23
0
    def __init__(self,
                 newick,
                 alg,
                 taxid,
                 tid,
                 actions,
                 style,
                 predraw_fn=None):
        try:
            self.tree = PhyloTree(newick=newick,
                                  alignment=alg,
                                  alg_format="fasta")
        except NewickError:
            self.tree = Tree(newick, format=1)

        if predraw_fn:
            predraw_fn(self.tree)
        self.tree.actions = actions
        self.tree.tree_style = style

        self.taxid = taxid
        #print taxid

        self.treeid = tid
        self.mapid = "map_" + tid
        self.imgid = "img_" + tid
        self.boxid = 'box_' + tid
        # Initialze node internal IDs
        for index, n in enumerate(self.tree.traverse('preorder')):
            n._nid = index
예제 #24
0
 def get_ingroup_monoplyletic_clades(self):
     print 'Clade species representation cutoff {0}'.format(self.species_representaion_cutoff)
     outgrp_regex_str, species_dict, ingroup_regex_str = read_profile_file(BasePath.species_profile_filename)
     fam_tree_filename = BasePath.rooted_famtrees_dir + "/" + self.fam_id
     outgroup_re = re.compile(outgrp_regex_str)
     fam_tree = PhyloTree(fam_tree_filename, format=1)
     self.process_family_tree(fam_tree, outgroup_re, species_dict)
예제 #25
0
def safe_phylo_read(filename) -> PhyloTree:
    if isinstance(filename, PhyloTree):
        return filename
    try:
        return PhyloTree(filename, format=3)
    except:
        try:
            return PhyloTree(filename)
        except:
            try:
                return PhyloTree(filename, format=1)
            except:
                try:
                    return PhyloTree(filename, format=5)
                except NewickError as e:
                    print(f"Are you sure tree {filename} exists?", file=sys.stderr, flush=True)
                    raise e
예제 #26
0
    def test_species(self):
        """
        tests if node.species and ncbi_query are working
        """

        # test node.species

        species_tree = PhyloTree(
            """(Felis_catus_1:1,
                (Homo_sapiens_1:1, Pan_troglodytes_1:1),
                Saccharomyces_cerevisiae_1:1);""",
            format=1)
        species_tree.set_species_naming_function(lambda n: n.name.split("_")[1] if "_" in n.name else '')

        pattern0 = """('',
                       (' len(set(["sapiens","pygmaeus"]) & species(@))>0',
                       Pan_troglodytes_1)
                       );"""

        pattern0 = TreePattern(pattern0)


        root = species_tree.get_tree_root()
        self.assertEqual(list(pattern0.find_match(species_tree)), [root])

        # test ncbi taxonomy

        ncbi = NCBITaxa()
        taxonomy_tree = PhyloTree("((9598, 9606), 10090);", sp_naming_function=lambda name: name)
        taxonomy_tree.annotate_ncbi_taxa()
        root = taxonomy_tree.get_tree_root()

        pattern1 = """ '  @.sci_name == "Euarchontoglires" ';"""
        pattern2 = """
          (( '@.sci_name=="H**o sapiens"' , '9526 in @.lineage ' )' @.rank=="subfamily" and @.taxid == 207598 ')
          '  @.sci_name == "Euarchontoglires" and "cellular organisms" in @.named_lineage';
          """

        pattern1 = TreePattern(pattern1)
        pattern2 = TreePattern(pattern2)

        match1 = pattern1.find_match(taxonomy_tree)
        match2 = pattern2.find_match(taxonomy_tree)

        self.assertEqual(list(match1), [root])
        self.assertEqual(list(match2), [root])
예제 #27
0
def open_tree(tree_file_path):
    """Opens tree (contree or treefile) and assigns support values to nodes in case of a standard tree file"""
    if 'contree' in tree_file_path:
        tree = PhyloTree(tree_file_path, sp_naming_function=None)
    elif 'treefile' in tree_file_path:  # Branch supports in SH-aLRT support (%) / ultrafast bootstrap support (%)
        tree = PhyloTree(tree_file_path, sp_naming_function=None, format=1)
        for node in tree.iter_descendants():
            if not node.is_leaf():
                support_values = node.name.split('/')
                try:
                    node.support = float(support_values[1])
                except IndexError:  # No support values when sequences were identical --> set support artifically to 100.0
                    node.support = 100.0
                #node.add_features(shalrt = float(support_values[0])) # Not necessary...
    else:
        sys.exit('Error: tree format not recognised')
    return tree
예제 #28
0
    def test_shortcut_functions(self):
        t = PhyloTree(
            """((((Human_1, Chimp_1), (Human_2, (Chimp_2, Chimp_3))),
            ((Fish_1, (Human_3, Fish_3)), Yeast_2)), Yeast_1);""")
        t.set_species_naming_function(lambda node: node.name.split("_")[0])
        t.get_descendant_evol_events()  # DDDSSSDDS

        root = t.get_tree_root()
        # Detects two consecutive nodes with duplications
        pattern0 = """('n_duplications(@) > 0')'n_duplications(@) > 0 '; """
        pattern1 = """( 'contains_leaves(@, ["Chimp_2", "Chimp_3"])'); """
        pattern2 = """'n_speciations(@) > 3 '; """

        pattern0 = TreePattern(pattern0)
        pattern1 = TreePattern(pattern1)
        pattern2 = TreePattern(pattern2)

        pattern0_match = list(pattern0.find_match(t, maxhits=None))
        pattern1_match = list(pattern1.find_match(t, maxhits=None))
        pattern2_match = list(pattern2.find_match(t, maxhits=None))

        self.assertEqual(len(pattern0_match), 5)

        self.assertEqual(len(pattern1_match), 4)
        self.assertEqual(pattern1_match[0], root)

        self.assertEqual(len(pattern2_match), 2)
        self.assertEqual(pattern2_match[0], root)
        self.assertEqual(pattern2_match[1], root.children[0])
예제 #29
0
def extract_ortho_from_trees(filename):

    # prepare output variables
    l_ortho = list()
    l_ortho_para = list()
    
    # load dict of trees
    tmp_d = utils.get_pickle(Path('dir_step2') / 'dict_trees' / filename)
    
    # analyse trees 1 by 1
    for ref_leaf, newick in tmp_d.items():
        
        # load tree and get all leaves
        tree = PhyloTree(newick)
        all_leaves = {leaf.name for leaf in tree}
                                   
        # get all leaves from last interesting nodes      
        ref_node = tree.search_nodes(name = ref_leaf)[0]
        ortho = custom_species_overlap(ref_node)
        
        # add ref_leaf to ortho in case no good node selected
        if len(ortho) == 0:
            ortho.add(ref_leaf)
        
        # get para
        para = all_leaves - ortho
    
        # save ortho
        xx = list(ortho)    
        xx.sort()    
        for sub in itertools.combinations(xx, 2):
            pair_int = int(str(len(sub[0])) + sub[0] + sub[1])
            l_ortho.append(pair_int)
        
        # save ortho@para if there is a paralogous group
        if para:
            l_ortho_para.append(' '.join(ortho) + '@' + ' '.join(para))

    # save ortho @ para
    utils.save_pickle(path_tmp / filename, l_ortho_para)

    # save ortho
    utils.save_pickle(path_tmp_ortho / filename, l_ortho)
    
    return [0,0]
예제 #30
0
def load_json(fp):
    data = json.loads(clean_json(fp))
    taxonomy = {}
    count_total = 0
    counts = []

    for row in data['ubiome_bacteriacounts']:
        normalise_row(row)
        counts.append(row['count_norm'])
        t = PhyloTree()
        t.name = row['tax_name']
        t.add_features(**row)
        taxonomy[row['taxon']] = t

    root = taxonomy[min(taxonomy.keys())]
    count_total = root.count_norm
    root.alpha = alpha_function(counts)

    for t in taxonomy.values():
        t.add_feature('count_pct', float(t.count_norm) / count_total * 100)
        parent = t.parent
        tp = taxonomy.get(parent)
        if tp is not None:
            tp.add_child(t)
    print('loaded {} into tree depth {} diversity {:.2f}'.format(
        len(taxonomy), len(root), root.alpha))
    return root
예제 #31
0
class DrawTree(object):
    def __init__(self):

        parser = argparse.ArgumentParser(description="Draw phylogenetic tree")
        parser.add_argument('tree_file', action='store', type=str)
        parser.add_argument('-f',
                            '--img-format',
                            dest='img_format',
                            action='store',
                            required=False,
                            default="png",
                            type=str)
        parser.add_argument('-r',
                            '--ref',
                            dest='ref_file',
                            action='store',
                            required=True,
                            type=file)
        self.args = parser.parse_args()
        self.tree_bname = self.args.tree_file.split('.')[0]
        self.tree = PhyloTree(self.args.tree_file)
        self.img_format = self.args.img_format

    def get_ref(self):

        code2name = {}
        for line in self.args.ref_file.read().splitlines():
            data = tuple(line.split('\t'))[:2]
            if len(data) == 2:
                code2name[data[0]] = data[1]
        print(code2name)
        return code2name

    def draw(self, Ts):

        circular_style = TreeStyle()
        #circular_style.mode = "c"
        circular_style.scale = 20
        img_fname = '.'.join([self.tree_bname, self.img_format])
        self.tree.render(img_fname, tree_style=Ts, w=400, units="mm")
        #, tree_style=circular_style)
        try:
            subprocess.call(['gpicview', img_fname])
        except:
            pass
예제 #32
0
def make_species_list(path):
    t = PhyloTree("{}.3.fa.tre".format(path))
    leaves = []
    for leaf in t:
        leaves.append(leaf)
    l = [str(i) for i in leaves]
    l = [i.lstrip("\n--") for i in l]
    l2 = [re.sub("\d", "", i) for i in l]
    return (l, l2)
def process_family_tree(fam_tree_fileName, profile_fileName):

	outgrp_regex_str, species_dict = read_profile_file(profile_fileName)
	outgrp_re = re.compile(outgrp_regex_str)

	fam_tree = PhyloTree(fam_tree_fileName, format=1)
	if not (detect_multifurcation(fam_tree)):
		return 0
	node_dict = get_ingroup_monophyletic_clade_nodes(fam_tree, outgrp_re)
	get_SO_duplication_events(fam_tree, node_dict, species_dict, fam_tree_fileName)
예제 #34
0
def ultrametricer(node_order, tree_file):

    with open(tree_file) as f:
        mytree = PhyloTree(f.next().strip(), format=1)

    # First I get every single leaf

    leaves = mytree.get_leaves()

    # The total distance must be:

    v = len(leaves)

    # Now we get the expected distances
    distances = dict()
    for i, node in enumerate(node_order):

        distances[node] = i + 1

    for node in leaves:
        distances[node.name] = v

    # We add the root (that has no name)
    distances[""] = 0

    # We get the root

    root = mytree.get_tree_root()

    for node in leaves:
        #Now I start traversing to the root

        while (node.up):

            # The expected distance of this branch is:
            expected = distances[node.name] - distances[node.up.name]

            node.dist = expected

            node = node.up

    return mytree.write(format=1)
예제 #35
0
def test():
    t = PhyloTree(
        "((((Human_1, Chimp_1), (Human_2, (Chimp_2, Chimp_3))), ((Fish_1, (Human_3, Fish_3)), Yeast_2)), Yeast_1);")
    t.set_species_naming_function(lambda node: node.name.split("_")[0])

    pattern = """('')' is_duplication(@) '; """
    pattern = TreePattern(pattern, format=8, quoted_node_names=True,
                          functions={'contains_species': contains_species,
                                     'is_duplication': is_duplication,
                                    'is speciation': is_speciation
                                    })
    #should return 5 results
    print(len(list(pattern.find_match(t, None, maxhits=None))))

    pattern1 = """( 'contains(@, ("Chimp_2", "Chimp_3"))' , 'num_species(@, 2) and num_leaves(@,2)' ); """
    tp1 = TreePattern(pattern1, format=8, quoted_node_names=True,
                      functions={'contains': contains,
                                 "num_species": number_of_species,
                                 "num_leaves": number_of_leaves})
    #should return 1 result
    print(len(list(tp1.find_match(t, None))))
예제 #36
0
    def test_cached_attributes(self):
        pattern0 = """  '"Gallus_gallus_1" in leaves(@)' ;"""
        pattern1 = """( '"Hom" in species(@) and n_leaves(@) > 2')'"Pan_troglodytes_1" in leaves(@)';"""

        pattern0 = TreePattern(pattern0)
        pattern1 = TreePattern(pattern1)

        tree = PhyloTree(
            "((((Anolis_carolinensis_1:1, Gallus_gallus_1:1), (Felis_catus_1:1, (Homo_sapiens_1:1, Pan_troglodytes_1:1)primates)primates), ((Danio_rerio_1:1, (Xenopus_laevis_1:1, Anolis_carolinensis_1:1)), Saccharomyces_cerevisiae_2:1)), Saccharomyces_cerevisiae_1:1);",
            format=1)
        root = tree.get_tree_root()

        pattern0_match = list(pattern0.find_match(tree, maxhits=None))
        self.assertEqual(len(pattern0_match), 5)  # returns leaf itself
        self.assertEqual(pattern0_match[0], root)
        self.assertEqual(pattern0_match[4].name, "Gallus_gallus_1")

        pattern1_match = list(pattern1.find_match(tree, maxhits=None))
        self.assertEqual(len(pattern1_match), 3)
        self.assertEqual(pattern1_match[0], root)
        self.assertEqual(pattern1_match[2].children[1].children[1].children[0].name, "Homo_sapiens_1")
예제 #37
0
def LoadTrees(treeFile, dlm):
    """Reads and stores phylogenetic trees from a file

    Parameters
    ------
    treefile: file, file of newick trees, 1 per line
    outgroup: str, last entry from quartet

    Returns
    ------
    treelist: obj, ete3 object of trees

    """
    print("loading trees...")
    treelist = []
    with open(treeFile, 'r') as newick:
        for line in newick:
            if not line.startswith("NA"):
                t = PhyloTree(line)
                t.set_species_naming_function(lambda node: node.name.split(dlm)[0])
                treelist.append(t)
    return(treelist)
예제 #38
0
def yes_choice(tree_file_name, gene, algae_choice):
	t=PhyloTree(tree_file_name)
	R = t.get_midpoint_outgroup()
	t.set_outgroup(R)
	gene_names = t.get_leaf_names()
	if algae_choice[0] == "y":
		print("\nFirst, let's define the algae clade.")
		algae_list = clade_to_tree(t)
	else:
		algae_list = []
	outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)")
	if outlier_choice[0] == "y":
		print("\nLet's define the outlier group. \nFirst you can add any spceices that are in a monophyletic clade")
		outlier_list = clade_to_tree(t)
		other_copies = raw_input("If there are other genes in the outlier group, enter them here, separated by a space, or else enter n.")
		if other_copies != "n":
			other_list = other_copies.split(" ")
			outlier_list = outlier_list + other_list
	else:
		outlier_list=[]
	print("\nSelect one monophyletic family to define. \nYou will have a later chance to split this family again if needed.")
	group_list = clade_to_tree(t)
	###tree1
	cut_list = [i for i in gene_names if i not in group_list]
	cut_list = cut_list + algae_list + outlier_list
	gene1 = yesMake(cut_list, gene, tree_file_name)
	###tree2
	cut_list1 = [i for i in gene_names if i not in cut_list]
	cut_list1 = cut_list1 + algae_list + outlier_list
	gene2 = yesMake(cut_list1, gene1, tree_file_name)
	with open(sys.argv[2], "r") as f:
		todo_list=[line.rstrip() for line in f]
	todo_list=[i for i in todo_list if i != gene]
	todo_list.append(gene1)
	todo_list.append(gene2)
	with open(sys.argv[2], "w") as todo:
		for i in todo_list:
			todo.write(i+"\n")
예제 #39
0
from ete3 import PhyloTree

# Reads a phylogenetic tree (using default species name encoding)
t = PhyloTree("(((Hsa_001,Ptr_001),(Cfa_001,Mms_001)),(Dme_001,Dme_002));")
#                              /-Hsa_001
#                    /--------|
#                   |          \-Ptr_001
#          /--------|
#         |         |          /-Cfa_001
#         |          \--------|
# ---------|                    \-Mms_001
#         |
#         |          /-Dme_001
#          \--------|
#                    \-Dme_002
#
# Prints current leaf names and species codes
print "Deafult mode:"
for n in t.get_leaves():
    print "node:", n.name, "Species name:", n.species
# node: Dme_001 Species name: Dme
# node: Dme_002 Species name: Dme
# node: Hsa_001 Species name: Hsa
# node: Ptr_001 Species name: Ptr
# node: Cfa_001 Species name: Cfa
# node: Mms_001 Species name: Mms
#
# We can also use our own leaf name parsing function to obtain species
# names. All we need to do is create a python function that takes
# node's name as argument and return its corresponding species name.
def get_species_name(node_name_string):
 MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
"""
iphylip_txt = """
 4 76
      seqA   MAEIPDETIQ QFMALT---H NIAVQYLSEF GDLNEALNSY YASQTDDIKD RREEAHQFMA
      seqB   MAEIPDATIQ QFMALTNVSH NIAVQY--EF GDLNEALNSY YAYQTDDQKD RREEAHQFMA
      seqC   MAEIPDATIQ ---ALTNVSH NIAVQYLSEF GDLNEALNSY YASQTDDQPD RREEAHQFMA
      seqD   MAEAPDETIQ QFMALTNVSH NIAVQYLSEF GDLNEAL--- ---------- -REEAHQ---
             LTNVSHQFMA LTNVSH
             LTNVSH---- ------
             LTNVSH---- ------
             -------FMA LTNVSH
"""
# Load a tree and link it to an alignment. As usual, 'alignment' can
# be the path to a file or data in text format.
t = PhyloTree("(((seqA,seqB),seqC),seqD);", alignment=fasta_txt, alg_format="fasta")

#We can now access the sequence of every leaf node
print "These are the nodes and its sequences:"
for leaf in t.iter_leaves():
    print leaf.name, leaf.sequence
#seqD MAEAPDETIQQFMALTNVSHNIAVQYLSEFGDLNEAL--------------REEAH
#seqC MAEIPDATIQ---ALTNVSHNIAVQYLSEFGDLNEALNSYYASQTDDQPDRREEAH
#seqA MAEIPDETIQQFMALT---HNIAVQYLSEFGDLNEALNSYYASQTDDIKDRREEAH
#seqB MAEIPDATIQQFMALTNVSHNIAVQY--EFGDLNEALNSYYAYQTDDQKDRREEAH
#
# The associated alignment can be changed at any time
t.link_to_alignment(alignment=iphylip_txt, alg_format="iphylip")
# Let's check that sequences have changed
print "These are the nodes and its re-linked sequences:"
for leaf in t.iter_leaves():
예제 #41
0
# divergence from the taxonomic tree indicates important evolutionary events like duplications or losses.



#load a tree and associated alignment
#treefile = '/home/cactuskid/Dropbox/IIB/mergeLineages/yuyo/restricted/hapAndeff/strcutres_and_tcoffeeset_aln_struct.phy_phyml_tree.txtlabels.txt'

folder = '/home/cactuskid/Dropbox/IIB/mergeLineages/yuyo/*/*labels.txt'
#folder = '/home/cactuskid/Dropbox/IIB/mergeLineages/yuyo/*/*/*labels.txt'

treefiles = glob.glob(folder)
for treefile in treefiles:
	print treefile
	colorSepcies = False
	#alg = '/home/cactuskid/Dropbox/IIB/mergeLineages/phylogeny/hybrid/merged_curate_aln.fasta'
	t = PhyloTree( treefile, sp_naming_function=None) #, alignment=alg, alg_format="fasta")
	# Calculate the midpoint node
	R = t.get_midpoint_outgroup()
	# and set it as tree outgroup
	t.set_outgroup(R)

	def save_obj(obj, name ):
	    with open( name + '.pkl', 'wb') as f:
	        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

	def load_obj(name ):
	    with open( name + '.pkl', 'r') as f:
	        return pickle.load(f)

	genedict = load_obj('genedict')
	speciescolors = load_obj('colors')
예제 #42
0
    '''
    layout for CodemlTree
    '''
    if hasattr(node, "collapsed"):
        if node.collapsed == 1:
            node.img_style["draw_descendants"]= False
    if node.is_leaf():
        if hasattr (node, "sequence"):
            seqface =  MySequenceFace(node.sequence, "nt",
                                      fsize=10,
                                      col_w=11, interactive=True)
            faces.add_face_to_node(seqface, node, 1, aligned=True)


if __name__ == "__main__":
    tree = PhyloTree('(Orangutan,Human,Chimp);')
    tree.link_to_alignment("""
                           >Chimp
                           HARWLNEKLRCELRTLKKLGLDGYKAVSQYVKGRA
                           >Orangutan
                           DARWINEKLRCVSRTLKKLGLDGYKGVSQYVKGRP
                           >Human
                           DARWHNVKLRCELRTLKKLGLVGFKAVSQFVIRRA
                           """)
    nt_sequences = {"Human"    : "GACGCACGGTGGCACAACGTAAAATTAAGATGTGAATTGAGAACTCTGAAAAAATTGGGACTGGTCGGCTTCAAGGCAGTAAGTCAATTCGTAATACGTCGTGCG",
                    "Chimp"    : "CACGCCCGATGGCTCAACGAAAAGTTAAGATGCGAATTGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGCAGTAAGTCAGTACGTTAAAGGTCGTGCG",
                    "Orangutan": "GATGCACGCTGGATCAACGAAAAGTTAAGATGCGTATCGAGAACTCTGAAAAAATTGGGACTGGACGGCTACAAGGGAGTAAGTCAATACGTTAAAGGTCGTCCG"
                }
    for l in nt_sequences:
        (tree & l).nt_sequence = nt_sequences[l]
    tree.dist = 0
예제 #43
0
from ete3 import PhyloTree, Tree, TreeStyle
from ete3 import add_face_to_node, TextFace, AttrFace, SequenceFace

nw = '(0:0, 1:20);'
fa = """
>0
AA..
>1
CAA.
"""

t = PhyloTree(nw, alignment=fa, alg_format='fasta', format=1)
ts = TreeStyle()
ts.show_branch_length = False
ts.show_leaf_name = False
ts.draw_guiding_lines = True
ts.draw_aligned_faces_as_table = True
ts.show_scale = False

def my_layout(node):
    #
    # add names to all nodes (not just to leaf nodes)
    # ete3/test/test_treeview/face_rotation.py
    F = TextFace(node.name, tight_text=True)
    add_face_to_node(F, node, column=0, position="branch-right")
    #
    # add branch lengths
    # ete3/treeview/qt4_render.py
    if not node.is_root():
        bl_face = AttrFace("dist", fsize=8, ftype="Arial",
                fgcolor="black", formatter="%0.3g")
예제 #44
0
파일: pdf_tree.py 프로젝트: nickloman/ebov
def main(args):
	if args.alignment:
		t = PhyloTree(args.tree, alignment=args.alignment, alg_format='fasta')
	else:
		t = PhyloTree(args.tree)

	if args.highlight_new:
		runs = read_runs(args.highlight_new)

	t.set_outgroup('EM_079422')
	t.ladderize()

	ts = TreeStyle()
	ts.show_leaf_name = False
	ts.show_branch_support = False
	ts.layout_fn = layout

	thick_hz_line = NodeStyle()
	thick_hz_line["hz_line_width"] = 8
	t.set_style(thick_hz_line)
	#t.children[0].set_style(thick_hz_line)
	#t.children[1].set_style(thick_hz_line)

	thick_vt_line = NodeStyle()
	thick_vt_line["vt_line_width"] = 4
	t.set_style(thick_vt_line)

	# header
	if not args.hide_annotations:
		ts.aligned_header.add_face(MyTextFace('Sample identifier', fstyle='Bold', fsize=8, tight_text=False), column = 1)
		ts.aligned_header.add_face(MyTextFace('Prefecture', fstyle='Bold', fsize=8, tight_text=False), column = 2)
		ts.aligned_header.add_face(MyTextFace('Sous-prefecture', fstyle='Bold', fsize=8, tight_text=False), column = 3)
		ts.aligned_header.add_face(MyTextFace('Village', fstyle='Bold', fsize=8, tight_text=False), column = 4)
		ts.aligned_header.add_face(MyTextFace('Sample received', fstyle='Bold', fsize=8, tight_text=False), column = 5)

	if args.positions:
		positions = read_positions(args.positions)

		alg_header = RulerFace(positions,
                              col_width=11,
                              height=0, # set to 0 if dont want to use values
                              kind="stick",
                              hlines = [0],
                              hlines_col = ["white"], # trick to hide hz line
                              )

		ts.aligned_header.add_face(alg_header, 6)

	#legend
	if args.legend:
		legend = {}
		for s in samples.values():
			legend[s['prefec']] = s['prefec__colour']
		for p in sorted(legend.keys()):
			ts.legend.add_face(CircleFace(4, legend[p]), column=0)
			ts.legend.add_face(MyTextFace(p, fsize=6, tight_text=False), column=1)	
		ts.legend_position=1

	if args.circular:
		ts.mode = "c"
		ts.arc_start = -180 # 0 degrees = 3 o'clock
		ts.arc_span = 180

#	t.show(tree_style=ts)
	t.render(args.output, tree_style=ts, w=1024)
예제 #45
0
from ete3 import PhyloTree

# Loads a gene tree and its corresponding species tree. Note that
# species names in sptree are the 3 firs letters of leaf nodes in
# genetree.
gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));'
species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);"
genetree = PhyloTree(gene_tree_nw)
sptree = PhyloTree(species_tree_nw)
print genetree
#                    /-Dme_001
#          /--------|
#         |          \-Dme_002
#         |
#         |                              /-Cfa_001
#         |                    /--------|
#---------|                   |          \-Mms_001
#         |          /--------|
#         |         |         |                    /-Hsa_001
#         |         |         |          /--------|
#         |         |          \--------|          \-Ptr_001
#          \--------|                   |
#                   |                    \-Mmu_001
#                   |
#                   |          /-Ptr_002
#                    \--------|
#                             |          /-Hsa_002
#                              \--------|
#                                        \-Mmu_002
#
# Let's reconcile our genetree with the species tree
                        """)

    parser.add_argument("--colorbar_save", dest="colorbar_save",
                        type=str,
                        help="""
                        save path of Colorbar for the heatmap with matplotlib
                        """)        
    

    args = parser.parse_args()
    infile = args.infile
    mode = args.mode
    newick = args.newick

    if newick:
        t = PhyloTree(args.newick)      
        species2taxid = dict([ line.split()[0], line.strip().split()[1] ] for line in open(infile))
        taxids = set(species2taxid.values())
    else:
        ncbi = NCBITaxa()
        taxids = set([ line.strip() for line in open(infile) ])


    if args.taxoncolors:
        taxon2color = dict([int(line.split()[0]), line.split()[1]] for line in open(args.taxoncolors))

    tNCBI = ncbi.get_topology(taxids, intermediate_nodes=True)
    tNCBI = tNCBI.search_nodes(name="2759")[0]
    ncbi.annotate_tree(tNCBI, taxid_attr="name")
    tax2node = dict([node.taxid, node] for node in tNCBI.traverse())
def pre_prune(gene):
	full_tree=PhyloTree(gene+"/"+gene+".3.fa.tre")
	gene_names=full_tree.get_leaf_names()
	m=100
	start_gene="{}_all{}".format(gene,str(m))
	os.system("mkdir {}".format(start_gene))
	full_tree.write(format=1, outfile="{}/{}.3.fa.tre".format(start_gene,start_gene))
	m=m+1
	l=[start_gene]
	for item in l:
		full_tree=PhyloTree("{}/{}.3.fa.tre".format(item,item))
		view_rooted_tree(full_tree)
		print("Tree for {}".format(item))
		c=raw_input("Split off a monophyletic gene copy? (y/n)")
		if c[0] == "y":
			algae_choice = raw_input("\nIs there an algae group that is sister to all shown families? (y/n)")
			outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)")
		while c[0]=="y":
			if algae_choice[0] == "y":
				print("\nFirst, let's define the algae clade.")
				algae_list = clade_to_tree(full_tree)
			else:
				algae_list = []
			if outlier_choice[0] == "y":
				print("\nLet's define the outlier group. ")
				outlier_list = []
				out_choice = raw_input("\nIs there a monophyletic clade in the outlier group? (y/n)")
				while out_choice[0] == "y":
					outlier_list2 = clade_to_tree(full_tree)
					outlier_list = outlier_list + outlier_list2
					out_choice = raw_input("\nIs there another monopyletic clade to add to the outlier group? (y/n)")
				other_choice = raw_input("Are there additional genes in the outlier group? (y/n)")
				while other_choice[0] == "y":
					other_copies = raw_input("\nEnter genes to include, separated by a space. Enter only up to ten genes at a time.")
					try:
						other_list = other_copies.split(" ")
						outlier_list = outlier_list + other_list
					except ValueError:
						other_choice = raw_input("\nAt least one gene is not found on the tree. Reenter genes? y/n")
					other_choice = raw_input("Are there more genes to enter? (y/n)")
			else:
				outlier_list=[]
			b="{}_all{}".format(gene, str(m))
			l.append(b)
			tree1=PhyloTree("{}/{}.3.fa.tre".format(item,item))
			R=tree1.get_midpoint_outgroup()
			tree1.set_outgroup(R)
			print("\nFor the monophyletic gene copy:")
			group_list=clade_to_tree(tree1)
			group_list=group_list + algae_list + outlier_list
			gene_names=tree1.get_leaf_names()
			if len(group_list)==len(gene_names):
				c1=raw_input("\nList includes all copies on tree.\nMake gene with all copies? (y/n)")
				if c1=="y":
					c="n"
				else:
					print("\nGroup crosses root. Unable to make group.\nChoose new group.")
					c="y"
			else:
				cut_list=[i for i in gene_names if i not in group_list]
				cut_list = cut_list + algae_list + outlier_list
				os.system("mkdir {}".format(b))
				tree2=PhyloTree("{}/{}.3.fa.tre".format(item,item))
				R=tree2.get_midpoint_outgroup()
				tree2.set_outgroup(R)
				tree2.prune(group_list,preserve_branch_length=True)
				tree2.write(format=1, outfile="{}/{}.3.fa.tre".format(b,b))
				tree1.prune(cut_list,preserve_branch_length=True)
				tree1.write(format=1, outfile="{}/{}.3.fa.tre".format(item,item))
				m=m+1
				print ("\nTree now looks like this.")
				view_rooted_tree(tree1)
				c=raw_input("Split off a monophyletic clade? (y/n)")
				if c[0] == "y":
					algae_choice = raw_input("\nIs there an algae group that is sister to all shown families? (y/n)")
					outlier_choice = raw_input("\nIs there another monophyletic or non-monophyletic outlier group that is sister to all families shown? (y/n)")

	with open(sys.argv[1], "a") as p:
		for i in l:
			p.write(i+"\n")
예제 #48
0
from ete3 import PhyloTree

# Creates a gene phylogeny with several duplication events at
# different levels. Note that we are using the default method for
# detecting the species code of leaves (three first lettes in the node
# name are considered the species code).
nw = """
((Dme_001,Dme_002),(((Cfa_001,Mms_001),((((Hsa_001,Hsa_003),Ptr_001)
,Mmu_001),((Hsa_004,Ptr_004),Mmu_004))),(Ptr_002,(Hsa_002,Mmu_002))));
"""
t = PhyloTree(nw)
print "Original tree:",
print t
#
#             /-Dme_001
#   /--------|
#  |          \-Dme_002
#  |
#  |                              /-Cfa_001
#  |                    /--------|
#  |                   |          \-Mms_001
#  |                   |
# --|                   |                                        /-Hsa_001
#  |                   |                              /--------|
#  |          /--------|                    /--------|          \-Hsa_003
#  |         |         |                   |         |
#  |         |         |          /--------|          \-Ptr_001
#  |         |         |         |         |
#  |         |         |         |          \-Mmu_001
#  |         |          \--------|
#   \--------|                   |                    /-Hsa_004
from ete3 import PhyloTree
# Loads an example tree
nw = """
((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),
(Ptr_002,(Hsa_002,Mmu_002))));
"""
t = PhyloTree(nw)
print t
#                    /-Dme_001
#          /--------|
#         |          \-Dme_002
#         |
#         |                              /-Cfa_001
#         |                    /--------|
#---------|                   |          \-Mms_001
#         |          /--------|
#         |         |         |                    /-Hsa_001
#         |         |         |          /--------|
#         |         |          \--------|          \-Ptr_001
#          \--------|                   |
#                   |                    \-Mmu_001
#                   |
#                   |          /-Ptr_002
#                    \--------|
#                             |          /-Hsa_002
#                              \--------|
#                                        \-Mmu_002
#
# To obtain all the evolutionary events involving a given leaf node we
# use get_my_evol_events method
matches = t.search_nodes(name="Hsa_001")