예제 #1
0
def reconcile_etetoolkit(protein):
    species_tree = PhyloTree(SPECIES_TREE_FILE.format(
        protein, 'nh'), format=1, sp_naming_function=lambda name: name)
    gene_tree = PhyloTree(GENE_TREE_FILE.format(
        protein, protein, 'nh'), format=1, sp_naming_function=lambda name: name)
    recon_tree, events = gene_tree.reconcile(species_tree)
    recon_tree.render("phylotree.png")
예제 #2
0
    def build_tree(self, sample, rank_limit='None'):

        # Gets taxids of sample.  Gets all taxids if sample is None.
        taxids = set(self.get_all_tax_ids(sample))

        taxid2nodes = {}
        all_nodes = {}
        root_children = []

        for taxid in taxids:
            taxid2nodes[taxid] = []

            taxpath = self.get_taxpath(taxid)
            rank = self.get_rank(taxid)

            if self.rank_position[rank] <= self.rank_position[rank_limit]:

                for node_id in taxpath:

                    if node_id != '':

                        if node_id not in all_nodes:
                            node = all_nodes.setdefault(node_id, PhyloTree())
                            node.name = str(node_id)
                            node.taxid = node_id
                            rank = self.get_rank(node_id)
                            node.add_feature("rank", rank)
                            node.add_feature("sci_name", self.get_name(node_id))

                            if rank == 'superkingdom':
                                root_children.append(node)

                        else:
                            node = all_nodes[node_id] # node already exists

                        taxid2nodes[taxid].append(node)

        # generate parent child relationships
        for taxid in taxid2nodes.keys():
            parent = None
            for node in taxid2nodes[taxid]:
                if parent and node not in parent.children:
                    parent.add_child(node)
                parent = node

        root = PhyloTree()
        root.name = 'root'
        root.taxid = '0'
        root.add_feature("rank", "root")
        root.add_feature("sci_name", "root")

        for child in root_children:
            root.add_child(child)

        tree = root
        if len(root.children) == 1:
            tree = root.children[0].detach()

        return tree
예제 #3
0
def get_example_tree():

    # Performs a tree reconciliation analysis
    gene_tree_nw = '((Dme_001,Dme_002),(((Cfa_001,Mms_001),((Hsa_001,Ptr_001),Mmu_001)),(Ptr_002,(Hsa_002,Mmu_002))));'
    species_tree_nw = "((((Hsa, Ptr), Mmu), (Mms, Cfa)), Dme);"
    genetree = PhyloTree(gene_tree_nw)
    sptree = PhyloTree(species_tree_nw)
    recon_tree, events = genetree.reconcile(sptree)
    recon_tree.link_to_alignment(alg)
    return recon_tree, TreeStyle()
예제 #4
0
    def test_lineages(self):
        """
        Search trees (naming format: NumericTaxid.SequenceName)
        for nodes containing branches that separate two groups of primate genes where,
        in one side, the human gene has been lost,
        and the branch support value of the matching node is higher than 0.9.

                                  /-Any primate taxid (9443 in lineage)
        support >= 0.9--|
                                  \-Any primate taxid except human

        """
        t1 = PhyloTree("(9601.ENSPPYP00000022176:1,9593.ENSGGOP00000009720:1);")
        t2 = PhyloTree("(9361.ENSDNOP00000016844:1,9258.ENSOANP00000032529:1);")
        t3 = PhyloTree(
            "(((((37347.ENSTBEP00000010698:0.120098,(9361.ENSDNOP00000000113:0.0697238,(9785.ENSLAFP00000009564:0.0297499,(9371.ENSETEP00000002412:0.0588324,9813.ENSPCAP00000006440:0.026638)0.985184:0.0242194)0.99985:0.0211882)0.99706:0.0161759)0.756:0.00666819,((132908.ENSPVAP00000002358:0.0439546,59463.ENSMLUP00000004598:0.0635161)0.994843:0.00885432,(9796.ENSECAP00000009809:0.0292517,((9685.ENSFCAP00000004938:0.056779,(9615.ENSCAFP00000008559:0.039179,(9823.ENSSSCP00000024070:0.126803,(9669.ENSMPUP00000010096:0.0341928,9646.ENSAMEP00000005906:0.0189746)0.995231:0.00951966)0.915476:0.0046099)0.949664:0.00417374)0.99985:0.0133593,(9739.ENSTTRP00000009464:0.0664336,9913.ENSBTAP00000001687:0.036632)0.99985:0.0236174)0.939309:0.00508062)0.991475:0.00823937)0.99985:0.0107263)0.99985:0.0100107,((9986.ENSOCUP00000014919:0.0830612,10141.ENSCPOP00000005291:0.12195)0.99985:0.0202639,((9483.ENSCJAP00000047968:0.0446865,(9544.ENSMMUP00000007168:0.0201746,((9593.ENSGGOP00000005929:0.00916494,(9606.ENSP00000294053:1.3e-07,9598.ENSPTRP00000006940:0.0068176)0.955193:0.00220905)0.99985:0.00778854,(9601.ENSPPYP00000004174:0.00495163,61853.ENSNLEP00000020892:0.179569)0.290072:0.00153447)0.998732:0.00889714)0.99985:0.0144864)0.99985:0.0344562,(9478.ENSTSYP00000006073:0.129349,(30608.ENSMICP00000010690:0.0852248,30611.ENSOGAP00000013738:0.0467206)0.99985:0.0188861)0.232709:0.00179852)0.99985:0.00929928)0.51042:0.00516905)0.367617:0.00813494,(43179.ENSSTOP00000004287:0.0599707,(10020.ENSDORP00000000618:0.138502,(10116.ENSRNOP00000026665:0.0528487,10090.ENSMUSP00000001884:0.0307781)0.99985:0.089983)0.99985:0.018366)0.698647:0.00414256)0.995833:0.06629,(9258.ENSOANP00000012946:0.33344,(13616.ENSMODP00000032549:0.0348012,(9315.ENSMEUP00000011030:0.0138664,9305.ENSSHAP00000003293:0.0185119)0.570293:0.0137766)0.99985:0.143897)0.995833:0.06629);")
        t4 = PhyloTree("(9593.ENSGGOP00000025542:1,9601.ENSPPYP00000004907:1);")
        t5 = PhyloTree(
            "(9371.ENSETEP00000005103:0.0955875,(9785.ENSLAFP00000014743:0.0214619,(9813.ENSPCAP00000005573:0.0376639,(9796.ENSECAP00000019319:0.0196571,(37347.ENSTBEP00000012329:0.0242927,((9361.ENSDNOP00000011716:0.0676669,(9606.ENSP00000374323:9e-07,(9593.ENSGGOP00000028731:0.00246332,(61853.ENSNLEP00000002377:0.0030064,(9601.ENSPPYP00000015233:0.0112606,(9598.ENSPTRP00000026129:0.00246268,9483.ENSCJAP00000015834:0.0290829)0:1.2e-07)0:6.5e-07)0.146278:0.00614181)0.146329:0.00485474)0.991187:0.014264)0.763764:0.00352544,((10020.ENSDORP00000008692:0.0259566,(30608.ENSMICP00000002718:0.0380742,9478.ENSTSYP00000009200:0.0174548)0.197348:0.00155005)0.99985:0.0110622,((((132908.ENSPVAP00000013183:0.0099908,59463.ENSMLUP00000014424:0.0115111)0.99985:0.00655941,(10141.ENSCPOP00000003417:0.0535498,((9669.ENSMPUP00000002651:0.0156675,(9646.ENSAMEP00000014393:0.0142536,9615.ENSCAFP00000013394:0.00243184)0.930921:0.00345947)0.99985:0.015828,(9913.ENSBTAP00000053531:0.0545233,9739.ENSTTRP00000001508:0.0344514)0.985783:0.00536759)0:1.1e-07)0:1.1e-07)0.99985:0.00795592,(10090.ENSMUSP00000066734:0.0572278,(43179.ENSSTOP00000020881:0.021661,30611.ENSOGAP00000000479:0.00876016)0.955042:0.00724791)0.992776:0.0044053)0:3.4e-07,(9258.ENSOANP00000012014:0.10692,(9315.ENSMEUP00000001901:0.0451997,13616.ENSMODP00000021214:0.00830289)0.994926:0.0229072)0.99985:0.0500253)0.981032:0.00621499)0:9e-08)0.723103:0.00185076)0.580248:0.00162611)0.99985:0.0167207)0.863552:0.00574499)1:0.0955875);")
        t6 = PhyloTree(
            "((9305.ENSSHAP00000010229:0.0607855,13616.ENSMODP00000009656:0.0615237)0.99985:0.0877765,(9785.ENSLAFP00000028174:0.0885004,(((9823.ENSSSCP00000002806:0.0860827,9823.ENSSSCP00000002780:0.0111508)0.99985:0.122086,((9913.ENSBTAP00000038896:0.050358,(9685.ENSFCAP00000017257:0.0778567,(9986.ENSOCUP00000017975:0.161424,(9615.ENSCAFP00000020783:0.056902,(9646.ENSAMEP00000019763:0.0857189,9669.ENSMPUP00000019474:0.0325693)0.99985:0.0314116)0.875671:0.00690881)0.942895:0.0136375)0.798192:0.00741364)0.967573:0.0100004,(59463.ENSMLUP00000020576:0.0755216,9796.ENSECAP00000004613:0.0777605)0.799782:0.00471384)0.911021:0.00832673)0.659845:0.00664335,((43179.ENSSTOP00000021465:0.123042,9593.ENSGGOP00000020601:0.0781752)0.987812:0.0311266,(30611.ENSOGAP00000021055:0.090792,(10116.ENSRNOP00000016702:0.0112116,10090.ENSMUSP00000050705:0.0330259)0.99985:0.134681)0.972881:0.0174783)0.998643:0.0179346)0.901179:0.017737)0.99985:0.0877765);")
        t7 = PhyloTree(
            "(9258.ENSOANP00000017269:0.144169,(((10090.ENSMUSP00000089169:0.0424834,10116.ENSRNOP00000026070:0.0151696)0.99985:0.0742333,(((((132908.ENSPVAP00000008558:0.0138473,(30608.ENSMICP00000004293:1.5e-07,((9986.ENSOCUP00000020707:0.0691049,37347.ENSTBEP00000002617:0.0138881)0:1.2e-07,(9371.ENSETEP00000012957:0.0515389,(9785.ENSLAFP00000009919:0.0260641,9813.ENSPCAP00000013834:0.0329521)0.741149:0.0041225)0.998768:0.00855745)0.99985:0.0111961)0.867255:0.00524663)0:4.3e-07,(9361.ENSDNOP00000010929:0.0359312,(9739.ENSTTRP00000015818:0.0267351,9796.ENSECAP00000009501:0.0168218)0.868862:0.00355516)0:8e-08)0.99985:0.0056594,(9913.ENSBTAP00000012912:0.0231165,(9669.ENSMPUP00000002012:0.00320767,9823.ENSSSCP00000023102:0.0629927)0.99134:0.00309237)0.988361:0.00284581)0:1.5e-07,((59463.ENSMLUP00000015155:0.0360776,9615.ENSCAFP00000002053:0.00579656)0.961397:0.00553059,(9685.ENSFCAP00000023114:0.0115974,9646.ENSAMEP00000004090:0.00575272)0.959045:0.00279601)0.988458:0.00279093)0.998008:0.00284847,(30611.ENSOGAP00000001383:0.00849776,((9483.ENSCJAP00000006698:0.0114709,(9544.ENSMMUP00000006654:0.00568623,(61853.ENSNLEP00000004122:0.00566385,(9601.ENSPPYP00000021653:0.00853215,(9593.ENSGGOP00000020462:1.8e-07,(9598.ENSPTRP00000035990:1e-08,9606.ENSP00000365550:1e-08)0.99985:0.00282071)0.996162:0.00281965)0:1.7e-07)0:8e-08)0.954037:0.0027827)0.99985:0.00818313,(43179.ENSSTOP00000012068:0.0109022,(9478.ENSTSYP00000008441:0.0132658,10141.ENSCPOP00000000986:0.0564111)0.314526:0.00294575)0:7e-08)0.980721:0.00309462)0.991529:0.00280168)0:1.6e-07)0.99985:0.0483405,(9315.ENSMEUP00000015273:0.00839008,(9305.ENSSHAP00000020642:0.00542335,13616.ENSMODP00000010568:0.101485)0:2.1e-07)0.99985:0.0336521)1:0.144169);")
        t8 = PhyloTree(
            "(((9371.ENSETEP00000003671:0.0131637,(9258.ENSOANP00000006745:0.117598,(132908.ENSPVAP00000001122:0.0159907,(30611.ENSOGAP00000013217:0.0071702,(((9823.ENSSSCP00000000042:0.0144457,(9646.ENSAMEP00000009872:0.0154876,9361.ENSDNOP00000012437:0.0817179)0:1e-06)0.998538:0.00765581,(9544.ENSMMUP00000001765:1e-08,(10116.ENSRNOP00000010491:0.0292686,(9669.ENSMPUP00000016236:0.340739,9615.ENSCAFP00000001415:4e-07)0.989009:0.00985882)0:8.7e-07)0:8.7e-07)0.99736:0.00973955,(((9606.ENSP00000379704:1e-08,(9601.ENSPPYP00000013264:0.00772278,9598.ENSPTRP00000024873:1e-08)0:2.3e-07)0.996569:0.00720502,(9913.ENSBTAP00000017531:0.0145949,9739.ENSTTRP00000016448:0.00723237)0.996503:0.00710774)0:4.2e-07,((9593.ENSGGOP00000008768:0.270021,(9785.ENSLAFP00000013194:0.00881524,9478.ENSTSYP00000011482:6.1e-07)0.482225:0.00675219)0.500314:0.00675139,(((59463.ENSMLUP00000002337:0.0319341,30608.ENSMICP00000003266:6.2e-07)0.987498:0.010619,(9796.ENSECAP00000021110:0.0073991,(9986.ENSOCUP00000007142:0.0196352,37347.ENSTBEP00000000333:0.0989537)0:9.5e-07)0:1.09e-06)0.873107:0.00951386,((9685.ENSFCAP00000000826:3e-07,(43179.ENSSTOP00000011619:0.00863897,10090.ENSMUSP00000023095:1e-08)0:1e-08)0.99985:0.132958,(10020.ENSDORP00000013215:0.0339132,10141.ENSCPOP00000011894:4.1e-07)0:4.1e-07)0.524756:0.00714334)0:8.1e-07)0.99985:0.00971634)0:7e-08)0:7e-08)0.772739:0.0177399)0.992096:0.0404786)0.817723:0.0310407)0.522416:0.072068,(9305.ENSSHAP00000014579:0.246289,9315.ENSMEUP00000008760:0.0666798)0.977479:0.195421)0.99985:1.2587,((((37347.ENSTBEP00000000946:0.0956163,(9483.ENSCJAP00000024301:0.0743892,(9593.ENSGGOP00000012469:0.00721405,(9606.ENSP00000391249:1e-08,9606.ENSP00000461549:1e-08)0:1.3e-07)0.993649:0.00856538)0.99985:0.0230549)0.975176:0.0143781,(30611.ENSOGAP00000003324:0.104251,30608.ENSMICP00000007369:0.0381575)0.990656:0.0183563)0.916137:0.00581305,(9823.ENSSSCP00000018191:0.0558998,((10020.ENSDORP00000010153:0.197695,((9796.ENSECAP00000018039:0.0363101,132908.ENSPVAP00000013461:0.0941126)0.892367:0.013635,((9739.ENSTTRP00000004783:0.0138565,9913.ENSBTAP00000003415:0.0166473)0.99985:0.0326524,((9371.ENSETEP00000006140:0.107709,(9785.ENSLAFP00000006435:0.170692,9813.ENSPCAP00000005503:0.0655274)0:2.68e-06)0.99985:0.0526328,(9258.ENSOANP00000002804:0.150016,(9315.ENSMEUP00000001056:0.0197146,(13616.ENSMODP00000002021:0.0382813,9305.ENSSHAP00000007534:0.0357616)0.99985:0.0843541)0.99985:0.115238)0.99985:0.133971)0.964252:0.0135998)0.99559:0.0163904)0.732303:0.00993157)0.99985:0.0470037,(9685.ENSFCAP00000008713:0.124988,(9615.ENSCAFP00000007771:0.0225216,(9646.ENSAMEP00000014479:0.0718956,9669.ENSMPUP00000013273:0.0487162)0.99985:0.0148769)0:9.2e-07)0.99985:0.0433867)0.99277:0.027679)0.99985:0.0134312)0:4.7e-07,(43179.ENSSTOP00000019919:0.152642,((10116.ENSRNOP00000003891:0.158016,10090.ENSMUSP00000091435:0.0102936)0.99985:0.0704992,(10141.ENSCPOP00000011436:0.130601,9986.ENSOCUP00000015843:0.529405)0:5.42e-06)0.909203:0.011833)0.428577:0.0186403)0.99985:1.2587);")
        t9 = PhyloTree("(9305.ENSSHAP00000009662:1,9305.ENSSHAP00000009620:1);")
        t10 = PhyloTree("((9315.ENSMEUP00000008285:0.899711,9258.ENSOANP00000027752:0.559777)0.99985:0.11989,((9739.ENSTTRP00000010720:0.164873,9913.ENSBTAP00000003500:0.298158)0.99985:0.109903,((9685.ENSFCAP00000006440:0.239731,(9615.ENSCAFP00000042310:0.122399,(9646.ENSAMEP00000002314:0.18278,9669.ENSMPUP00000005544:0.270727)0.6117:0.0396991)0.99985:0.0702148)0.99985:0.082488,(132908.ENSPVAP00000014833:0.488081,(9796.ENSECAP00000022144:0.310699,(((9785.ENSLAFP00000009512:0.187095,9813.ENSPCAP00000004417:0.493329)0.99985:0.359095,(30611.ENSOGAP00000016876:0.334272,(9483.ENSCJAP00000021314:0.178043,(9601.ENSPPYP00000003401:0.0415077,((61853.ENSNLEP00000003253:0.196659,9544.ENSMMUP00000037769:0.326984)0.835225:0.0989423,(9593.ENSGGOP00000004740:0.101826,9606.ENSP00000182290:0.0204981)0.997196:0.020731)0.307827:0.0046059)0.99985:0.0991112)0.99985:0.162323)0.972253:0.0380139)0.70642:0.0193389,((10141.ENSCPOP00000016274:0.272126,43179.ENSSTOP00000015376:0.458416)0.996119:0.0901785,(37347.ENSTBEP00000013312:0.328061,(10020.ENSDORP00000010739:0.398341,(10116.ENSRNOP00000051746:0.0455948,10090.ENSMUSP00000009396:0.0811741)0.99985:0.269525)0.791467:0.0577236)0.536676:0.0461933)0.99985:0.0620583)0.99985:0.0788824)0.969465:0.0395994)0.635969:0.0171601)0.702925:0.0283261)0.99985:0.11989);")

        trees = [(t1, "t1", True), (t2, "t2", False), (t3, "t3", True),
                 (t4, "t4", True), (t5, "t5", True), (t6, "t6", False),
                 (t7, "t7", True), (t8, "t8", True), (t9, "t9", False),
                 (t10, "t10", True)]
        for tree, tree_name, has_matches in trees:
            tree.set_species_naming_function(lambda n: n.name.split(".")[0] if "." in n.name else '')
            tree.annotate_ncbi_taxa()
            # Has support for two primates where at least one is not H**o sapiens
            pattern = """
                ( ' 9443 in @.lineage ' , ' 9443 in @.lineage and @.name!=9606 ' )' @.support >= 0.9 ';
                """
            pattern = TreePattern(pattern)
            if not has_matches:
                self.assertEqual(list(pattern.find_match(tree)), [])
            else:
                match = pattern.find_match(tree).next()
                self.assertEqual(match.support >= 0.9, True)
                test_status = (9443 in match.children[0].lineage and \
                               9443 in match.children[1].lineage and \
                               match.children[1].name != '9606')
                # permute children and check again
                test_status2 = (9443 in match.children[1].lineage and \
                               9443 in match.children[0].lineage and \
                               match.children[0].name != '9606')
                self.assertEqual(test_status, True)
                self.assertEqual(test_status2, True)
예제 #5
0
def extract_clades(newick_file, processed_newick_out=None):
    """ the outer logic for tree splitting """
    # preprocess tree
    print("Pre-processing tree ({})".format(newick_file))
    tree = PhyloTree(newick_file)
    R = tree.get_midpoint_outgroup()
    tree.set_outgroup(R)
    tree.ladderize()
    tree.convert_to_ultrametric()
    if (processed_newick_out is not None):
        tree.write(format=1, outfile=processed_newick_out)
    # calculate clades
    print("Calling clades ({})".format(newick_file))

    def get_branch_length(node):
        for l in node:
            return l.get_distance(node)

    len_tree = len(tree)
    dist_tree = get_branch_length(tree)

    def condition_discard(node, tree):
        return (len(node) < 3)

    def condition_ok(node, tree):
        return len(node) < max(10, len_tree / 50)

    branches = get_pruned_branch(tree, tree, condition_discard, condition_ok,
                                 [])
    clades = {}
    for i, branch in enumerate(
            sorted(branches, key=lambda nodes: -1 * len(nodes))):
        clades[str(i + 1)] = [node.name for node in branch]
    return clades
예제 #6
0
def cut_stray_genes(gene, species_keep, species_list):
    ######Showing the tree######
    clade_tree = PhyloTree(gene + "/" + gene + ".3.fa.tre")
    clade_tree.prune(species_keep, preserve_branch_length=True)
    if len(species_keep) > 1:
        view_rooted_tree(clade_tree)
        print("\nThis is the clade tree. There are " + str(len(species_keep)) +
              " total gene copies.\n")
    else:
        print(
            "\nSpecies tree only contains 1 species. Tree will not be shown.")
    cut_list = species_keep
    view_counts(cut_list, species_list)
    ######Removing stray within-clade gene copies from the clade######
    cut_question = raw_input("\nAre there stray genes to cut? (y/n)")
    while cut_question[0] == "y":
        cut_gene_str = raw_input(
            "\nEnter genes to cut, separated by a space: ")
        cut_gene_list = [item for item in cut_gene_str.split()]
        cut_list = [i for i in cut_list if i not in cut_gene_list]
        if set(cut_gene_list).issubset(species_keep):
            try:
                clade_tree.prune(cut_list, preserve_branch_length=True)
                view_rooted_tree(clade_tree)
                view_counts(cut_list, species_list)
            except ValueError:
                print(
                    "\nSomething is wrong with the way the genes were entered. You entered:\n"
                    + cut_gene_str + "\nCut abandoned.")
        else:
            print(
                "\nAt least one gene is not found on the tree. You entered:\n"
                + cut_gene_str + "\nCut abandoned.")
        cut_question = raw_input("\nAre there more genes to cut? (y/n)")
    return (cut_list)
예제 #7
0
def make_other_groups(gene, species_keep, species_list):
    full_tree = PhyloTree(gene + "/" + gene + ".3.fa.tre")
    ######Checking if the list is empty######
    if len(species_keep) == 0:
        print("\nThere are no other genes in this gene family.")
    else:
        ######Removing stray within-clade gene copies from the clade######
        cut_list = cut_stray_other(gene, species_keep, species_list)
        ######Making it a group######
        group_list = cut_list
        ######Checking that there is only one gene per species######
        check_set = {str(item[0:3]) for item in group_list}
        while len(group_list) != len(check_set):
            view_counts(cut_list, species_list)
            group_str = raw_input(
                "\nYou can only have one gene per species. Enter more genes to cut, separated by a space: "
            )
            group_list = [item for item in group_list if item not in group_str]
            check_set = {str(item[0:3]) for item in group_list}
        print("\nThere are " + str(len(group_list)) +
              " genes in this group.\nGroup looks like:")
        print(group_list)
        print("\nMaking the group.")
        ######Saving gene group as a file######
        with open(gene + "/" + gene + "_noclade_prune.txt", "a") as group_file:
            for i in group_list:
                group_file.write(i + "\n")
        ######Saving name of group to a master list######
        with open(gene + "/" + gene + "_master_tree_list.txt", "a") as master:
            master.write(gene + "_noclade\n")
def main(args):
    genome_names = load_genome_names_by_clade_name(args.clade_name)
    LOGGER.info("loaded {} {} genomes".format(len(genome_names), args.clade_name))
    cdss = load_cdss_by_genome_names(genome_names)
    LOGGER.info("loaded {} cdss".format(len(cdss)))

    ortho_fp = pathlib.Path(build_clade_filepath(args.clade_name)).joinpath("./ortho/{}.ortho".format(args.clade_name))
    ortho_df = pd.read_csv(ortho_fp, sep='\t')
    cdss = set_gene_name_to_cdss(cdss, ortho_fp)
    LOGGER.info("loaded orthology from {}".format(ortho_fp))

    if args.split_fp:
        cdss = set_split_to_cdss(cdss, args.split_fp)
        LOGGER.info("loaded simulated segmentation from {}".format(args.split_fp))

    tree = None
    if args.tree_fp:
        tree = PhyloTree(args.tree_fp, format=1)
        LOGGER.info("loaded phylogenetic tree from {}".format(args.tree_fp))

    records = []
    cdsDAO = CdsDAO(cdss)
    gene_names = sorted(set(ortho_df["gene_name"]))
    #    gene_names = list(gene_names)[:100]
    LOGGER.info("found {} genes to search".format(len(gene_names)))
    for origin_gene_name in gene_names:
        LOGGER.info("start {}".format(origin_gene_name))
        records += detect_edges_all(origin_gene_name, args.score_method, cdsDAO, tree)

    out_df = pd.DataFrame(records, columns=["x", "y", "score", "score_naive", "total", "found", "bls",
                                            "top_offset", "top_relationship", "top_ratio"])
    out_df.to_csv(args.out_fp, sep='\t', index=False)
    LOGGER.info("saved results to {}".format(args.out_fp))
    def score_family_tree(self):

        outgrp_regex_str, species_dict, ingroup_regex_str = read_profile_file(
            BasePath.species_profile_filename)
        fam_tree_filename = BasePath.rooted_famtrees_dir + "/" + self.fam_id
        fam_tree = PhyloTree(fam_tree_filename, format=1)

        outgrp_re = re.compile(outgrp_regex_str)
        ingrp_re = re.compile(ingroup_regex_str)

        flag = self.check_if_tree_contains_outgroups(fam_tree, outgrp_re)
        if flag == 1:
            return 0

        ingroup_matches_arr = self.get_ingroup_sequence_list(
            fam_tree, ingrp_re)
        ingroup_pair_arr = self.get_ingroup_sequence_pairs(ingroup_matches_arr)
        precision_val = self.inspect_ingroup_pairs(fam_tree, ingroup_pair_arr,
                                                   outgrp_re)

        tree_score_filename = BasePath.outpath + "/" + self.fam_id + "/" + self.fam_id + \
            BasePath.tree_score_fileextension

        tree_score_file = open(tree_score_filename, "w")
        tree_score_file.write(self.fam_id + " " + str(precision_val) + "\n")
        tree_score_file.close()
예제 #10
0
def LoadTrees(treeFile, dlm):
    """Reads and stores phylogenetic trees from a file

    Parameters
    ------
    treefile: file, file of newick trees, 1 per line
    outgroup: str, last entry from quartet

    Returns
    ------
    treelist: obj, ete3 object of trees

    """
    print("loading trees...")
    treelist = []
    pbar = tqdm(total=file_len(treeFile))
    with open(treeFile, 'r') as newick:
        for line in newick:
            pbar.update(1)
            if not line.startswith("NA"):
                t = PhyloTree(line)
                t.set_species_naming_function(
                    lambda node: node.name.split(dlm)[0])
                treelist.append(t)
    pbar.close()
    return (treelist)
예제 #11
0
def do_alnntree(pref, ndf, fasta, refs, congen, targetids, gaps=0.9, cpus=-1):
    # TODO: add checkpoint to avoid repeating
    to_phy = congen
    for name, data in ndf.groupby('saccver'):
        # mi = data.sstart.min()
        # ma = data.send.max()
        tx = data.staxid.iloc[0]
        try:
            seq = refs['>%s' % name].replace('\n', '').strip()  # [mi-1:ma+1]
        except KeyError:
            name = name.split('|')[0]
            seq = refs['>%s' % name].replace('\n', '').strip()
        to_phy += '>%d.%s\n%s\n' % (tx, name, seq)
    with shelve.open(fasta) as dic:
        for h, s in dic.items():
            if h.strip()[1:] in targetids:
                print(h)
                to_phy += '%s\n%s\n' % (h, s.strip().replace('\n', ''))
            else:
                print(h, 'not in')
    aln, _ = stdin_run(['mafft', '--thread', str(cpus), '--auto', '-'], to_phy)
    trm = trimaln(aln.decode('utf-8'), targetids, gaps=gaps)
    tre, _ = stdin_run(['fasttreeMP', '-nt', '-gtr', '-gamma'], trm)
    tre = tre.strip()[:-1].replace(b';', b'-').decode('utf-8') + ';'
    t = PhyloTree(tre, sp_naming_function=lambda name: name.split('.')[0])
    with open('%s.aln' % pref, 'w') as al, open('%s.treepickle' % pref, 'wb') \
            as tp:
        al.write(trm)
        t.write(outfile='%s.tree' % pref)
        dill.dump(t, tp)
    tax2 = t.annotate_ncbi_taxa()
    fix_species(t)
    print(t)
    return t, tax2
예제 #12
0
    def __init__(self,
                 newick,
                 alg,
                 taxid,
                 tid,
                 actions,
                 style,
                 predraw_fn=None):
        try:
            self.tree = PhyloTree(newick=newick,
                                  alignment=alg,
                                  alg_format="fasta")
        except NewickError:
            self.tree = Tree(newick, format=1)

        if predraw_fn:
            predraw_fn(self.tree)
        self.tree.actions = actions
        self.tree.tree_style = style

        self.taxid = taxid
        #print taxid

        self.treeid = tid
        self.mapid = "map_" + tid
        self.imgid = "img_" + tid
        self.boxid = 'box_' + tid
        # Initialze node internal IDs
        for index, n in enumerate(self.tree.traverse('preorder')):
            n._nid = index
예제 #13
0
    def test_shortcut_functions(self):
        t = PhyloTree(
            """((((Human_1, Chimp_1), (Human_2, (Chimp_2, Chimp_3))),
            ((Fish_1, (Human_3, Fish_3)), Yeast_2)), Yeast_1);""")
        t.set_species_naming_function(lambda node: node.name.split("_")[0])
        t.get_descendant_evol_events()  # DDDSSSDDS

        root = t.get_tree_root()
        # Detects two consecutive nodes with duplications
        pattern0 = """('n_duplications(@) > 0')'n_duplications(@) > 0 '; """
        pattern1 = """( 'contains_leaves(@, ["Chimp_2", "Chimp_3"])'); """
        pattern2 = """'n_speciations(@) > 3 '; """

        pattern0 = TreePattern(pattern0)
        pattern1 = TreePattern(pattern1)
        pattern2 = TreePattern(pattern2)

        pattern0_match = list(pattern0.find_match(t, maxhits=None))
        pattern1_match = list(pattern1.find_match(t, maxhits=None))
        pattern2_match = list(pattern2.find_match(t, maxhits=None))

        self.assertEqual(len(pattern0_match), 5)

        self.assertEqual(len(pattern1_match), 4)
        self.assertEqual(pattern1_match[0], root)

        self.assertEqual(len(pattern2_match), 2)
        self.assertEqual(pattern2_match[0], root)
        self.assertEqual(pattern2_match[1], root.children[0])
 def get_ingroup_monoplyletic_clades(self):
     outgrp_regex_str, species_dict, ingroup_regex_str = read_profile_file(
         BasePath.species_profile_filename)
     fam_tree_filename = BasePath.outpath + "/" + self.fam_id + "/" + BasePath.raxml_tree_fileprefix + self.fam_id
     outgroup_re = re.compile(outgrp_regex_str)
     fam_tree = PhyloTree(fam_tree_filename, format=1)
     self.process_family_tree(fam_tree, outgroup_re, species_dict)
예제 #15
0
def load_json(fp):
    data = json.loads(clean_json(fp))
    taxonomy = {}
    count_total = 0
    counts = []

    for row in data['ubiome_bacteriacounts']:
        normalise_row(row)
        counts.append(row['count_norm'])
        t = PhyloTree()
        t.name = row['tax_name']
        t.add_features(**row)
        taxonomy[row['taxon']] = t

    root = taxonomy[min(taxonomy.keys())]
    count_total = root.count_norm
    root.alpha = alpha_function(counts)

    for t in taxonomy.values():
        t.add_feature('count_pct', float(t.count_norm) / count_total * 100)
        parent = t.parent
        tp = taxonomy.get(parent)
        if tp is not None:
            tp.add_child(t)
    print('loaded {} into tree depth {} diversity {:.2f}'.format(
        len(taxonomy), len(root), root.alpha))
    return root
 def get_ingroup_monoplyletic_clades(self):
     outgrp_regex_str, species_dict, ingroup_regex_str, outgroup_id_arr = read_profile_file(BasePath.species_profile_filename)
     fam_tree_filename = BasePath.outpath + "/" + self.fam_id + "/" + self.fam_id + \
         BasePath.rooted_fasttree_fileextension
     outgroup_re = re.compile(outgrp_regex_str)
     fam_tree = PhyloTree(fam_tree_filename, format=1)
     self.process_family_tree(fam_tree, outgroup_re, species_dict)
예제 #17
0
def run(args):
    from ete3 import Tree, PhyloTree
    for nw in args.src_tree_iterator:
        if args.orthologs is not None:
            t = PhyloTree(nw)
            for e in t.get_descendant_evol_events():
                print(e.in_seqs, e.out_seqs)
예제 #18
0
    def root_tree(self):
        outgrp_regex_str, species_dict, ingroup_regex_str, outgroup_id_arr = read_profile_file(
            BasePath.species_profile_filename)
        fam_tree_filename = BasePath.outpath + "/" + self.fam_id + "/" + self.fam_id + BasePath.fasttree_fileextension
        fam_tree = PhyloTree(fam_tree_filename, format=1)

        outgrp_re = re.compile(outgrp_regex_str)
        ingrp_re = re.compile(ingroup_regex_str)

        outgroup_sequence_list = self.get_regex_matching_sequence_list_from_node(
            fam_tree, outgrp_re)
        outgroup_monophyly_check = fam_tree.check_monophyly(
            values=outgroup_sequence_list, target_attr="name")
        if outgroup_monophyly_check[0]:
            print "Outgroups are monophyletic"
            root_node = fam_tree.get_common_ancestor(outgroup_sequence_list)
            fam_tree.set_outgroup(root_node)
            self.write_rooted_tree(fam_tree)

        else:
            print "Outgroups are not monophyletic"

            outgroup_sequence_list_from_seqlist = self.get_outgroup_sequences_from_seqlist(
            )
            arranged_outgroup_sequence_list = self.arrange_outgroup_sequence_ids(
                outgroup_sequence_list_from_seqlist, outgroup_id_arr)
            root_node = arranged_outgroup_sequence_list[0]
            print "Rooting using sequence {0}".format(root_node)
            fam_tree.set_outgroup(root_node)
            self.write_rooted_tree(fam_tree)
예제 #19
0
def prune_main(gene, speciesList, cladeDict):
    gene = str(gene)
    erase_previous_files(gene)
    copy_list = copies_in_group(gene)
    gene_type = count_summarize(gene, copy_list, speciesList, cladeDict)
    choice = "n"
    if gene_type == "small":
        small_family(gene)
    elif gene_type == "single":
        single_copy(gene, copy_list, cladeDict)
    else:
        print("\nShowing the gene tree.")
        clade_tree = PhyloTree(gene + "/" + gene + ".3.fa.tre")
        view_rooted_tree(clade_tree)
        choice2 = raw_input(
            "\nWould you like to split this gene family into multiple families? (y/n)"
        )
        if choice2[0] == "y":
            pre_prune(gene)
        else:
            choice = raw_input(
                "\nContinue with pruning as single gene family? (y/n)")
    if choice[0] == "y":
        make_clade_groups(gene, cladeDict, copy_list, speciesList)
        make_all_lists(gene, cladeDict)
예제 #20
0
 def get_ingroup_monoplyletic_clades(self):
     print 'Clade species representation cutoff {0}'.format(self.species_representaion_cutoff)
     outgrp_regex_str, species_dict, ingroup_regex_str = read_profile_file(BasePath.species_profile_filename)
     fam_tree_filename = BasePath.rooted_famtrees_dir + "/" + self.fam_id
     outgroup_re = re.compile(outgrp_regex_str)
     fam_tree = PhyloTree(fam_tree_filename, format=1)
     self.process_family_tree(fam_tree, outgroup_re, species_dict)
예제 #21
0
def name_ancestors(timetreefile, to_table=False, ete3_algo=False, uniq=True):
    logger.info('Loading data')
    ### /!\ quoted_node_names only from ete3 v3.1.1
    timetree = PhyloTree(timetreefile, format=1, quoted_node_names=True)
    ncbi = NCBITaxa()


    name2taxid = ncbi.get_name_translator([sp.replace('_', ' ') for sp in \
                                                    timetree.get_leaf_names()])

    for leaf in timetree.get_leaves():
        try:
            leaf.add_feature('taxid', name2taxid[leaf.name.replace('_',
                                                                   ' ')][0])
        except KeyError:
            logger.warning('Species %r not found', leaf.name)
            leaf.delete(prevent_nondicotomic=True, preserve_branch_length=True)

    logger.info('Placing common ancestors')
    if ete3_algo:
        ncbi.annotate_tree(timetree, 'taxid')
    else:
        myannotate(timetree, ncbi)
    matchrename_ncbitax(timetree, uniq)

    #logger.debug({ft:getattr(timetree, ft) for ft in timetree.features})

    if not to_table:
        print(timetree.write(format=1, format_root_node=True))
    else:
        for node in timetree.traverse():
            if not node.is_leaf():
                print(node.oldname + '\t' + getattr(node, 'sci_name', ''))
예제 #22
0
def etealign(tree, MA):
    t = tree
    treefix = open(t, "r")
    t = treefix.readline().replace("'", "")
    tree = PhyloTree(t)
    print(tree)
    tree.link_to_alignment(alignment=MA, alg_format="fasta")
    tree.show()
예제 #23
0
def open_tree(tree_file_path):
    """Opens tree (contree or treefile) and assigns support values to nodes in case of a standard tree file"""
    if 'contree' in tree_file_path:
        tree = PhyloTree(tree_file_path, sp_naming_function=None)
    elif 'treefile' in tree_file_path:  # Branch supports in SH-aLRT support (%) / ultrafast bootstrap support (%)
        tree = PhyloTree(tree_file_path, sp_naming_function=None, format=1)
        for node in tree.iter_descendants():
            if not node.is_leaf():
                support_values = node.name.split('/')
                try:
                    node.support = float(support_values[1])
                except IndexError:  # No support values when sequences were identical --> set support artifically to 100.0
                    node.support = 100.0
                #node.add_features(shalrt = float(support_values[0])) # Not necessary...
    else:
        sys.exit('Error: tree format not recognised')
    return tree
예제 #24
0
def safe_phylo_read(filename) -> PhyloTree:
    if isinstance(filename, PhyloTree):
        return filename
    try:
        return PhyloTree(filename, format=3)
    except:
        try:
            return PhyloTree(filename)
        except:
            try:
                return PhyloTree(filename, format=1)
            except:
                try:
                    return PhyloTree(filename, format=5)
                except NewickError as e:
                    print(f"Are you sure tree {filename} exists?", file=sys.stderr, flush=True)
                    raise e
예제 #25
0
    def test_species(self):
        """
        tests if node.species and ncbi_query are working
        """

        # test node.species

        species_tree = PhyloTree(
            """(Felis_catus_1:1,
                (Homo_sapiens_1:1, Pan_troglodytes_1:1),
                Saccharomyces_cerevisiae_1:1);""",
            format=1)
        species_tree.set_species_naming_function(lambda n: n.name.split("_")[1] if "_" in n.name else '')

        pattern0 = """('',
                       (' len(set(["sapiens","pygmaeus"]) & species(@))>0',
                       Pan_troglodytes_1)
                       );"""

        pattern0 = TreePattern(pattern0)


        root = species_tree.get_tree_root()
        self.assertEqual(list(pattern0.find_match(species_tree)), [root])

        # test ncbi taxonomy

        ncbi = NCBITaxa()
        taxonomy_tree = PhyloTree("((9598, 9606), 10090);", sp_naming_function=lambda name: name)
        taxonomy_tree.annotate_ncbi_taxa()
        root = taxonomy_tree.get_tree_root()

        pattern1 = """ '  @.sci_name == "Euarchontoglires" ';"""
        pattern2 = """
          (( '@.sci_name=="H**o sapiens"' , '9526 in @.lineage ' )' @.rank=="subfamily" and @.taxid == 207598 ')
          '  @.sci_name == "Euarchontoglires" and "cellular organisms" in @.named_lineage';
          """

        pattern1 = TreePattern(pattern1)
        pattern2 = TreePattern(pattern2)

        match1 = pattern1.find_match(taxonomy_tree)
        match2 = pattern2.find_match(taxonomy_tree)

        self.assertEqual(list(match1), [root])
        self.assertEqual(list(match2), [root])
예제 #26
0
def make_species_list(path):
    t = PhyloTree("{}.3.fa.tre".format(path))
    leaves = []
    for leaf in t:
        leaves.append(leaf)
    l = [str(i) for i in leaves]
    l = [i.lstrip("\n--") for i in l]
    l2 = [re.sub("\d", "", i) for i in l]
    return (l, l2)
def process_family_tree(fam_tree_fileName, profile_fileName):

	outgrp_regex_str, species_dict = read_profile_file(profile_fileName)
	outgrp_re = re.compile(outgrp_regex_str)

	fam_tree = PhyloTree(fam_tree_fileName, format=1)
	if not (detect_multifurcation(fam_tree)):
		return 0
	node_dict = get_ingroup_monophyletic_clade_nodes(fam_tree, outgrp_re)
	get_SO_duplication_events(fam_tree, node_dict, species_dict, fam_tree_fileName)
def build_tree(aln, tree, basename, show, output_format):
    """Build phylogenetic tree from files

    This function creates a file with the phylogenetic tree and alignment
    from the fasta multiple alignment file and the tree in newick format.

    Parameters
    -------------
    aln: string
        Alignment string in fasta format
    tree: string
        Tree string in newick format
    basename: string
        Basename of the original alignment file
    show: boolean
        Show ETE tree browser (yes/no)
    output_format: string
        Format of the output
    """

    if tree[-1] != ";":
        genetree = PhyloTree("{};".format(tree))
    else:
        genetree = PhyloTree(tree)

    ts = TreeStyle()
    ts.show_leaf_name = False

    new_tree = "{BASENAME}_Tree.{FORMAT}".format(BASENAME=basename,
                                                 FORMAT=output_format)
    new_tree_aln = "{BASENAME}_Tree_aln.{FORMAT}".format(BASENAME=basename,
                                                         FORMAT=output_format)

    if show:
        genetree.render(new_tree, tree_style=ts)
        genetree.link_to_alignment(aln)
        genetree.render(new_tree_aln, tree_style=ts)
        genetree.show(tree_style=ts)
    else:
        genetree.render(new_tree, tree_style=ts)
        genetree.link_to_alignment(aln)
        genetree.render(new_tree_aln, tree_style=ts)
예제 #29
0
def main():
    parser = argparse.ArgumentParser(description='Gene Copy Number Finder')
    parser.add_argument('--genetree',
                        required=True,
                        help='GeneTree in nhx format')
    parser.add_argument('--speciesorder',
                        required=True,
                        help='Comma-separated species list')
    args = parser.parse_args()

    species_list = args.speciesorder.split(",")
    species_list = [_.strip() for _ in species_list]
    table = []

    with open(args.genetree, "r") as f:
        # reads multiple gene tree line by line gene tree
        for line in f:
            # Remove empty NHX features that can be produced by TreeBest but break ete3
            line = line.replace('[&&NHX]', '')

            # reads single gene tree
            genetree = PhyloTree(line)
            leaves = genetree.get_leaf_names()

            leaves_parts = [_.split("_") for _ in leaves]
            for i, leaf_parts in enumerate(leaves_parts):
                if len(leaf_parts) != 2:
                    raise Exception(
                        "Leaf node '%s' is not in gene_species format" %
                        leaves[i])

            leaves_species = [_[1] for _ in leaves_parts]
            species_counter = collections.Counter(leaves_species)

            # Assign to ref_species the first element of species_list which
            # appears in a leaf node
            for ref_species in species_list:
                if ref_species in species_counter:
                    break
            else:
                raise Exception(
                    "None of the specified species was found in the GeneTree '%s'"
                    % line)

            # Find the gene of the (first) leaf node for the ref_species
            for leaf_parts in leaves_parts:
                if leaf_parts[1] == ref_species:
                    species_counter['gene'] = leaf_parts[0]
                    break

            table.append(species_counter)

    colList = ["gene"] + species_list
    printTSV(table, colList)
def main():
    usage = "usage: %prog --genetree <genetree-file> --speciestree <speciestree-file> [options]"
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('--genetree', help='GeneTree in nhx format')
    parser.add_option('--speciestree', help='Species Tree in nhx format')
    parser.add_option('--species_format', type='int', default=8, help='Species Tree input format (0-9)')
    parser.add_option('--gene_node', type='int', default=0, help='Gene node format 0=gene_species, 1=species_gene')
    parser.add_option('--gainlose', action='store_true', default=False, help='Find out gene gain/lose')
    parser.add_option('--output_format', type='int', default=9, help='GeneTree output format (0-9)')
    options, args = parser.parse_args()

    if options.genetree is None:
        parser.error("--genetree option must be specified, GeneTree in nhx format")

    # reads single gene tree
    genetree = PhyloTree(options.genetree)

    # sets species naming function
    if options.gene_node == 0:
        genetree.set_species_naming_function(parse_sp_name)

    # reconcile species tree with gene tree to help find out gene gain/lose
    if options.gainlose:

        if options.speciestree is None:
            parser.error("--speciestree option must be specified, species tree in nhx format")

        # reads species tree
        speciestree = PhyloTree(options.speciestree, format=options.species_format)

        # Removes '*' from Species names comes from Species tree configrured for TreeBest
        for leaf in speciestree:
            leaf.name = leaf.name.strip('*')

        genetree, events = genetree.reconcile(speciestree)

    # splits tree by duplication events which returns the list of all subtrees resulting from splitting current tree by its duplication nodes.
    for cluster_id, node in enumerate(genetree.split_by_dups(), 1):
        outfile = str(cluster_id) + '_genetree.nhx'
        with open(outfile, 'w') as f:
            f.write(node.write(format=options.output_format))