Пример #1
0
    def testCandidates(self):
        tree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
        mcTree = MultiCactusTree(NXNewick().parseString(tree, addImpliedRoots = False))
        mcTree.computeSubtreeRoots()
        og = GreedyOutgroup()
        og.importTree(mcTree)
        candidates = set(['HUMAN', 'CHIMP', 'RAT'])
        og.greedy(candidateSet=candidates, candidateChildFrac=0.5)
        assert og.ogMap['Anc1'][0][0] == 'Anc4'
        assert og.ogMap['Anc2'][0][0] == 'Anc4'
        assert og.ogMap['Anc3'][0][0] == 'Anc4'
        assert 'Anc4' not in og.ogMap
        assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc6', 'Anc7']
        assert og.ogMap['Anc6'][0][0] in ['Anc5', 'MOUSE', 'RAT']
        assert og.ogMap['Anc7'][0][0] in ['Anc5', 'MOUSE', 'RAT']

        og = GreedyOutgroup()
        og.importTree(mcTree)
        candidates = set(['HUMAN', 'CHIMP', 'RAT'])
        candidateFrac = 1
        og.greedy(candidateSet=candidates, candidateChildFrac=1.0)
        assert og.ogMap['Anc1'][0][0] == 'Anc7'
        assert og.ogMap['Anc2'][0][0] == 'Anc7'
        assert og.ogMap['Anc3'][0][0] == 'Anc7'
        assert 'Anc4' not in og.ogMap
        assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc7']
        assert og.ogMap['Anc6'][0][0] == 'RAT'
        assert og.ogMap['Anc7'][0][0] == 'RAT'
def createMCProject(tree, experiment, config, options):
    """
    Creates a properly initialized MultiCactusProject.

    TODO: This should really all be in the constructor for MultiCactusProject.
    """
    mcTree = MultiCactusTree(tree)
    mcTree.nameUnlabeledInternalNodes(config.getDefaultInternalNodePrefix())
    mcTree.computeSubtreeRoots()
    mcProj = MultiCactusProject()
    for genome in experiment.getGenomesWithSequence():
        mcProj.inputSequenceMap[genome] = experiment.getSequenceID(genome)
    mcProj.mcTree = mcTree
    if config.getDoSelfAlignment():
        mcTree.addSelfEdges()
    for name in mcProj.mcTree.getSubtreeRootNames():
        expPath = "%s/%s/%s_experiment.xml" % (options.path, name, name)
        mcProj.expMap[name] = os.path.abspath(expPath)
    alignmentRootId = mcProj.mcTree.getRootId()
    if options.root is not None:
        try:
            alignmentRootId = mcProj.mcTree.getNodeId(options.root)
        except:
            raise RuntimeError("Specified root name %s not found in tree" %
                               options.root)

    fillInOutgroups(mcProj, options.outgroupNames, config, alignmentRootId)

    # if necessary, we reroot the tree at the specified alignment root id.  all leaf genomes
    # that are no longer in the tree, but still used as outgroups, are moved into special fields
    # so that we can remember to, say, get their paths for preprocessing.
    specifyAlignmentRoot(mcProj, experiment, alignmentRootId)
    return mcProj
Пример #3
0
    def testDynamicOutgroupsJustLeaves(self):
        tree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
        mcTree = MultiCactusTree(NXNewick().parseString(tree, addImpliedRoots = False))
        mcTree.computeSubtreeRoots()
        og = DynamicOutgroup()
        og.importTree(mcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3, sequenceLossWeight=0.)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                       og.ogMap.values()))
        # ordering is important!
        assert og.ogMap['Anc1'][0][0] == 'HUMAN'
        assert og.ogMap['Anc7'][0][0] == 'BABOON'

        og = DynamicOutgroup()
        og.importTree(mcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))

        # we keep dynamic outgroups sorted by distance too
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                               og.ogMap.values()))
Пример #4
0
 def testAddSelf(self):
     trueSelf = '((((((((HUMAN:0.006969)HUMAN_self:0.006969,(CHIMP:0.009727)CHIMP_self:0.009727)Anc7:0.025291)Anc7_self:0.025291,(BABOON:0.044568)BABOON_self:0.044568)Anc3:0.11)Anc3_self:0.11,(((MOUSE:0.072818)MOUSE_self:0.072818,(RAT:0.081244)RAT_self:0.081244)Anc4:0.260342)Anc4_self:0.260342)Anc1:0.02326)Anc1_self:0.02326,(((((DOG:0.07)DOG_self:0.07,(CAT:0.07)CAT_self:0.07)Anc5:0.087381)Anc5_self:0.087381,(((PIG:0.06)PIG_self:0.06,(COW:0.06)COW_self:0.06)Anc6:0.104728)Anc6_self:0.104728)Anc2:0.04)Anc2_self:0.04)Anc0;'
     tree = MultiCactusTree(self.mcTree1)
     tree.nameUnlabeledInternalNodes()
     tree.computeSubtreeRoots()
     tree.addSelfEdges()
     treeString = NXNewick().writeString(tree)
     self.assertEqual(treeString, trueSelf)
Пример #5
0
 def testAddSelf(self):
     trueSelf = "((((((((HUMAN:0.006969)HUMAN_self:0.006969,(CHIMP:0.009727)CHIMP_self:0.009727)Anc7:0.025291)Anc7_self:0.025291,(BABOON:0.044568)BABOON_self:0.044568)Anc3:0.11)Anc3_self:0.11,(((MOUSE:0.072818)MOUSE_self:0.072818,(RAT:0.081244)RAT_self:0.081244)Anc4:0.260342)Anc4_self:0.260342)Anc1:0.02326)Anc1_self:0.02326,(((((DOG:0.07)DOG_self:0.07,(CAT:0.07)CAT_self:0.07)Anc5:0.087381)Anc5_self:0.087381,(((PIG:0.06)PIG_self:0.06,(COW:0.06)COW_self:0.06)Anc6:0.104728)Anc6_self:0.104728)Anc2:0.04)Anc2_self:0.04)Anc0;"
     tree = MultiCactusTree(self.mcTree1)
     tree.nameUnlabeledInternalNodes()
     tree.computeSubtreeRoots()
     tree.addSelfEdges()
     treeString = NXNewick().writeString(tree)
     self.assertEqual(treeString, trueSelf)
Пример #6
0
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.trees = randomTreeSet()
        self.mcTrees = []
        self.tempDir = getTempDirectory(os.getcwd())
        self.tempFa = os.path.join(self.tempDir, "seq.fa")
        with open(self.tempFa, "w") as f:
            f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n")
        self.dummySeqMaps = []
        for tree in self.trees:
            if tree.size() < 50:
                mcTree = MultiCactusTree(tree)
                seqMap = dict()
                for i in mcTree.breadthFirstTraversal():
                    mcTree.setName(i, "Node%s" % str(i))
                    seqMap["Node%s" % str(i)] = self.tempFa
                mcTree.computeSubtreeRoots()
                mcTree.nameUnlabeledInternalNodes()
                self.mcTrees.append(mcTree)
                self.dummySeqMaps.append(seqMap)

        # Boreoeutherian tree
        borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
        self.borMcTree = MultiCactusTree(NXNewick().parseString(
            borTree, addImpliedRoots=False))
        self.borMcTree.computeSubtreeRoots()
        self.borMcTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.borMcTree)

        # Eutherian backbone tree
        backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);'
        self.backboneTree = MultiCactusTree(NXNewick().parseString(
            backbone, addImpliedRoots=False))
        self.backboneTree.computeSubtreeRoots()
        self.backboneTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.backboneTree)

        seqLens = dict()
        seqLens["HUMAN"] = 57553
        seqLens["CHIMP"] = 57344
        seqLens["BABOON"] = 58960
        seqLens["MOUSE"] = 32750
        seqLens["RAT"] = 38436
        seqLens["DOG"] = 54187
        seqLens["CAT"] = 50283
        seqLens["PIG"] = 54843
        seqLens["COW"] = 55508
        self.blanchetteSeqMap = dict()
        for event, seqLen in seqLens.items():
            p = os.path.join(self.tempDir, event + ".fa")
            with open(p, "w") as f:
                f.write(">%s\n" % event)
                f.write(''.join(['A'] * seqLen))
                f.write('\n')
            self.blanchetteSeqMap[event] = p
Пример #7
0
 def getTree(self, onlyThisSubtree=False):
     treeString = self.xmlRoot.attrib["species_tree"]
     ret = NXNewick().parseString(treeString, addImpliedRoots=False)
     if onlyThisSubtree:
         # Get a subtree containing only the reference node and its
         # children, rather than a species tree including the
         # outgroups as well
         multiCactus = MultiCactusTree(ret)
         multiCactus.nameUnlabeledInternalNodes()
         multiCactus.computeSubtreeRoots()
         ret = multiCactus.extractSubTree(self.getRootGenome())
     return ret
Пример #8
0
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.trees = randomTreeSet()
        self.mcTrees = []
        self.tempDir = getTempDirectory(os.getcwd())
        self.tempFa = os.path.join(self.tempDir, "seq.fa")
        with open(self.tempFa, "w") as f:
            f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n")
        self.dummySeqMaps = []
        for tree in self.trees:
            if tree.size() < 50:
                mcTree = MultiCactusTree(tree, tree.degree())
                seqMap = dict()
                for i in mcTree.breadthFirstTraversal():
                    mcTree.setName(i, "Node%s" % str(i))
                    seqMap["Node%s" % str(i)] = self.tempFa
                mcTree.computeSubtreeRoots()
                mcTree.nameUnlabeledInternalNodes()
                self.mcTrees.append(mcTree)
                self.dummySeqMaps.append(seqMap)

        # Boreoeutherian tree
        borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
        self.borMcTree = MultiCactusTree(NXNewick().parseString(borTree, addImpliedRoots=False))
        self.borMcTree.computeSubtreeRoots()
        self.borMcTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.borMcTree)

        # Eutherian backbone tree
        backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);'
        self.backboneTree = MultiCactusTree(NXNewick().parseString(backbone, addImpliedRoots=False))
        self.backboneTree.computeSubtreeRoots()
        self.backboneTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.backboneTree)

        seqLens = dict()
        seqLens["HUMAN"] = 57553
        seqLens["CHIMP"] = 57344
        seqLens["BABOON"] = 58960
        seqLens["MOUSE"] = 32750
        seqLens["RAT"] = 38436
        seqLens["DOG"] = 54187
        seqLens["CAT"] = 50283
        seqLens["PIG"] = 54843
        seqLens["COW"] = 55508
        self.blanchetteSeqMap = dict()
        for event, seqLen in seqLens.items():
            p = os.path.join(self.tempDir, event +".fa")
            with open(p, "w") as f:
                f.write(">%s\n" % event)
                f.write(''.join(['A'] * seqLen))
                f.write('\n')
            self.blanchetteSeqMap[event] = p
Пример #9
0
 def getTree(self, onlyThisSubtree=False):
     treeString = self.xmlRoot.attrib["species_tree"]
     ret = NXNewick().parseString(treeString, addImpliedRoots = False)
     if onlyThisSubtree:
         # Get a subtree containing only the reference node and its
         # children, rather than a species tree including the
         # outgroups as well
         multiCactus = MultiCactusTree(ret)
         multiCactus.nameUnlabeledInternalNodes()
         multiCactus.computeSubtreeRoots()
         ret = multiCactus.extractSubTree(self.getReferenceNameFromConfig())
     return ret
Пример #10
0
 def testJustLeaves(self):
     tree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
     mcTree = MultiCactusTree(NXNewick().parseString(tree, addImpliedRoots = False))
     mcTree.computeSubtreeRoots()
     og = GreedyOutgroup()
     og.importTree(mcTree)
     candidates = set([mcTree.getName(x) for x in mcTree.getLeaves()])
     og.greedy(candidateSet=candidates, candidateChildFrac=2.)
     assert og.ogMap['Anc1'][0][0] == 'HUMAN'
     assert og.ogMap['Anc2'][0][0] in ['CAT', 'DOG']
     assert og.ogMap['Anc3'][0][0] in ['PIG', 'COW']
     assert og.ogMap['Anc4'][0][0] in ['CAT', 'DOG']
     assert og.ogMap['Anc5'][0][0] == 'HUMAN'
     assert og.ogMap['Anc6'][0][0] in ['CAT', 'DOG']
     assert og.ogMap['Anc7'][0][0] == 'BABOON'
Пример #11
0
    def testAddOutgroup(self):
        trueOg = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3:0.11,(MOUSE:0.072818,RAT:0.081244)Anc4:0.260342)Anc1:0.02326,((DOG:0.07,CAT:0.07)Anc5:0.087381,(PIG:0.06,COW:0.06)Anc6:0.104728)Anc2:0.04,outgroup:1.7)Anc0;'
        tree = MultiCactusTree(self.mcTree1)
        tree.nameUnlabeledInternalNodes()
        tree.computeSubtreeRoots()
        tree.addOutgroup("outgroup", 1.7)
        treeString = NXNewick().writeString(tree)
        self.assertEqual(treeString, trueOg)

        trueLeafOg = "(A:1.1,outgroup:1.1);"
        leafTreeString = "A;"
        parser = NXNewick()
        leafTree = MultiCactusTree(parser.parseString(leafTreeString, addImpliedRoots = False))
        leafTree.nameUnlabeledInternalNodes()
        leafTree.computeSubtreeRoots()
        leafTree.addOutgroup("outgroup", 2.2)
        leafTreeOutString = NXNewick().writeString(leafTree)
        self.assertEqual(leafTreeOutString, trueLeafOg)
Пример #12
0
    def testAddOutgroup(self):
        trueOg = "((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3:0.11,(MOUSE:0.072818,RAT:0.081244)Anc4:0.260342)Anc1:0.02326,((DOG:0.07,CAT:0.07)Anc5:0.087381,(PIG:0.06,COW:0.06)Anc6:0.104728)Anc2:0.04,outgroup:1.7)Anc0;"
        tree = MultiCactusTree(self.mcTree1)
        tree.nameUnlabeledInternalNodes()
        tree.computeSubtreeRoots()
        tree.addOutgroup("outgroup", 1.7)
        treeString = NXNewick().writeString(tree)
        self.assertEqual(treeString, trueOg)

        trueLeafOg = "(A:1.1,outgroup:1.1);"
        leafTreeString = "A;"
        parser = NXNewick()
        leafTree = MultiCactusTree(parser.parseString(leafTreeString, addImpliedRoots=False))
        leafTree.nameUnlabeledInternalNodes()
        leafTree.computeSubtreeRoots()
        leafTree.addOutgroup("outgroup", 2.2)
        leafTreeOutString = NXNewick().writeString(leafTree)
        self.assertEqual(leafTreeOutString, trueLeafOg)
Пример #13
0
 def testMultipleOutgroups(self):
     tree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
     mcTree = MultiCactusTree(NXNewick().parseString(tree, addImpliedRoots = False))
     mcTree.computeSubtreeRoots()
     og = GreedyOutgroup()
     og.importTree(mcTree)
     og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3)
     # make sure all entries have <= 3 outgroups.
     assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
     # and for all entries, the closest must be first.
     assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                    og.ogMap.values()))
     # ordering is important!
     assert map(itemgetter(0), og.ogMap['Anc4']) == ['Anc1']
     assert map(itemgetter(0), og.ogMap['Anc7']) == ['BABOON', 'Anc1',
                                                     'Anc5']
     # We avoid cycles, and choose post-order first, so this only
     # uses leaves.
     assert map(itemgetter(0), og.ogMap['Anc1']) == ['HUMAN', 'CHIMP',
                                                     'BABOON']
Пример #14
0
 def testMultipleOutgroupsJustLeaves(self):
     tree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
     mcTree = MultiCactusTree(NXNewick().parseString(tree, addImpliedRoots = False))
     mcTree.computeSubtreeRoots()
     og = GreedyOutgroup()
     og.importTree(mcTree)
     candidates = set([mcTree.getName(x) for x in mcTree.getLeaves()])
     og.greedy(candidateSet=candidates, candidateChildFrac=2.,
               maxNumOutgroups=3)
     # make sure all entries have <= 3 outgroups.
     assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
     # and for all entries, the closest must be first.
     assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                    og.ogMap.values()))
     # ordering is important!
     assert map(itemgetter(0), og.ogMap['Anc1']) == ['HUMAN', 'CHIMP',
                                                     'BABOON']
     assert og.ogMap['Anc7'][0][0] == 'BABOON'
     assert og.ogMap['Anc7'][1][0] in ['CAT', 'DOG']
     assert og.ogMap['Anc7'][2][0] in ['CAT', 'DOG']
Пример #15
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     self.trees = randomTreeSet()
     self.mcTrees = []
     self.tempDir = getTempDirectory(os.getcwd())
     self.tempFa = os.path.join(self.tempDir, "seq.fa")
     with open(self.tempFa, "w") as f:
         f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n")
     self.dummySeqMaps = []
     for tree in self.trees:
         if tree.size() < 500:
             mcTree = MultiCactusTree(tree, tree.degree())
             seqMap = dict()
             for i in mcTree.breadthFirstTraversal():
                 mcTree.setName(i, "Node%s" % str(i))
                 seqMap["Node%s" % str(i)] = self.tempFa
             mcTree.computeSubtreeRoots()
             self.mcTrees.append(mcTree)
             self.dummySeqMaps.append(seqMap)
             
     seqLens = dict()
     seqLens["HUMAN"] = 57553
     seqLens["CHIMP"] = 57344
     seqLens["BABOON"] = 58960
     seqLens["MOUSE"] = 32750
     seqLens["RAT"] = 38436
     seqLens["DOG"] = 54187
     seqLens["CAT"] = 50283
     seqLens["PIG"] = 54843
     seqLens["COW"] = 55508
     self.blanchetteSeqMap = dict()
     for event, seqLen in seqLens.items():
         p = os.path.join(self.tempDir, event +".fa")
         with open(p, "w") as f:
             f.write(">%s\n" % event)
             f.write(''.join(['A'] * seqLen))
             f.write('\n')
         self.blanchetteSeqMap[event] = p
Пример #16
0
def createMCProject(tree, experiment, config, options):
    mcTree = MultiCactusTree(tree, config.getSubtreeSize())
    mcTree.nameUnlabeledInternalNodes(config.getDefaultInternalNodePrefix())
    mcTree.computeSubtreeRoots()
    mcProj = MultiCactusProject()
    mcProj.mcTree = mcTree
    mcProj.inputSequences = experiment.getSequences()[:] 
    mcProj.outputSequenceDir = experiment.getOutputSequenceDir()
    if config.getDoSelfAlignment():
        mcTree.addSelfEdges()
    for name in mcProj.mcTree.getSubtreeRootNames():
        expPath = "%s/%s/%s_experiment.xml" % (options.path, name, name)
        mcProj.expMap[name] = os.path.abspath(expPath)
    alignmentRootId = mcProj.mcTree.getRootId()
    if options.root is not None:
        try:
            alignmentRootId = mcProj.mcTree.getNodeId(options.root)
        except Exception as e:
            raise RuntimeError("Specified root name %s not found in tree" % options.root)
    mcProj.outgroup = None
    if config.getOutgroupStrategy() == 'greedy':
        # use the provided outgroup candidates, or use all outgroups
        # as candidates if none are given
        mcProj.outgroup = GreedyOutgroup()
        mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId)
        mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(),
                               candidateSet=options.outgroupNames,
                               candidateChildFrac=config.getOutgroupAncestorQualityFraction(),
                               maxNumOutgroups=config.getMaxNumOutgroups())
    elif config.getOutgroupStrategy() == 'greedyLeaves':
        # use all leaves as outgroups, unless outgroup candidates are given
        mcProj.outgroup = GreedyOutgroup()
        mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId)
        ogSet = options.outgroupNames
        if ogSet is None:
            ogSet = set([mcProj.mcTree.getName(x) for x in mcProj.mcTree.getLeaves()])
        mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(),
                               candidateSet=ogSet,
                               candidateChildFrac=2.0,
                               maxNumOutgroups=config.getMaxNumOutgroups())
    elif config.getOutgroupStrategy() == 'greedyPreference':
        # prefer the provided outgroup candidates, if any, but use
        # other nodes as "filler" if we can't find enough.
        mcProj.outgroup = GreedyOutgroup()
        mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId)
        mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(),
                               candidateSet=options.outgroupNames,
                               candidateChildFrac=config.getOutgroupAncestorQualityFraction(),
                               maxNumOutgroups=config.getMaxNumOutgroups())
        mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(),
                               candidateSet=None,
                               candidateChildFrac=config.getOutgroupAncestorQualityFraction(),
                               maxNumOutgroups=config.getMaxNumOutgroups())
    elif config.getOutgroupStrategy() == 'dynamic':
        # dynamic programming algorithm that exactly optimizes probability
        # that base in target node aligns to at least one base in the
        # outgroup set.  Caveats are that it only returns leaves, and
        # the model used for optimization is super naive. Still, it does
        # some things better than greedy approaches such as properly account
        # for phylogenetic redundancy, as well as try to factor assembly
        # size/quality automatically. 
        mcProj.outgroup = DynamicOutgroup()
        mcProj.outgroup.importTree(mcProj.mcTree, mcProj.getInputSequenceMap(), alignmentRootId,
                                   candidateSet=options.outgroupNames)
        mcProj.outgroup.compute(maxNumOutgroups=config.getMaxNumOutgroups())
    elif config.getOutgroupStrategy() != 'none':
        raise RuntimeError("Could not understand outgroup strategy %s" % config.getOutgroupStrategy())

    # if necessary, we reroot the tree at the specified alignment root id.  all leaf genomes
    # that are no longer in the tree, but still used as outgroups, are moved into special fields
    # so that we can remember to, say, get their paths for preprocessing. 
    specifyAlignmentRoot(mcProj, alignmentRootId)
    return mcProj
Пример #17
0
class TestCase(unittest.TestCase):

    def setUp(self):
        unittest.TestCase.setUp(self)
        self.trees = randomTreeSet()
        self.mcTrees = []
        self.tempDir = getTempDirectory(os.getcwd())
        self.tempFa = os.path.join(self.tempDir, "seq.fa")
        with open(self.tempFa, "w") as f:
            f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n")
        self.dummySeqMaps = []
        for tree in self.trees:
            if tree.size() < 50:
                mcTree = MultiCactusTree(tree, tree.degree())
                seqMap = dict()
                for i in mcTree.breadthFirstTraversal():
                    mcTree.setName(i, "Node%s" % str(i))
                    seqMap["Node%s" % str(i)] = self.tempFa
                mcTree.computeSubtreeRoots()
                mcTree.nameUnlabeledInternalNodes()
                self.mcTrees.append(mcTree)
                self.dummySeqMaps.append(seqMap)

        # Boreoeutherian tree
        borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
        self.borMcTree = MultiCactusTree(NXNewick().parseString(borTree, addImpliedRoots=False))
        self.borMcTree.computeSubtreeRoots()
        self.borMcTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.borMcTree)

        # Eutherian backbone tree
        backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);'
        self.backboneTree = MultiCactusTree(NXNewick().parseString(backbone, addImpliedRoots=False))
        self.backboneTree.computeSubtreeRoots()
        self.backboneTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.backboneTree)

        seqLens = dict()
        seqLens["HUMAN"] = 57553
        seqLens["CHIMP"] = 57344
        seqLens["BABOON"] = 58960
        seqLens["MOUSE"] = 32750
        seqLens["RAT"] = 38436
        seqLens["DOG"] = 54187
        seqLens["CAT"] = 50283
        seqLens["PIG"] = 54843
        seqLens["COW"] = 55508
        self.blanchetteSeqMap = dict()
        for event, seqLen in seqLens.items():
            p = os.path.join(self.tempDir, event +".fa")
            with open(p, "w") as f:
                f.write(">%s\n" % event)
                f.write(''.join(['A'] * seqLen))
                f.write('\n')
            self.blanchetteSeqMap[event] = p

    def tearDown(self):
        unittest.TestCase.tearDown(self)
        system("rm -rf %s" % self.tempDir)

    def testJustLeaves(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set([self.borMcTree.getName(x) for x in self.borMcTree.getLeaves()])
        og.greedy(candidateSet=candidates, candidateChildFrac=2.)
        assert og.ogMap['Anc1'][0][0] == 'HUMAN'
        assert og.ogMap['Anc2'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc3'][0][0] in ['PIG', 'COW']
        assert og.ogMap['Anc4'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc5'][0][0] == 'HUMAN'
        assert og.ogMap['Anc6'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc7'][0][0] == 'BABOON'

    def testHeightTable(self):
        """Make sure the height-table is calculated correctly."""
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        htable = og.heightTable()
        self.assertEquals(htable[self.borMcTree.getNodeId('HUMAN')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('PIG')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('RAT')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc7')], 1)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc1')], 2)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc0')], 4)

    def testZeroThreshold(self):
        """A threshold of 0 should produce outgroup sets that cause no additional depth in the resulting schedule."""
        tree = self.backboneTree
        og = GreedyOutgroup()
        og.importTree(tree)
        og.greedy(candidateSet=set(['Homo_sapiens', 'Mus_musculus']),threshold=0, maxNumOutgroups=3, candidateChildFrac=0.75)
        og.greedy(threshold=0, maxNumOutgroups=3, candidateChildFrac=0.75)
        htable = og.heightTable()
        for node, outgroups in og.ogMap.items():
            for outgroup, _ in outgroups:
                # For the outgroup assignment to create no
                # additional dependencies, each outgroup must have
                # a height lower than the node it's outgroup to
                # (or be a leaf)
                self.assertTrue(htable[tree.getNodeId(outgroup)] < htable[tree.getNodeId(node)] \
                                or htable[tree.getNodeId(outgroup)] == 0)

    def testCandidates(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(['HUMAN', 'CHIMP', 'RAT'])
        og.greedy(candidateSet=candidates, candidateChildFrac=0.5)
        assert og.ogMap['Anc1'][0][0] == 'Anc4'
        assert og.ogMap['Anc2'][0][0] == 'Anc4'
        assert og.ogMap['Anc3'][0][0] == 'Anc4'
        assert 'Anc4' not in og.ogMap
        assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc6', 'Anc7']
        assert og.ogMap['Anc6'][0][0] in ['Anc5', 'MOUSE', 'RAT']
        assert og.ogMap['Anc7'][0][0] in ['Anc5', 'MOUSE', 'RAT']

        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(['HUMAN', 'CHIMP', 'RAT'])
        og.greedy(candidateSet=candidates, candidateChildFrac=1.0)
        assert og.ogMap['Anc1'][0][0] == 'Anc7'
        assert og.ogMap['Anc2'][0][0] == 'Anc7'
        assert og.ogMap['Anc3'][0][0] == 'Anc7'
        assert 'Anc4' not in og.ogMap
        assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc7']
        assert og.ogMap['Anc6'][0][0] == 'RAT'
        assert og.ogMap['Anc7'][0][0] == 'RAT'

    def testGeneralBetterThanLeaves(self):
        for tree in self.mcTrees:
            og1 = GreedyOutgroup()
            og1.importTree(tree)
            candidates = set([tree.getName(x) for x in tree.getLeaves()])
            og1.greedy(candidateSet=candidates, candidateChildFrac=2.)
            og2 = GreedyOutgroup()
            og2.importTree(tree)
            og2.greedy(candidateSet=None)

            for i in og1.ogMap:
                assert i in og2.ogMap
                dist1 = og1.ogMap[i][0][1]
                dist2 = og2.ogMap[i][0][1]
                assert dist2 <= dist1

    def testGeneralConstrainedBetterThanLeaves(self):
        for tree in self.mcTrees:
            og1 = GreedyOutgroup()
            og1.importTree(tree)
            candidates = set([tree.getName(x) for x in tree.getLeaves()])
            og1.greedy(candidateSet=candidates, candidateChildFrac=2.)
            og2 = GreedyOutgroup()
            og2.importTree(tree)
            og2.greedy(candidateSet=None, threshold=2)

            for i in og1.ogMap:
                assert i in og2.ogMap
                dist1 = og1.ogMap[i][0][1]
                dist2 = og2.ogMap[i][0][1]
                assert dist2 <= dist1

    def testMultipleOutgroups(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                       og.ogMap.values()))
        # ordering is important!
        assert map(itemgetter(0), og.ogMap['Anc4']) == ['Anc1']
        assert map(itemgetter(0), og.ogMap['Anc7']) == ['BABOON', 'Anc1',
                                                        'Anc5']
        # We avoid cycles, and choose post-order first, so this only
        # uses leaves.
        assert map(itemgetter(0), og.ogMap['Anc1']) == ['HUMAN', 'CHIMP',
                                                        'BABOON']

    def testMultipleOutgroupsJustLeaves(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set([self.borMcTree.getName(x) for x in self.borMcTree.getLeaves()])
        og.greedy(candidateSet=candidates, candidateChildFrac=2.,
                  maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                       og.ogMap.values()))
        # ordering is important!
        assert map(itemgetter(0), og.ogMap['Anc1']) == ['HUMAN', 'CHIMP',
                                                        'BABOON']
        assert og.ogMap['Anc7'][0][0] == 'BABOON'
        assert og.ogMap['Anc7'][1][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc7'][2][0] in ['CAT', 'DOG']

    def testMultipleOutgroupsOnRandomTrees(self):
        for tree in self.mcTrees:
            og = GreedyOutgroup()
            og.importTree(tree)
            og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                           og.ogMap.values()))

    def testDynamicOutgroupsOnRandomTrees(self):
        for tree, seqMap in zip(self.mcTrees, self.dummySeqMaps):
            degree = max([len(tree.getChildren(x)) for x in
                         tree.breadthFirstTraversal()])
            if degree < 8:
                og = DynamicOutgroup()
                og.edgeLen = 5
                og.importTree(tree, seqMap)
                og.compute(maxNumOutgroups=3)
                # make sure all entries have <= 3 outgroups.
                assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
                # and for all entries, the closest must be first.
                # (this will be true because all sequences are the same)
                assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                               og.ogMap.values()))

    def testDynamicOutgroupsJustLeaves(self):
        og = DynamicOutgroup()
        og.importTree(self.borMcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3, sequenceLossWeight=0.)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                       og.ogMap.values()))
        # ordering is important!
        assert og.ogMap['Anc1'][0][0] == 'HUMAN'
        assert og.ogMap['Anc7'][0][0] == 'BABOON'

        og = DynamicOutgroup()
        og.importTree(self.borMcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))

        # we keep dynamic outgroups sorted by distance too
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                               og.ogMap.values()))
                        

    def testMultipleIdenticalRunsProduceSameResult(self):
        """The code now allows for multiple greedy() calls with different
        candidate sets, so that some outgroups can be 'preferred' over
        others without being the only candidates.
        Check that running greedy() multiple times with the same
        parameters gives the same result as running it once.
        """
        for tree in self.mcTrees:
            ogOnce = GreedyOutgroup()
            ogOnce.importTree(tree)
            ogOnce.greedy(maxNumOutgroups=3)
            ogMultipleTimes = GreedyOutgroup()
            ogMultipleTimes.importTree(tree)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(map(lambda x: len(x) <= 3, ogMultipleTimes.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                           ogMultipleTimes.ogMap.values()))
            # Check that the maps are equal. Can't compare them
            # directly since python will convert them to ordered
            # association lists.
            assert len(ogOnce.ogMap) == len(ogMultipleTimes.ogMap)
            for i in ogOnce.ogMap:
                assert i in ogMultipleTimes.ogMap
                assert ogOnce.ogMap[i] == ogMultipleTimes.ogMap[i]

    def testPreferredCandidateSets(self):
        """Test that running greedy() multiple times with different candidate
        sets will behave properly, i.e. keep all the existing outgroup
        assignments and fill in more on the second run."""
        for tree in self.mcTrees:
            ogOnce = GreedyOutgroup()
            ogOnce.importTree(tree)
            nodes = [j for j in tree.postOrderTraversal()]
            candidateSet = set([tree.getName(i) for i in random.sample(nodes, min(20, len(nodes)))])
            ogOnce.greedy(candidateSet=candidateSet, maxNumOutgroups=3)
            ogTwice = GreedyOutgroup()
            ogTwice.importTree(tree)
            ogTwice.greedy(candidateSet=candidateSet, maxNumOutgroups=3)
            ogTwice.greedy(maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(map(lambda x: len(x) <= 3, ogTwice.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                           ogTwice.ogMap.values()))
            for node in ogTwice.ogMap:
                if node in ogOnce.ogMap:
                    # the ogMap entry in ogOnce should be a subset of the ogMap entry for ogTwice
                    oneRunOutgroups = ogOnce.ogMap[node]
                    twoRunOutgroups = ogTwice.ogMap[node]
                    assert len(twoRunOutgroups) >= len(oneRunOutgroups)
                    for i in oneRunOutgroups:
                        assert i in twoRunOutgroups

    def testNoOutgroupIsADescendantOfAnother(self):
        """No two outgroups should be on the same path to the root."""
        for tree in self.mcTrees:
            tree.nameUnlabeledInternalNodes()
            og = GreedyOutgroup()
            og.importTree(tree)
            og.greedy(maxNumOutgroups=3)
            for source in og.ogMap:
                for (sink1, _) in og.ogMap[source]:
                    for (sink2, _) in og.ogMap[source]:
                        if sink1 != sink2:
                            sink1Id = tree.nameToId[sink1]
                            sink2Id = tree.nameToId[sink2]
                            assert sink1Id not in tree.postOrderTraversal(sink2Id)
                            assert sink2Id not in tree.postOrderTraversal(sink1Id)
Пример #18
0
class TestCase(unittest.TestCase):
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.mcTree1 = None
        self.mcTree1a = None
        self.mcTree2 = None
        self.__generateTrees()

    def testSanity(self):
        parser = NXNewick()
        mcTree1 = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots=False))
        tree1String = NXNewick().writeString(mcTree1)
        self.assertEqual(tree1String, self.tree1)
        mcTree2 = MultiCactusTree(parser.parseString(self.tree2, addImpliedRoots=False), subtreeSize=3)
        tree2String = NXNewick().writeString(mcTree2)
        self.assertEqual(tree2String, self.tree2)

    def testSubtrees(self):
        roots1 = ["Anc0", "Anc1", "Anc2", "Anc3", "Anc4", "Anc5", "Anc6", "Anc7"]
        roots1a = ["Anc0", "Anc3", "Anc4", "Anc5", "Anc6"]
        roots2 = ["Anc0", "Anc1", "Anc2", "Anc3", "Anc4"]

        subTree1_a3 = "(Anc7:0.025291,BABOON:0.044568)Anc3;"
        subTree1a_a0 = "((Anc3:0.11,Anc4:0.260342)Anc1:0.02326,(Anc5:0.087381,Anc6:0.104728)Anc2:0.04)Anc0;"
        subTree2_a3 = "(monkey:100.8593,cat:47.14069)Anc5;"

        trueRoots = [roots1, roots1a, roots2]
        trueSubtrees = [subTree1_a3, subTree1a_a0, subTree2_a3]
        trees = [self.mcTree1, self.mcTree1a, self.mcTree2]
        ancs = ["Anc3", "Anc0", "Anc5"]

        for i in range(0, 3):
            roots = trees[i].getSubtreeRootNames()
            self.assertEqual(sorted(roots), sorted(trueRoots[i]))
            subtree = trees[i].extractSubTree(ancs[i])
            subtree = NXNewick().writeString(subtree)
            self.assertEqual(subtree, trueSubtrees[i])

    def testAddSelf(self):
        trueSelf = "((((((((HUMAN:0.006969)HUMAN_self:0.006969,(CHIMP:0.009727)CHIMP_self:0.009727)Anc7:0.025291)Anc7_self:0.025291,(BABOON:0.044568)BABOON_self:0.044568)Anc3:0.11)Anc3_self:0.11,(((MOUSE:0.072818)MOUSE_self:0.072818,(RAT:0.081244)RAT_self:0.081244)Anc4:0.260342)Anc4_self:0.260342)Anc1:0.02326)Anc1_self:0.02326,(((((DOG:0.07)DOG_self:0.07,(CAT:0.07)CAT_self:0.07)Anc5:0.087381)Anc5_self:0.087381,(((PIG:0.06)PIG_self:0.06,(COW:0.06)COW_self:0.06)Anc6:0.104728)Anc6_self:0.104728)Anc2:0.04)Anc2_self:0.04)Anc0;"
        tree = MultiCactusTree(self.mcTree1)
        tree.nameUnlabeledInternalNodes()
        tree.computeSubtreeRoots()
        tree.addSelfEdges()
        treeString = NXNewick().writeString(tree)
        self.assertEqual(treeString, trueSelf)

    def testAddOutgroup(self):
        trueOg = "((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3:0.11,(MOUSE:0.072818,RAT:0.081244)Anc4:0.260342)Anc1:0.02326,((DOG:0.07,CAT:0.07)Anc5:0.087381,(PIG:0.06,COW:0.06)Anc6:0.104728)Anc2:0.04,outgroup:1.7)Anc0;"
        tree = MultiCactusTree(self.mcTree1)
        tree.nameUnlabeledInternalNodes()
        tree.computeSubtreeRoots()
        tree.addOutgroup("outgroup", 1.7)
        treeString = NXNewick().writeString(tree)
        self.assertEqual(treeString, trueOg)

        trueLeafOg = "(A:1.1,outgroup:1.1);"
        leafTreeString = "A;"
        parser = NXNewick()
        leafTree = MultiCactusTree(parser.parseString(leafTreeString, addImpliedRoots=False))
        leafTree.nameUnlabeledInternalNodes()
        leafTree.computeSubtreeRoots()
        leafTree.addOutgroup("outgroup", 2.2)
        leafTreeOutString = NXNewick().writeString(leafTree)
        self.assertEqual(leafTreeOutString, trueLeafOg)

    def testExtractSpanningTree(self):
        """Tests whether extracting a binary spanning tree works correctly."""
        prevNewick1 = NXNewick().writeString(self.mcTree1)
        # Check a dead-simple spanning tree with 3 closely related leaves.
        spanHCB = self.mcTree1.extractSpanningTree(["HUMAN", "CHIMP", "BABOON"])
        # Check that the existing tree hasn't been modified (OK, a bit
        # silly, but just in case).
        self.assertEqual(NXNewick().writeString(self.mcTree1), prevNewick1)
        # Check the actual spanning tree.
        self.assertEqual(
            NXNewick().writeString(spanHCB), "((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3;"
        )

        # Now test a more complicated tree, where we should remove as
        # many of the ancestors as possible (they will add extra
        # losses for no reason!).
        spanHCC = self.mcTree1.extractSpanningTree(["HUMAN", "CHIMP", "CAT"])
        self.assertEqual(NXNewick().writeString(self.mcTree1), prevNewick1)
        self.assertEqual(
            NXNewick().writeString(spanHCC), "((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.158551,CAT:0.197381)Anc0;"
        )

    def __generateTrees(self):
        self.tree1 = "((((HUMAN:0.006969,CHIMP:0.009727):0.025291,BABOON:0.044568):0.11,(MOUSE:0.072818,RAT:0.081244):0.260342):0.02326,((DOG:0.07,CAT:0.07):0.087381,(PIG:0.06,COW:0.06):0.104728):0.04);"
        self.tree2 = "((raccoon:19.19959,bear:6.80041):0.846,((sea_lion:11.997,seal:12.003):7.52973,((monkey:100.8593,cat:47.14069):20.59201,weasel:18.87953):2.0946):3.87382,dog:25.46154);"
        parser = NXNewick()
        self.mcTree1 = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots=False))
        self.mcTree1a = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots=False), subtreeSize=4)
        self.mcTree2 = MultiCactusTree(parser.parseString(self.tree2, addImpliedRoots=False), subtreeSize=3)
        self.mcTree1.nameUnlabeledInternalNodes()
        self.mcTree1a.nameUnlabeledInternalNodes()
        self.mcTree2.nameUnlabeledInternalNodes()
        self.mcTree1.computeSubtreeRoots()
        self.mcTree1a.computeSubtreeRoots()
        self.mcTree2.computeSubtreeRoots()
Пример #19
0
class TestCase(unittest.TestCase):
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.mcTree1 = None
        self.mcTree1a = None
        self.mcTree2 = None
        self.__generateTrees()

    def testSanity(self):
        parser = NXNewick()
        mcTree1 = MultiCactusTree(
            parser.parseString(self.tree1, addImpliedRoots=False))
        tree1String = NXNewick().writeString(mcTree1)
        self.assertEqual(tree1String, self.tree1)
        mcTree2 = MultiCactusTree(parser.parseString(self.tree2,
                                                     addImpliedRoots=False),
                                  subtreeSize=3)
        tree2String = NXNewick().writeString(mcTree2)
        self.assertEqual(tree2String, self.tree2)

    def testSubtrees(self):
        roots1 = [
            "Anc0", "Anc1", "Anc2", "Anc3", "Anc4", "Anc5", "Anc6", "Anc7"
        ]
        roots1a = ["Anc0", "Anc3", "Anc4", "Anc5", "Anc6"]
        roots2 = ["Anc0", "Anc1", "Anc2", "Anc3", "Anc4"]

        subTree1_a3 = '(Anc7:0.025291,BABOON:0.044568)Anc3;'
        subTree1a_a0 = '((Anc3:0.11,Anc4:0.260342)Anc1:0.02326,(Anc5:0.087381,Anc6:0.104728)Anc2:0.04)Anc0;'
        subTree2_a3 = '(monkey:100.8593,cat:47.14069)Anc5;'

        trueRoots = [roots1, roots1a, roots2]
        trueSubtrees = [subTree1_a3, subTree1a_a0, subTree2_a3]
        trees = [self.mcTree1, self.mcTree1a, self.mcTree2]
        ancs = ["Anc3", "Anc0", "Anc5"]

        for i in range(0, 3):
            roots = trees[i].getSubtreeRootNames()
            self.assertEqual(sorted(roots), sorted(trueRoots[i]))
            subtree = trees[i].extractSubTree(ancs[i])
            subtree = NXNewick().writeString(subtree)
            self.assertEqual(subtree, trueSubtrees[i])

    def testAddSelf(self):
        trueSelf = '((((((((HUMAN:0.006969)HUMAN_self:0.006969,(CHIMP:0.009727)CHIMP_self:0.009727)Anc7:0.025291)Anc7_self:0.025291,(BABOON:0.044568)BABOON_self:0.044568)Anc3:0.11)Anc3_self:0.11,(((MOUSE:0.072818)MOUSE_self:0.072818,(RAT:0.081244)RAT_self:0.081244)Anc4:0.260342)Anc4_self:0.260342)Anc1:0.02326)Anc1_self:0.02326,(((((DOG:0.07)DOG_self:0.07,(CAT:0.07)CAT_self:0.07)Anc5:0.087381)Anc5_self:0.087381,(((PIG:0.06)PIG_self:0.06,(COW:0.06)COW_self:0.06)Anc6:0.104728)Anc6_self:0.104728)Anc2:0.04)Anc2_self:0.04)Anc0;'
        tree = MultiCactusTree(self.mcTree1)
        tree.nameUnlabeledInternalNodes()
        tree.computeSubtreeRoots()
        tree.addSelfEdges()
        treeString = NXNewick().writeString(tree)
        self.assertEqual(treeString, trueSelf)

    def testAddOutgroup(self):
        trueOg = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3:0.11,(MOUSE:0.072818,RAT:0.081244)Anc4:0.260342)Anc1:0.02326,((DOG:0.07,CAT:0.07)Anc5:0.087381,(PIG:0.06,COW:0.06)Anc6:0.104728)Anc2:0.04,outgroup:1.7)Anc0;'
        tree = MultiCactusTree(self.mcTree1)
        tree.nameUnlabeledInternalNodes()
        tree.computeSubtreeRoots()
        tree.addOutgroup("outgroup", 1.7)
        treeString = NXNewick().writeString(tree)
        self.assertEqual(treeString, trueOg)

        trueLeafOg = "(A:1.1,outgroup:1.1);"
        leafTreeString = "A;"
        parser = NXNewick()
        leafTree = MultiCactusTree(
            parser.parseString(leafTreeString, addImpliedRoots=False))
        leafTree.nameUnlabeledInternalNodes()
        leafTree.computeSubtreeRoots()
        leafTree.addOutgroup("outgroup", 2.2)
        leafTreeOutString = NXNewick().writeString(leafTree)
        self.assertEqual(leafTreeOutString, trueLeafOg)

    def testExtractSpanningTree(self):
        """Tests whether extracting a binary spanning tree works correctly."""
        prevNewick1 = NXNewick().writeString(self.mcTree1)
        # Check a dead-simple spanning tree with 3 closely related leaves.
        spanHCB = self.mcTree1.extractSpanningTree(
            ["HUMAN", "CHIMP", "BABOON"])
        # Check that the existing tree hasn't been modified (OK, a bit
        # silly, but just in case).
        self.assertEqual(NXNewick().writeString(self.mcTree1), prevNewick1)
        # Check the actual spanning tree.
        self.assertEqual(
            NXNewick().writeString(spanHCB),
            "((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3;"
        )

        # Now test a more complicated tree, where we should remove as
        # many of the ancestors as possible (they will add extra
        # losses for no reason!).
        spanHCC = self.mcTree1.extractSpanningTree(["HUMAN", "CHIMP", "CAT"])
        self.assertEqual(NXNewick().writeString(self.mcTree1), prevNewick1)
        self.assertEqual(
            NXNewick().writeString(spanHCC),
            "((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.158551,CAT:0.197381)Anc0;")

    def __generateTrees(self):
        self.tree1 = '((((HUMAN:0.006969,CHIMP:0.009727):0.025291,BABOON:0.044568):0.11,(MOUSE:0.072818,RAT:0.081244):0.260342):0.02326,((DOG:0.07,CAT:0.07):0.087381,(PIG:0.06,COW:0.06):0.104728):0.04);'
        self.tree2 = '((raccoon:19.19959,bear:6.80041):0.846,((sea_lion:11.997,seal:12.003):7.52973,((monkey:100.8593,cat:47.14069):20.59201,weasel:18.87953):2.0946):3.87382,dog:25.46154);'
        parser = NXNewick()
        self.mcTree1 = MultiCactusTree(
            parser.parseString(self.tree1, addImpliedRoots=False))
        self.mcTree1a = MultiCactusTree(parser.parseString(
            self.tree1, addImpliedRoots=False),
                                        subtreeSize=4)
        self.mcTree2 = MultiCactusTree(parser.parseString(
            self.tree2, addImpliedRoots=False),
                                       subtreeSize=3)
        self.mcTree1.nameUnlabeledInternalNodes()
        self.mcTree1a.nameUnlabeledInternalNodes()
        self.mcTree2.nameUnlabeledInternalNodes()
        self.mcTree1.computeSubtreeRoots()
        self.mcTree1a.computeSubtreeRoots()
        self.mcTree2.computeSubtreeRoots()
def createMCProject(tree, experiment, config, options):
    mcTree = MultiCactusTree(tree, config.getSubtreeSize())
    mcTree.nameUnlabeledInternalNodes(config.getDefaultInternalNodePrefix())
    mcTree.computeSubtreeRoots()
    mcProj = MultiCactusProject()
    mcProj.mcTree = mcTree
    mcProj.inputSequences = experiment.getSequences()[:]
    if config.getDoSelfAlignment():
        mcTree.addSelfEdges()
    for name in mcProj.mcTree.getSubtreeRootNames():
        expPath = "%s/%s/%s_experiment.xml" % (options.path, name, name)
        mcProj.expMap[name] = os.path.abspath(expPath)
    alignmentRootId = mcProj.mcTree.getRootId()
    if options.root is not None:
        try:
            alignmentRootId = mcProj.mcTree.getNodeId(options.root)
        except:
            raise RuntimeError("Specified root name %s not found in tree" %
                               options.root)
    mcProj.outgroup = None
    if config.getOutgroupStrategy() == 'greedy':
        # use the provided outgroup candidates, or use all outgroups
        # as candidates if none are given
        mcProj.outgroup = GreedyOutgroup()
        mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId)
        mcProj.outgroup.greedy(
            threshold=config.getOutgroupThreshold(),
            candidateSet=options.outgroupNames,
            candidateChildFrac=config.getOutgroupAncestorQualityFraction(),
            maxNumOutgroups=config.getMaxNumOutgroups())
    elif config.getOutgroupStrategy() == 'greedyLeaves':
        # use all leaves as outgroups, unless outgroup candidates are given
        mcProj.outgroup = GreedyOutgroup()
        mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId)
        ogSet = options.outgroupNames
        if ogSet is None:
            ogSet = set(
                [mcProj.mcTree.getName(x) for x in mcProj.mcTree.getLeaves()])
        mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(),
                               candidateSet=ogSet,
                               candidateChildFrac=2.0,
                               maxNumOutgroups=config.getMaxNumOutgroups())
    elif config.getOutgroupStrategy() == 'greedyPreference':
        # prefer the provided outgroup candidates, if any, but use
        # other nodes as "filler" if we can't find enough.
        mcProj.outgroup = GreedyOutgroup()
        mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId)
        mcProj.outgroup.greedy(
            threshold=config.getOutgroupThreshold(),
            candidateSet=options.outgroupNames,
            candidateChildFrac=config.getOutgroupAncestorQualityFraction(),
            maxNumOutgroups=config.getMaxNumOutgroups())
        mcProj.outgroup.greedy(
            threshold=config.getOutgroupThreshold(),
            candidateSet=None,
            candidateChildFrac=config.getOutgroupAncestorQualityFraction(),
            maxNumOutgroups=config.getMaxNumOutgroups())
    elif config.getOutgroupStrategy() == 'dynamic':
        # dynamic programming algorithm that exactly optimizes probability
        # that base in target node aligns to at least one base in the
        # outgroup set.  Caveats are that it only returns leaves, and
        # the model used for optimization is super naive. Still, it does
        # some things better than greedy approaches such as properly account
        # for phylogenetic redundancy, as well as try to factor assembly
        # size/quality automatically.
        mcProj.outgroup = DynamicOutgroup()
        mcProj.outgroup.importTree(mcProj.mcTree,
                                   mcProj.getInputSequenceMap(),
                                   alignmentRootId,
                                   candidateSet=options.outgroupNames)
        mcProj.outgroup.compute(maxNumOutgroups=config.getMaxNumOutgroups())
    elif config.getOutgroupStrategy() != 'none':
        raise RuntimeError("Could not understand outgroup strategy %s" %
                           config.getOutgroupStrategy())

    # if necessary, we reroot the tree at the specified alignment root id.  all leaf genomes
    # that are no longer in the tree, but still used as outgroups, are moved into special fields
    # so that we can remember to, say, get their paths for preprocessing.
    specifyAlignmentRoot(mcProj, alignmentRootId)
    return mcProj
Пример #21
0
class TestCase(unittest.TestCase):
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.trees = randomTreeSet()
        self.mcTrees = []
        self.tempDir = getTempDirectory(os.getcwd())
        self.tempFa = os.path.join(self.tempDir, "seq.fa")
        with open(self.tempFa, "w") as f:
            f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n")
        self.dummySeqMaps = []
        for tree in self.trees:
            if tree.size() < 50:
                mcTree = MultiCactusTree(tree)
                seqMap = dict()
                for i in mcTree.breadthFirstTraversal():
                    mcTree.setName(i, "Node%s" % str(i))
                    seqMap["Node%s" % str(i)] = self.tempFa
                mcTree.computeSubtreeRoots()
                mcTree.nameUnlabeledInternalNodes()
                self.mcTrees.append(mcTree)
                self.dummySeqMaps.append(seqMap)

        # Boreoeutherian tree
        borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
        self.borMcTree = MultiCactusTree(NXNewick().parseString(
            borTree, addImpliedRoots=False))
        self.borMcTree.computeSubtreeRoots()
        self.borMcTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.borMcTree)

        # Eutherian backbone tree
        backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);'
        self.backboneTree = MultiCactusTree(NXNewick().parseString(
            backbone, addImpliedRoots=False))
        self.backboneTree.computeSubtreeRoots()
        self.backboneTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.backboneTree)

        seqLens = dict()
        seqLens["HUMAN"] = 57553
        seqLens["CHIMP"] = 57344
        seqLens["BABOON"] = 58960
        seqLens["MOUSE"] = 32750
        seqLens["RAT"] = 38436
        seqLens["DOG"] = 54187
        seqLens["CAT"] = 50283
        seqLens["PIG"] = 54843
        seqLens["COW"] = 55508
        self.blanchetteSeqMap = dict()
        for event, seqLen in seqLens.items():
            p = os.path.join(self.tempDir, event + ".fa")
            with open(p, "w") as f:
                f.write(">%s\n" % event)
                f.write(''.join(['A'] * seqLen))
                f.write('\n')
            self.blanchetteSeqMap[event] = p

    def tearDown(self):
        unittest.TestCase.tearDown(self)
        system("rm -rf %s" % self.tempDir)

    def testJustLeaves(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(
            [self.borMcTree.getName(x) for x in self.borMcTree.getLeaves()])
        og.greedy(candidateSet=candidates, candidateChildFrac=2.)
        assert og.ogMap['Anc1'][0][0] == 'HUMAN'
        assert og.ogMap['Anc2'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc3'][0][0] in ['PIG', 'COW']
        assert og.ogMap['Anc4'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc5'][0][0] == 'HUMAN'
        assert og.ogMap['Anc6'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc7'][0][0] == 'BABOON'

    def testHeightTable(self):
        """Make sure the height-table is calculated correctly."""
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        htable = og.heightTable()
        self.assertEquals(htable[self.borMcTree.getNodeId('HUMAN')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('PIG')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('RAT')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc7')], 1)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc1')], 2)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc0')], 4)

    def testZeroThreshold(self):
        """A threshold of 0 should produce outgroup sets that cause no additional depth in the resulting schedule."""
        tree = self.backboneTree
        og = GreedyOutgroup()
        og.importTree(tree)
        og.greedy(candidateSet=set(['Homo_sapiens', 'Mus_musculus']),
                  threshold=0,
                  maxNumOutgroups=3,
                  candidateChildFrac=0.75)
        og.greedy(threshold=0, maxNumOutgroups=3, candidateChildFrac=0.75)
        htable = og.heightTable()
        for node, outgroups in og.ogMap.items():
            for outgroup, _ in outgroups:
                # For the outgroup assignment to create no
                # additional dependencies, each outgroup must have
                # a height lower than the node it's outgroup to
                # (or be a leaf)
                self.assertTrue(htable[tree.getNodeId(outgroup)] < htable[tree.getNodeId(node)] \
                                or htable[tree.getNodeId(outgroup)] == 0)

    def testCandidates(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(['HUMAN', 'CHIMP', 'RAT'])
        og.greedy(candidateSet=candidates, candidateChildFrac=0.5)
        assert og.ogMap['Anc1'][0][0] == 'Anc4'
        assert og.ogMap['Anc2'][0][0] == 'Anc4'
        assert og.ogMap['Anc3'][0][0] == 'Anc4'
        assert 'Anc4' not in og.ogMap
        assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc6', 'Anc7']
        assert og.ogMap['Anc6'][0][0] in ['Anc5', 'MOUSE', 'RAT']
        assert og.ogMap['Anc7'][0][0] in ['Anc5', 'MOUSE', 'RAT']

        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(['HUMAN', 'CHIMP', 'RAT'])
        og.greedy(candidateSet=candidates, candidateChildFrac=1.0)
        assert og.ogMap['Anc1'][0][0] == 'Anc7'
        assert og.ogMap['Anc2'][0][0] == 'Anc7'
        assert og.ogMap['Anc3'][0][0] == 'Anc7'
        assert 'Anc4' not in og.ogMap
        assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc7']
        assert og.ogMap['Anc6'][0][0] == 'RAT'
        assert og.ogMap['Anc7'][0][0] == 'RAT'

    def testGeneralBetterThanLeaves(self):
        for tree in self.mcTrees:
            og1 = GreedyOutgroup()
            og1.importTree(tree)
            candidates = set([tree.getName(x) for x in tree.getLeaves()])
            og1.greedy(candidateSet=candidates, candidateChildFrac=2.)
            og2 = GreedyOutgroup()
            og2.importTree(tree)
            og2.greedy(candidateSet=None)

            for i in og1.ogMap:
                assert i in og2.ogMap
                dist1 = og1.ogMap[i][0][1]
                dist2 = og2.ogMap[i][0][1]
                assert dist2 <= dist1

    def testGeneralConstrainedBetterThanLeaves(self):
        for tree in self.mcTrees:
            og1 = GreedyOutgroup()
            og1.importTree(tree)
            candidates = set([tree.getName(x) for x in tree.getLeaves()])
            og1.greedy(candidateSet=candidates, candidateChildFrac=2.)
            og2 = GreedyOutgroup()
            og2.importTree(tree)
            og2.greedy(candidateSet=None, threshold=2)

            for i in og1.ogMap:
                assert i in og2.ogMap
                dist1 = og1.ogMap[i][0][1]
                dist2 = og2.ogMap[i][0][1]
                assert dist2 <= dist1

    def testMultipleOutgroups(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(
            map(lambda x: x == sorted(x, key=itemgetter(1)),
                og.ogMap.values()))
        # ordering is important!
        assert map(itemgetter(0), og.ogMap['Anc4']) == ['Anc1']
        assert map(itemgetter(0),
                   og.ogMap['Anc7']) == ['BABOON', 'Anc1', 'Anc5']
        # We avoid cycles, and choose post-order first, so this only
        # uses leaves.
        assert map(itemgetter(0),
                   og.ogMap['Anc1']) == ['HUMAN', 'CHIMP', 'BABOON']

    def testMultipleOutgroupsJustLeaves(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(
            [self.borMcTree.getName(x) for x in self.borMcTree.getLeaves()])
        og.greedy(candidateSet=candidates,
                  candidateChildFrac=2.,
                  maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(
            map(lambda x: x == sorted(x, key=itemgetter(1)),
                og.ogMap.values()))
        # ordering is important!
        assert map(itemgetter(0),
                   og.ogMap['Anc1']) == ['HUMAN', 'CHIMP', 'BABOON']
        assert og.ogMap['Anc7'][0][0] == 'BABOON'
        assert og.ogMap['Anc7'][1][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc7'][2][0] in ['CAT', 'DOG']

    def testMultipleOutgroupsOnRandomTrees(self):
        for tree in self.mcTrees:
            og = GreedyOutgroup()
            og.importTree(tree)
            og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(
                map(lambda x: x == sorted(x, key=itemgetter(1)),
                    og.ogMap.values()))

    def testDynamicOutgroupsOnRandomTrees(self):
        for tree, seqMap in zip(self.mcTrees, self.dummySeqMaps):
            degree = max([
                len(tree.getChildren(x)) for x in tree.breadthFirstTraversal()
            ])
            if degree < 8:
                og = DynamicOutgroup()
                og.edgeLen = 5
                og.importTree(tree, seqMap)
                og.compute(maxNumOutgroups=3)
                # make sure all entries have <= 3 outgroups.
                assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
                # and for all entries, the closest must be first.
                # (this will be true because all sequences are the same)
                assert all(
                    map(lambda x: x == sorted(x, key=itemgetter(1)),
                        og.ogMap.values()))

    def testDynamicOutgroupsJustLeaves(self):
        og = DynamicOutgroup()
        og.importTree(self.borMcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3, sequenceLossWeight=0.)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(
            map(lambda x: x == sorted(x, key=itemgetter(1)),
                og.ogMap.values()))
        # ordering is important!
        assert og.ogMap['Anc1'][0][0] == 'HUMAN'
        assert og.ogMap['Anc7'][0][0] == 'BABOON'

        og = DynamicOutgroup()
        og.importTree(self.borMcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))

        # we keep dynamic outgroups sorted by distance too
        assert all(
            map(lambda x: x == sorted(x, key=itemgetter(1)),
                og.ogMap.values()))

    def testMultipleIdenticalRunsProduceSameResult(self):
        """The code now allows for multiple greedy() calls with different
        candidate sets, so that some outgroups can be 'preferred' over
        others without being the only candidates.
        Check that running greedy() multiple times with the same
        parameters gives the same result as running it once.
        """
        for tree in self.mcTrees:
            ogOnce = GreedyOutgroup()
            ogOnce.importTree(tree)
            ogOnce.greedy(maxNumOutgroups=3)
            ogMultipleTimes = GreedyOutgroup()
            ogMultipleTimes.importTree(tree)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(
                map(lambda x: len(x) <= 3, ogMultipleTimes.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(
                map(lambda x: x == sorted(x, key=itemgetter(1)),
                    ogMultipleTimes.ogMap.values()))
            # Check that the maps are equal. Can't compare them
            # directly since python will convert them to ordered
            # association lists.
            assert len(ogOnce.ogMap) == len(ogMultipleTimes.ogMap)
            for i in ogOnce.ogMap:
                assert i in ogMultipleTimes.ogMap
                assert ogOnce.ogMap[i] == ogMultipleTimes.ogMap[i]

    def testPreferredCandidateSets(self):
        """Test that running greedy() multiple times with different candidate
        sets will behave properly, i.e. keep all the existing outgroup
        assignments and fill in more on the second run."""
        for tree in self.mcTrees:
            ogOnce = GreedyOutgroup()
            ogOnce.importTree(tree)
            nodes = [j for j in tree.postOrderTraversal()]
            candidateSet = set([
                tree.getName(i)
                for i in random.sample(nodes, min(20, len(nodes)))
            ])
            ogOnce.greedy(candidateSet=candidateSet, maxNumOutgroups=3)
            ogTwice = GreedyOutgroup()
            ogTwice.importTree(tree)
            ogTwice.greedy(candidateSet=candidateSet, maxNumOutgroups=3)
            ogTwice.greedy(maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(map(lambda x: len(x) <= 3, ogTwice.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(
                map(lambda x: x == sorted(x, key=itemgetter(1)),
                    ogTwice.ogMap.values()))
            for node in ogTwice.ogMap:
                if node in ogOnce.ogMap:
                    # the ogMap entry in ogOnce should be a subset of the ogMap entry for ogTwice
                    oneRunOutgroups = ogOnce.ogMap[node]
                    twoRunOutgroups = ogTwice.ogMap[node]
                    assert len(twoRunOutgroups) >= len(oneRunOutgroups)
                    for i in oneRunOutgroups:
                        assert i in twoRunOutgroups

    def testNoOutgroupIsADescendantOfAnother(self):
        """No two outgroups should be on the same path to the root."""
        for tree in self.mcTrees:
            tree.nameUnlabeledInternalNodes()
            og = GreedyOutgroup()
            og.importTree(tree)
            og.greedy(maxNumOutgroups=3)
            for source in og.ogMap:
                for (sink1, _) in og.ogMap[source]:
                    for (sink2, _) in og.ogMap[source]:
                        if sink1 != sink2:
                            sink1Id = tree.nameToId[sink1]
                            sink2Id = tree.nameToId[sink2]
                            assert sink1Id not in tree.postOrderTraversal(
                                sink2Id)
                            assert sink2Id not in tree.postOrderTraversal(
                                sink1Id)