Python MultiCactusTree.getLeaves示例

编程语言: Python

命名空间/包名称: cactus.progressive.multiCactusTree

类/类型: MultiCactusTree

方法/功能: getLeaves

hotexamples.com的示例: 7

Python MultiCactusTree.getLeaves - 已找到7个示例。这些是从开源项目中提取的最受好评的cactus.progressive.multiCactusTree.MultiCactusTree.getLeaves现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

nameUnlabeledInternalNodes(16)

MultiCactusTree(9)

computeSubtreeRoots(8)

getName(6)

getLeaves(4)

getRootName(4)

addSelfEdges(3)

getChildNames(3)

setName(2)

assignSubtreeRootNames(2)

breadthFirstTraversal(2)

extractSubTree(1)

addOutgroup(1)

getParent(1)

getSubtreeRoot(1)

getWeight(1)

hasName(1)

isLeaf(1)

postOrderTraversal(1)

getNodeId(1)

示例#1

显示文件

 def testJustLeaves(self):
     tree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
     mcTree = MultiCactusTree(NXNewick().parseString(tree, addImpliedRoots = False))
     mcTree.computeSubtreeRoots()
     og = GreedyOutgroup()
     og.importTree(mcTree)
     candidates = set([mcTree.getName(x) for x in mcTree.getLeaves()])
     og.greedy(candidateSet=candidates, candidateChildFrac=2.)
     assert og.ogMap['Anc1'][0][0] == 'HUMAN'
     assert og.ogMap['Anc2'][0][0] in ['CAT', 'DOG']
     assert og.ogMap['Anc3'][0][0] in ['PIG', 'COW']
     assert og.ogMap['Anc4'][0][0] in ['CAT', 'DOG']
     assert og.ogMap['Anc5'][0][0] == 'HUMAN'
     assert og.ogMap['Anc6'][0][0] in ['CAT', 'DOG']
     assert og.ogMap['Anc7'][0][0] == 'BABOON'

示例#2

显示文件

 def testMultipleOutgroupsJustLeaves(self):
     tree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
     mcTree = MultiCactusTree(NXNewick().parseString(tree, addImpliedRoots = False))
     mcTree.computeSubtreeRoots()
     og = GreedyOutgroup()
     og.importTree(mcTree)
     candidates = set([mcTree.getName(x) for x in mcTree.getLeaves()])
     og.greedy(candidateSet=candidates, candidateChildFrac=2.,
               maxNumOutgroups=3)
     # make sure all entries have <= 3 outgroups.
     assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
     # and for all entries, the closest must be first.
     assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                    og.ogMap.values()))
     # ordering is important!
     assert map(itemgetter(0), og.ogMap['Anc1']) == ['HUMAN', 'CHIMP',
                                                     'BABOON']
     assert og.ogMap['Anc7'][0][0] == 'BABOON'
     assert og.ogMap['Anc7'][1][0] in ['CAT', 'DOG']
     assert og.ogMap['Anc7'][2][0] in ['CAT', 'DOG']

示例#3

显示文件

文件： cactus_align.py 项目： xiangyupan/cactus

def runCactusAfterBlastOnly(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            halID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            # apply path overrides.  this was necessary for wdl which doesn't take kindly to
            # text files of local paths (ie seqfile).  one way to fix would be to add support
            # for s3 paths and force wdl to use it.  a better way would be a more fundamental
            # interface shift away from files of paths throughout all of cactus
            if options.pathOverrides:
                seqFile = SeqFile(options.seqFile)
                configNode = ET.parse(options.configFile).getroot()
                config = ConfigWrapper(configNode)
                tree = MultiCactusTree(seqFile.tree)
                tree.nameUnlabeledInternalNodes(
                    prefix=config.getDefaultInternalNodePrefix())
                for name, override in zip(options.pathOverrideNames,
                                          options.pathOverrides):
                    seqFile.pathMap[name] = override
                override_seq = os.path.join(options.cactusDir,
                                            'seqFile.override')
                with open(override_seq, 'w') as out_sf:
                    out_sf.write(str(seqFile))
                options.seqFile = override_seq

            #to be consistent with all-in-one cactus, we make sure the project
            #isn't limiting itself to the subtree (todo: parameterize so root can
            #be passed through from prepare to blast/align)
            proj_options = copy.deepcopy(options)
            proj_options.root = None
            #Create the progressive cactus project (as we do in runCactusProgressive)
            projWrapper = ProjectWrapper(proj_options,
                                         proj_options.configFile,
                                         ignoreSeqPaths=options.root)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)

            # open up the experiment (as we do in ProgressiveUp.run)
            # note that we copy the path into the options here
            experimentFile = project.expMap[options.root]
            expXml = ET.parse(experimentFile).getroot()
            experiment = ExperimentWrapper(expXml)
            configPath = experiment.getConfigPath()
            configXml = ET.parse(configPath).getroot()

            seqIDMap = dict()
            tree = MultiCactusTree(experiment.getTree()).extractSubTree(
                options.root)
            leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
            outgroups = experiment.getOutgroupGenomes()
            genome_set = set(leaves + outgroups)

            # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups
            def get_input_path(suffix=''):
                base_path = options.cigarsFile[0]
                for input_path in options.cigarsFile:
                    if suffix and input_path.endswith(suffix):
                        return input_path
                    if os.path.basename(base_path).startswith(
                            os.path.basename(input_path)):
                        base_path = input_path
                return base_path + suffix

            # import the outgroups
            outgroupIDs = []
            outgroup_fragment_found = False
            for i, outgroup in enumerate(outgroups):
                try:
                    outgroupID = toil.importFile(
                        makeURL(get_input_path('.og_fragment_{}'.format(i))))
                    outgroupIDs.append(outgroupID)
                    experiment.setSequenceID(outgroup, outgroupID)
                    outgroup_fragment_found = True
                    assert not options.pangenome
                except:
                    # we assume that input is not coming from cactus blast, so we'll treat output
                    # sequences normally and not go looking for fragments
                    outgroupIDs = []
                    break

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            for genome, seq in list(project.inputSequenceMap.items()):
                if genome in leaves or (not outgroup_fragment_found
                                        and genome in outgroups):
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)

                    experiment.setSequenceID(genome, toil.importFile(seq))

            if not outgroup_fragment_found:
                outgroupIDs = [
                    experiment.getSequenceID(outgroup)
                    for outgroup in outgroups
                ]

            # write back the experiment, as CactusWorkflowArguments wants a path
            experiment.writeXML(experimentFile)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            if options.pangenome:
                # turn off the megablock filter as it ruins non-all-to-all alignments
                configWrapper.disableCafMegablockFilter()
                # the recoverable chains parameter does not seem to play nicely with star-like alignments either
                #configWrapper.disableRecoverableChains()

            workFlowArgs = CactusWorkflowArguments(
                options,
                experimentFile=experimentFile,
                configNode=configNode,
                seqIDMap=project.inputSequenceIDMap)

            #import the files that cactus-blast made
            workFlowArgs.alignmentsID = toil.importFile(
                makeURL(get_input_path()))
            workFlowArgs.secondaryAlignmentsID = None
            if not options.pafInput:
                try:
                    workFlowArgs.secondaryAlignmentsID = toil.importFile(
                        makeURL(get_input_path('.secondary')))
                except:
                    pass
            workFlowArgs.outgroupFragmentIDs = outgroupIDs
            workFlowArgs.ingroupCoverageIDs = []
            if outgroup_fragment_found and len(outgroups) > 0:
                for i in range(len(leaves)):
                    workFlowArgs.ingroupCoverageIDs.append(
                        toil.importFile(
                            makeURL(get_input_path(
                                '.ig_coverage_{}'.format(i)))))

            halID = toil.start(
                Job.wrapJobFn(run_cactus_align,
                              configWrapper,
                              workFlowArgs,
                              project,
                              doRenaming=options.nonCactusInput,
                              pafInput=options.pafInput))

        # export the hal
        toil.exportFile(halID, makeURL(options.outputHal))

示例#4

显示文件

class TestCase(unittest.TestCase):
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.trees = randomTreeSet()
        self.mcTrees = []
        self.tempDir = getTempDirectory(os.getcwd())
        self.tempFa = os.path.join(self.tempDir, "seq.fa")
        with open(self.tempFa, "w") as f:
            f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n")
        self.dummySeqMaps = []
        for tree in self.trees:
            if tree.size() < 50:
                mcTree = MultiCactusTree(tree)
                seqMap = dict()
                for i in mcTree.breadthFirstTraversal():
                    mcTree.setName(i, "Node%s" % str(i))
                    seqMap["Node%s" % str(i)] = self.tempFa
                mcTree.computeSubtreeRoots()
                mcTree.nameUnlabeledInternalNodes()
                self.mcTrees.append(mcTree)
                self.dummySeqMaps.append(seqMap)

        # Boreoeutherian tree
        borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
        self.borMcTree = MultiCactusTree(NXNewick().parseString(
            borTree, addImpliedRoots=False))
        self.borMcTree.computeSubtreeRoots()
        self.borMcTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.borMcTree)

        # Eutherian backbone tree
        backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);'
        self.backboneTree = MultiCactusTree(NXNewick().parseString(
            backbone, addImpliedRoots=False))
        self.backboneTree.computeSubtreeRoots()
        self.backboneTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.backboneTree)

        seqLens = dict()
        seqLens["HUMAN"] = 57553
        seqLens["CHIMP"] = 57344
        seqLens["BABOON"] = 58960
        seqLens["MOUSE"] = 32750
        seqLens["RAT"] = 38436
        seqLens["DOG"] = 54187
        seqLens["CAT"] = 50283
        seqLens["PIG"] = 54843
        seqLens["COW"] = 55508
        self.blanchetteSeqMap = dict()
        for event, seqLen in seqLens.items():
            p = os.path.join(self.tempDir, event + ".fa")
            with open(p, "w") as f:
                f.write(">%s\n" % event)
                f.write(''.join(['A'] * seqLen))
                f.write('\n')
            self.blanchetteSeqMap[event] = p

    def tearDown(self):
        unittest.TestCase.tearDown(self)
        system("rm -rf %s" % self.tempDir)

    def testJustLeaves(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(
            [self.borMcTree.getName(x) for x in self.borMcTree.getLeaves()])
        og.greedy(candidateSet=candidates, candidateChildFrac=2.)
        assert og.ogMap['Anc1'][0][0] == 'HUMAN'
        assert og.ogMap['Anc2'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc3'][0][0] in ['PIG', 'COW']
        assert og.ogMap['Anc4'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc5'][0][0] == 'HUMAN'
        assert og.ogMap['Anc6'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc7'][0][0] == 'BABOON'

    def testHeightTable(self):
        """Make sure the height-table is calculated correctly."""
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        htable = og.heightTable()
        self.assertEquals(htable[self.borMcTree.getNodeId('HUMAN')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('PIG')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('RAT')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc7')], 1)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc1')], 2)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc0')], 4)

    def testZeroThreshold(self):
        """A threshold of 0 should produce outgroup sets that cause no additional depth in the resulting schedule."""
        tree = self.backboneTree
        og = GreedyOutgroup()
        og.importTree(tree)
        og.greedy(candidateSet=set(['Homo_sapiens', 'Mus_musculus']),
                  threshold=0,
                  maxNumOutgroups=3,
                  candidateChildFrac=0.75)
        og.greedy(threshold=0, maxNumOutgroups=3, candidateChildFrac=0.75)
        htable = og.heightTable()
        for node, outgroups in og.ogMap.items():
            for outgroup, _ in outgroups:
                # For the outgroup assignment to create no
                # additional dependencies, each outgroup must have
                # a height lower than the node it's outgroup to
                # (or be a leaf)
                self.assertTrue(htable[tree.getNodeId(outgroup)] < htable[tree.getNodeId(node)] \
                                or htable[tree.getNodeId(outgroup)] == 0)

    def testCandidates(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(['HUMAN', 'CHIMP', 'RAT'])
        og.greedy(candidateSet=candidates, candidateChildFrac=0.5)
        assert og.ogMap['Anc1'][0][0] == 'Anc4'
        assert og.ogMap['Anc2'][0][0] == 'Anc4'
        assert og.ogMap['Anc3'][0][0] == 'Anc4'
        assert 'Anc4' not in og.ogMap
        assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc6', 'Anc7']
        assert og.ogMap['Anc6'][0][0] in ['Anc5', 'MOUSE', 'RAT']
        assert og.ogMap['Anc7'][0][0] in ['Anc5', 'MOUSE', 'RAT']

        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(['HUMAN', 'CHIMP', 'RAT'])
        og.greedy(candidateSet=candidates, candidateChildFrac=1.0)
        assert og.ogMap['Anc1'][0][0] == 'Anc7'
        assert og.ogMap['Anc2'][0][0] == 'Anc7'
        assert og.ogMap['Anc3'][0][0] == 'Anc7'
        assert 'Anc4' not in og.ogMap
        assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc7']
        assert og.ogMap['Anc6'][0][0] == 'RAT'
        assert og.ogMap['Anc7'][0][0] == 'RAT'

    def testGeneralBetterThanLeaves(self):
        for tree in self.mcTrees:
            og1 = GreedyOutgroup()
            og1.importTree(tree)
            candidates = set([tree.getName(x) for x in tree.getLeaves()])
            og1.greedy(candidateSet=candidates, candidateChildFrac=2.)
            og2 = GreedyOutgroup()
            og2.importTree(tree)
            og2.greedy(candidateSet=None)

            for i in og1.ogMap:
                assert i in og2.ogMap
                dist1 = og1.ogMap[i][0][1]
                dist2 = og2.ogMap[i][0][1]
                assert dist2 <= dist1

    def testGeneralConstrainedBetterThanLeaves(self):
        for tree in self.mcTrees:
            og1 = GreedyOutgroup()
            og1.importTree(tree)
            candidates = set([tree.getName(x) for x in tree.getLeaves()])
            og1.greedy(candidateSet=candidates, candidateChildFrac=2.)
            og2 = GreedyOutgroup()
            og2.importTree(tree)
            og2.greedy(candidateSet=None, threshold=2)

            for i in og1.ogMap:
                assert i in og2.ogMap
                dist1 = og1.ogMap[i][0][1]
                dist2 = og2.ogMap[i][0][1]
                assert dist2 <= dist1

    def testMultipleOutgroups(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(
            map(lambda x: x == sorted(x, key=itemgetter(1)),
                og.ogMap.values()))
        # ordering is important!
        assert map(itemgetter(0), og.ogMap['Anc4']) == ['Anc1']
        assert map(itemgetter(0),
                   og.ogMap['Anc7']) == ['BABOON', 'Anc1', 'Anc5']
        # We avoid cycles, and choose post-order first, so this only
        # uses leaves.
        assert map(itemgetter(0),
                   og.ogMap['Anc1']) == ['HUMAN', 'CHIMP', 'BABOON']

    def testMultipleOutgroupsJustLeaves(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(
            [self.borMcTree.getName(x) for x in self.borMcTree.getLeaves()])
        og.greedy(candidateSet=candidates,
                  candidateChildFrac=2.,
                  maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(
            map(lambda x: x == sorted(x, key=itemgetter(1)),
                og.ogMap.values()))
        # ordering is important!
        assert map(itemgetter(0),
                   og.ogMap['Anc1']) == ['HUMAN', 'CHIMP', 'BABOON']
        assert og.ogMap['Anc7'][0][0] == 'BABOON'
        assert og.ogMap['Anc7'][1][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc7'][2][0] in ['CAT', 'DOG']

    def testMultipleOutgroupsOnRandomTrees(self):
        for tree in self.mcTrees:
            og = GreedyOutgroup()
            og.importTree(tree)
            og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(
                map(lambda x: x == sorted(x, key=itemgetter(1)),
                    og.ogMap.values()))

    def testDynamicOutgroupsOnRandomTrees(self):
        for tree, seqMap in zip(self.mcTrees, self.dummySeqMaps):
            degree = max([
                len(tree.getChildren(x)) for x in tree.breadthFirstTraversal()
            ])
            if degree < 8:
                og = DynamicOutgroup()
                og.edgeLen = 5
                og.importTree(tree, seqMap)
                og.compute(maxNumOutgroups=3)
                # make sure all entries have <= 3 outgroups.
                assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
                # and for all entries, the closest must be first.
                # (this will be true because all sequences are the same)
                assert all(
                    map(lambda x: x == sorted(x, key=itemgetter(1)),
                        og.ogMap.values()))

    def testDynamicOutgroupsJustLeaves(self):
        og = DynamicOutgroup()
        og.importTree(self.borMcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3, sequenceLossWeight=0.)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(
            map(lambda x: x == sorted(x, key=itemgetter(1)),
                og.ogMap.values()))
        # ordering is important!
        assert og.ogMap['Anc1'][0][0] == 'HUMAN'
        assert og.ogMap['Anc7'][0][0] == 'BABOON'

        og = DynamicOutgroup()
        og.importTree(self.borMcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))

        # we keep dynamic outgroups sorted by distance too
        assert all(
            map(lambda x: x == sorted(x, key=itemgetter(1)),
                og.ogMap.values()))

    def testMultipleIdenticalRunsProduceSameResult(self):
        """The code now allows for multiple greedy() calls with different
        candidate sets, so that some outgroups can be 'preferred' over
        others without being the only candidates.
        Check that running greedy() multiple times with the same
        parameters gives the same result as running it once.
        """
        for tree in self.mcTrees:
            ogOnce = GreedyOutgroup()
            ogOnce.importTree(tree)
            ogOnce.greedy(maxNumOutgroups=3)
            ogMultipleTimes = GreedyOutgroup()
            ogMultipleTimes.importTree(tree)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(
                map(lambda x: len(x) <= 3, ogMultipleTimes.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(
                map(lambda x: x == sorted(x, key=itemgetter(1)),
                    ogMultipleTimes.ogMap.values()))
            # Check that the maps are equal. Can't compare them
            # directly since python will convert them to ordered
            # association lists.
            assert len(ogOnce.ogMap) == len(ogMultipleTimes.ogMap)
            for i in ogOnce.ogMap:
                assert i in ogMultipleTimes.ogMap
                assert ogOnce.ogMap[i] == ogMultipleTimes.ogMap[i]

    def testPreferredCandidateSets(self):
        """Test that running greedy() multiple times with different candidate
        sets will behave properly, i.e. keep all the existing outgroup
        assignments and fill in more on the second run."""
        for tree in self.mcTrees:
            ogOnce = GreedyOutgroup()
            ogOnce.importTree(tree)
            nodes = [j for j in tree.postOrderTraversal()]
            candidateSet = set([
                tree.getName(i)
                for i in random.sample(nodes, min(20, len(nodes)))
            ])
            ogOnce.greedy(candidateSet=candidateSet, maxNumOutgroups=3)
            ogTwice = GreedyOutgroup()
            ogTwice.importTree(tree)
            ogTwice.greedy(candidateSet=candidateSet, maxNumOutgroups=3)
            ogTwice.greedy(maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(map(lambda x: len(x) <= 3, ogTwice.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(
                map(lambda x: x == sorted(x, key=itemgetter(1)),
                    ogTwice.ogMap.values()))
            for node in ogTwice.ogMap:
                if node in ogOnce.ogMap:
                    # the ogMap entry in ogOnce should be a subset of the ogMap entry for ogTwice
                    oneRunOutgroups = ogOnce.ogMap[node]
                    twoRunOutgroups = ogTwice.ogMap[node]
                    assert len(twoRunOutgroups) >= len(oneRunOutgroups)
                    for i in oneRunOutgroups:
                        assert i in twoRunOutgroups

    def testNoOutgroupIsADescendantOfAnother(self):
        """No two outgroups should be on the same path to the root."""
        for tree in self.mcTrees:
            tree.nameUnlabeledInternalNodes()
            og = GreedyOutgroup()
            og.importTree(tree)
            og.greedy(maxNumOutgroups=3)
            for source in og.ogMap:
                for (sink1, _) in og.ogMap[source]:
                    for (sink2, _) in og.ogMap[source]:
                        if sink1 != sink2:
                            sink1Id = tree.nameToId[sink1]
                            sink2Id = tree.nameToId[sink2]
                            assert sink1Id not in tree.postOrderTraversal(
                                sink2Id)
                            assert sink2Id not in tree.postOrderTraversal(
                                sink1Id)

示例#5

显示文件

文件： cactus_align.py 项目： shanwai1234/cactus

def make_align_job(options, toil):
    options.cactusDir = getTempDirectory()

    # apply path overrides.  this was necessary for wdl which doesn't take kindly to
    # text files of local paths (ie seqfile).  one way to fix would be to add support
    # for s3 paths and force wdl to use it.  a better way would be a more fundamental
    # interface shift away from files of paths throughout all of cactus
    if options.pathOverrides:
        seqFile = SeqFile(options.seqFile)
        configNode = ET.parse(options.configFile).getroot()
        config = ConfigWrapper(configNode)
        tree = MultiCactusTree(seqFile.tree)
        tree.nameUnlabeledInternalNodes(
            prefix=config.getDefaultInternalNodePrefix())
        for name, override in zip(options.pathOverrideNames,
                                  options.pathOverrides):
            seqFile.pathMap[name] = override
        override_seq = os.path.join(options.cactusDir, 'seqFile.override')
        with open(override_seq, 'w') as out_sf:
            out_sf.write(str(seqFile))
        options.seqFile = override_seq

    if not options.root:
        seqFile = SeqFile(options.seqFile)
        configNode = ET.parse(options.configFile).getroot()
        config = ConfigWrapper(configNode)
        mcTree = MultiCactusTree(seqFile.tree)
        mcTree.nameUnlabeledInternalNodes(
            prefix=config.getDefaultInternalNodePrefix())
        options.root = mcTree.getRootName()

    if options.acyclic:
        seqFile = SeqFile(options.seqFile)
        tree = MultiCactusTree(seqFile.tree)
        leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
        if options.acyclic not in leaves:
            raise RuntimeError(
                "Genome specified with --acyclic, {}, not found in tree leaves"
                .format(options.acyclic))

    #to be consistent with all-in-one cactus, we make sure the project
    #isn't limiting itself to the subtree (todo: parameterize so root can
    #be passed through from prepare to blast/align)
    proj_options = copy.deepcopy(options)
    proj_options.root = None
    #Create the progressive cactus project (as we do in runCactusProgressive)
    projWrapper = ProjectWrapper(proj_options,
                                 proj_options.configFile,
                                 ignoreSeqPaths=options.root)
    projWrapper.writeXml()

    pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                          '%s_project.xml' % ProjectWrapper.alignmentDirName)
    assert os.path.exists(pjPath)

    project = MultiCactusProject()

    if not os.path.isdir(options.cactusDir):
        os.makedirs(options.cactusDir)

    project.readXML(pjPath)

    # open up the experiment (as we do in ProgressiveUp.run)
    # note that we copy the path into the options here
    experimentFile = project.expMap[options.root]
    expXml = ET.parse(experimentFile).getroot()
    experiment = ExperimentWrapper(expXml)
    configPath = experiment.getConfigPath()
    configXml = ET.parse(configPath).getroot()

    seqIDMap = dict()
    tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root)
    leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
    outgroups = experiment.getOutgroupGenomes()
    genome_set = set(leaves + outgroups)

    # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups
    def get_input_path(suffix=''):
        base_path = options.cigarsFile[0]
        for input_path in options.cigarsFile:
            if suffix and input_path.endswith(suffix):
                return input_path
            if os.path.basename(base_path).startswith(
                    os.path.basename(input_path)):
                base_path = input_path
        return base_path + suffix

    # import the outgroups
    outgroupIDs = []
    outgroup_fragment_found = False
    for i, outgroup in enumerate(outgroups):
        try:
            outgroupID = toil.importFile(
                makeURL(get_input_path('.og_fragment_{}'.format(i))))
            outgroupIDs.append(outgroupID)
            experiment.setSequenceID(outgroup, outgroupID)
            outgroup_fragment_found = True
            assert not options.pangenome
        except:
            # we assume that input is not coming from cactus blast, so we'll treat output
            # sequences normally and not go looking for fragments
            outgroupIDs = []
            break

    #import the sequences (that we need to align for the given event, ie leaves and outgroups)
    for genome, seq in list(project.inputSequenceMap.items()):
        if genome in leaves or (not outgroup_fragment_found
                                and genome in outgroups):
            if os.path.isdir(seq):
                tmpSeq = getTempFile()
                catFiles(
                    [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)],
                    tmpSeq)
                seq = tmpSeq
            seq = makeURL(seq)

            logger.info("Importing {}".format(seq))
            experiment.setSequenceID(genome, toil.importFile(seq))

    if not outgroup_fragment_found:
        outgroupIDs = [
            experiment.getSequenceID(outgroup) for outgroup in outgroups
        ]

    # write back the experiment, as CactusWorkflowArguments wants a path
    experiment.writeXML(experimentFile)

    #import cactus config
    if options.configFile:
        cactusConfigID = toil.importFile(makeURL(options.configFile))
    else:
        cactusConfigID = toil.importFile(makeURL(project.getConfigPath()))
    project.setConfigID(cactusConfigID)

    project.syncToFileStore(toil)
    configNode = ET.parse(project.getConfigPath()).getroot()
    configWrapper = ConfigWrapper(configNode)
    configWrapper.substituteAllPredefinedConstantsWithLiterals()

    if options.singleCopySpecies:
        findRequiredNode(
            configWrapper.xmlRoot,
            "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format(
                options.singleCopySpecies)

    if options.barMaskFilter:
        findRequiredNode(
            configWrapper.xmlRoot,
            "bar").attrib["partialOrderAlignmentMaskFilter"] = str(
                options.barMaskFilter)

    if options.pangenome:
        # turn off the megablock filter as it ruins non-all-to-all alignments
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["minimumBlockHomologySupport"] = "0"
        findRequiredNode(
            configWrapper.xmlRoot,
            "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999"
        # turn off mapq filtering
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["runMapQFiltering"] = "0"
        # more iterations here helps quite a bit to reduce underalignment
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["maxRecoverableChainsIterations"] = "50"
        # turn down minimum block degree to get a fat ancestor
        findRequiredNode(configWrapper.xmlRoot,
                         "bar").attrib["minimumBlockDegree"] = "1"
        # turn on POA
        findRequiredNode(configWrapper.xmlRoot,
                         "bar").attrib["partialOrderAlignment"] = "1"
        # save it
        if not options.batch:
            pg_file = options.outHal + ".pg-conf.xml"
            if pg_file.startswith('s3://'):
                pg_temp_file = getTempFile()
            else:
                pg_temp_file = pg_file
            configWrapper.writeXML(pg_temp_file)
            if pg_file.startswith('s3://'):
                write_s3(pg_temp_file,
                         pg_file,
                         region=get_aws_region(options.jobStore))
            logger.info("pangenome configuration overrides saved in {}".format(
                pg_file))

    workFlowArgs = CactusWorkflowArguments(options,
                                           experimentFile=experimentFile,
                                           configNode=configNode,
                                           seqIDMap=project.inputSequenceIDMap)

    #import the files that cactus-blast made
    workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path()))
    workFlowArgs.secondaryAlignmentsID = None
    if not options.pafInput:
        try:
            workFlowArgs.secondaryAlignmentsID = toil.importFile(
                makeURL(get_input_path('.secondary')))
        except:
            pass
    workFlowArgs.outgroupFragmentIDs = outgroupIDs
    workFlowArgs.ingroupCoverageIDs = []
    if outgroup_fragment_found and len(outgroups) > 0:
        for i in range(len(leaves)):
            workFlowArgs.ingroupCoverageIDs.append(
                toil.importFile(
                    makeURL(get_input_path('.ig_coverage_{}'.format(i)))))

    align_job = Job.wrapJobFn(run_cactus_align,
                              configWrapper,
                              workFlowArgs,
                              project,
                              checkpointInfo=options.checkpointInfo,
                              doRenaming=options.nonCactusInput,
                              pafInput=options.pafInput,
                              pafSecondaries=options.usePafSecondaries,
                              doVG=options.outVG,
                              doGFA=options.outGFA,
                              delay=options.stagger,
                              eventNameAsID=options.eventNameAsID,
                              acyclicEvent=options.acyclic)
    return align_job

示例#6

显示文件

文件： outgroupTest.py 项目： benedictpaten/cactus

class TestCase(unittest.TestCase):

    def setUp(self):
        unittest.TestCase.setUp(self)
        self.trees = randomTreeSet()
        self.mcTrees = []
        self.tempDir = getTempDirectory(os.getcwd())
        self.tempFa = os.path.join(self.tempDir, "seq.fa")
        with open(self.tempFa, "w") as f:
            f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n")
        self.dummySeqMaps = []
        for tree in self.trees:
            if tree.size() < 50:
                mcTree = MultiCactusTree(tree, tree.degree())
                seqMap = dict()
                for i in mcTree.breadthFirstTraversal():
                    mcTree.setName(i, "Node%s" % str(i))
                    seqMap["Node%s" % str(i)] = self.tempFa
                mcTree.computeSubtreeRoots()
                mcTree.nameUnlabeledInternalNodes()
                self.mcTrees.append(mcTree)
                self.dummySeqMaps.append(seqMap)

        # Boreoeutherian tree
        borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
        self.borMcTree = MultiCactusTree(NXNewick().parseString(borTree, addImpliedRoots=False))
        self.borMcTree.computeSubtreeRoots()
        self.borMcTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.borMcTree)

        # Eutherian backbone tree
        backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);'
        self.backboneTree = MultiCactusTree(NXNewick().parseString(backbone, addImpliedRoots=False))
        self.backboneTree.computeSubtreeRoots()
        self.backboneTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.backboneTree)

        seqLens = dict()
        seqLens["HUMAN"] = 57553
        seqLens["CHIMP"] = 57344
        seqLens["BABOON"] = 58960
        seqLens["MOUSE"] = 32750
        seqLens["RAT"] = 38436
        seqLens["DOG"] = 54187
        seqLens["CAT"] = 50283
        seqLens["PIG"] = 54843
        seqLens["COW"] = 55508
        self.blanchetteSeqMap = dict()
        for event, seqLen in seqLens.items():
            p = os.path.join(self.tempDir, event +".fa")
            with open(p, "w") as f:
                f.write(">%s\n" % event)
                f.write(''.join(['A'] * seqLen))
                f.write('\n')
            self.blanchetteSeqMap[event] = p

    def tearDown(self):
        unittest.TestCase.tearDown(self)
        system("rm -rf %s" % self.tempDir)

    def testJustLeaves(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set([self.borMcTree.getName(x) for x in self.borMcTree.getLeaves()])
        og.greedy(candidateSet=candidates, candidateChildFrac=2.)
        assert og.ogMap['Anc1'][0][0] == 'HUMAN'
        assert og.ogMap['Anc2'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc3'][0][0] in ['PIG', 'COW']
        assert og.ogMap['Anc4'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc5'][0][0] == 'HUMAN'
        assert og.ogMap['Anc6'][0][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc7'][0][0] == 'BABOON'

    def testHeightTable(self):
        """Make sure the height-table is calculated correctly."""
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        htable = og.heightTable()
        self.assertEquals(htable[self.borMcTree.getNodeId('HUMAN')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('PIG')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('RAT')], 0)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc7')], 1)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc1')], 2)
        self.assertEquals(htable[self.borMcTree.getNodeId('Anc0')], 4)

    def testZeroThreshold(self):
        """A threshold of 0 should produce outgroup sets that cause no additional depth in the resulting schedule."""
        tree = self.backboneTree
        og = GreedyOutgroup()
        og.importTree(tree)
        og.greedy(candidateSet=set(['Homo_sapiens', 'Mus_musculus']),threshold=0, maxNumOutgroups=3, candidateChildFrac=0.75)
        og.greedy(threshold=0, maxNumOutgroups=3, candidateChildFrac=0.75)
        htable = og.heightTable()
        for node, outgroups in og.ogMap.items():
            for outgroup, _ in outgroups:
                # For the outgroup assignment to create no
                # additional dependencies, each outgroup must have
                # a height lower than the node it's outgroup to
                # (or be a leaf)
                self.assertTrue(htable[tree.getNodeId(outgroup)] < htable[tree.getNodeId(node)] \
                                or htable[tree.getNodeId(outgroup)] == 0)

    def testCandidates(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(['HUMAN', 'CHIMP', 'RAT'])
        og.greedy(candidateSet=candidates, candidateChildFrac=0.5)
        assert og.ogMap['Anc1'][0][0] == 'Anc4'
        assert og.ogMap['Anc2'][0][0] == 'Anc4'
        assert og.ogMap['Anc3'][0][0] == 'Anc4'
        assert 'Anc4' not in og.ogMap
        assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc6', 'Anc7']
        assert og.ogMap['Anc6'][0][0] in ['Anc5', 'MOUSE', 'RAT']
        assert og.ogMap['Anc7'][0][0] in ['Anc5', 'MOUSE', 'RAT']

        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set(['HUMAN', 'CHIMP', 'RAT'])
        og.greedy(candidateSet=candidates, candidateChildFrac=1.0)
        assert og.ogMap['Anc1'][0][0] == 'Anc7'
        assert og.ogMap['Anc2'][0][0] == 'Anc7'
        assert og.ogMap['Anc3'][0][0] == 'Anc7'
        assert 'Anc4' not in og.ogMap
        assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc7']
        assert og.ogMap['Anc6'][0][0] == 'RAT'
        assert og.ogMap['Anc7'][0][0] == 'RAT'

    def testGeneralBetterThanLeaves(self):
        for tree in self.mcTrees:
            og1 = GreedyOutgroup()
            og1.importTree(tree)
            candidates = set([tree.getName(x) for x in tree.getLeaves()])
            og1.greedy(candidateSet=candidates, candidateChildFrac=2.)
            og2 = GreedyOutgroup()
            og2.importTree(tree)
            og2.greedy(candidateSet=None)

            for i in og1.ogMap:
                assert i in og2.ogMap
                dist1 = og1.ogMap[i][0][1]
                dist2 = og2.ogMap[i][0][1]
                assert dist2 <= dist1

    def testGeneralConstrainedBetterThanLeaves(self):
        for tree in self.mcTrees:
            og1 = GreedyOutgroup()
            og1.importTree(tree)
            candidates = set([tree.getName(x) for x in tree.getLeaves()])
            og1.greedy(candidateSet=candidates, candidateChildFrac=2.)
            og2 = GreedyOutgroup()
            og2.importTree(tree)
            og2.greedy(candidateSet=None, threshold=2)

            for i in og1.ogMap:
                assert i in og2.ogMap
                dist1 = og1.ogMap[i][0][1]
                dist2 = og2.ogMap[i][0][1]
                assert dist2 <= dist1

    def testMultipleOutgroups(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                       og.ogMap.values()))
        # ordering is important!
        assert map(itemgetter(0), og.ogMap['Anc4']) == ['Anc1']
        assert map(itemgetter(0), og.ogMap['Anc7']) == ['BABOON', 'Anc1',
                                                        'Anc5']
        # We avoid cycles, and choose post-order first, so this only
        # uses leaves.
        assert map(itemgetter(0), og.ogMap['Anc1']) == ['HUMAN', 'CHIMP',
                                                        'BABOON']

    def testMultipleOutgroupsJustLeaves(self):
        og = GreedyOutgroup()
        og.importTree(self.borMcTree)
        candidates = set([self.borMcTree.getName(x) for x in self.borMcTree.getLeaves()])
        og.greedy(candidateSet=candidates, candidateChildFrac=2.,
                  maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                       og.ogMap.values()))
        # ordering is important!
        assert map(itemgetter(0), og.ogMap['Anc1']) == ['HUMAN', 'CHIMP',
                                                        'BABOON']
        assert og.ogMap['Anc7'][0][0] == 'BABOON'
        assert og.ogMap['Anc7'][1][0] in ['CAT', 'DOG']
        assert og.ogMap['Anc7'][2][0] in ['CAT', 'DOG']

    def testMultipleOutgroupsOnRandomTrees(self):
        for tree in self.mcTrees:
            og = GreedyOutgroup()
            og.importTree(tree)
            og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                           og.ogMap.values()))

    def testDynamicOutgroupsOnRandomTrees(self):
        for tree, seqMap in zip(self.mcTrees, self.dummySeqMaps):
            degree = max([len(tree.getChildren(x)) for x in
                         tree.breadthFirstTraversal()])
            if degree < 8:
                og = DynamicOutgroup()
                og.edgeLen = 5
                og.importTree(tree, seqMap)
                og.compute(maxNumOutgroups=3)
                # make sure all entries have <= 3 outgroups.
                assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
                # and for all entries, the closest must be first.
                # (this will be true because all sequences are the same)
                assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                               og.ogMap.values()))

    def testDynamicOutgroupsJustLeaves(self):
        og = DynamicOutgroup()
        og.importTree(self.borMcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3, sequenceLossWeight=0.)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))
        # and for all entries, the closest must be first.
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                       og.ogMap.values()))
        # ordering is important!
        assert og.ogMap['Anc1'][0][0] == 'HUMAN'
        assert og.ogMap['Anc7'][0][0] == 'BABOON'

        og = DynamicOutgroup()
        og.importTree(self.borMcTree, self.blanchetteSeqMap)
        og.compute(maxNumOutgroups=3)
        # make sure all entries have <= 3 outgroups.
        assert all(map(lambda x: len(x) <= 3, og.ogMap.values()))

        # we keep dynamic outgroups sorted by distance too
        assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                               og.ogMap.values()))
                        

    def testMultipleIdenticalRunsProduceSameResult(self):
        """The code now allows for multiple greedy() calls with different
        candidate sets, so that some outgroups can be 'preferred' over
        others without being the only candidates.
        Check that running greedy() multiple times with the same
        parameters gives the same result as running it once.
        """
        for tree in self.mcTrees:
            ogOnce = GreedyOutgroup()
            ogOnce.importTree(tree)
            ogOnce.greedy(maxNumOutgroups=3)
            ogMultipleTimes = GreedyOutgroup()
            ogMultipleTimes.importTree(tree)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            ogMultipleTimes.greedy(maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(map(lambda x: len(x) <= 3, ogMultipleTimes.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                           ogMultipleTimes.ogMap.values()))
            # Check that the maps are equal. Can't compare them
            # directly since python will convert them to ordered
            # association lists.
            assert len(ogOnce.ogMap) == len(ogMultipleTimes.ogMap)
            for i in ogOnce.ogMap:
                assert i in ogMultipleTimes.ogMap
                assert ogOnce.ogMap[i] == ogMultipleTimes.ogMap[i]

    def testPreferredCandidateSets(self):
        """Test that running greedy() multiple times with different candidate
        sets will behave properly, i.e. keep all the existing outgroup
        assignments and fill in more on the second run."""
        for tree in self.mcTrees:
            ogOnce = GreedyOutgroup()
            ogOnce.importTree(tree)
            nodes = [j for j in tree.postOrderTraversal()]
            candidateSet = set([tree.getName(i) for i in random.sample(nodes, min(20, len(nodes)))])
            ogOnce.greedy(candidateSet=candidateSet, maxNumOutgroups=3)
            ogTwice = GreedyOutgroup()
            ogTwice.importTree(tree)
            ogTwice.greedy(candidateSet=candidateSet, maxNumOutgroups=3)
            ogTwice.greedy(maxNumOutgroups=3)
            # make sure all entries have <= 3 outgroups.
            assert all(map(lambda x: len(x) <= 3, ogTwice.ogMap.values()))
            # and for all entries, the closest must be first.
            assert all(map(lambda x: x == sorted(x, key=itemgetter(1)),
                           ogTwice.ogMap.values()))
            for node in ogTwice.ogMap:
                if node in ogOnce.ogMap:
                    # the ogMap entry in ogOnce should be a subset of the ogMap entry for ogTwice
                    oneRunOutgroups = ogOnce.ogMap[node]
                    twoRunOutgroups = ogTwice.ogMap[node]
                    assert len(twoRunOutgroups) >= len(oneRunOutgroups)
                    for i in oneRunOutgroups:
                        assert i in twoRunOutgroups

    def testNoOutgroupIsADescendantOfAnother(self):
        """No two outgroups should be on the same path to the root."""
        for tree in self.mcTrees:
            tree.nameUnlabeledInternalNodes()
            og = GreedyOutgroup()
            og.importTree(tree)
            og.greedy(maxNumOutgroups=3)
            for source in og.ogMap:
                for (sink1, _) in og.ogMap[source]:
                    for (sink2, _) in og.ogMap[source]:
                        if sink1 != sink2:
                            sink1Id = tree.nameToId[sink1]
                            sink2Id = tree.nameToId[sink2]
                            assert sink1Id not in tree.postOrderTraversal(sink2Id)
                            assert sink2Id not in tree.postOrderTraversal(sink1Id)

示例#7

显示文件

文件： cactus_align.py 项目： masri2019/cactus

def runCactusAfterBlastOnly(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            alignmentID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            #Create the progressive cactus project (as we do in runCactusProgressive)
            projWrapper = ProjectWrapper(options,
                                         options.configFile,
                                         ignoreSeqPaths=options.root)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)

            # open up the experiment (as we do in ProgressiveUp.run)
            # note that we copy the path into the options here
            experimentFile = project.expMap[options.root]
            expXml = ET.parse(experimentFile).getroot()
            experiment = ExperimentWrapper(expXml)
            configPath = experiment.getConfigPath()
            configXml = ET.parse(configPath).getroot()

            seqIDMap = dict()
            tree = MultiCactusTree(experiment.getTree()).extractSubTree(
                options.root)
            leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
            outgroups = experiment.getOutgroupGenomes()
            genome_set = set(leaves + outgroups)

            # import the outgroups
            outgroupIDs = []
            cactus_blast_input = not options.nonBlastInput
            for i, outgroup in enumerate(outgroups):
                try:
                    outgroupID = toil.importFile(
                        makeURL(options.blastOutput) +
                        '.og_fragment_{}'.format(i))
                    outgroupIDs.append(outgroupID)
                    experiment.setSequenceID(outgroup, outgroupID)
                except:
                    if cactus_blast_input:
                        raise
                    # we assume that input is not coming from cactus blast, so we'll treat output
                    # sequences normally and not go looking for fragments
                    outgroupIDs = []
                    break

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            for genome, seq in list(project.inputSequenceMap.items()):
                if genome in leaves or (not cactus_blast_input
                                        and genome in outgroups):
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)

                    experiment.setSequenceID(genome, toil.importFile(seq))

            if not cactus_blast_input:
                outgroupIDs = [
                    experiment.getSequenceID(outgroup)
                    for outgroup in outgroups
                ]

            # write back the experiment, as CactusWorkflowArguments wants a path
            experiment.writeXML(experimentFile)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            workFlowArgs = CactusWorkflowArguments(
                options,
                experimentFile=experimentFile,
                configNode=configNode,
                seqIDMap=project.inputSequenceIDMap)

            #import the files that cactus-blast made
            workFlowArgs.alignmentsID = toil.importFile(
                makeURL(options.blastOutput))
            try:
                workFlowArgs.secondaryAlignmentsID = toil.importFile(
                    makeURL(options.blastOutput) + '.secondary')
            except:
                workFlowArgs.secondaryAlignmentsID = None
            workFlowArgs.outgroupFragmentIDs = outgroupIDs
            workFlowArgs.ingroupCoverageIDs = []
            if cactus_blast_input and len(outgroups) > 0:
                for i in range(len(leaves)):
                    workFlowArgs.ingroupCoverageIDs.append(
                        toil.importFile(
                            makeURL(options.blastOutput) +
                            '.ig_coverage_{}'.format(i)))

            halID = toil.start(
                Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs,
                              project, cactus_blast_input))

        # export the hal
        toil.exportFile(halID, makeURL(options.outputHal))