Пример #1
0
    def testChildTreeJob(self):
        """Check that the ChildTreeJob class runs all children."""
        numChildren = 100
        flagDir = getTempDirectory()

        options = Job.Runner.getDefaultOptions(getTempDirectory())
        shutil.rmtree(options.jobStore)

        with Toil(options) as toil:
            toil.start(CTTestParent(flagDir, numChildren))

        # Check that all jobs ran
        for i in xrange(numChildren):
            self.assertTrue(os.path.exists(os.path.join(flagDir, str(i))))
        shutil.rmtree(flagDir)
Пример #2
0
    def testChildTreeJob(self):
        """Check that the ChildTreeJob class runs all children."""
        numChildren = 100
        flagDir = getTempDirectory()

        options = Job.Runner.getDefaultOptions(getTempDirectory())
        shutil.rmtree(options.jobStore)

        with Toil(options) as toil:
            toil.start(CTTestParent(flagDir, numChildren))

        # Check that all jobs ran
        for i in range(numChildren):
            self.assertTrue(os.path.exists(os.path.join(flagDir, str(i))))
        shutil.rmtree(flagDir)
Пример #3
0
def scriptTree_SortTest(testNo, batchSystem, lines=10000, maxLineLength=10, N=10000):
    """Tests scriptTree/jobTree by sorting a file in parallel.
    """
    for test in xrange(testNo):
        tempDir = getTempDirectory(os.getcwd())
        tempFile = getTempFile(rootDir=tempDir)
        jobTreeDir = os.path.join(tempDir, "testJobTree")
        makeFileToSort(tempFile, lines=lines, maxLineLength=maxLineLength)
        #First make our own sorted version
        fileHandle = open(tempFile, 'r')
        l = fileHandle.readlines()
        l.sort()
        fileHandle.close()
        #Sort the file
        while True:
            command = "scriptTreeTest_Sort.py --jobTree %s --logLevel=DEBUG --fileToSort=%s --N %i --batchSystem %s --jobTime 1.0 --maxCpus 20 --retryCount 2" % (jobTreeDir, tempFile, N, batchSystem) #, retryCount)
            system(command)
            try:
                system("jobTreeStatus --jobTree %s --failIfNotComplete" % jobTreeDir)
                break
            except:
                print "The jobtree failed and will be restarted"
                #raise RuntimeError()
                continue
                
        #Now check the file is properly sorted..
        #Now get the sorted file
        fileHandle = open(tempFile, 'r')
        l2 = fileHandle.readlines()
        fileHandle.close()
        checkEqual(l, l2)
        system("rm -rf %s" % tempDir)
Пример #4
0
    def run(self, fileStore):
        sequenceFiles1 = [
            fileStore.readGlobalFile(fileID)
            for fileID in self.sequenceFileIDs1
        ]
        chunks = runGetChunks(
            sequenceFiles=sequenceFiles1,
            chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()),
            chunkSize=self.blastOptions.chunkSize,
            overlapSize=self.blastOptions.overlapSize)
        assert len(chunks) > 0
        logger.info(
            "Broken up the sequence files into individual 'chunk' files")
        chunkIDs = [
            fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks
        ]

        diagonalResultsID = self.addChild(
            MakeSelfBlasts(self.blastOptions, chunkIDs)).rv()
        offDiagonalResultsID = self.addChild(
            MakeOffDiagonalBlasts(self.blastOptions, chunkIDs)).rv()
        logger.debug("Collating the blasts after blasting all-against-all")
        return self.addFollowOn(
            CollateBlasts(self.blastOptions,
                          [diagonalResultsID, offDiagonalResultsID])).rv()
Пример #5
0
    def runComparisonOfBlastScriptVsNaiveBlast(self, blastMode):
        """We compare the output with a naive run of the blast program, to check the results are nearly
        equivalent.
        """
        encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] #, 2) ] #Could go to six
        species = ("human", "mouse", "dog")
        #Other species to try "rat", "monodelphis", "macaque", "chimp"
        for encodeRegion in encodeRegions:
            regionPath = os.path.join(self.encodePath, encodeRegion)
            for i in xrange(len(species)):
                species1 = species[i]
                for species2 in species[i+1:]:
                    seqFile1 = os.path.join(regionPath, "%s.%s.fa" % (species1, encodeRegion))
                    seqFile2 = os.path.join(regionPath, "%s.%s.fa" % (species2, encodeRegion))

                    #Run simple blast
                    runNaiveBlast(seqFile1, seqFile2, self.tempOutputFile, self.tempDir)
                    logger.info("Ran the naive blast okay")
                    
                    #Run cactus blast pipeline
                    toilDir = os.path.join(getTempDirectory(self.tempDir), "toil")
                    if blastMode == "allAgainstAll":
                        runCactusBlast(sequenceFiles=[ seqFile1, seqFile2 ],
                                       alignmentsFile=self.tempOutputFile2, toilDir=toilDir,
                                       chunkSize=500000, overlapSize=10000)
                    else:
                        runCactusBlast(sequenceFiles=[ seqFile1 ], alignmentsFile=self.tempOutputFile2,
                                       toilDir=toilDir, chunkSize=500000, overlapSize=10000,
                                       targetSequenceFiles=[ seqFile2 ])
                    logger.info("Ran cactus_blast okay")
                    logger.critical("Comparing cactus_blast and naive blast; using mode: %s" % blastMode)
                    checkCigar(self.tempOutputFile)
                    checkCigar(self.tempOutputFile2)
                    compareResultsFile(self.tempOutputFile, self.tempOutputFile2)
Пример #6
0
def runWorkflow_multipleExamples(inputGenFunction,
                                 testNumber=1,
                                 testRestrictions=(TestStatus.TEST_SHORT, TestStatus.TEST_MEDIUM, \
                                                   TestStatus.TEST_LONG, TestStatus.TEST_VERY_LONG,),
                               inverseTestRestrictions=False,
                               batchSystem="single_machine",
                               buildAvgs=False, buildReference=False,
                               buildReferenceSequence=False,
                               buildCactusPDF=False, buildAdjacencyPDF=False,
                               buildReferencePDF=False,
                               makeCactusTreeStats=False, makeMAFs=False,
                               configFile=None, buildJobTreeStats=False):
    """A wrapper to run a number of examples.
    """
    if (inverseTestRestrictions and TestStatus.getTestStatus() not in testRestrictions) or \
        (not inverseTestRestrictions and TestStatus.getTestStatus() in testRestrictions):
        for test in xrange(testNumber):
            tempDir = getTempDirectory(os.getcwd())
            sequences, newickTreeString = inputGenFunction(regionNumber=test,
                                                           tempDir=tempDir)
            runWorkflow_TestScript(sequences,
                                   newickTreeString,
                                   batchSystem=batchSystem,
                                   buildAvgs=buildAvgs,
                                   buildReference=buildReference,
                                   buildCactusPDF=buildCactusPDF,
                                   buildAdjacencyPDF=buildAdjacencyPDF,
                                   makeCactusTreeStats=makeCactusTreeStats,
                                   makeMAFs=makeMAFs,
                                   configFile=configFile,
                                   buildJobTreeStats=buildJobTreeStats)
            system("rm -rf %s" % tempDir)
            logger.info("Finished random test %i" % test)
Пример #7
0
 def testBlastRandom(self):
     """Make some sequences, put them in a file, call blast with random parameters 
     and check it runs okay.
     """
     tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa")
     self.tempFiles.append(tempSeqFile)
     for test in xrange(self.testNo):
         seqNo = random.choice(xrange(0, 10))
         seq = getRandomSequence(8000)[1]
         fileHandle = open(tempSeqFile, 'w')
         for fastaHeader, seq in [(str(i),
                                   mutateSequence(seq,
                                                  0.3 * random.random()))
                                  for i in xrange(seqNo)]:
             if random.random() > 0.5:
                 seq = reverseComplement(seq)
             fastaWrite(fileHandle, fastaHeader, seq)
         fileHandle.close()
         chunkSize = random.choice(xrange(500, 9000))
         overlapSize = random.choice(xrange(2, 100))
         toilDir = os.path.join(getTempDirectory(self.tempDir), "toil")
         runCactusBlast([tempSeqFile], self.tempOutputFile, toilDir,
                        chunkSize, overlapSize)
         #runToilStatusAndFailIfNotComplete(toilDir)
         if getLogLevelString() == "DEBUG":
             system("cat %s" % self.tempOutputFile)
         system("rm -rf %s " % toilDir)
Пример #8
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     self.testNo = TestStatus.getTestSetup(1, 2, 10, 10)
     self.tempDir = getTempDirectory(os.getcwd())
     self.jobTreeDir = os.path.join(
         self.tempDir,
         "testJobTree")  #A directory for the job tree to be created in
Пример #9
0
    def runComparisonOfBlastScriptVsNaiveBlast(self, blastMode):
        """We compare the output with a naive run of the blast program, to check the results are nearly
        equivalent.
        """
        encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] #, 2) ] #Could go to six
        species = ("human", "mouse", "dog")
        #Other species to try "rat", "monodelphis", "macaque", "chimp"
        for encodeRegion in encodeRegions:
            regionPath = os.path.join(self.encodePath, encodeRegion)
            for i in xrange(len(species)):
                species1 = species[i]
                for species2 in species[i+1:]:
                    seqFile1 = os.path.join(regionPath, "%s.%s.fa" % (species1, encodeRegion))
                    seqFile2 = os.path.join(regionPath, "%s.%s.fa" % (species2, encodeRegion))

                    #Run simple blast
                    runNaiveBlast(seqFile1, seqFile2, self.tempOutputFile, self.tempDir)
                    logger.info("Ran the naive blast okay")
                    
                    #Run cactus blast pipeline
                    toilDir = os.path.join(getTempDirectory(self.tempDir), "toil")
                    if blastMode == "allAgainstAll":
                        runCactusBlast(sequenceFiles=[ seqFile1, seqFile2 ],
                                       alignmentsFile=self.tempOutputFile2, toilDir=toilDir,
                                       chunkSize=500000, overlapSize=10000)
                    else:
                        runCactusBlast(sequenceFiles=[ seqFile1 ], alignmentsFile=self.tempOutputFile2,
                                       toilDir=toilDir, chunkSize=500000, overlapSize=10000,
                                       targetSequenceFiles=[ seqFile2 ])
                    logger.info("Ran cactus_blast okay")
                    logger.critical("Comparing cactus_blast and naive blast; using mode: %s" % blastMode)
                    checkCigar(self.tempOutputFile)
                    checkCigar(self.tempOutputFile2)
                    compareResultsFile(self.tempOutputFile, self.tempOutputFile2)
Пример #10
0
    def testCactusSetup(self):
        """Creates a bunch of random inputs and then passes them to cactus setup.
        """
        for test in xrange(self.testNo):
            tempDir = os.path.relpath(getTempDirectory(os.getcwd()))
            sequenceNumber = random.choice(xrange(100))
            sequences, newickTreeString = getCactusInputs_random(
                tempDir=tempDir, sequenceNumber=sequenceNumber)

            #Setup the flower disk.
            experiment = getCactusWorkflowExperimentForTest(
                sequences, newickTreeString,
                os.path.join('/data', os.path.relpath(tempDir)))
            cactusDiskDatabaseString = experiment.getDiskDatabaseString()
            cactusSequencesPath = os.path.join(experiment.getDbDir(),
                                               "cactusSequences")

            runCactusSetup(cactusDiskDatabaseString=cactusDiskDatabaseString,
                           cactusSequencesPath=cactusSequencesPath,
                           sequences=sequences,
                           newickTreeString=newickTreeString)
            runCactusSetup(cactusDiskDatabaseString=cactusDiskDatabaseString,
                           cactusSequencesPath=cactusSequencesPath,
                           sequences=sequences,
                           newickTreeString=newickTreeString)

            experiment.cleanupDb()
            system("rm -rf %s" % tempDir)
            logger.info("Finished test %i of cactus_setup.py", test)
Пример #11
0
    def run(self, fileStore):
        logger.info("Preparing sequence for preprocessing")

        inSequence = fileStore.readGlobalFile(self.inSequenceID)

        if self.prepOptions.chunkSize <= 0:
            # In this first case we don't need to break up the sequence
            chunked = False
            inChunkList = [inSequence]
        else:
            # chunk it up
            chunked = True
            inChunkDirectory = getTempDirectory(
                rootDir=fileStore.getLocalTempDir())
            inChunkList = runGetChunks(sequenceFiles=[inSequence],
                                       chunksDir=inChunkDirectory,
                                       chunkSize=self.prepOptions.chunkSize,
                                       overlapSize=0)
            inChunkList = [os.path.abspath(path) for path in inChunkList]
        logger.info("Chunks = %s" % inChunkList)

        inChunkIDList = [
            fileStore.writeGlobalFile(chunk, cleanup=True)
            for chunk in inChunkList
        ]
        outChunkIDList = []
        #For each input chunk we create an output chunk, it is the output chunks that get concatenated together.
        if not self.chunksToCompute:
            self.chunksToCompute = list(range(len(inChunkList)))
        for i in self.chunksToCompute:
            #Calculate the number of chunks to use
            inChunkNumber = int(
                max(
                    1,
                    math.ceil(
                        len(inChunkList) *
                        self.prepOptions.proportionToSample)))
            assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0
            #Now get the list of chunks flanking and including the current chunk
            j = max(0, i - inChunkNumber // 2)
            inChunkIDs = inChunkIDList[j:j + inChunkNumber]
            if len(
                    inChunkIDs
            ) < inChunkNumber:  #This logic is like making the list circular
                inChunkIDs += inChunkIDList[:inChunkNumber - len(inChunkIDs)]
            assert len(inChunkIDs) == inChunkNumber
            outChunkIDList.append(
                self.addChild(
                    self.getChunkedJobForCurrentStage(
                        inChunkIDs,
                        float(inChunkNumber) / len(inChunkIDList),
                        inChunkIDList[i])).rv())

        if chunked:
            # Merge results of the chunking process back into a genome-wide file
            return self.addFollowOn(
                MergeChunks(self.prepOptions, outChunkIDList)).rv()
        else:
            # Didn't chunk--we have a genome-wide fasta file
            return outChunkIDList[0]
Пример #12
0
    def run(self, fileStore):
        sequenceFiles1 = [
            fileStore.readGlobalFile(fileID)
            for fileID in self.sequenceFileIDs1
        ]
        if self.blastOptions.gpuLastz == True:
            # wga-gpu has a 3G limit.
            self.blastOptions.chunkSize = 3000000000
        chunks = runGetChunks(
            sequenceFiles=sequenceFiles1,
            chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()),
            chunkSize=self.blastOptions.chunkSize,
            overlapSize=self.blastOptions.overlapSize)
        if len(chunks) == 0:
            raise Exception(
                "no chunks produced for files: {} ".format(sequenceFiles1))
        logger.info(
            "Broken up the sequence files into individual 'chunk' files")
        chunkIDs = [
            fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks
        ]

        diagonalResultsID = self.addChild(
            MakeSelfBlasts(self.blastOptions, chunkIDs)).rv()
        offDiagonalResultsID = self.addChild(
            MakeOffDiagonalBlasts(self.blastOptions, chunkIDs)).rv()
        logger.debug("Collating the blasts after blasting all-against-all")
        return self.addFollowOn(
            CollateBlasts(self.blastOptions,
                          [diagonalResultsID, offDiagonalResultsID])).rv()
Пример #13
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     self.encodeRegion = "ENm001"
     self.encodePath = os.path.join(TestStatus.getPathToDataSets(), "MAY-2005")
     self.regionPath = os.path.join(self.encodePath, self.encodeRegion)
     self.tempDir = getTempDirectory(os.getcwd())
     self.tempOutputFile = os.path.join(self.tempDir, "results1.txt")
Пример #14
0
    def testKeepingCoverageOnIngroups(self):
        """Tests whether the --ingroupCoverageDir option works as
        advertised."""
        encodeRegion = "ENm001"
        ingroups = ["human", "cow"]
        outgroups = ["macaque", "rabbit", "dog"]
        regionPath = os.path.join(self.encodePath, encodeRegion)
        ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups)
        outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
        # Run blast in "ingroup vs outgroups" mode, requesting to keep
        # the bed files that show outgroup coverage on the ingroup.
        toilDir = os.path.join(self.tempDir, "tmp_toil")
        outgroupFragmentPaths = [getTempFile(rootDir=self.tempDir) for outgroup in outgroups]
        ingroupCoveragePaths = [getTempFile(rootDir=self.tempDir) for ingroup in ingroups]
        runCactusBlastIngroupsAndOutgroups(ingroups=ingroupPaths, outgroups=outgroupPaths, alignmentsFile=self.tempOutputFile, outgroupFragmentPaths=outgroupFragmentPaths, ingroupCoveragePaths=ingroupCoveragePaths, toilDir=toilDir)
        for i, ingroupPath in enumerate(ingroupPaths):
            # Get the coverage from the outgroups independently and
            # check that it's the same as the file in
            # ingroupCoverageDir
            otherIngroupPath = ingroupPaths[1] if i == 0 else ingroupPaths[0]
            # To filter out alignments from the other ingroup and
            # self-alignments we need to create a fasta with all the
            # outgroup fragments in it.
            outgroupsCombined = getTempFile(rootDir=self.tempDir)
            for outgroupFragmentPath in outgroupFragmentPaths:
                system("cat %s >> %s" % (outgroupFragmentPath, outgroupsCombined))
            independentCoverageFile = getTempFile(rootDir=self.tempDir)
            coverageWorkDir = getTempDirectory(rootDir=self.tempDir)
            calculateCoverage(work_dir=coverageWorkDir, fromGenome=outgroupsCombined, sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=independentCoverageFile)

            # find the coverage file cactus_blast kept (should be
            # named according to the basename of the ingroup path
            # file)
            keptCoverageFile = ingroupCoveragePaths[i]
            self.assertTrue(filecmp.cmp(independentCoverageFile, keptCoverageFile))
Пример #15
0
def runWorkflow_multipleExamples(inputGenFunction,
                                 testNumber=1, 
                                 testRestrictions=(TestStatus.TEST_SHORT, TestStatus.TEST_MEDIUM, \
                                                   TestStatus.TEST_LONG, TestStatus.TEST_VERY_LONG,),
                               inverseTestRestrictions=False,
                               batchSystem="single_machine",
                               buildAvgs=False, buildReference=False,
                               buildReferenceSequence=False,
                               buildCactusPDF=False, buildAdjacencyPDF=False,
                               buildReferencePDF=False,
                               makeCactusTreeStats=False, makeMAFs=False,
                               configFile=None, buildJobTreeStats=False):
    """A wrapper to run a number of examples.
    """
    if (inverseTestRestrictions and TestStatus.getTestStatus() not in testRestrictions) or \
        (not inverseTestRestrictions and TestStatus.getTestStatus() in testRestrictions):
        for test in xrange(testNumber): 
            tempDir = getTempDirectory(os.getcwd())
            sequences, newickTreeString = inputGenFunction(regionNumber=test, tempDir=tempDir)
            runWorkflow_TestScript(sequences, newickTreeString,
                                   batchSystem=batchSystem,
                                   buildAvgs=buildAvgs, buildReference=buildReference, 
                                   buildCactusPDF=buildCactusPDF, buildAdjacencyPDF=buildAdjacencyPDF,
                                   makeCactusTreeStats=makeCactusTreeStats, makeMAFs=makeMAFs, configFile=configFile,
                                   buildJobTreeStats=buildJobTreeStats)
            system("rm -rf %s" % tempDir)
            logger.info("Finished random test %i" % test)
Пример #16
0
   def run(self):
      #----------------------------------------
      # Run cactus_workflow.py and report time#
      #----------------------------------------
      logger.info("CactusWorkflowWrapper: going to issue cactus run for simulation %s, parameter %s\n" %(self.simulation, self.paraFile))
      tempDir = getTempDirectory(self.outDir)
      flowerdisk = os.path.join(tempDir, "cactusDisk")
      jobtreeDir = os.path.join(tempDir, "jobTree")
      #batchSystem = "single_machine"
      batchSystem = "parasol"
      retryCount = 0
      command = "cactus_workflow.py --speciesTree='%s' %s --configFile %s --buildTrees --setupAndBuildAlignments --cactusDisk %s --logDebug --job=JOB_FILE" %(self.tree, self.sequenceFiles, self.paraFile, flowerdisk)
      starttime = time.time()
      runJobTree(command, jobtreeDir, "DEBUG", retryCount, batchSystem, None)
      #runCactusWorkflow(flowerdisk, self.sequenceFiles, self.tree, jobtreeDir, "DEBUG", 0, batchSystem, None, True, True, False, False, self.config)
      runtime = time.time() - starttime
      logger.info("Done cactus_workflow for simulation %s, config %s\n" %(self.simulation, self.paraFile))

      #-----------------------
      # Run cactus_treeStats #
      #-----------------------
      #statsFile = os.path.join(self.outDir, "stats", "%s.xml" % self.simNum)
      statsFile = os.path.join(self.outDir, "stats", "%s.xml" % self.simName)
      runCactusTreeStats(outputFile=statsFile, cactusDisk=flowerdisk)
      #self.addChildCommand(command)

      #------------------- Adding child ------------------------#
      #self.addChildTarget(CactusMAFGeneratorWrapper(self.outDir, tempDir, self.simNum, runtime))
      self.addChildTarget(CactusMAFGeneratorWrapper(self.outDir, tempDir, self.simTrueMafDir, self.simName, runtime))
      logger.info("Added child CactusMAFGeneratorWrapper at %s\n" % self.outDir)

      #------------------- Cleaning up -------------------------#
      self.setFollowOnTarget(CactusWorkflowWrapperCleanup(tempDir))
 def setUp(self):
     #This is the number of random problems to solve, handed to the test code
     self.testNo = TestStatus.getTestSetup(shortTestNo=1, mediumTestNo=5, 
                                           longTestNo=10, veryLongTestNo=100)
     self.tempFiles = []
     self.tempDir = getTempDirectory(os.getcwd())
     self.tempBlossomDirectory = self.tempDir + "/tempBlossom"
     unittest.TestCase.setUp(self)
Пример #18
0
 def run(self, fileStore):
     sequenceFiles1 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1]
     sequenceFiles2 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs2]
     chunks1 = runGetChunks(sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize)
     chunks2 = runGetChunks(sequenceFiles=sequenceFiles2, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize)
     chunkIDs1 = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks1]
     chunkIDs2 = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks2]
     resultsIDs = []
     #Make the list of blast jobs.
     for chunkID1 in chunkIDs1:
         for chunkID2 in chunkIDs2:
             #TODO: Make the compression work
             self.blastOptions.compressFiles = False
             resultsIDs.append(self.addChild(RunBlast(self.blastOptions, chunkID1, chunkID2)).rv())
     logger.info("Made the list of blasts")
     #Set up the job to collate all the results
     return self.addFollowOn(CollateBlasts(self.blastOptions, resultsIDs)).rv()
Пример #19
0
 def run(self, fileStore):
     sequenceFiles1 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1]
     sequenceFiles2 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs2]
     chunks1 = runGetChunks(sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize)
     chunks2 = runGetChunks(sequenceFiles=sequenceFiles2, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize)
     chunkIDs1 = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks1]
     chunkIDs2 = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks2]
     resultsIDs = []
     #Make the list of blast jobs.
     for chunkID1 in chunkIDs1:
         for chunkID2 in chunkIDs2:
             #TODO: Make the compression work
             self.blastOptions.compressFiles = False
             resultsIDs.append(self.addChild(RunBlast(self.blastOptions, chunkID1, chunkID2)).rv())
     logger.info("Made the list of blasts")
     #Set up the job to collate all the results
     return self.addFollowOn(CollateBlasts(self.blastOptions, resultsIDs)).rv()
 def setUp(self):
     #This is the number of random problems to solve, handed to the test code
     self.testNo = TestStatus.getTestSetup(shortTestNo=1, mediumTestNo=5,
                                           longTestNo=10, veryLongTestNo=100)
     self.tempFiles = []
     self.tempDir = getTempDirectory(os.getcwd())
     self.tempMatchGraphDirectory = self.tempDir + "/tempMatchGraph"
     unittest.TestCase.setUp(self)
Пример #21
0
    def progressiveFunction(self, experimentFile, toilDir,
                            batchSystem, buildAvgs,
                            buildReference,
                            buildHal,
                            buildFasta,
                            toilStats,
                            subtreeRoot=None):
        tempDir = getTempDirectory(os.getcwd())
        tempExperimentDir = os.path.join(tempDir, "exp")
        runCreateMultiCactusProject(experimentFile,
                                    tempExperimentDir,
                                    fixNames=False,
                                    root=subtreeRoot)
        logger.info("Put the temporary files in %s" % tempExperimentDir)

        runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"),
                             toilDir,
                             batchSystem=batchSystem,
                             buildAvgs=buildAvgs,
                             toilStats=toilStats)

        # Check that the headers and sequences in the output are the
        # same as the sequences in the input (minus differences in
        # repeat-masking)
        exp = ExperimentWrapper(ET.parse(experimentFile).getroot())
        seqMap = exp.buildSequenceMap()
        # Maps genome name -> headers in fasta
        headers = {}
        for genomeName, inputSequencePath in seqMap.items():
            if os.path.isdir(inputSequencePath):
                # Some "input sequence paths" are actually provided as
                # directories containing multiple FASTAs
                concatenatedPath = getTempFile()
                system("cat %s/* > %s" % (inputSequencePath, concatenatedPath))
                inputSequencePath = concatenatedPath
            headers[genomeName] = list(map(itemgetter(0), fastaRead(inputSequencePath)))

        # check headers inside .c2h output
        for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)):
            subExp = ExperimentWrapper(ET.parse(expPath).getroot())
            outgroups = subExp.getOutgroupEvents()
            c2hPath = subExp.getHALPath()
            with open(c2hPath) as f:
                for line in f:
                    fields = line.split('\t')
                    if fields[0] == 's':
                        # Sequence line
                        genome = fields[1][1:-1]
                        header = fields[2][1:-1]
                        if genome in headers and genome not in outgroups:
                            # This genome is an input genome
                            self.assertTrue(header in headers[genome],
                                            'Header %s from output c2h %s not found in input fa %s'
                                            ' for genome %s' % (header, c2hPath, seqMap[genome], genome))


        runToilStatusAndFailIfNotComplete(toilDir)
        system("rm -rf %s" % tempDir)
Пример #22
0
 def setUp(self):
     self.testNo = TestStatus.getTestSetup(1, 5, 10, 100)
     self.tempDir = getTempDirectory(os.getcwd())
     self.tempFiles = []
     unittest.TestCase.setUp(self)
     self.tempOutputFile = os.path.join(self.tempDir, "results1.txt")
     self.tempFiles.append(self.tempOutputFile)
     self.tempOutputFile2 = os.path.join(self.tempDir, "results2.txt")
     self.tempFiles.append(self.tempOutputFile2) 
     self.encodePath = os.path.join(TestStatus.getPathToDataSets(), "MAY-2005")
def repeat_masking_job(job, input_fasta, species):
    temp_dir = getTempDirectory()
    os.chdir(temp_dir)
    local_fasta = os.path.join(temp_dir, 'input.fa')
    job.fileStore.readGlobalFile(input_fasta, local_fasta, cache=False)
    system("chmod a+rw %s" % local_fasta)
    system("RepeatMasker -pa 10 -species {species} {input}".format(species=species, input=local_fasta))
    output_path = local_fasta + '.out'
    masked_out = job.fileStore.writeGlobalFile(output_path)
    return masked_out
Пример #24
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     self.encodeRegion = "ENm001"
     self.encodePath = os.path.join(TestStatus.getPathToDataSets(), "MAY-2005")
     self.regionPath = os.path.join(self.encodePath, self.encodeRegion)
     self.tempDir = getTempDirectory(os.getcwd())
     self.tempOutputFile = os.path.join(self.tempDir, "results1.txt")
     self.toilDir = os.path.join(self.tempDir, "toil")
     self.toilOptions = Job.Runner.getDefaultOptions(self.toilDir)
     self.toilOptions.disableCaching = True
Пример #25
0
 def setUp(self):
     self.testNo = TestStatus.getTestSetup(1, 5, 10, 100)
     self.tempDir = getTempDirectory(os.getcwd())
     self.tempFiles = []
     unittest.TestCase.setUp(self)
     self.tempOutputFile = os.path.join(self.tempDir, "results1.txt")
     self.tempFiles.append(self.tempOutputFile)
     self.tempOutputFile2 = os.path.join(self.tempDir, "results2.txt")
     self.tempFiles.append(self.tempOutputFile2) 
     self.encodePath = os.path.join(TestStatus.getPathToDataSets(), "MAY-2005")
Пример #26
0
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.trees = randomTreeSet()
        self.mcTrees = []
        self.tempDir = getTempDirectory(os.getcwd())
        self.tempFa = os.path.join(self.tempDir, "seq.fa")
        with open(self.tempFa, "w") as f:
            f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n")
        self.dummySeqMaps = []
        for tree in self.trees:
            if tree.size() < 50:
                mcTree = MultiCactusTree(tree)
                seqMap = dict()
                for i in mcTree.breadthFirstTraversal():
                    mcTree.setName(i, "Node%s" % str(i))
                    seqMap["Node%s" % str(i)] = self.tempFa
                mcTree.computeSubtreeRoots()
                mcTree.nameUnlabeledInternalNodes()
                self.mcTrees.append(mcTree)
                self.dummySeqMaps.append(seqMap)

        # Boreoeutherian tree
        borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
        self.borMcTree = MultiCactusTree(NXNewick().parseString(
            borTree, addImpliedRoots=False))
        self.borMcTree.computeSubtreeRoots()
        self.borMcTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.borMcTree)

        # Eutherian backbone tree
        backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);'
        self.backboneTree = MultiCactusTree(NXNewick().parseString(
            backbone, addImpliedRoots=False))
        self.backboneTree.computeSubtreeRoots()
        self.backboneTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.backboneTree)

        seqLens = dict()
        seqLens["HUMAN"] = 57553
        seqLens["CHIMP"] = 57344
        seqLens["BABOON"] = 58960
        seqLens["MOUSE"] = 32750
        seqLens["RAT"] = 38436
        seqLens["DOG"] = 54187
        seqLens["CAT"] = 50283
        seqLens["PIG"] = 54843
        seqLens["COW"] = 55508
        self.blanchetteSeqMap = dict()
        for event, seqLen in seqLens.items():
            p = os.path.join(self.tempDir, event + ".fa")
            with open(p, "w") as f:
                f.write(">%s\n" % event)
                f.write(''.join(['A'] * seqLen))
                f.write('\n')
            self.blanchetteSeqMap[event] = p
Пример #27
0
    def run(self, fileStore):
        sequenceFiles1 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1]
        chunks = runGetChunks(sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize = self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize)
        assert len(chunks) > 0
        logger.info("Broken up the sequence files into individual 'chunk' files")
        chunkIDs = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks]

        diagonalResultsID = self.addChild(MakeSelfBlasts(self.blastOptions, chunkIDs)).rv()
        offDiagonalResultsID = self.addChild(MakeOffDiagonalBlasts(self.blastOptions, chunkIDs)).rv()
        logger.debug("Collating the blasts after blasting all-against-all")
        return self.addFollowOn(CollateBlasts(self.blastOptions, [diagonalResultsID, offDiagonalResultsID])).rv()
Пример #28
0
 def testSort(self):
     for test in xrange(self.testNo):
         tempDir = getTempDirectory(os.getcwd())
         tempFile1 = getTempFile(rootDir=tempDir)
         makeFileToSort(tempFile1)
         lines1 = loadFile(tempFile1)
         lines1.sort()
         sort(tempFile1)
         lines2 = loadFile(tempFile1)
         checkEqual(lines1, lines2)
         system("rm -rf %s" % tempDir)
Пример #29
0
def repeat_masking_job(job, input_fasta, species):
    temp_dir = getTempDirectory()
    os.chdir(temp_dir)
    local_fasta = os.path.join(temp_dir, 'input.fa')
    job.fileStore.readGlobalFile(input_fasta, local_fasta, cache=False)
    system("chmod a+rw %s" % local_fasta)
    system("RepeatMasker -pa 10 -species {species} {input}".format(
        species=species, input=local_fasta))
    output_path = local_fasta + '.out'
    masked_out = job.fileStore.writeGlobalFile(output_path)
    return masked_out
Пример #30
0
    def testAddingOutgroupsImprovesResult(self):
        """Run blast on "ingroup" and "outgroup" encode regions, and ensure
        that adding an extra outgroup only adds alignments if
        possible, and doesn't lose any
        """
        encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ]
        ingroups = ["human", "macaque"]
        outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"]
        # subselect 4 random ordered outgroups
        outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), 4))]
        for encodeRegion in encodeRegions:
            regionPath = os.path.join(self.encodePath, encodeRegion)
            ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups)
            outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
            results = []
            for numOutgroups in xrange(1,5):
                # Align w/ increasing numbers of outgroups
                subResults = getTempFile()
                subOutgroupPaths = outgroupPaths[:numOutgroups]
                print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths))
                tmpToil = os.path.join(self.tempDir, "outgroupToil")
                runCactusBlastIngroupsAndOutgroups(ingroupPaths, subOutgroupPaths, alignmentsFile=subResults, toilDir=tmpToil)
                results.append(subResults)

            # Print diagnostics about coverage
            for i, subResults in enumerate(results):
                for ingroup, ingroupPath in zip(ingroups, ingroupPaths):
                    ingroupCoverage = getTempFile(rootDir=self.tempDir)
                    coverageWorkDir = getTempDirectory(rootDir=self.tempDir)
                    calculateCoverage(work_dir=coverageWorkDir, sequenceFile=ingroupPath, cigarFile=subResults, outputFile=ingroupCoverage)
                    coveredBases = popenCatch("cat %s | awk '{ total += $3 - $2 } END { print total }'" % ingroupCoverage)
                    print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases)

            resultsSets = map(lambda x : loadResults(x), results)
            for i, moreOutgroupsResults in enumerate(resultsSets[1:]):
                # Make sure the results from (n+1) outgroups are
                # (very nearly) a superset of the results from n outgroups
                print "Using %d addl outgroup(s):" % (i + 1)
                comparator =  ResultComparator(resultsSets[0], moreOutgroupsResults)
                print comparator
                self.assertTrue(comparator.sensitivity >= 0.99)

            # Ensure that the new alignments don't cover more than
            # x% of already existing alignments to human
            for i in xrange(1, len(resultsSets)):
                prevResults = resultsSets[i-1][0]
                curResults = resultsSets[i][0]
                prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults)))
                newAlignments = curResults.difference(prevResults)
                newAlignmentsHumanPos =  set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments)))
                print "addl outgroup %d:" % i
                print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos)))
            for subResult in results:
                os.remove(subResult)
Пример #31
0
def runCactusProgressive(options):
    with Toil(options) as toil:
        importSingularityImage()
        #Run the workflow
        if options.restart:
            halID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()
            #Create the progressive cactus project
            projWrapper = ProjectWrapper(options)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)
            #import the sequences
            for genome, seq in project.inputSequenceMap.items():
                if os.path.isdir(seq):
                    tmpSeq = getTempFile()
                    catFiles([
                        os.path.join(seq, subSeq) for subSeq in os.listdir(seq)
                    ], tmpSeq)
                    seq = tmpSeq
                seq = makeURL(seq)
                project.inputSequenceIDMap[genome] = toil.importFile(seq)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            project.writeXML(pjPath)
            halID = toil.start(
                RunCactusPreprocessorThenProgressiveDown(
                    options, project, memory=configWrapper.getDefaultMemory()))

        toil.exportFile(halID, makeURL(options.outputHal))
Пример #32
0
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.trees = randomTreeSet()
        self.mcTrees = []
        self.tempDir = getTempDirectory(os.getcwd())
        self.tempFa = os.path.join(self.tempDir, "seq.fa")
        with open(self.tempFa, "w") as f:
            f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n")
        self.dummySeqMaps = []
        for tree in self.trees:
            if tree.size() < 50:
                mcTree = MultiCactusTree(tree, tree.degree())
                seqMap = dict()
                for i in mcTree.breadthFirstTraversal():
                    mcTree.setName(i, "Node%s" % str(i))
                    seqMap["Node%s" % str(i)] = self.tempFa
                mcTree.computeSubtreeRoots()
                mcTree.nameUnlabeledInternalNodes()
                self.mcTrees.append(mcTree)
                self.dummySeqMaps.append(seqMap)

        # Boreoeutherian tree
        borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
        self.borMcTree = MultiCactusTree(NXNewick().parseString(borTree, addImpliedRoots=False))
        self.borMcTree.computeSubtreeRoots()
        self.borMcTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.borMcTree)

        # Eutherian backbone tree
        backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);'
        self.backboneTree = MultiCactusTree(NXNewick().parseString(backbone, addImpliedRoots=False))
        self.backboneTree.computeSubtreeRoots()
        self.backboneTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.backboneTree)

        seqLens = dict()
        seqLens["HUMAN"] = 57553
        seqLens["CHIMP"] = 57344
        seqLens["BABOON"] = 58960
        seqLens["MOUSE"] = 32750
        seqLens["RAT"] = 38436
        seqLens["DOG"] = 54187
        seqLens["CAT"] = 50283
        seqLens["PIG"] = 54843
        seqLens["COW"] = 55508
        self.blanchetteSeqMap = dict()
        for event, seqLen in seqLens.items():
            p = os.path.join(self.tempDir, event +".fa")
            with open(p, "w") as f:
                f.write(">%s\n" % event)
                f.write(''.join(['A'] * seqLen))
                f.write('\n')
            self.blanchetteSeqMap[event] = p
Пример #33
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     self.tempDir = getTempDirectory(os.getcwd())
     self.tempFiles = []
     unittest.TestCase.setUp(self)
     self.tempOutputFile = os.path.join(self.tempDir, "results1.txt")
     self.tempFiles.append(self.tempOutputFile)
     self.tempOutputFile2 = os.path.join(self.tempDir, "results2.txt")
     self.tempFiles.append(self.tempOutputFile2) 
     self.encodePath = os.path.join(TestStatus.getPathToDataSets(), "MAY-2005")
     self.defaultLastzArguments = "--ambiguous=iupac"
     self.defaultRealignArguments = ""
Пример #34
0
    def testAddingOutgroupsImprovesResult(self):
        """Run blast on "ingroup" and "outgroup" encode regions, and ensure
        that adding an extra outgroup only adds alignments if
        possible, and doesn't lose any
        """
        encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ]
        ingroups = ["human", "macaque"]
        outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"]
        # subselect 4 random ordered outgroups
        outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), 4))]
        for encodeRegion in encodeRegions:
            regionPath = os.path.join(self.encodePath, encodeRegion)
            ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups)
            outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
            results = []
            for numOutgroups in xrange(1,5):
                # Align w/ increasing numbers of outgroups
                subResults = getTempFile()
                subOutgroupPaths = outgroupPaths[:numOutgroups]
                tmpJobTree = getTempDirectory()
                print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths))
                system("cactus_blast.py --ingroups %s --outgroups %s --cigars %s --jobTree %s/jobTree" % (",".join(ingroupPaths), ",".join(subOutgroupPaths), subResults, tmpJobTree))
                system("rm -fr %s" % (tmpJobTree))
                results.append(subResults)

            # Print diagnostics about coverage
            for i, subResults in enumerate(results):
                for ingroup, ingroupPath in zip(ingroups, ingroupPaths):
                    coveredBases = popenCatch("cactus_coverage %s %s | awk '{ total += $3 - $2 } END { print total }'" % (ingroupPath, subResults))
                    print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases)

            resultsSets = map(lambda x : loadResults(x), results)
            for i, moreOutgroupsResults in enumerate(resultsSets[1:]):
                # Make sure the results from (n+1) outgroups are
                # (very nearly) a superset of the results from n outgroups
                print "Using %d addl outgroup(s):" % (i + 1)
                comparator =  ResultComparator(resultsSets[0], moreOutgroupsResults)
                print comparator
                self.assertTrue(comparator.sensitivity >= 0.99)

            # Ensure that the new alignments don't cover more than
            # x% of already existing alignments to human
            for i in xrange(1, len(resultsSets)):
                prevResults = resultsSets[i-1][0]
                curResults = resultsSets[i][0]
                prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults)))
                newAlignments = curResults.difference(prevResults)
                newAlignmentsHumanPos =  set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments)))
                print "addl outgroup %d:" % i
                print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos)))
            for subResult in results:
                os.remove(subResult)
Пример #35
0
 def testGetMidPoint(self):
     for test in xrange(self.testNo):
         tempDir = getTempDirectory(os.getcwd())
         tempFile = getTempFile(rootDir=tempDir)
         makeFileToSort(tempFile)
         l = open(tempFile, 'r').read()
         fileSize = os.path.getsize(tempFile)
         midPoint = getMidPoint(tempFile, 0, fileSize)
         print "the mid point is %i of a file of %i bytes woth byte" % (midPoint, fileSize)
         assert midPoint < fileSize
         assert l[midPoint] == '\n'
         assert midPoint >= 0
         system("rm -rf %s" % tempDir)
Пример #36
0
 def setUp(self):
     self.batchSystem = "singleMachine"
     if getBatchSystem() != None:
         self.batchSystem = getBatchSystem()
     unittest.TestCase.setUp(self)
     self.useOutgroup = False
     self.doSelfAlignment = False
     #Load the config file, turn on the checks.
     configWrapper = ConfigWrapper(ET.parse(os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot())
     configWrapper.turnAllModesOn()
     self.tempDir = getTempDirectory(os.getcwd())
     self.configFile = os.path.join(self.tempDir, "tempConfig.xml")
     configWrapper.writeXML(self.configFile)
Пример #37
0
 def setUp(self):
     self.batchSystem = "singleMachine"
     if getBatchSystem() != None:
         self.batchSystem = getBatchSystem()
     unittest.TestCase.setUp(self)
     self.useOutgroup = False
     self.doSelfAlignment = False
     #Load the config file, turn on the checks.
     configWrapper = ConfigWrapper(ET.parse(os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot())
     configWrapper.turnAllModesOn()
     configWrapper.turnOffHeaderChecks()
     self.tempDir = getTempDirectory(os.getcwd())
     self.configFile = os.path.join(self.tempDir, "tempConfig.xml")
     configWrapper.writeXML(self.configFile)
Пример #38
0
 def testCopySubRangeOfFile(self):
     for test in xrange(self.testNo):
         tempDir = getTempDirectory(os.getcwd())
         tempFile = getTempFile(rootDir=tempDir)
         outputFile = getTempFile(rootDir=tempDir)
         makeFileToSort(tempFile)
         fileSize = os.path.getsize(tempFile)
         assert fileSize > 0
         fileStart = random.choice(xrange(0, fileSize))
         fileEnd = random.choice(xrange(fileStart, fileSize))
         copySubRangeOfFile(tempFile, fileStart, fileEnd, outputFile)
         l = open(outputFile, 'r').read()
         l2 = open(tempFile, 'r').read()[fileStart:fileEnd]
         checkEqual(l, l2)
         system("rm -rf %s" % tempDir)
Пример #39
0
    def run(self, fileStore):
        logger.info("Preparing sequence for preprocessing")
        # chunk it up
        inSequence = fileStore.readGlobalFile(self.inSequenceID)
        inChunkDirectory = getTempDirectory(
            rootDir=fileStore.getLocalTempDir())
        inChunkList = runGetChunks(sequenceFiles=[inSequence],
                                   chunksDir=inChunkDirectory,
                                   chunkSize=self.prepOptions.chunkSize,
                                   overlapSize=0)
        inChunkList = [os.path.abspath(path) for path in inChunkList]
        logger.info("Chunks = %s" % inChunkList)
        logger.info("Chunks dir = %s" % os.listdir(inChunkDirectory))

        inChunkIDList = [
            fileStore.writeGlobalFile(chunk) for chunk in inChunkList
        ]
        outChunkIDList = []
        #For each input chunk we create an output chunk, it is the output chunks that get concatenated together.
        if not self.chunksToCompute:
            self.chunksToCompute = range(len(inChunkList))
        for i in self.chunksToCompute:
            #Calculate the number of chunks to use
            inChunkNumber = int(
                max(
                    1,
                    math.ceil(
                        len(inChunkList) *
                        self.prepOptions.proportionToSample)))
            assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0
            #Now get the list of chunks flanking and including the current chunk
            j = max(0, i - inChunkNumber / 2)
            inChunkIDs = inChunkIDList[j:j + inChunkNumber]
            if len(
                    inChunkIDs
            ) < inChunkNumber:  #This logic is like making the list circular
                inChunkIDs += inChunkIDList[:inChunkNumber - len(inChunkIDs)]
            assert len(inChunkIDs) == inChunkNumber
            outChunkIDList.append(
                self.addChild(
                    PreprocessChunk(self.prepOptions, inChunkIDs,
                                    float(inChunkNumber) / len(inChunkIDList),
                                    inChunkIDList[i])).rv())
        # follow on to merge chunks
        return self.addFollowOn(MergeChunks(self.prepOptions,
                                            outChunkIDList)).rv()
Пример #40
0
 def testCactusSetup(self): 
     """Creates a bunch of random inputs and then passes them to cactus setup.
     """
     for test in xrange(self.testNo): 
         tempDir = getTempDirectory(os.getcwd())
         sequenceNumber = random.choice(xrange(100))
         sequences, newickTreeString = getCactusInputs_random(tempDir=tempDir, sequenceNumber=sequenceNumber)
         
         #Setup the flower disk.
         experiment = getCactusWorkflowExperimentForTest(sequences, newickTreeString, tempDir)
         cactusDiskDatabaseString = experiment.getDiskDatabaseString() 
         
         runCactusSetup(cactusDiskDatabaseString, sequences, newickTreeString)
         runCactusSetup(cactusDiskDatabaseString, sequences, newickTreeString)
         
         experiment.cleanupDb()
         system("rm -rf %s" % tempDir)
         logger.info("Finished test %i of cactus_setup.py", test) 
Пример #41
0
 def testCPecanEmMultipleTrials(self):
     """Runs uns cPecanEm with multiple different trials.
     """
     for seqFile1, seqFile2 in seqFilePairGenerator():
         tempDir = getTempDirectory(rootDir=os.getcwd())
         jobTreeDir = os.path.join(tempDir, "jobTree")
         alignmentsFile = os.path.join(tempDir, "alignments.cigars")
         computeAlignments(seqFile1, seqFile2, alignmentsFile)
         logger.info("Computed alignments for seqs %s and %s" % (seqFile1, seqFile2))
         outputModelFile = os.path.join(tempDir, "outputModel.txt")
         outputModelXMLFile = os.path.join(tempDir, "outputModel.xml")
         outputBlastFile = os.path.join(tempDir, "outputBlast.txt")
         #First run the script to generate a model and do one iteration of EM to 
         #get the likelihood to compare with the final likelihood
         trials=3
         runCPecanEm(sequenceFiles=[ seqFile1, seqFile2 ], 
                      alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, 
                      jobTreeDir=jobTreeDir,
                      trials=trials,
                      outputTrialHmms=True,
                      iterations=5, randomStart=True, logLevel=getLogLevelString(),
                      optionsToRealign="--diagonalExpansion=6 --splitMatrixBiggerThanThis=100",
                      outputXMLModelFile=outputModelXMLFile,
                      blastScoringMatrixFile=outputBlastFile)
         trialHmms = [ Hmm.loadHmm(outputModelFile + ("_%i" % i)) for i in xrange(trials) ]
         hmm = Hmm.loadHmm(outputModelFile)
         node = ET.parse(outputModelXMLFile).getroot()
         logger.info("After multiple trials and iterations of EM the best likelihood found was %s, the likelihoods of the variants were: %s" % 
                     (hmm.likelihood, " ".join(map(lambda x : str(x.likelihood), trialHmms))))
         
         matchProbs, gapOpen, gapExtend = makeBlastScoringMatrix(hmm, ("ACTG",))
         logger.info("Gap open: %s, Gap extend: %s, Match probs %s" % (gapOpen, gapExtend, " ".join(map(str, matchProbs))))
         
         self.assertTrue(float(node.attrib["maxLikelihood"]) == hmm.likelihood)
         
         #Now use the blast file to compute a new matrix
         computeAlignments(seqFile1, seqFile2, alignmentsFile, lastzArguments=("--ambiguous=iupac --scores=%s" % outputBlastFile))
         
         #Run modifyHmm to check it works
         system("cPecanModifyHmm %s %s --gcContent=0.5 --substitutionRate=0.05 --setFlatIndelEmissions" % (outputModelFile, outputModelFile))
         hmm = Hmm.loadHmm(outputModelFile)
         node = ET.parse(outputModelXMLFile).getroot()
         
         system("rm -rf %s" % tempDir)
Пример #42
0
def runWorkflow_multipleExamples(testId,
                                 inputGenFunction,
                                 testNumber=1,
                                 batchSystem="single_machine",
                                 buildAvgs=False,
                                 configFile=None,
                                 buildToilStats=False,
                                 useConstraints=False,
                                 cactusWorkflowFunction=runCactusWorkflow,
                                 logLevel=None,
                                 buildHal=False,
                                 buildFasta=False,
                                 progressive=False):
    """A wrapper to run a number of examples.
    The testId parameter is used to allocate a unique port so that tests
    can run in parallel.
    """
    if logLevel is None:
        logLevel = _LOG_LEVEL
    for test in range(testNumber):
        tempDir = getTempDirectory(os.getcwd())
        if useConstraints:
            sequences, newickTreeString, constraints = inputGenFunction(
                regionNumber=test, tempDir=tempDir)
        else:
            sequences, newickTreeString = inputGenFunction(regionNumber=test,
                                                           tempDir=tempDir)
            constraints = None
        runWorkflow_TestScript(testId,
                               sequences,
                               newickTreeString,
                               outputDir=tempDir,
                               batchSystem=batchSystem,
                               buildAvgs=buildAvgs,
                               buildHal=buildHal,
                               buildFasta=buildFasta,
                               configFile=configFile,
                               buildToilStats=buildToilStats,
                               constraints=constraints,
                               progressive=progressive,
                               cactusWorkflowFunction=cactusWorkflowFunction,
                               logLevel=logLevel)
        system("rm -rf %s" % tempDir)
        logger.info("Finished random test %i" % test)
Пример #43
0
    def testCactus_Random_fixedAncestor(self):
        """Tests that cactus doesn't crash when aligning to a fixed ancestral sequence."""
        sequences, _ = getCactusInputs_random(treeLeafNumber=3)
        rootSeq = sequences.pop()
        # Create a star tree
        tree = '(%s)root;' % ",".join([str(x) + ":1.0" for x in range(len(sequences))])
        outputDir = getTempDirectory()
        experiment = getCactusWorkflowExperimentForTest(sequences, tree,
                                                        outputDir,
                                                        progressive=True)
        experiment.setSequenceID("root", rootSeq)
        experiment.setRootReconstructed(False)
        experimentFile = os.path.join(outputDir, "experiment.xml")
        experiment.writeXML(experimentFile)

        jobTreeDir = os.path.join(outputDir, "jobTree")

        self.progressiveFunction(experimentFile, jobTreeDir, 'singleMachine',
                                 False, True, True, False)
Пример #44
0
 def testJobTreeStats_SortSimple(self):
     """Tests the jobTreeStats utility using the scriptTree_sort example.
     """
     for test in xrange(self.testNo):
         tempDir = getTempDirectory(os.getcwd())
         tempFile = getTempFile(rootDir=tempDir)
         outputFile = getTempFile(rootDir=tempDir)
         jobTreeDir = os.path.join(tempDir, "jobTree")
         lines=100000
         maxLineLength=10
         N=1000
         makeFileToSort(tempFile, lines, maxLineLength)
         #Sort the file
         command = "scriptTreeTest_Sort.py --jobTree %s --logLevel=DEBUG --fileToSort=%s --N %s --stats --jobTime 0.5" % (jobTreeDir, tempFile, N)
         system(command)
         #Now get the stats
         system("jobTreeStats --jobTree %s --outputFile %s" % (jobTreeDir, outputFile))
         #Cleanup
         system("rm -rf %s" % tempDir)
Пример #45
0
    def run(self, fileStore):
        logger.info("Preparing sequence for preprocessing")

        inSequence = fileStore.readGlobalFile(self.inSequenceID)

        if self.prepOptions.chunkSize <= 0:
            # In this first case we don't need to break up the sequence
            chunked = False
            inChunkList = [inSequence]
        else:
            # chunk it up
            chunked = True
            inChunkDirectory = getTempDirectory(rootDir=fileStore.getLocalTempDir())
            inChunkList = runGetChunks(sequenceFiles=[inSequence], chunksDir=inChunkDirectory,
                                       chunkSize=self.prepOptions.chunkSize,
                                       overlapSize=0)
            inChunkList = [os.path.abspath(path) for path in inChunkList]
        logger.info("Chunks = %s" % inChunkList)

        inChunkIDList = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in inChunkList]
        outChunkIDList = []
        #For each input chunk we create an output chunk, it is the output chunks that get concatenated together.
        if not self.chunksToCompute:
            self.chunksToCompute = range(len(inChunkList))
        for i in self.chunksToCompute:
            #Calculate the number of chunks to use
            inChunkNumber = int(max(1, math.ceil(len(inChunkList) * self.prepOptions.proportionToSample)))
            assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0
            #Now get the list of chunks flanking and including the current chunk
            j = max(0, i - inChunkNumber/2)
            inChunkIDs = inChunkIDList[j:j+inChunkNumber]
            if len(inChunkIDs) < inChunkNumber: #This logic is like making the list circular
                inChunkIDs += inChunkIDList[:inChunkNumber-len(inChunkIDs)]
            assert len(inChunkIDs) == inChunkNumber
            outChunkIDList.append(self.addChild(self.getChunkedJobForCurrentStage(inChunkIDs, float(inChunkNumber)/len(inChunkIDList), inChunkIDList[i])).rv())

        if chunked:
            # Merge results of the chunking process back into a genome-wide file
            return self.addFollowOn(MergeChunks(self.prepOptions, outChunkIDList)).rv()
        else:
            # Didn't chunk--we have a genome-wide fasta file
            return outChunkIDList[0]
Пример #46
0
def runCactusGraphMapJoin(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            wf_output = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            #load cactus config
            configNode = ET.parse(options.configFile).getroot()
            config = ConfigWrapper(configNode)
            config.substituteAllPredefinedConstantsWithLiterals()

            # load up the vgs
            vg_ids = []
            for vg_path in options.vg:
                logger.info("Importing {}".format(vg_path))
                vg_ids.append(toil.importFile(makeURL(vg_path)))

            # tack on the decoys
            if options.decoyGraph:
                logger.info("Importing decoys {}".format(options.decoyGraph))
                vg_ids.append(toil.importFile(makeURL(options.decoyGraph)))
                # we'll treat it like any other graph downstream, except clipping
                # where we'll check first using the path name
                options.vg.append(options.decoyGraph)

            # load up the hals
            hal_ids = []
            for hal_path in options.hal:
                logger.info("Importing {}".format(hal_path))
                hal_ids.append(toil.importFile(makeURL(hal_path)))

            # run the workflow
            wf_output = toil.start(
                Job.wrapJobFn(graphmap_join_workflow, options, config, vg_ids,
                              hal_ids))

        #export the split data
        export_join_data(toil, options, wf_output[0], wf_output[1],
                         wf_output[2])
Пример #47
0
def runWorkflow_multipleExamples(inputGenFunction,
                                 testNumber=1,
                                 testRestrictions=(TestStatus.TEST_SHORT, TestStatus.TEST_MEDIUM, \
                                                   TestStatus.TEST_LONG, TestStatus.TEST_VERY_LONG,),
                                 inverseTestRestrictions=False,
                                 batchSystem="single_machine",
                                 buildAvgs=False, buildReference=False,
                                 configFile=None, buildToilStats=False,
                                 useConstraints=False,
                                 cactusWorkflowFunction=runCactusWorkflow,
                                 buildHal=False,
                                 buildFasta=False,
                                 progressive=False):
    """A wrapper to run a number of examples.
    """
    if (inverseTestRestrictions and TestStatus.getTestStatus() not in testRestrictions) or \
        (not inverseTestRestrictions and TestStatus.getTestStatus() in testRestrictions):
        for test in xrange(testNumber):
            tempDir = getTempDirectory(os.getcwd())
            if useConstraints:
                sequences, newickTreeString, constraints = inputGenFunction(
                    regionNumber=test, tempDir=tempDir)
            else:
                sequences, newickTreeString = inputGenFunction(
                    regionNumber=test, tempDir=tempDir)
                constraints = None
            runWorkflow_TestScript(
                sequences,
                newickTreeString,
                outputDir=tempDir,
                batchSystem=batchSystem,
                buildAvgs=buildAvgs,
                buildReference=buildReference,
                buildHal=buildHal,
                buildFasta=buildFasta,
                configFile=configFile,
                buildToilStats=buildToilStats,
                constraints=constraints,
                progressive=progressive,
                cactusWorkflowFunction=cactusWorkflowFunction)
            system("rm -rf %s" % tempDir)
            logger.info("Finished random test %i" % test)
Пример #48
0
 def testCPecanEm(self):
     """Runs cPecanEm. 
     """
     trial = 0
     for modelType in ("fiveState", "fiveStateAsymmetric", "threeState", "threeStateAsymmetric"):
         for seqFile1, seqFile2 in seqFilePairGenerator():
             tempDir = getTempDirectory(rootDir=os.getcwd())
             jobTreeDir = os.path.join(tempDir, "jobTree")
             alignmentsFile = os.path.join(tempDir, "alignments.cigars")
             computeAlignments(seqFile1, seqFile2, alignmentsFile)
             logger.info("Computed alignments for seqs %s and %s" % (seqFile1, seqFile2))
             outputModelFile = os.path.join(tempDir, "outputModel.txt")
             #First run the script to generate a model and do one iteration of EM to 
             #get the likelihood to compare with the final likelihood
             runCPecanEm(sequenceFiles=[ seqFile1, seqFile2 ], 
                          alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, 
                          modelType=modelType,
                          jobTreeDir=jobTreeDir,
                          iterations=1, trials=1, randomStart=False, logLevel=getLogLevelString(),
                          setJukesCantorStartingEmissions=0.2,
                          #useDefaultModelAsStart=,
                          trainEmissions=True,
                          tieEmissions=True,
                          optionsToRealign="--diagonalExpansion=6 --splitMatrixBiggerThanThis=100")
             hmm = Hmm.loadHmm(outputModelFile)
             system("rm -rf %s" % jobTreeDir) #Cleanup the old jobTree
             logger.info("For trial %s the likelihood after 1 iteration of EM is %s" % (trial, hmm.likelihood))
             iterations = 5
             runCPecanEm(sequenceFiles=[ seqFile1, seqFile2 ], 
                         alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, jobTreeDir=jobTreeDir,
                         optionsToRealign="--diagonalExpansion=6 --splitMatrixBiggerThanThis=100",
                         iterations=iterations, inputModelFile=outputModelFile, logLevel=getLogLevelString(),
                         maxAlignmentLengthPerJob=10000) #, updateTheBand=True)
             hmm2 = Hmm.loadHmm(outputModelFile)
             logger.info("For trial %s the likelihood after a further %s iterations of EM is %s" % (trial, iterations, hmm2.likelihood))
             self.assertTrue(hmm.likelihood < hmm2.likelihood)
             hmm2.normalise()
             logger.info("Final transitions: %s" % " ".join(map(str, hmm2.transitions)))
             logger.info("Final emissions: %s" % " ".join(map(str, hmm2.emissions)))
             system("rm -rf %s" % tempDir)
             trial += 1
Пример #49
0
 def progressiveFunction(self, experimentFile, jobTreeDir,
                         batchSystem, buildAvgs,
                         buildReference,
                         buildHal,
                         buildFasta,
                         jobTreeStats,
                         subtreeRoot=None):
     tempDir = getTempDirectory(os.getcwd())
     tempExperimentDir = os.path.join(tempDir, "exp")
     runCactusCreateMultiCactusProject(experimentFile,
                                       tempExperimentDir,
                                       fixNames=False,
                                       root=subtreeRoot)
     logger.info("Put the temporary files in %s" % tempExperimentDir)
     runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"),
                          jobTreeDir,
                          batchSystem=batchSystem,
                          buildAvgs=buildAvgs,
                          jobTreeStats=jobTreeStats)
     runJobTreeStatusAndFailIfNotComplete(jobTreeDir)
     system("rm -rf %s" % tempDir)
Пример #50
0
    def run(self):
        #----------------------------------------
        # Run cactus_workflow.py and report time#
        #----------------------------------------
        logger.info(
            "CactusWorkflowWrapper: going to issue cactus run for simulation %s, parameter %s\n"
            % (self.simulation, self.paraFile))
        tempDir = getTempDirectory(self.outDir)
        flowerdisk = os.path.join(tempDir, "cactusDisk")
        jobtreeDir = os.path.join(tempDir, "jobTree")
        #batchSystem = "single_machine"
        batchSystem = "parasol"
        retryCount = 0
        command = "cactus_workflow.py --speciesTree='%s' %s --configFile %s --buildTrees --setupAndBuildAlignments --cactusDisk %s --logDebug --job=JOB_FILE" % (
            self.tree, self.sequenceFiles, self.paraFile, flowerdisk)
        starttime = time.time()
        runJobTree(command, jobtreeDir, "DEBUG", retryCount, batchSystem, None)
        #runCactusWorkflow(flowerdisk, self.sequenceFiles, self.tree, jobtreeDir, "DEBUG", 0, batchSystem, None, True, True, False, False, self.config)
        runtime = time.time() - starttime
        logger.info("Done cactus_workflow for simulation %s, config %s\n" %
                    (self.simulation, self.paraFile))

        #-----------------------
        # Run cactus_treeStats #
        #-----------------------
        #statsFile = os.path.join(self.outDir, "stats", "%s.xml" % self.simNum)
        statsFile = os.path.join(self.outDir, "stats", "%s.xml" % self.simName)
        runCactusTreeStats(outputFile=statsFile, cactusDisk=flowerdisk)
        #self.addChildCommand(command)

        #------------------- Adding child ------------------------#
        #self.addChildTarget(CactusMAFGeneratorWrapper(self.outDir, tempDir, self.simNum, runtime))
        self.addChildTarget(
            CactusMAFGeneratorWrapper(self.outDir, tempDir, self.simTrueMafDir,
                                      self.simName, runtime))
        logger.info("Added child CactusMAFGeneratorWrapper at %s\n" %
                    self.outDir)

        #------------------- Cleaning up -------------------------#
        self.setFollowOnTarget(CactusWorkflowWrapperCleanup(tempDir))
Пример #51
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     self.trees = randomTreeSet()
     self.mcTrees = []
     self.tempDir = getTempDirectory(os.getcwd())
     self.tempFa = os.path.join(self.tempDir, "seq.fa")
     with open(self.tempFa, "w") as f:
         f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n")
     self.dummySeqMaps = []
     for tree in self.trees:
         if tree.size() < 500:
             mcTree = MultiCactusTree(tree, tree.degree())
             seqMap = dict()
             for i in mcTree.breadthFirstTraversal():
                 mcTree.setName(i, "Node%s" % str(i))
                 seqMap["Node%s" % str(i)] = self.tempFa
             mcTree.computeSubtreeRoots()
             self.mcTrees.append(mcTree)
             self.dummySeqMaps.append(seqMap)
             
     seqLens = dict()
     seqLens["HUMAN"] = 57553
     seqLens["CHIMP"] = 57344
     seqLens["BABOON"] = 58960
     seqLens["MOUSE"] = 32750
     seqLens["RAT"] = 38436
     seqLens["DOG"] = 54187
     seqLens["CAT"] = 50283
     seqLens["PIG"] = 54843
     seqLens["COW"] = 55508
     self.blanchetteSeqMap = dict()
     for event, seqLen in seqLens.items():
         p = os.path.join(self.tempDir, event +".fa")
         with open(p, "w") as f:
             f.write(">%s\n" % event)
             f.write(''.join(['A'] * seqLen))
             f.write('\n')
         self.blanchetteSeqMap[event] = p
Пример #52
0
def runWorkflow_multipleExamples(inputGenFunction,
                                 testNumber=1, 
                                 testRestrictions=(TestStatus.TEST_SHORT, TestStatus.TEST_MEDIUM, \
                                                   TestStatus.TEST_LONG, TestStatus.TEST_VERY_LONG,),
                                 inverseTestRestrictions=False,
                                 batchSystem="single_machine",
                                 buildAvgs=False, buildReference=False,
                                 configFile=None, buildJobTreeStats=False, 
                                 useConstraints=False,
                                 cactusWorkflowFunction=runCactusWorkflow,
                                 buildHal=False,
                                 buildFasta=False,
                                 progressive=False):
    """A wrapper to run a number of examples.
    """
    if (inverseTestRestrictions and TestStatus.getTestStatus() not in testRestrictions) or \
        (not inverseTestRestrictions and TestStatus.getTestStatus() in testRestrictions):
        for test in xrange(testNumber): 
            tempDir = getTempDirectory(os.getcwd())
            if useConstraints:
                sequences, newickTreeString, constraints = inputGenFunction(regionNumber=test, tempDir=tempDir)
            else:
                sequences, newickTreeString = inputGenFunction(regionNumber=test, tempDir=tempDir)
                constraints = None
            experiment = runWorkflow_TestScript(sequences, newickTreeString,
                                                outputDir=tempDir,
                                                batchSystem=batchSystem,
                                                buildAvgs=buildAvgs, buildReference=buildReference, 
                                                buildHal=buildHal,
                                                buildFasta=buildFasta,
                                                configFile=configFile,
                                                buildJobTreeStats=buildJobTreeStats,
                                                constraints=constraints,
                                                progressive=progressive,
                                                cactusWorkflowFunction=cactusWorkflowFunction)
            experiment.cleanupDb()
            system("rm -rf %s" % tempDir)
            logger.info("Finished random test %i" % test)
Пример #53
0
def scriptTree_SortTest(testNo, batchSystem, lines=100000, maxLineLength=10, N=1000):
    """Tests scriptTree/jobTree by sorting a file in parallel.
    """
    for test in xrange(testNo):
        tempDir = getTempDirectory(os.getcwd())
        tempFile = getTempFile(rootDir=tempDir)
        jobTreeDir = os.path.join(tempDir, "jobTree")
        makeFileToSort(tempFile, lines=lines, maxLineLength=maxLineLength)
        #First make our own sorted version
        fileHandle = open(tempFile, 'r')
        l = fileHandle.readlines()
        l.sort()
        fileHandle.close()
        #Sort the file
        command = "scriptTreeTest_Sort.py --jobTree %s --logLevel=DEBUG --fileToSort=%s --N %i --batchSystem %s --jobTime 1.0 --maxJobs 20" % (jobTreeDir, tempFile, N, batchSystem)
        system(command)
        #Now check the file is properly sorted..
        #Now get the sorted file
        fileHandle = open(tempFile, 'r')
        l2 = fileHandle.readlines()
        fileHandle.close()
        checkEqual(l, l2)
        system("rm -rf %s" % tempDir)
Пример #54
0
 def testBlastRandom(self):
     """Make some sequences, put them in a file, call blast with random parameters 
     and check it runs okay.
     """
     tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa")
     self.tempFiles.append(tempSeqFile)
     for test in xrange(self.testNo):
         seqNo = random.choice(xrange(0, 10))
         seq = getRandomSequence(8000)[1]
         fileHandle = open(tempSeqFile, 'w')
         for fastaHeader, seq in [ (str(i), mutateSequence(seq, 0.3*random.random())) for i in xrange(seqNo) ]:
             if random.random() > 0.5:
                 seq = reverseComplement(seq)
             fastaWrite(fileHandle, fastaHeader, seq)
         fileHandle.close()
         chunkSize = random.choice(xrange(500, 9000))
         overlapSize = random.choice(xrange(2, 100))
         toilDir = os.path.join(getTempDirectory(self.tempDir), "toil")
         runCactusBlast([ tempSeqFile ], self.tempOutputFile, toilDir, chunkSize, overlapSize)
         #runToilStatusAndFailIfNotComplete(toilDir)
         if getLogLevelString() == "DEBUG":
             system("cat %s" % self.tempOutputFile)
         system("rm -rf %s " % toilDir)
Пример #55
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    seed = random.randint(0, 2**31)
    parser = argparse.ArgumentParser(description='Run little hal test')
    parser.add_argument('--preset', type=str,
                        help='halGenRandom preset to use [small, medium, big, large]', default='small')
    args = parser.parse_args()
    rval = 0
    print "chunk, comp, time(gen), time(cons), fsize(k)"
    try:
        for chunkSize in [10000, 100000, 1000000, 10000000]:
            for compression in [0, 2, 5, 7, 9]:
                try:
                    tempDir = getTempDirectory(rootDir="./")
                    tempFile = getTempFile(suffix=".h5", rootDir=tempDir)
                except:
                    traceback.print_exc(file=sys.stdout)
                    return 1

                t = time.time()
                runHalGen(args.preset, seed, chunkSize, compression, tempFile)
                fsize = os.path.getsize(tempFile)
                th = time.time() - t
                runHalCons(tempFile, getTempFile(rootDir=tempDir))
                tc = time.time() - th - t
                print "%d, %d, %f.3, %f.3, %f.2" % (
                    chunkSize, compression, th, tc, fsize / 1024.)

    except:
        traceback.print_exc(file=sys.stdout)
        return 1

    system("rm -rf %s" % tempDir)
    return rval