def testChildTreeJob(self): """Check that the ChildTreeJob class runs all children.""" numChildren = 100 flagDir = getTempDirectory() options = Job.Runner.getDefaultOptions(getTempDirectory()) shutil.rmtree(options.jobStore) with Toil(options) as toil: toil.start(CTTestParent(flagDir, numChildren)) # Check that all jobs ran for i in xrange(numChildren): self.assertTrue(os.path.exists(os.path.join(flagDir, str(i)))) shutil.rmtree(flagDir)
def testChildTreeJob(self): """Check that the ChildTreeJob class runs all children.""" numChildren = 100 flagDir = getTempDirectory() options = Job.Runner.getDefaultOptions(getTempDirectory()) shutil.rmtree(options.jobStore) with Toil(options) as toil: toil.start(CTTestParent(flagDir, numChildren)) # Check that all jobs ran for i in range(numChildren): self.assertTrue(os.path.exists(os.path.join(flagDir, str(i)))) shutil.rmtree(flagDir)
def scriptTree_SortTest(testNo, batchSystem, lines=10000, maxLineLength=10, N=10000): """Tests scriptTree/jobTree by sorting a file in parallel. """ for test in xrange(testNo): tempDir = getTempDirectory(os.getcwd()) tempFile = getTempFile(rootDir=tempDir) jobTreeDir = os.path.join(tempDir, "testJobTree") makeFileToSort(tempFile, lines=lines, maxLineLength=maxLineLength) #First make our own sorted version fileHandle = open(tempFile, 'r') l = fileHandle.readlines() l.sort() fileHandle.close() #Sort the file while True: command = "scriptTreeTest_Sort.py --jobTree %s --logLevel=DEBUG --fileToSort=%s --N %i --batchSystem %s --jobTime 1.0 --maxCpus 20 --retryCount 2" % (jobTreeDir, tempFile, N, batchSystem) #, retryCount) system(command) try: system("jobTreeStatus --jobTree %s --failIfNotComplete" % jobTreeDir) break except: print "The jobtree failed and will be restarted" #raise RuntimeError() continue #Now check the file is properly sorted.. #Now get the sorted file fileHandle = open(tempFile, 'r') l2 = fileHandle.readlines() fileHandle.close() checkEqual(l, l2) system("rm -rf %s" % tempDir)
def run(self, fileStore): sequenceFiles1 = [ fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1 ] chunks = runGetChunks( sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize) assert len(chunks) > 0 logger.info( "Broken up the sequence files into individual 'chunk' files") chunkIDs = [ fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks ] diagonalResultsID = self.addChild( MakeSelfBlasts(self.blastOptions, chunkIDs)).rv() offDiagonalResultsID = self.addChild( MakeOffDiagonalBlasts(self.blastOptions, chunkIDs)).rv() logger.debug("Collating the blasts after blasting all-against-all") return self.addFollowOn( CollateBlasts(self.blastOptions, [diagonalResultsID, offDiagonalResultsID])).rv()
def runComparisonOfBlastScriptVsNaiveBlast(self, blastMode): """We compare the output with a naive run of the blast program, to check the results are nearly equivalent. """ encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] #, 2) ] #Could go to six species = ("human", "mouse", "dog") #Other species to try "rat", "monodelphis", "macaque", "chimp" for encodeRegion in encodeRegions: regionPath = os.path.join(self.encodePath, encodeRegion) for i in xrange(len(species)): species1 = species[i] for species2 in species[i+1:]: seqFile1 = os.path.join(regionPath, "%s.%s.fa" % (species1, encodeRegion)) seqFile2 = os.path.join(regionPath, "%s.%s.fa" % (species2, encodeRegion)) #Run simple blast runNaiveBlast(seqFile1, seqFile2, self.tempOutputFile, self.tempDir) logger.info("Ran the naive blast okay") #Run cactus blast pipeline toilDir = os.path.join(getTempDirectory(self.tempDir), "toil") if blastMode == "allAgainstAll": runCactusBlast(sequenceFiles=[ seqFile1, seqFile2 ], alignmentsFile=self.tempOutputFile2, toilDir=toilDir, chunkSize=500000, overlapSize=10000) else: runCactusBlast(sequenceFiles=[ seqFile1 ], alignmentsFile=self.tempOutputFile2, toilDir=toilDir, chunkSize=500000, overlapSize=10000, targetSequenceFiles=[ seqFile2 ]) logger.info("Ran cactus_blast okay") logger.critical("Comparing cactus_blast and naive blast; using mode: %s" % blastMode) checkCigar(self.tempOutputFile) checkCigar(self.tempOutputFile2) compareResultsFile(self.tempOutputFile, self.tempOutputFile2)
def runWorkflow_multipleExamples(inputGenFunction, testNumber=1, testRestrictions=(TestStatus.TEST_SHORT, TestStatus.TEST_MEDIUM, \ TestStatus.TEST_LONG, TestStatus.TEST_VERY_LONG,), inverseTestRestrictions=False, batchSystem="single_machine", buildAvgs=False, buildReference=False, buildReferenceSequence=False, buildCactusPDF=False, buildAdjacencyPDF=False, buildReferencePDF=False, makeCactusTreeStats=False, makeMAFs=False, configFile=None, buildJobTreeStats=False): """A wrapper to run a number of examples. """ if (inverseTestRestrictions and TestStatus.getTestStatus() not in testRestrictions) or \ (not inverseTestRestrictions and TestStatus.getTestStatus() in testRestrictions): for test in xrange(testNumber): tempDir = getTempDirectory(os.getcwd()) sequences, newickTreeString = inputGenFunction(regionNumber=test, tempDir=tempDir) runWorkflow_TestScript(sequences, newickTreeString, batchSystem=batchSystem, buildAvgs=buildAvgs, buildReference=buildReference, buildCactusPDF=buildCactusPDF, buildAdjacencyPDF=buildAdjacencyPDF, makeCactusTreeStats=makeCactusTreeStats, makeMAFs=makeMAFs, configFile=configFile, buildJobTreeStats=buildJobTreeStats) system("rm -rf %s" % tempDir) logger.info("Finished random test %i" % test)
def testBlastRandom(self): """Make some sequences, put them in a file, call blast with random parameters and check it runs okay. """ tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa") self.tempFiles.append(tempSeqFile) for test in xrange(self.testNo): seqNo = random.choice(xrange(0, 10)) seq = getRandomSequence(8000)[1] fileHandle = open(tempSeqFile, 'w') for fastaHeader, seq in [(str(i), mutateSequence(seq, 0.3 * random.random())) for i in xrange(seqNo)]: if random.random() > 0.5: seq = reverseComplement(seq) fastaWrite(fileHandle, fastaHeader, seq) fileHandle.close() chunkSize = random.choice(xrange(500, 9000)) overlapSize = random.choice(xrange(2, 100)) toilDir = os.path.join(getTempDirectory(self.tempDir), "toil") runCactusBlast([tempSeqFile], self.tempOutputFile, toilDir, chunkSize, overlapSize) #runToilStatusAndFailIfNotComplete(toilDir) if getLogLevelString() == "DEBUG": system("cat %s" % self.tempOutputFile) system("rm -rf %s " % toilDir)
def setUp(self): unittest.TestCase.setUp(self) self.testNo = TestStatus.getTestSetup(1, 2, 10, 10) self.tempDir = getTempDirectory(os.getcwd()) self.jobTreeDir = os.path.join( self.tempDir, "testJobTree") #A directory for the job tree to be created in
def testCactusSetup(self): """Creates a bunch of random inputs and then passes them to cactus setup. """ for test in xrange(self.testNo): tempDir = os.path.relpath(getTempDirectory(os.getcwd())) sequenceNumber = random.choice(xrange(100)) sequences, newickTreeString = getCactusInputs_random( tempDir=tempDir, sequenceNumber=sequenceNumber) #Setup the flower disk. experiment = getCactusWorkflowExperimentForTest( sequences, newickTreeString, os.path.join('/data', os.path.relpath(tempDir))) cactusDiskDatabaseString = experiment.getDiskDatabaseString() cactusSequencesPath = os.path.join(experiment.getDbDir(), "cactusSequences") runCactusSetup(cactusDiskDatabaseString=cactusDiskDatabaseString, cactusSequencesPath=cactusSequencesPath, sequences=sequences, newickTreeString=newickTreeString) runCactusSetup(cactusDiskDatabaseString=cactusDiskDatabaseString, cactusSequencesPath=cactusSequencesPath, sequences=sequences, newickTreeString=newickTreeString) experiment.cleanupDb() system("rm -rf %s" % tempDir) logger.info("Finished test %i of cactus_setup.py", test)
def run(self, fileStore): logger.info("Preparing sequence for preprocessing") inSequence = fileStore.readGlobalFile(self.inSequenceID) if self.prepOptions.chunkSize <= 0: # In this first case we don't need to break up the sequence chunked = False inChunkList = [inSequence] else: # chunk it up chunked = True inChunkDirectory = getTempDirectory( rootDir=fileStore.getLocalTempDir()) inChunkList = runGetChunks(sequenceFiles=[inSequence], chunksDir=inChunkDirectory, chunkSize=self.prepOptions.chunkSize, overlapSize=0) inChunkList = [os.path.abspath(path) for path in inChunkList] logger.info("Chunks = %s" % inChunkList) inChunkIDList = [ fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in inChunkList ] outChunkIDList = [] #For each input chunk we create an output chunk, it is the output chunks that get concatenated together. if not self.chunksToCompute: self.chunksToCompute = list(range(len(inChunkList))) for i in self.chunksToCompute: #Calculate the number of chunks to use inChunkNumber = int( max( 1, math.ceil( len(inChunkList) * self.prepOptions.proportionToSample))) assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0 #Now get the list of chunks flanking and including the current chunk j = max(0, i - inChunkNumber // 2) inChunkIDs = inChunkIDList[j:j + inChunkNumber] if len( inChunkIDs ) < inChunkNumber: #This logic is like making the list circular inChunkIDs += inChunkIDList[:inChunkNumber - len(inChunkIDs)] assert len(inChunkIDs) == inChunkNumber outChunkIDList.append( self.addChild( self.getChunkedJobForCurrentStage( inChunkIDs, float(inChunkNumber) / len(inChunkIDList), inChunkIDList[i])).rv()) if chunked: # Merge results of the chunking process back into a genome-wide file return self.addFollowOn( MergeChunks(self.prepOptions, outChunkIDList)).rv() else: # Didn't chunk--we have a genome-wide fasta file return outChunkIDList[0]
def run(self, fileStore): sequenceFiles1 = [ fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1 ] if self.blastOptions.gpuLastz == True: # wga-gpu has a 3G limit. self.blastOptions.chunkSize = 3000000000 chunks = runGetChunks( sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize) if len(chunks) == 0: raise Exception( "no chunks produced for files: {} ".format(sequenceFiles1)) logger.info( "Broken up the sequence files into individual 'chunk' files") chunkIDs = [ fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks ] diagonalResultsID = self.addChild( MakeSelfBlasts(self.blastOptions, chunkIDs)).rv() offDiagonalResultsID = self.addChild( MakeOffDiagonalBlasts(self.blastOptions, chunkIDs)).rv() logger.debug("Collating the blasts after blasting all-against-all") return self.addFollowOn( CollateBlasts(self.blastOptions, [diagonalResultsID, offDiagonalResultsID])).rv()
def setUp(self): unittest.TestCase.setUp(self) self.encodeRegion = "ENm001" self.encodePath = os.path.join(TestStatus.getPathToDataSets(), "MAY-2005") self.regionPath = os.path.join(self.encodePath, self.encodeRegion) self.tempDir = getTempDirectory(os.getcwd()) self.tempOutputFile = os.path.join(self.tempDir, "results1.txt")
def testKeepingCoverageOnIngroups(self): """Tests whether the --ingroupCoverageDir option works as advertised.""" encodeRegion = "ENm001" ingroups = ["human", "cow"] outgroups = ["macaque", "rabbit", "dog"] regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups) outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) # Run blast in "ingroup vs outgroups" mode, requesting to keep # the bed files that show outgroup coverage on the ingroup. toilDir = os.path.join(self.tempDir, "tmp_toil") outgroupFragmentPaths = [getTempFile(rootDir=self.tempDir) for outgroup in outgroups] ingroupCoveragePaths = [getTempFile(rootDir=self.tempDir) for ingroup in ingroups] runCactusBlastIngroupsAndOutgroups(ingroups=ingroupPaths, outgroups=outgroupPaths, alignmentsFile=self.tempOutputFile, outgroupFragmentPaths=outgroupFragmentPaths, ingroupCoveragePaths=ingroupCoveragePaths, toilDir=toilDir) for i, ingroupPath in enumerate(ingroupPaths): # Get the coverage from the outgroups independently and # check that it's the same as the file in # ingroupCoverageDir otherIngroupPath = ingroupPaths[1] if i == 0 else ingroupPaths[0] # To filter out alignments from the other ingroup and # self-alignments we need to create a fasta with all the # outgroup fragments in it. outgroupsCombined = getTempFile(rootDir=self.tempDir) for outgroupFragmentPath in outgroupFragmentPaths: system("cat %s >> %s" % (outgroupFragmentPath, outgroupsCombined)) independentCoverageFile = getTempFile(rootDir=self.tempDir) coverageWorkDir = getTempDirectory(rootDir=self.tempDir) calculateCoverage(work_dir=coverageWorkDir, fromGenome=outgroupsCombined, sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=independentCoverageFile) # find the coverage file cactus_blast kept (should be # named according to the basename of the ingroup path # file) keptCoverageFile = ingroupCoveragePaths[i] self.assertTrue(filecmp.cmp(independentCoverageFile, keptCoverageFile))
def run(self): #---------------------------------------- # Run cactus_workflow.py and report time# #---------------------------------------- logger.info("CactusWorkflowWrapper: going to issue cactus run for simulation %s, parameter %s\n" %(self.simulation, self.paraFile)) tempDir = getTempDirectory(self.outDir) flowerdisk = os.path.join(tempDir, "cactusDisk") jobtreeDir = os.path.join(tempDir, "jobTree") #batchSystem = "single_machine" batchSystem = "parasol" retryCount = 0 command = "cactus_workflow.py --speciesTree='%s' %s --configFile %s --buildTrees --setupAndBuildAlignments --cactusDisk %s --logDebug --job=JOB_FILE" %(self.tree, self.sequenceFiles, self.paraFile, flowerdisk) starttime = time.time() runJobTree(command, jobtreeDir, "DEBUG", retryCount, batchSystem, None) #runCactusWorkflow(flowerdisk, self.sequenceFiles, self.tree, jobtreeDir, "DEBUG", 0, batchSystem, None, True, True, False, False, self.config) runtime = time.time() - starttime logger.info("Done cactus_workflow for simulation %s, config %s\n" %(self.simulation, self.paraFile)) #----------------------- # Run cactus_treeStats # #----------------------- #statsFile = os.path.join(self.outDir, "stats", "%s.xml" % self.simNum) statsFile = os.path.join(self.outDir, "stats", "%s.xml" % self.simName) runCactusTreeStats(outputFile=statsFile, cactusDisk=flowerdisk) #self.addChildCommand(command) #------------------- Adding child ------------------------# #self.addChildTarget(CactusMAFGeneratorWrapper(self.outDir, tempDir, self.simNum, runtime)) self.addChildTarget(CactusMAFGeneratorWrapper(self.outDir, tempDir, self.simTrueMafDir, self.simName, runtime)) logger.info("Added child CactusMAFGeneratorWrapper at %s\n" % self.outDir) #------------------- Cleaning up -------------------------# self.setFollowOnTarget(CactusWorkflowWrapperCleanup(tempDir))
def setUp(self): #This is the number of random problems to solve, handed to the test code self.testNo = TestStatus.getTestSetup(shortTestNo=1, mediumTestNo=5, longTestNo=10, veryLongTestNo=100) self.tempFiles = [] self.tempDir = getTempDirectory(os.getcwd()) self.tempBlossomDirectory = self.tempDir + "/tempBlossom" unittest.TestCase.setUp(self)
def run(self, fileStore): sequenceFiles1 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1] sequenceFiles2 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs2] chunks1 = runGetChunks(sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize) chunks2 = runGetChunks(sequenceFiles=sequenceFiles2, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize=self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize) chunkIDs1 = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks1] chunkIDs2 = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks2] resultsIDs = [] #Make the list of blast jobs. for chunkID1 in chunkIDs1: for chunkID2 in chunkIDs2: #TODO: Make the compression work self.blastOptions.compressFiles = False resultsIDs.append(self.addChild(RunBlast(self.blastOptions, chunkID1, chunkID2)).rv()) logger.info("Made the list of blasts") #Set up the job to collate all the results return self.addFollowOn(CollateBlasts(self.blastOptions, resultsIDs)).rv()
def setUp(self): #This is the number of random problems to solve, handed to the test code self.testNo = TestStatus.getTestSetup(shortTestNo=1, mediumTestNo=5, longTestNo=10, veryLongTestNo=100) self.tempFiles = [] self.tempDir = getTempDirectory(os.getcwd()) self.tempMatchGraphDirectory = self.tempDir + "/tempMatchGraph" unittest.TestCase.setUp(self)
def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, toilStats, subtreeRoot=None): tempDir = getTempDirectory(os.getcwd()) tempExperimentDir = os.path.join(tempDir, "exp") runCreateMultiCactusProject(experimentFile, tempExperimentDir, fixNames=False, root=subtreeRoot) logger.info("Put the temporary files in %s" % tempExperimentDir) runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"), toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats) # Check that the headers and sequences in the output are the # same as the sequences in the input (minus differences in # repeat-masking) exp = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqMap = exp.buildSequenceMap() # Maps genome name -> headers in fasta headers = {} for genomeName, inputSequencePath in seqMap.items(): if os.path.isdir(inputSequencePath): # Some "input sequence paths" are actually provided as # directories containing multiple FASTAs concatenatedPath = getTempFile() system("cat %s/* > %s" % (inputSequencePath, concatenatedPath)) inputSequencePath = concatenatedPath headers[genomeName] = list(map(itemgetter(0), fastaRead(inputSequencePath))) # check headers inside .c2h output for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)): subExp = ExperimentWrapper(ET.parse(expPath).getroot()) outgroups = subExp.getOutgroupEvents() c2hPath = subExp.getHALPath() with open(c2hPath) as f: for line in f: fields = line.split('\t') if fields[0] == 's': # Sequence line genome = fields[1][1:-1] header = fields[2][1:-1] if genome in headers and genome not in outgroups: # This genome is an input genome self.assertTrue(header in headers[genome], 'Header %s from output c2h %s not found in input fa %s' ' for genome %s' % (header, c2hPath, seqMap[genome], genome)) runToilStatusAndFailIfNotComplete(toilDir) system("rm -rf %s" % tempDir)
def setUp(self): self.testNo = TestStatus.getTestSetup(1, 5, 10, 100) self.tempDir = getTempDirectory(os.getcwd()) self.tempFiles = [] unittest.TestCase.setUp(self) self.tempOutputFile = os.path.join(self.tempDir, "results1.txt") self.tempFiles.append(self.tempOutputFile) self.tempOutputFile2 = os.path.join(self.tempDir, "results2.txt") self.tempFiles.append(self.tempOutputFile2) self.encodePath = os.path.join(TestStatus.getPathToDataSets(), "MAY-2005")
def repeat_masking_job(job, input_fasta, species): temp_dir = getTempDirectory() os.chdir(temp_dir) local_fasta = os.path.join(temp_dir, 'input.fa') job.fileStore.readGlobalFile(input_fasta, local_fasta, cache=False) system("chmod a+rw %s" % local_fasta) system("RepeatMasker -pa 10 -species {species} {input}".format(species=species, input=local_fasta)) output_path = local_fasta + '.out' masked_out = job.fileStore.writeGlobalFile(output_path) return masked_out
def setUp(self): unittest.TestCase.setUp(self) self.encodeRegion = "ENm001" self.encodePath = os.path.join(TestStatus.getPathToDataSets(), "MAY-2005") self.regionPath = os.path.join(self.encodePath, self.encodeRegion) self.tempDir = getTempDirectory(os.getcwd()) self.tempOutputFile = os.path.join(self.tempDir, "results1.txt") self.toilDir = os.path.join(self.tempDir, "toil") self.toilOptions = Job.Runner.getDefaultOptions(self.toilDir) self.toilOptions.disableCaching = True
def setUp(self): unittest.TestCase.setUp(self) self.trees = randomTreeSet() self.mcTrees = [] self.tempDir = getTempDirectory(os.getcwd()) self.tempFa = os.path.join(self.tempDir, "seq.fa") with open(self.tempFa, "w") as f: f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n") self.dummySeqMaps = [] for tree in self.trees: if tree.size() < 50: mcTree = MultiCactusTree(tree) seqMap = dict() for i in mcTree.breadthFirstTraversal(): mcTree.setName(i, "Node%s" % str(i)) seqMap["Node%s" % str(i)] = self.tempFa mcTree.computeSubtreeRoots() mcTree.nameUnlabeledInternalNodes() self.mcTrees.append(mcTree) self.dummySeqMaps.append(seqMap) # Boreoeutherian tree borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;' self.borMcTree = MultiCactusTree(NXNewick().parseString( borTree, addImpliedRoots=False)) self.borMcTree.computeSubtreeRoots() self.borMcTree.nameUnlabeledInternalNodes() self.mcTrees.append(self.borMcTree) # Eutherian backbone tree backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);' self.backboneTree = MultiCactusTree(NXNewick().parseString( backbone, addImpliedRoots=False)) self.backboneTree.computeSubtreeRoots() self.backboneTree.nameUnlabeledInternalNodes() self.mcTrees.append(self.backboneTree) seqLens = dict() seqLens["HUMAN"] = 57553 seqLens["CHIMP"] = 57344 seqLens["BABOON"] = 58960 seqLens["MOUSE"] = 32750 seqLens["RAT"] = 38436 seqLens["DOG"] = 54187 seqLens["CAT"] = 50283 seqLens["PIG"] = 54843 seqLens["COW"] = 55508 self.blanchetteSeqMap = dict() for event, seqLen in seqLens.items(): p = os.path.join(self.tempDir, event + ".fa") with open(p, "w") as f: f.write(">%s\n" % event) f.write(''.join(['A'] * seqLen)) f.write('\n') self.blanchetteSeqMap[event] = p
def run(self, fileStore): sequenceFiles1 = [fileStore.readGlobalFile(fileID) for fileID in self.sequenceFileIDs1] chunks = runGetChunks(sequenceFiles=sequenceFiles1, chunksDir=getTempDirectory(rootDir=fileStore.getLocalTempDir()), chunkSize = self.blastOptions.chunkSize, overlapSize=self.blastOptions.overlapSize) assert len(chunks) > 0 logger.info("Broken up the sequence files into individual 'chunk' files") chunkIDs = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in chunks] diagonalResultsID = self.addChild(MakeSelfBlasts(self.blastOptions, chunkIDs)).rv() offDiagonalResultsID = self.addChild(MakeOffDiagonalBlasts(self.blastOptions, chunkIDs)).rv() logger.debug("Collating the blasts after blasting all-against-all") return self.addFollowOn(CollateBlasts(self.blastOptions, [diagonalResultsID, offDiagonalResultsID])).rv()
def testSort(self): for test in xrange(self.testNo): tempDir = getTempDirectory(os.getcwd()) tempFile1 = getTempFile(rootDir=tempDir) makeFileToSort(tempFile1) lines1 = loadFile(tempFile1) lines1.sort() sort(tempFile1) lines2 = loadFile(tempFile1) checkEqual(lines1, lines2) system("rm -rf %s" % tempDir)
def repeat_masking_job(job, input_fasta, species): temp_dir = getTempDirectory() os.chdir(temp_dir) local_fasta = os.path.join(temp_dir, 'input.fa') job.fileStore.readGlobalFile(input_fasta, local_fasta, cache=False) system("chmod a+rw %s" % local_fasta) system("RepeatMasker -pa 10 -species {species} {input}".format( species=species, input=local_fasta)) output_path = local_fasta + '.out' masked_out = job.fileStore.writeGlobalFile(output_path) return masked_out
def testAddingOutgroupsImprovesResult(self): """Run blast on "ingroup" and "outgroup" encode regions, and ensure that adding an extra outgroup only adds alignments if possible, and doesn't lose any """ encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] ingroups = ["human", "macaque"] outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"] # subselect 4 random ordered outgroups outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), 4))] for encodeRegion in encodeRegions: regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups) outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) results = [] for numOutgroups in xrange(1,5): # Align w/ increasing numbers of outgroups subResults = getTempFile() subOutgroupPaths = outgroupPaths[:numOutgroups] print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths)) tmpToil = os.path.join(self.tempDir, "outgroupToil") runCactusBlastIngroupsAndOutgroups(ingroupPaths, subOutgroupPaths, alignmentsFile=subResults, toilDir=tmpToil) results.append(subResults) # Print diagnostics about coverage for i, subResults in enumerate(results): for ingroup, ingroupPath in zip(ingroups, ingroupPaths): ingroupCoverage = getTempFile(rootDir=self.tempDir) coverageWorkDir = getTempDirectory(rootDir=self.tempDir) calculateCoverage(work_dir=coverageWorkDir, sequenceFile=ingroupPath, cigarFile=subResults, outputFile=ingroupCoverage) coveredBases = popenCatch("cat %s | awk '{ total += $3 - $2 } END { print total }'" % ingroupCoverage) print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases) resultsSets = map(lambda x : loadResults(x), results) for i, moreOutgroupsResults in enumerate(resultsSets[1:]): # Make sure the results from (n+1) outgroups are # (very nearly) a superset of the results from n outgroups print "Using %d addl outgroup(s):" % (i + 1) comparator = ResultComparator(resultsSets[0], moreOutgroupsResults) print comparator self.assertTrue(comparator.sensitivity >= 0.99) # Ensure that the new alignments don't cover more than # x% of already existing alignments to human for i in xrange(1, len(resultsSets)): prevResults = resultsSets[i-1][0] curResults = resultsSets[i][0] prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults))) newAlignments = curResults.difference(prevResults) newAlignmentsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments))) print "addl outgroup %d:" % i print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))) for subResult in results: os.remove(subResult)
def runCactusProgressive(options): with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) #import the sequences for genome, seq in project.inputSequenceMap.items(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start( RunCactusPreprocessorThenProgressiveDown( options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
def setUp(self): unittest.TestCase.setUp(self) self.trees = randomTreeSet() self.mcTrees = [] self.tempDir = getTempDirectory(os.getcwd()) self.tempFa = os.path.join(self.tempDir, "seq.fa") with open(self.tempFa, "w") as f: f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n") self.dummySeqMaps = [] for tree in self.trees: if tree.size() < 50: mcTree = MultiCactusTree(tree, tree.degree()) seqMap = dict() for i in mcTree.breadthFirstTraversal(): mcTree.setName(i, "Node%s" % str(i)) seqMap["Node%s" % str(i)] = self.tempFa mcTree.computeSubtreeRoots() mcTree.nameUnlabeledInternalNodes() self.mcTrees.append(mcTree) self.dummySeqMaps.append(seqMap) # Boreoeutherian tree borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;' self.borMcTree = MultiCactusTree(NXNewick().parseString(borTree, addImpliedRoots=False)) self.borMcTree.computeSubtreeRoots() self.borMcTree.nameUnlabeledInternalNodes() self.mcTrees.append(self.borMcTree) # Eutherian backbone tree backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);' self.backboneTree = MultiCactusTree(NXNewick().parseString(backbone, addImpliedRoots=False)) self.backboneTree.computeSubtreeRoots() self.backboneTree.nameUnlabeledInternalNodes() self.mcTrees.append(self.backboneTree) seqLens = dict() seqLens["HUMAN"] = 57553 seqLens["CHIMP"] = 57344 seqLens["BABOON"] = 58960 seqLens["MOUSE"] = 32750 seqLens["RAT"] = 38436 seqLens["DOG"] = 54187 seqLens["CAT"] = 50283 seqLens["PIG"] = 54843 seqLens["COW"] = 55508 self.blanchetteSeqMap = dict() for event, seqLen in seqLens.items(): p = os.path.join(self.tempDir, event +".fa") with open(p, "w") as f: f.write(">%s\n" % event) f.write(''.join(['A'] * seqLen)) f.write('\n') self.blanchetteSeqMap[event] = p
def setUp(self): unittest.TestCase.setUp(self) self.tempDir = getTempDirectory(os.getcwd()) self.tempFiles = [] unittest.TestCase.setUp(self) self.tempOutputFile = os.path.join(self.tempDir, "results1.txt") self.tempFiles.append(self.tempOutputFile) self.tempOutputFile2 = os.path.join(self.tempDir, "results2.txt") self.tempFiles.append(self.tempOutputFile2) self.encodePath = os.path.join(TestStatus.getPathToDataSets(), "MAY-2005") self.defaultLastzArguments = "--ambiguous=iupac" self.defaultRealignArguments = ""
def testAddingOutgroupsImprovesResult(self): """Run blast on "ingroup" and "outgroup" encode regions, and ensure that adding an extra outgroup only adds alignments if possible, and doesn't lose any """ encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] ingroups = ["human", "macaque"] outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"] # subselect 4 random ordered outgroups outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), 4))] for encodeRegion in encodeRegions: regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups) outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) results = [] for numOutgroups in xrange(1,5): # Align w/ increasing numbers of outgroups subResults = getTempFile() subOutgroupPaths = outgroupPaths[:numOutgroups] tmpJobTree = getTempDirectory() print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths)) system("cactus_blast.py --ingroups %s --outgroups %s --cigars %s --jobTree %s/jobTree" % (",".join(ingroupPaths), ",".join(subOutgroupPaths), subResults, tmpJobTree)) system("rm -fr %s" % (tmpJobTree)) results.append(subResults) # Print diagnostics about coverage for i, subResults in enumerate(results): for ingroup, ingroupPath in zip(ingroups, ingroupPaths): coveredBases = popenCatch("cactus_coverage %s %s | awk '{ total += $3 - $2 } END { print total }'" % (ingroupPath, subResults)) print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases) resultsSets = map(lambda x : loadResults(x), results) for i, moreOutgroupsResults in enumerate(resultsSets[1:]): # Make sure the results from (n+1) outgroups are # (very nearly) a superset of the results from n outgroups print "Using %d addl outgroup(s):" % (i + 1) comparator = ResultComparator(resultsSets[0], moreOutgroupsResults) print comparator self.assertTrue(comparator.sensitivity >= 0.99) # Ensure that the new alignments don't cover more than # x% of already existing alignments to human for i in xrange(1, len(resultsSets)): prevResults = resultsSets[i-1][0] curResults = resultsSets[i][0] prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults))) newAlignments = curResults.difference(prevResults) newAlignmentsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments))) print "addl outgroup %d:" % i print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))) for subResult in results: os.remove(subResult)
def testGetMidPoint(self): for test in xrange(self.testNo): tempDir = getTempDirectory(os.getcwd()) tempFile = getTempFile(rootDir=tempDir) makeFileToSort(tempFile) l = open(tempFile, 'r').read() fileSize = os.path.getsize(tempFile) midPoint = getMidPoint(tempFile, 0, fileSize) print "the mid point is %i of a file of %i bytes woth byte" % (midPoint, fileSize) assert midPoint < fileSize assert l[midPoint] == '\n' assert midPoint >= 0 system("rm -rf %s" % tempDir)
def setUp(self): self.batchSystem = "singleMachine" if getBatchSystem() != None: self.batchSystem = getBatchSystem() unittest.TestCase.setUp(self) self.useOutgroup = False self.doSelfAlignment = False #Load the config file, turn on the checks. configWrapper = ConfigWrapper(ET.parse(os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot()) configWrapper.turnAllModesOn() self.tempDir = getTempDirectory(os.getcwd()) self.configFile = os.path.join(self.tempDir, "tempConfig.xml") configWrapper.writeXML(self.configFile)
def setUp(self): self.batchSystem = "singleMachine" if getBatchSystem() != None: self.batchSystem = getBatchSystem() unittest.TestCase.setUp(self) self.useOutgroup = False self.doSelfAlignment = False #Load the config file, turn on the checks. configWrapper = ConfigWrapper(ET.parse(os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot()) configWrapper.turnAllModesOn() configWrapper.turnOffHeaderChecks() self.tempDir = getTempDirectory(os.getcwd()) self.configFile = os.path.join(self.tempDir, "tempConfig.xml") configWrapper.writeXML(self.configFile)
def testCopySubRangeOfFile(self): for test in xrange(self.testNo): tempDir = getTempDirectory(os.getcwd()) tempFile = getTempFile(rootDir=tempDir) outputFile = getTempFile(rootDir=tempDir) makeFileToSort(tempFile) fileSize = os.path.getsize(tempFile) assert fileSize > 0 fileStart = random.choice(xrange(0, fileSize)) fileEnd = random.choice(xrange(fileStart, fileSize)) copySubRangeOfFile(tempFile, fileStart, fileEnd, outputFile) l = open(outputFile, 'r').read() l2 = open(tempFile, 'r').read()[fileStart:fileEnd] checkEqual(l, l2) system("rm -rf %s" % tempDir)
def run(self, fileStore): logger.info("Preparing sequence for preprocessing") # chunk it up inSequence = fileStore.readGlobalFile(self.inSequenceID) inChunkDirectory = getTempDirectory( rootDir=fileStore.getLocalTempDir()) inChunkList = runGetChunks(sequenceFiles=[inSequence], chunksDir=inChunkDirectory, chunkSize=self.prepOptions.chunkSize, overlapSize=0) inChunkList = [os.path.abspath(path) for path in inChunkList] logger.info("Chunks = %s" % inChunkList) logger.info("Chunks dir = %s" % os.listdir(inChunkDirectory)) inChunkIDList = [ fileStore.writeGlobalFile(chunk) for chunk in inChunkList ] outChunkIDList = [] #For each input chunk we create an output chunk, it is the output chunks that get concatenated together. if not self.chunksToCompute: self.chunksToCompute = range(len(inChunkList)) for i in self.chunksToCompute: #Calculate the number of chunks to use inChunkNumber = int( max( 1, math.ceil( len(inChunkList) * self.prepOptions.proportionToSample))) assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0 #Now get the list of chunks flanking and including the current chunk j = max(0, i - inChunkNumber / 2) inChunkIDs = inChunkIDList[j:j + inChunkNumber] if len( inChunkIDs ) < inChunkNumber: #This logic is like making the list circular inChunkIDs += inChunkIDList[:inChunkNumber - len(inChunkIDs)] assert len(inChunkIDs) == inChunkNumber outChunkIDList.append( self.addChild( PreprocessChunk(self.prepOptions, inChunkIDs, float(inChunkNumber) / len(inChunkIDList), inChunkIDList[i])).rv()) # follow on to merge chunks return self.addFollowOn(MergeChunks(self.prepOptions, outChunkIDList)).rv()
def testCactusSetup(self): """Creates a bunch of random inputs and then passes them to cactus setup. """ for test in xrange(self.testNo): tempDir = getTempDirectory(os.getcwd()) sequenceNumber = random.choice(xrange(100)) sequences, newickTreeString = getCactusInputs_random(tempDir=tempDir, sequenceNumber=sequenceNumber) #Setup the flower disk. experiment = getCactusWorkflowExperimentForTest(sequences, newickTreeString, tempDir) cactusDiskDatabaseString = experiment.getDiskDatabaseString() runCactusSetup(cactusDiskDatabaseString, sequences, newickTreeString) runCactusSetup(cactusDiskDatabaseString, sequences, newickTreeString) experiment.cleanupDb() system("rm -rf %s" % tempDir) logger.info("Finished test %i of cactus_setup.py", test)
def testCPecanEmMultipleTrials(self): """Runs uns cPecanEm with multiple different trials. """ for seqFile1, seqFile2 in seqFilePairGenerator(): tempDir = getTempDirectory(rootDir=os.getcwd()) jobTreeDir = os.path.join(tempDir, "jobTree") alignmentsFile = os.path.join(tempDir, "alignments.cigars") computeAlignments(seqFile1, seqFile2, alignmentsFile) logger.info("Computed alignments for seqs %s and %s" % (seqFile1, seqFile2)) outputModelFile = os.path.join(tempDir, "outputModel.txt") outputModelXMLFile = os.path.join(tempDir, "outputModel.xml") outputBlastFile = os.path.join(tempDir, "outputBlast.txt") #First run the script to generate a model and do one iteration of EM to #get the likelihood to compare with the final likelihood trials=3 runCPecanEm(sequenceFiles=[ seqFile1, seqFile2 ], alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, jobTreeDir=jobTreeDir, trials=trials, outputTrialHmms=True, iterations=5, randomStart=True, logLevel=getLogLevelString(), optionsToRealign="--diagonalExpansion=6 --splitMatrixBiggerThanThis=100", outputXMLModelFile=outputModelXMLFile, blastScoringMatrixFile=outputBlastFile) trialHmms = [ Hmm.loadHmm(outputModelFile + ("_%i" % i)) for i in xrange(trials) ] hmm = Hmm.loadHmm(outputModelFile) node = ET.parse(outputModelXMLFile).getroot() logger.info("After multiple trials and iterations of EM the best likelihood found was %s, the likelihoods of the variants were: %s" % (hmm.likelihood, " ".join(map(lambda x : str(x.likelihood), trialHmms)))) matchProbs, gapOpen, gapExtend = makeBlastScoringMatrix(hmm, ("ACTG",)) logger.info("Gap open: %s, Gap extend: %s, Match probs %s" % (gapOpen, gapExtend, " ".join(map(str, matchProbs)))) self.assertTrue(float(node.attrib["maxLikelihood"]) == hmm.likelihood) #Now use the blast file to compute a new matrix computeAlignments(seqFile1, seqFile2, alignmentsFile, lastzArguments=("--ambiguous=iupac --scores=%s" % outputBlastFile)) #Run modifyHmm to check it works system("cPecanModifyHmm %s %s --gcContent=0.5 --substitutionRate=0.05 --setFlatIndelEmissions" % (outputModelFile, outputModelFile)) hmm = Hmm.loadHmm(outputModelFile) node = ET.parse(outputModelXMLFile).getroot() system("rm -rf %s" % tempDir)
def runWorkflow_multipleExamples(testId, inputGenFunction, testNumber=1, batchSystem="single_machine", buildAvgs=False, configFile=None, buildToilStats=False, useConstraints=False, cactusWorkflowFunction=runCactusWorkflow, logLevel=None, buildHal=False, buildFasta=False, progressive=False): """A wrapper to run a number of examples. The testId parameter is used to allocate a unique port so that tests can run in parallel. """ if logLevel is None: logLevel = _LOG_LEVEL for test in range(testNumber): tempDir = getTempDirectory(os.getcwd()) if useConstraints: sequences, newickTreeString, constraints = inputGenFunction( regionNumber=test, tempDir=tempDir) else: sequences, newickTreeString = inputGenFunction(regionNumber=test, tempDir=tempDir) constraints = None runWorkflow_TestScript(testId, sequences, newickTreeString, outputDir=tempDir, batchSystem=batchSystem, buildAvgs=buildAvgs, buildHal=buildHal, buildFasta=buildFasta, configFile=configFile, buildToilStats=buildToilStats, constraints=constraints, progressive=progressive, cactusWorkflowFunction=cactusWorkflowFunction, logLevel=logLevel) system("rm -rf %s" % tempDir) logger.info("Finished random test %i" % test)
def testCactus_Random_fixedAncestor(self): """Tests that cactus doesn't crash when aligning to a fixed ancestral sequence.""" sequences, _ = getCactusInputs_random(treeLeafNumber=3) rootSeq = sequences.pop() # Create a star tree tree = '(%s)root;' % ",".join([str(x) + ":1.0" for x in range(len(sequences))]) outputDir = getTempDirectory() experiment = getCactusWorkflowExperimentForTest(sequences, tree, outputDir, progressive=True) experiment.setSequenceID("root", rootSeq) experiment.setRootReconstructed(False) experimentFile = os.path.join(outputDir, "experiment.xml") experiment.writeXML(experimentFile) jobTreeDir = os.path.join(outputDir, "jobTree") self.progressiveFunction(experimentFile, jobTreeDir, 'singleMachine', False, True, True, False)
def testJobTreeStats_SortSimple(self): """Tests the jobTreeStats utility using the scriptTree_sort example. """ for test in xrange(self.testNo): tempDir = getTempDirectory(os.getcwd()) tempFile = getTempFile(rootDir=tempDir) outputFile = getTempFile(rootDir=tempDir) jobTreeDir = os.path.join(tempDir, "jobTree") lines=100000 maxLineLength=10 N=1000 makeFileToSort(tempFile, lines, maxLineLength) #Sort the file command = "scriptTreeTest_Sort.py --jobTree %s --logLevel=DEBUG --fileToSort=%s --N %s --stats --jobTime 0.5" % (jobTreeDir, tempFile, N) system(command) #Now get the stats system("jobTreeStats --jobTree %s --outputFile %s" % (jobTreeDir, outputFile)) #Cleanup system("rm -rf %s" % tempDir)
def run(self, fileStore): logger.info("Preparing sequence for preprocessing") inSequence = fileStore.readGlobalFile(self.inSequenceID) if self.prepOptions.chunkSize <= 0: # In this first case we don't need to break up the sequence chunked = False inChunkList = [inSequence] else: # chunk it up chunked = True inChunkDirectory = getTempDirectory(rootDir=fileStore.getLocalTempDir()) inChunkList = runGetChunks(sequenceFiles=[inSequence], chunksDir=inChunkDirectory, chunkSize=self.prepOptions.chunkSize, overlapSize=0) inChunkList = [os.path.abspath(path) for path in inChunkList] logger.info("Chunks = %s" % inChunkList) inChunkIDList = [fileStore.writeGlobalFile(chunk, cleanup=True) for chunk in inChunkList] outChunkIDList = [] #For each input chunk we create an output chunk, it is the output chunks that get concatenated together. if not self.chunksToCompute: self.chunksToCompute = range(len(inChunkList)) for i in self.chunksToCompute: #Calculate the number of chunks to use inChunkNumber = int(max(1, math.ceil(len(inChunkList) * self.prepOptions.proportionToSample))) assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0 #Now get the list of chunks flanking and including the current chunk j = max(0, i - inChunkNumber/2) inChunkIDs = inChunkIDList[j:j+inChunkNumber] if len(inChunkIDs) < inChunkNumber: #This logic is like making the list circular inChunkIDs += inChunkIDList[:inChunkNumber-len(inChunkIDs)] assert len(inChunkIDs) == inChunkNumber outChunkIDList.append(self.addChild(self.getChunkedJobForCurrentStage(inChunkIDs, float(inChunkNumber)/len(inChunkIDList), inChunkIDList[i])).rv()) if chunked: # Merge results of the chunking process back into a genome-wide file return self.addFollowOn(MergeChunks(self.prepOptions, outChunkIDList)).rv() else: # Didn't chunk--we have a genome-wide fasta file return outChunkIDList[0]
def runCactusGraphMapJoin(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: wf_output = toil.restart() else: options.cactusDir = getTempDirectory() #load cactus config configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) config.substituteAllPredefinedConstantsWithLiterals() # load up the vgs vg_ids = [] for vg_path in options.vg: logger.info("Importing {}".format(vg_path)) vg_ids.append(toil.importFile(makeURL(vg_path))) # tack on the decoys if options.decoyGraph: logger.info("Importing decoys {}".format(options.decoyGraph)) vg_ids.append(toil.importFile(makeURL(options.decoyGraph))) # we'll treat it like any other graph downstream, except clipping # where we'll check first using the path name options.vg.append(options.decoyGraph) # load up the hals hal_ids = [] for hal_path in options.hal: logger.info("Importing {}".format(hal_path)) hal_ids.append(toil.importFile(makeURL(hal_path))) # run the workflow wf_output = toil.start( Job.wrapJobFn(graphmap_join_workflow, options, config, vg_ids, hal_ids)) #export the split data export_join_data(toil, options, wf_output[0], wf_output[1], wf_output[2])
def runWorkflow_multipleExamples(inputGenFunction, testNumber=1, testRestrictions=(TestStatus.TEST_SHORT, TestStatus.TEST_MEDIUM, \ TestStatus.TEST_LONG, TestStatus.TEST_VERY_LONG,), inverseTestRestrictions=False, batchSystem="single_machine", buildAvgs=False, buildReference=False, configFile=None, buildToilStats=False, useConstraints=False, cactusWorkflowFunction=runCactusWorkflow, buildHal=False, buildFasta=False, progressive=False): """A wrapper to run a number of examples. """ if (inverseTestRestrictions and TestStatus.getTestStatus() not in testRestrictions) or \ (not inverseTestRestrictions and TestStatus.getTestStatus() in testRestrictions): for test in xrange(testNumber): tempDir = getTempDirectory(os.getcwd()) if useConstraints: sequences, newickTreeString, constraints = inputGenFunction( regionNumber=test, tempDir=tempDir) else: sequences, newickTreeString = inputGenFunction( regionNumber=test, tempDir=tempDir) constraints = None runWorkflow_TestScript( sequences, newickTreeString, outputDir=tempDir, batchSystem=batchSystem, buildAvgs=buildAvgs, buildReference=buildReference, buildHal=buildHal, buildFasta=buildFasta, configFile=configFile, buildToilStats=buildToilStats, constraints=constraints, progressive=progressive, cactusWorkflowFunction=cactusWorkflowFunction) system("rm -rf %s" % tempDir) logger.info("Finished random test %i" % test)
def testCPecanEm(self): """Runs cPecanEm. """ trial = 0 for modelType in ("fiveState", "fiveStateAsymmetric", "threeState", "threeStateAsymmetric"): for seqFile1, seqFile2 in seqFilePairGenerator(): tempDir = getTempDirectory(rootDir=os.getcwd()) jobTreeDir = os.path.join(tempDir, "jobTree") alignmentsFile = os.path.join(tempDir, "alignments.cigars") computeAlignments(seqFile1, seqFile2, alignmentsFile) logger.info("Computed alignments for seqs %s and %s" % (seqFile1, seqFile2)) outputModelFile = os.path.join(tempDir, "outputModel.txt") #First run the script to generate a model and do one iteration of EM to #get the likelihood to compare with the final likelihood runCPecanEm(sequenceFiles=[ seqFile1, seqFile2 ], alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, modelType=modelType, jobTreeDir=jobTreeDir, iterations=1, trials=1, randomStart=False, logLevel=getLogLevelString(), setJukesCantorStartingEmissions=0.2, #useDefaultModelAsStart=, trainEmissions=True, tieEmissions=True, optionsToRealign="--diagonalExpansion=6 --splitMatrixBiggerThanThis=100") hmm = Hmm.loadHmm(outputModelFile) system("rm -rf %s" % jobTreeDir) #Cleanup the old jobTree logger.info("For trial %s the likelihood after 1 iteration of EM is %s" % (trial, hmm.likelihood)) iterations = 5 runCPecanEm(sequenceFiles=[ seqFile1, seqFile2 ], alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, jobTreeDir=jobTreeDir, optionsToRealign="--diagonalExpansion=6 --splitMatrixBiggerThanThis=100", iterations=iterations, inputModelFile=outputModelFile, logLevel=getLogLevelString(), maxAlignmentLengthPerJob=10000) #, updateTheBand=True) hmm2 = Hmm.loadHmm(outputModelFile) logger.info("For trial %s the likelihood after a further %s iterations of EM is %s" % (trial, iterations, hmm2.likelihood)) self.assertTrue(hmm.likelihood < hmm2.likelihood) hmm2.normalise() logger.info("Final transitions: %s" % " ".join(map(str, hmm2.transitions))) logger.info("Final emissions: %s" % " ".join(map(str, hmm2.emissions))) system("rm -rf %s" % tempDir) trial += 1
def progressiveFunction(self, experimentFile, jobTreeDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, jobTreeStats, subtreeRoot=None): tempDir = getTempDirectory(os.getcwd()) tempExperimentDir = os.path.join(tempDir, "exp") runCactusCreateMultiCactusProject(experimentFile, tempExperimentDir, fixNames=False, root=subtreeRoot) logger.info("Put the temporary files in %s" % tempExperimentDir) runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"), jobTreeDir, batchSystem=batchSystem, buildAvgs=buildAvgs, jobTreeStats=jobTreeStats) runJobTreeStatusAndFailIfNotComplete(jobTreeDir) system("rm -rf %s" % tempDir)
def run(self): #---------------------------------------- # Run cactus_workflow.py and report time# #---------------------------------------- logger.info( "CactusWorkflowWrapper: going to issue cactus run for simulation %s, parameter %s\n" % (self.simulation, self.paraFile)) tempDir = getTempDirectory(self.outDir) flowerdisk = os.path.join(tempDir, "cactusDisk") jobtreeDir = os.path.join(tempDir, "jobTree") #batchSystem = "single_machine" batchSystem = "parasol" retryCount = 0 command = "cactus_workflow.py --speciesTree='%s' %s --configFile %s --buildTrees --setupAndBuildAlignments --cactusDisk %s --logDebug --job=JOB_FILE" % ( self.tree, self.sequenceFiles, self.paraFile, flowerdisk) starttime = time.time() runJobTree(command, jobtreeDir, "DEBUG", retryCount, batchSystem, None) #runCactusWorkflow(flowerdisk, self.sequenceFiles, self.tree, jobtreeDir, "DEBUG", 0, batchSystem, None, True, True, False, False, self.config) runtime = time.time() - starttime logger.info("Done cactus_workflow for simulation %s, config %s\n" % (self.simulation, self.paraFile)) #----------------------- # Run cactus_treeStats # #----------------------- #statsFile = os.path.join(self.outDir, "stats", "%s.xml" % self.simNum) statsFile = os.path.join(self.outDir, "stats", "%s.xml" % self.simName) runCactusTreeStats(outputFile=statsFile, cactusDisk=flowerdisk) #self.addChildCommand(command) #------------------- Adding child ------------------------# #self.addChildTarget(CactusMAFGeneratorWrapper(self.outDir, tempDir, self.simNum, runtime)) self.addChildTarget( CactusMAFGeneratorWrapper(self.outDir, tempDir, self.simTrueMafDir, self.simName, runtime)) logger.info("Added child CactusMAFGeneratorWrapper at %s\n" % self.outDir) #------------------- Cleaning up -------------------------# self.setFollowOnTarget(CactusWorkflowWrapperCleanup(tempDir))
def setUp(self): unittest.TestCase.setUp(self) self.trees = randomTreeSet() self.mcTrees = [] self.tempDir = getTempDirectory(os.getcwd()) self.tempFa = os.path.join(self.tempDir, "seq.fa") with open(self.tempFa, "w") as f: f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n") self.dummySeqMaps = [] for tree in self.trees: if tree.size() < 500: mcTree = MultiCactusTree(tree, tree.degree()) seqMap = dict() for i in mcTree.breadthFirstTraversal(): mcTree.setName(i, "Node%s" % str(i)) seqMap["Node%s" % str(i)] = self.tempFa mcTree.computeSubtreeRoots() self.mcTrees.append(mcTree) self.dummySeqMaps.append(seqMap) seqLens = dict() seqLens["HUMAN"] = 57553 seqLens["CHIMP"] = 57344 seqLens["BABOON"] = 58960 seqLens["MOUSE"] = 32750 seqLens["RAT"] = 38436 seqLens["DOG"] = 54187 seqLens["CAT"] = 50283 seqLens["PIG"] = 54843 seqLens["COW"] = 55508 self.blanchetteSeqMap = dict() for event, seqLen in seqLens.items(): p = os.path.join(self.tempDir, event +".fa") with open(p, "w") as f: f.write(">%s\n" % event) f.write(''.join(['A'] * seqLen)) f.write('\n') self.blanchetteSeqMap[event] = p
def runWorkflow_multipleExamples(inputGenFunction, testNumber=1, testRestrictions=(TestStatus.TEST_SHORT, TestStatus.TEST_MEDIUM, \ TestStatus.TEST_LONG, TestStatus.TEST_VERY_LONG,), inverseTestRestrictions=False, batchSystem="single_machine", buildAvgs=False, buildReference=False, configFile=None, buildJobTreeStats=False, useConstraints=False, cactusWorkflowFunction=runCactusWorkflow, buildHal=False, buildFasta=False, progressive=False): """A wrapper to run a number of examples. """ if (inverseTestRestrictions and TestStatus.getTestStatus() not in testRestrictions) or \ (not inverseTestRestrictions and TestStatus.getTestStatus() in testRestrictions): for test in xrange(testNumber): tempDir = getTempDirectory(os.getcwd()) if useConstraints: sequences, newickTreeString, constraints = inputGenFunction(regionNumber=test, tempDir=tempDir) else: sequences, newickTreeString = inputGenFunction(regionNumber=test, tempDir=tempDir) constraints = None experiment = runWorkflow_TestScript(sequences, newickTreeString, outputDir=tempDir, batchSystem=batchSystem, buildAvgs=buildAvgs, buildReference=buildReference, buildHal=buildHal, buildFasta=buildFasta, configFile=configFile, buildJobTreeStats=buildJobTreeStats, constraints=constraints, progressive=progressive, cactusWorkflowFunction=cactusWorkflowFunction) experiment.cleanupDb() system("rm -rf %s" % tempDir) logger.info("Finished random test %i" % test)
def scriptTree_SortTest(testNo, batchSystem, lines=100000, maxLineLength=10, N=1000): """Tests scriptTree/jobTree by sorting a file in parallel. """ for test in xrange(testNo): tempDir = getTempDirectory(os.getcwd()) tempFile = getTempFile(rootDir=tempDir) jobTreeDir = os.path.join(tempDir, "jobTree") makeFileToSort(tempFile, lines=lines, maxLineLength=maxLineLength) #First make our own sorted version fileHandle = open(tempFile, 'r') l = fileHandle.readlines() l.sort() fileHandle.close() #Sort the file command = "scriptTreeTest_Sort.py --jobTree %s --logLevel=DEBUG --fileToSort=%s --N %i --batchSystem %s --jobTime 1.0 --maxJobs 20" % (jobTreeDir, tempFile, N, batchSystem) system(command) #Now check the file is properly sorted.. #Now get the sorted file fileHandle = open(tempFile, 'r') l2 = fileHandle.readlines() fileHandle.close() checkEqual(l, l2) system("rm -rf %s" % tempDir)
def testBlastRandom(self): """Make some sequences, put them in a file, call blast with random parameters and check it runs okay. """ tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa") self.tempFiles.append(tempSeqFile) for test in xrange(self.testNo): seqNo = random.choice(xrange(0, 10)) seq = getRandomSequence(8000)[1] fileHandle = open(tempSeqFile, 'w') for fastaHeader, seq in [ (str(i), mutateSequence(seq, 0.3*random.random())) for i in xrange(seqNo) ]: if random.random() > 0.5: seq = reverseComplement(seq) fastaWrite(fileHandle, fastaHeader, seq) fileHandle.close() chunkSize = random.choice(xrange(500, 9000)) overlapSize = random.choice(xrange(2, 100)) toilDir = os.path.join(getTempDirectory(self.tempDir), "toil") runCactusBlast([ tempSeqFile ], self.tempOutputFile, toilDir, chunkSize, overlapSize) #runToilStatusAndFailIfNotComplete(toilDir) if getLogLevelString() == "DEBUG": system("cat %s" % self.tempOutputFile) system("rm -rf %s " % toilDir)
def main(argv=None): if argv is None: argv = sys.argv seed = random.randint(0, 2**31) parser = argparse.ArgumentParser(description='Run little hal test') parser.add_argument('--preset', type=str, help='halGenRandom preset to use [small, medium, big, large]', default='small') args = parser.parse_args() rval = 0 print "chunk, comp, time(gen), time(cons), fsize(k)" try: for chunkSize in [10000, 100000, 1000000, 10000000]: for compression in [0, 2, 5, 7, 9]: try: tempDir = getTempDirectory(rootDir="./") tempFile = getTempFile(suffix=".h5", rootDir=tempDir) except: traceback.print_exc(file=sys.stdout) return 1 t = time.time() runHalGen(args.preset, seed, chunkSize, compression, tempFile) fsize = os.path.getsize(tempFile) th = time.time() - t runHalCons(tempFile, getTempFile(rootDir=tempDir)) tc = time.time() - th - t print "%d, %d, %f.3, %f.3, %f.2" % ( chunkSize, compression, th, tc, fsize / 1024.) except: traceback.print_exc(file=sys.stdout) return 1 system("rm -rf %s" % tempDir) return rval