def getChunks(self, sequenceFiles, chunksDir): return [ chunk for chunk in popenCatch("cactus_blast_chunkSequences %s %i %i %s %s" % \ (getLogLevelString(), self.blastOptions.chunkSize, self.blastOptions.overlapSize, chunksDir, " ".join(sequenceFiles))).split("\n") if chunk != "" ]
def getRandomConfigFile(): tempConfigFile = getTempFile(rootDir="./", suffix=".xml") config = ET.parse(os.path.join(cactusRootPath(), "cactus_config.xml")).getroot() cafNode = config.find("caf") assert len(config.findall("caf")) == 1 annealingRounds = 1 + int(random.random() * 10) cafNode.attrib["annealingRounds"] = " ".join([ str(1 + int(random.random() * 10)) for i in xrange(annealingRounds) ]) deannealingRounds = list(set([ 1 + int(random.random() * 10) for i in xrange(int(random.random() * 10)) ])) deannealingRounds.sort() cafNode.attrib["deannealingRounds"] = " ".join([ str(i) for i in deannealingRounds ]) cafNode.attrib["trim"] = " ".join([ str(1 + int(random.random() * 5)) for i in xrange(annealingRounds) ]) cafNode.attrib["alignRepeatsAtLoop"] = str(random.random() * annealingRounds) cafNode.attrib["minimumTreeCoverage"] = str(random.random()) cafNode.attrib["blockTrim"] = str(int(random.random() * 5)) cafNode.attrib["ignoreAllChainsLessThanMinimumTreeCoverage"] = str(random.choice([0, 1])) cafNode.attrib["minimumBlockDegree"] = str(random.choice([0, 5])) checkNode = config.find("check") checkNode.attrib["runCheck"] = "1" checkNode = config.find("normal") checkNode.attrib["iterations"] = "2" #Now print the file.. fileHandle = open(tempConfigFile, 'w') ET.ElementTree(config).write(fileHandle) fileHandle.close() if getLogLevelString() == "DEBUG": system("cat %s" % tempConfigFile) return tempConfigFile
def createJob(attrib, parent, config): """Creates an XML record for the job in a file within the hierarchy of jobs. """ job = ET.Element("job") job.attrib["file"] = config.attrib["job_file_dir"].getTempFile(".xml") job.attrib["remaining_retry_count"] = config.attrib["retry_count"] job.attrib["colour"] = "grey" followOns = ET.SubElement(job, "followOns") ET.SubElement(followOns, "followOn", attrib.copy()) if parent != None: job.attrib["parent"] = parent job.attrib["child_count"] = str(0) job.attrib["black_child_count"] = str(0) job.attrib["log_level"] = getLogLevelString() job.attrib["log_file"] = config.attrib["log_file_dir"].getTempFile(".log") #The log file for the actual command job.attrib["slave_log_file"] = config.attrib["slave_log_file_dir"].getTempFile(".log") #The log file for the slave job.attrib["global_temp_dir"] = config.attrib["temp_dir_dir"].getTempDirectory() job.attrib["job_creation_time"] = str(time.time()) job.attrib["environment_file"] = config.attrib["environment_file"] job.attrib["job_time"] = config.attrib["job_time"] job.attrib["max_log_file_size"] = config.attrib["max_log_file_size"] job.attrib["default_memory"] = config.attrib["default_memory"] job.attrib["default_cpu"] = config.attrib["default_cpu"] job.attrib["total_time"] = attrib["time"] if bool(int(config.attrib["reportAllJobLogFiles"])): job.attrib["reportAllJobLogFiles"] = "" if config.attrib.has_key("stats"): job.attrib["stats"] = config.attrib["log_file_dir"].getTempFile(".xml") #The file to store stats in.. ET.SubElement(job, "children") return job
def run(self): logger.info("Preparing sequence for preprocessing") # chunk it up inChunkDirectory = makeSubDir(os.path.join(self.getGlobalTempDir(), "preprocessChunksIn")) inChunkList = [ chunk for chunk in popenCatch( "cactus_blast_chunkSequences %s %i 0 %s %s" % (getLogLevelString(), self.prepOptions.chunkSize, inChunkDirectory, self.inSequencePath) ).split("\n") if chunk != "" ] outChunkDirectory = makeSubDir(os.path.join(self.getGlobalTempDir(), "preprocessChunksOut")) outChunkList = [] # For each input chunk we create an output chunk, it is the output chunks that get concatenated together. for i in xrange(len(inChunkList)): outChunkList.append(os.path.join(outChunkDirectory, "chunk_%i" % i)) # Calculate the number of chunks to use inChunkNumber = int(max(1, math.ceil(len(inChunkList) * self.prepOptions.proportionToSample))) assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0 # Now get the list of chunks flanking and including the current chunk j = max(0, i - inChunkNumber / 2) inChunks = inChunkList[j : j + inChunkNumber] if len(inChunks) < inChunkNumber: # This logic is like making the list circular inChunks += inChunkList[: inChunkNumber - len(inChunks)] assert len(inChunks) == inChunkNumber self.addChildTarget( PreprocessChunk( self.prepOptions, inChunks, float(inChunkNumber) / len(inChunkList), inChunkList[i], outChunkList[i] ) ) # follow on to merge chunks self.setFollowOnTarget(MergeChunks(self.prepOptions, outChunkList, self.outSequencePath))
def testBlastRandom(self): """Make some sequences, put them in a file, call blast with random parameters and check it runs okay. """ tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa") self.tempFiles.append(tempSeqFile) for test in xrange(self.testNo): seqNo = random.choice(xrange(0, 10)) seq = getRandomSequence(8000)[1] fileHandle = open(tempSeqFile, 'w') for fastaHeader, seq in [(str(i), mutateSequence(seq, 0.3 * random.random())) for i in xrange(seqNo)]: if random.random() > 0.5: seq = reverseComplement(seq) fastaWrite(fileHandle, fastaHeader, seq) fileHandle.close() chunkSize = random.choice(xrange(500, 9000)) overlapSize = random.choice(xrange(2, 100)) toilDir = os.path.join(getTempDirectory(self.tempDir), "toil") runCactusBlast([tempSeqFile], self.tempOutputFile, toilDir, chunkSize, overlapSize) #runToilStatusAndFailIfNotComplete(toilDir) if getLogLevelString() == "DEBUG": system("cat %s" % self.tempOutputFile) system("rm -rf %s " % toilDir)
def test_bedParsing(self): """ mafComparator should parse a bed file and use the intervals for testing """ for maf1, maf2, bed, totalTrue, totalTrueInInterval in self.knownValues: if not os.path.exists('tempTestFiles'): os.mkdir('tempTestFiles') f = open(self.maf1path, 'w') f.write('%s%s%s' % (self.header, maf1, self.footer)) f.close() f = open(self.maf2path, 'w') f.write('%s%s%s' % (self.header, maf2, self.footer)) f.close() f = open(self.bedpath, 'w') f.write('%s' % bed) f.close() cmd = ['mafComparator'] cmd.append('--mafFile1=%s' % self.maf1path) cmd.append('--mafFile2=%s' % self.maf2path) cmd.append('--outputFile=%s' % os.path.join('tempTestFiles', 'output.xml')) if bed != '': cmd.append('--bedFiles=%s' % os.path.join('tempTestFiles', 'bed.bed')) cmd.append('--sampleNumber=1000 --logLevel %s' % getLogLevelString()) system(" ".join(cmd)) tree = ET.parse(os.path.join('tempTestFiles', 'output.xml')) homTests = tree.findall('homologyTests') self.assertAlmostEquals(totalTrue, float(homTests[0].find('aggregateResults').find('all').attrib['totalTrue'])) if totalTrueInInterval is None: self.assertEqual(None, homTests[0].find('aggregateResults').find('A')) else: self.assertAlmostEquals(totalTrueInInterval, float(homTests[0].find('aggregateResults').find('A').attrib['totalTrue'])) shutil.rmtree(os.path.dirname(self.maf1path))
def run(self): chunksDir = makeSubDir(os.path.join(self.getGlobalTempDir(), "chunks")) chunks = [ chunk for chunk in popenCatch("cactus_blast_chunkFlowerSequences %s '%s' %s %i %i %i %s" % \ (getLogLevelString(), self.cactusDisk, self.flowerName, self.blastOptions.chunkSize, self.blastOptions.overlapSize, self.blastOptions.minimumSequenceLength, chunksDir)).split("\n") if chunk != "" ] logger.info("Broken up the flowers into individual 'chunk' files") self.addChildTarget(MakeBlastsAllAgainstAll(self.blastOptions, chunks, self.finalResultsFile))
def reloadJobTree(jobTree): """Load the job tree from a dir. """ logger.info("The job tree appears to already exist, so we'll reload it") assert os.path.isfile(getConfigFileName(jobTree)) #A valid job tree must contain the config file assert os.path.isfile(getEnvironmentFileName(jobTree)) #A valid job tree must contain a pickle file which encodes the path environment of the job assert os.path.isdir(getJobFileDirName(jobTree)) #A job tree must have a directory of jobs. config = ET.parse(getConfigFileName(jobTree)).getroot() config.attrib["log_level"] = getLogLevelString() writeConfig(config) #This updates the on disk config file with the new logging setting batchSystem = loadTheBatchSystem(config) logger.info("Reloaded the jobtree") return config, batchSystem
def testCPecanEmMultipleTrials(self): """Runs uns cPecanEm with multiple different trials. """ for seqFile1, seqFile2 in seqFilePairGenerator(): tempDir = getTempDirectory(rootDir=os.getcwd()) jobTreeDir = os.path.join(tempDir, "jobTree") alignmentsFile = os.path.join(tempDir, "alignments.cigars") computeAlignments(seqFile1, seqFile2, alignmentsFile) logger.info("Computed alignments for seqs %s and %s" % (seqFile1, seqFile2)) outputModelFile = os.path.join(tempDir, "outputModel.txt") outputModelXMLFile = os.path.join(tempDir, "outputModel.xml") outputBlastFile = os.path.join(tempDir, "outputBlast.txt") #First run the script to generate a model and do one iteration of EM to #get the likelihood to compare with the final likelihood trials=3 runCPecanEm(sequenceFiles=[ seqFile1, seqFile2 ], alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, jobTreeDir=jobTreeDir, trials=trials, outputTrialHmms=True, iterations=5, randomStart=True, logLevel=getLogLevelString(), optionsToRealign="--diagonalExpansion=6 --splitMatrixBiggerThanThis=100", outputXMLModelFile=outputModelXMLFile, blastScoringMatrixFile=outputBlastFile) trialHmms = [ Hmm.loadHmm(outputModelFile + ("_%i" % i)) for i in xrange(trials) ] hmm = Hmm.loadHmm(outputModelFile) node = ET.parse(outputModelXMLFile).getroot() logger.info("After multiple trials and iterations of EM the best likelihood found was %s, the likelihoods of the variants were: %s" % (hmm.likelihood, " ".join(map(lambda x : str(x.likelihood), trialHmms)))) matchProbs, gapOpen, gapExtend = makeBlastScoringMatrix(hmm, ("ACTG",)) logger.info("Gap open: %s, Gap extend: %s, Match probs %s" % (gapOpen, gapExtend, " ".join(map(str, matchProbs)))) self.assertTrue(float(node.attrib["maxLikelihood"]) == hmm.likelihood) #Now use the blast file to compute a new matrix computeAlignments(seqFile1, seqFile2, alignmentsFile, lastzArguments=("--ambiguous=iupac --scores=%s" % outputBlastFile)) #Run modifyHmm to check it works system("cPecanModifyHmm %s %s --gcContent=0.5 --substitutionRate=0.05 --setFlatIndelEmissions" % (outputModelFile, outputModelFile)) hmm = Hmm.loadHmm(outputModelFile) node = ET.parse(outputModelXMLFile).getroot() system("rm -rf %s" % tempDir)
def createJobTree(options): logger.info("Starting to create the job tree setup for the first time") options.jobTree = absSymPath(options.jobTree) os.mkdir(options.jobTree) os.mkdir(getJobFileDirName(options.jobTree)) config = ET.Element("config") config.attrib["log_level"] = getLogLevelString() config.attrib["job_tree"] = options.jobTree config.attrib["parasol_command"] = options.parasolCommand config.attrib["try_count"] = str(int(options.retryCount) + 1) config.attrib["max_job_duration"] = str(float(options.maxJobDuration)) config.attrib["batch_system"] = options.batchSystem config.attrib["job_time"] = str(float(options.jobTime)) config.attrib["max_log_file_size"] = str(int(options.maxLogFileSize)) config.attrib["default_memory"] = str(int(options.defaultMemory)) config.attrib["default_cpu"] = str(int(options.defaultCpu)) config.attrib["max_cpus"] = str(int(options.maxCpus)) config.attrib["max_memory"] = str(int(options.maxMemory)) config.attrib["max_threads"] = str(int(options.maxThreads)) if options.bigBatchSystem != None: config.attrib["big_batch_system"] = options.bigBatchSystem config.attrib["big_memory_threshold"] = str( int(options.bigMemoryThreshold)) config.attrib["big_cpu_threshold"] = str(int(options.bigCpuThreshold)) config.attrib["big_max_cpus"] = str(int(options.bigMaxCpus)) config.attrib["big_max_memory"] = str(int(options.bigMaxMemory)) if options.stats: config.attrib["stats"] = "" #Load the batch system. batchSystem = loadTheBatchSystem(config, options) logger.info("Loaded the batch system %s" % batchSystem) #Set the parameters determining the polling frequency of the system. config.attrib["rescue_jobs_frequency"] = str( float(batchSystem.getRescueJobFrequency())) if options.rescueJobsFrequency != None: config.attrib["rescue_jobs_frequency"] = str( float(options.rescueJobsFrequency)) writeConfig(config) logger.info("Finished the job tree setup") return config, batchSystem
def getRandomConfigFile(): tempConfigFile = getTempFile(rootDir="./", suffix=".xml") config = ET.parse(os.path.join(cactusRootPath(), "cactus_config.xml")).getroot() cafNode = config.find("caf") assert len(config.findall("caf")) == 1 annealingRounds = 1 + int(random.random() * 10) cafNode.attrib["annealingRounds"] = " ".join( [str(1 + int(random.random() * 10)) for i in xrange(annealingRounds)]) deannealingRounds = list( set([ 1 + int(random.random() * 10) for i in xrange(int(random.random() * 10)) ])) deannealingRounds.sort() cafNode.attrib["deannealingRounds"] = " ".join( [str(i) for i in deannealingRounds]) cafNode.attrib["trim"] = " ".join( [str(1 + int(random.random() * 5)) for i in xrange(annealingRounds)]) cafNode.attrib["alignRepeatsAtLoop"] = str(random.random() * annealingRounds) cafNode.attrib["minimumTreeCoverage"] = str(random.random()) cafNode.attrib["blockTrim"] = str(int(random.random() * 5)) cafNode.attrib["ignoreAllChainsLessThanMinimumTreeCoverage"] = str( random.choice([0, 1])) cafNode.attrib["minimumBlockDegree"] = str(random.choice([0, 5])) checkNode = config.find("check") checkNode.attrib["runCheck"] = "1" checkNode = config.find("normal") checkNode.attrib["iterations"] = "2" #Now print the file.. fileHandle = open(tempConfigFile, 'w') ET.ElementTree(config).write(fileHandle) fileHandle.close() if getLogLevelString() == "DEBUG": system("cat %s" % tempConfigFile) return tempConfigFile
def testCPecanEm(self): """Runs cPecanEm. """ trial = 0 for modelType in ("fiveState", "fiveStateAsymmetric", "threeState", "threeStateAsymmetric"): for seqFile1, seqFile2 in seqFilePairGenerator(): tempDir = getTempDirectory(rootDir=os.getcwd()) jobTreeDir = os.path.join(tempDir, "jobTree") alignmentsFile = os.path.join(tempDir, "alignments.cigars") computeAlignments(seqFile1, seqFile2, alignmentsFile) logger.info("Computed alignments for seqs %s and %s" % (seqFile1, seqFile2)) outputModelFile = os.path.join(tempDir, "outputModel.txt") #First run the script to generate a model and do one iteration of EM to #get the likelihood to compare with the final likelihood runCPecanEm(sequenceFiles=[ seqFile1, seqFile2 ], alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, modelType=modelType, jobTreeDir=jobTreeDir, iterations=1, trials=1, randomStart=False, logLevel=getLogLevelString(), setJukesCantorStartingEmissions=0.2, #useDefaultModelAsStart=, trainEmissions=True, tieEmissions=True, optionsToRealign="--diagonalExpansion=6 --splitMatrixBiggerThanThis=100") hmm = Hmm.loadHmm(outputModelFile) system("rm -rf %s" % jobTreeDir) #Cleanup the old jobTree logger.info("For trial %s the likelihood after 1 iteration of EM is %s" % (trial, hmm.likelihood)) iterations = 5 runCPecanEm(sequenceFiles=[ seqFile1, seqFile2 ], alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, jobTreeDir=jobTreeDir, optionsToRealign="--diagonalExpansion=6 --splitMatrixBiggerThanThis=100", iterations=iterations, inputModelFile=outputModelFile, logLevel=getLogLevelString(), maxAlignmentLengthPerJob=10000) #, updateTheBand=True) hmm2 = Hmm.loadHmm(outputModelFile) logger.info("For trial %s the likelihood after a further %s iterations of EM is %s" % (trial, iterations, hmm2.likelihood)) self.assertTrue(hmm.likelihood < hmm2.likelihood) hmm2.normalise() logger.info("Final transitions: %s" % " ".join(map(str, hmm2.transitions))) logger.info("Final emissions: %s" % " ".join(map(str, hmm2.emissions))) system("rm -rf %s" % tempDir) trial += 1
def createJobTree(options): logger.info("Starting to create the job tree setup for the first time") options.jobTree = absSymPath(options.jobTree) os.mkdir(options.jobTree) os.mkdir(getJobFileDirName(options.jobTree)) config = ET.Element("config") config.attrib["log_level"] = getLogLevelString() config.attrib["job_tree"] = options.jobTree config.attrib["parasol_command"] = options.parasolCommand config.attrib["try_count"] = str(int(options.retryCount) + 1) config.attrib["max_job_duration"] = str(float(options.maxJobDuration)) config.attrib["batch_system"] = options.batchSystem config.attrib["job_time"] = str(float(options.jobTime)) config.attrib["max_log_file_size"] = str(int(options.maxLogFileSize)) config.attrib["default_memory"] = str(int(options.defaultMemory)) config.attrib["default_cpu"] = str(int(options.defaultCpu)) config.attrib["max_cpus"] = str(int(options.maxCpus)) config.attrib["max_memory"] = str(int(options.maxMemory)) config.attrib["max_threads"] = str(int(options.maxThreads)) if options.bigBatchSystem != None: config.attrib["big_batch_system"] = options.bigBatchSystem config.attrib["big_memory_threshold"] = str(int(options.bigMemoryThreshold)) config.attrib["big_cpu_threshold"] = str(int(options.bigCpuThreshold)) config.attrib["big_max_cpus"] = str(int(options.bigMaxCpus)) config.attrib["big_max_memory"] = str(int(options.bigMaxMemory)) if options.stats: config.attrib["stats"] = "" #Load the batch system. batchSystem = loadTheBatchSystem(config) #Set the parameters determining the polling frequency of the system. config.attrib["rescue_jobs_frequency"] = str(float(batchSystem.getRescueJobFrequency())) if options.rescueJobsFrequency != None: config.attrib["rescue_jobs_frequency"] = str(float(options.rescueJobsFrequency)) writeConfig(config) logger.info("Finished the job tree setup") return config, batchSystem
def testBlastRandom(self): """Make some sequences, put them in a file, call blast with random parameters and check it runs okay. """ tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa") self.tempFiles.append(tempSeqFile) for test in xrange(self.testNo): seqNo = random.choice(xrange(0, 10)) seq = getRandomSequence(8000)[1] fileHandle = open(tempSeqFile, 'w') for fastaHeader, seq in [ (str(i), mutateSequence(seq, 0.3*random.random())) for i in xrange(seqNo) ]: if random.random() > 0.5: seq = reverseComplement(seq) fastaWrite(fileHandle, fastaHeader, seq) fileHandle.close() chunkSize = random.choice(xrange(500, 9000)) overlapSize = random.choice(xrange(2, 100)) toilDir = os.path.join(getTempDirectory(self.tempDir), "toil") runCactusBlast([ tempSeqFile ], self.tempOutputFile, toilDir, chunkSize, overlapSize) #runToilStatusAndFailIfNotComplete(toilDir) if getLogLevelString() == "DEBUG": system("cat %s" % self.tempOutputFile) system("rm -rf %s " % toilDir)
def test_seedTesting(self): """ mafComparator should have replicatable runs via the --seed command """ for maf1, maf2 in self.knownValues: if not os.path.exists('tempTestFiles'): os.mkdir('tempTestFiles') f = open(self.maf1path, 'w') f.write('%s%s%s' % (self.header, maf1, self.footer)) f.close() f = open(self.maf2path, 'w') f.write('%s%s%s' % (self.header, maf2, self.footer)) f.close() cmd = ['mafComparator'] cmd.append('--mafFile1=%s' % self.maf1path) cmd.append('--mafFile2=%s' % self.maf2path) cmd.append('--outputFile=%s' % os.path.join('tempTestFiles', 'output.xml')) cmd.append('--sampleNumber=10 --logLevel %s' % getLogLevelString()) system(" ".join(cmd)) tree = ET.parse(os.path.join('tempTestFiles', 'output.xml')) ac = tree.getroot() seed = int(ac.attrib['seed']) origHomTests = tree.findall('homologyTests') cmd.append('--seed=%d' % seed) for i in xrange(0, 10): system(" ".join(cmd)) tree = ET.parse(os.path.join('tempTestFiles', 'output.xml')) ac = tree.getroot() homTests = tree.findall('homologyTests') self.assertEqual(seed, int(ac.attrib['seed'])) for elm in ['totalTrue', 'totalFalse', 'average']: self.assertEqual(homTests[0].find('aggregateResults').find('all').attrib[elm], origHomTests[0].find('aggregateResults').find('all').attrib[elm]) self.assertEqual(homTests[1].find('aggregateResults').find('all').attrib[elm], origHomTests[1].find('aggregateResults').find('all').attrib[elm]) os.remove(os.path.join('tempTestFiles', 'output.xml')) shutil.rmtree(os.path.dirname(self.maf1path))
def testHalGeneratorFunctions(self): """Run all the CuTests, fail if any of them fail. """ cactus_call(parameters=["cactus_halGeneratorTests", getLogLevelString()])
def testCuTest(self): cactus_call(parameters=["referenceTests", getLogLevelString()])
def getLogLevelString2(logLevelString): """Gets the log level string for the binary """ if logLevelString == None: return getLogLevelString() return logLevelString
def testCuTest(self): cactus_call(parameters=["stCafTests", getLogLevelString()])
def testHalGeneratorFunctions(self): """Run all the CuTests, fail if any of them fail. """ cactus_call( parameters=["cactus_halGeneratorTests", getLogLevelString()])
def testCuTest(self): system("matchingAndOrderingTests %s" % getLogLevelString())
def testAPI(self): """Run all the cactusAPI CuTests, fail if any of them fail. """ system("cactusAPITests %s" % getLogLevelString())
def run(self): tempResultsFile = os.path.join(self.getLocalTempDir(), "tempResults.cig") system("cactus_blast_sortAlignments %s %s %i" % (getLogLevelString(), self.cigarFile, tempResultsFile)) logger.info("Sorted the alignments okay") system("mv %s %s" % (tempResultsFile, self.cigarFile))
def testSonLibCTests(self): """Run m,ost the sonLib CuTests, fail if any of them fail. """ system("sonLibTests %s" % getLogLevelString())
def testHalGeneratorFunctions(self): """Run all the CuTests, fail if any of them fail. """ system("cactus_halGeneratorTests %s" % getLogLevelString())
def testCuTest(self): system("stPinchesAndCactiTests %s" % getLogLevelString())
def test3Edge(self): """Run the 3-edge connected CuTests, fail if any of them fail. """ system("3EdgeTests %s" % getLogLevelString())
def testPosetAlignerAPI(self): """Run all the cactus base aligner CuTests, fail if any of them fail. """ system("cactus_barTests %s" % getLogLevelString())
def testReferenceAndAsMedianAlgorithms(self): """Iterates through a list of simulation variants and prints results """ headerLine = "\t".join(("elementNumber", "chromosomeNumber", "leafGenomeNumber", "operationNumber", "totalOperationNumber", "doInversion", "doShortInversion", "doDcj", "doTranslocation", "doShortTranslocation", "greedyIterations", "theta", "replicate", "medianDCJDistance", "medianOutOfOrderDistance", "weightedMedianOutOfOrderDistance", "medianDCJDistanceForReferenceAlgorithm", "medianOutOfOrderDistanceForReferenceAlgorithm", "weightedMedianOutOfOrderDistanceForReferenceAlgorithm", "dCJDistanceForReferenceAlgorithmFromMedian", "outOfOrderDistanceForReferenceAlgorithmFromMedian", "weightedOutOfOrderDistanceForReferenceAlgorithmFromMedian", "medianDCJDistanceForAsMedian", "medianOutOfOrderDistanceForAsMedian", "weightedMedianOutOfOrderDistanceForAsMedian", "dCJDistanceForAsMedianFromMedian", "outOfOrderDistanceForAsMedianFromMedian", "weightedOutOfOrderDistanceForAsMedianFromMedian", "medianGenomeForReferenceAlgorithm", "medianGenomeForAsMedian")) if getLogLevelString() in ("DEBUG", "INFO" ): print headerLine for elementNumber in self.elementNumbers: for chromosomeNumber in self.chromosomeNumbers: for leafGenomeNumber in self.leafGenomeNumbers: for operationNumber in self.operationNumber: for doInversion, doShortInversion, doDcj, doTranslocation, doShortTranslocation in self.operationType: for greedyIterations in self.greedyIterations: for theta in self.theta: for replicate in xrange(self.replicates): medianHistory = MedianHistory(Genome(elementNumber=elementNumber, chromosomeNumber=chromosomeNumber), leafGenomeNumber=leafGenomeNumber) medianHistory.permuteLeafGenomes(operationNumber=operationNumber, doInversion=doInversion, doDcj=doDcj, doTranslocation=doTranslocation, doShortInversion=doShortInversion, doShortTranslocation=doShortTranslocation) medianDCJDistance = medianHistory.getMedianDcjDistance(medianHistory.getMedianGenome()) medianOutOfOrderDistance = medianHistory.getMedianOutOfOrderDistance(medianHistory.getMedianGenome()) weightedMedianOutOfOrderDistance = medianHistory.getWeightedMedianOutOfOrderDistance(medianHistory.getMedianGenome(), theta=theta) #Now run reference problem algorithm referenceProblemMedianGenome = runReferenceMedianProblemTest(medianHistory, greedyIterations, theta) medianDCJDistanceForReferenceAlgorithm = medianHistory.getMedianDcjDistance(referenceProblemMedianGenome) medianOutOfOrderDistanceForReferenceAlgorithm = medianHistory.getMedianOutOfOrderDistance(referenceProblemMedianGenome) weightedMedianOutOfOrderDistanceForReferenceAlgorithm = medianHistory.getWeightedMedianOutOfOrderDistance(referenceProblemMedianGenome, theta=theta) dCJDistanceForReferenceAlgorithmFromMedian = medianHistory.getMedianGenome().getCircularDcjDistance(referenceProblemMedianGenome) outOfOrderDistanceForReferenceAlgorithmFromMedian = medianHistory.getMedianGenome().getOutOfOrderDistance(referenceProblemMedianGenome) weightedOutOfOrderDistanceForReferenceAlgorithmFromMedian = medianHistory.getMedianGenome().getWeightedOutOfOrderDistance(referenceProblemMedianGenome, theta=theta) totalOperationNumber = operationNumber * len([ i for i in (doInversion, doShortInversion, doDcj, doTranslocation, doShortTranslocation) if i == True ]) #Biomedian comparison turned off if False and leafGenomeNumber == 3 and doDcj == False and float(totalOperationNumber) / elementNumber <= 0.5: asMedianProblemMedianGenome = runAsMedianMedianProblemTest(medianHistory) medianDCJDistanceForAsMedian = medianHistory.getMedianDcjDistance(asMedianProblemMedianGenome) medianOutOfOrderDistanceForAsMedian = medianHistory.getMedianOutOfOrderDistance(asMedianProblemMedianGenome) weightedMedianOutOfOrderDistanceForAsMedian = medianHistory.getWeightedMedianOutOfOrderDistance(asMedianProblemMedianGenome, theta=theta) dCJDistanceForAsMedianFromMedian = medianHistory.getMedianGenome().getCircularDcjDistance(asMedianProblemMedianGenome) outOfOrderDistanceForAsMedianFromMedian = medianHistory.getMedianGenome().getOutOfOrderDistance(asMedianProblemMedianGenome) weightedOutOfOrderDistanceForAsMedianFromMedian = medianHistory.getMedianGenome().getWeightedOutOfOrderDistance(asMedianProblemMedianGenome, theta=theta) else: asMedianProblemMedianGenome = "n/a" medianDCJDistanceForAsMedian = "n/a" medianOutOfOrderDistanceForAsMedian = "n/a" weightedMedianOutOfOrderDistanceForAsMedian = "n/a" dCJDistanceForAsMedianFromMedian = "n/a" outOfOrderDistanceForAsMedianFromMedian = "n/a" weightedOutOfOrderDistanceForAsMedianFromMedian = "n/a" #Now prepare line to print line = "\t".join([ str(i) for i in (elementNumber, chromosomeNumber, leafGenomeNumber, operationNumber, totalOperationNumber, doInversion, doShortInversion, doDcj, doTranslocation, doShortTranslocation, greedyIterations, theta, replicate, medianDCJDistance, medianOutOfOrderDistance, weightedMedianOutOfOrderDistance, medianDCJDistanceForReferenceAlgorithm, medianOutOfOrderDistanceForReferenceAlgorithm, weightedMedianOutOfOrderDistanceForReferenceAlgorithm, dCJDistanceForReferenceAlgorithmFromMedian, outOfOrderDistanceForReferenceAlgorithmFromMedian, weightedOutOfOrderDistanceForReferenceAlgorithmFromMedian, medianDCJDistanceForAsMedian, medianOutOfOrderDistanceForAsMedian, weightedMedianOutOfOrderDistanceForAsMedian, dCJDistanceForAsMedianFromMedian, outOfOrderDistanceForAsMedianFromMedian, weightedOutOfOrderDistanceForAsMedianFromMedian, "'%s'" % str(referenceProblemMedianGenome), "'%s'" % str(asMedianProblemMedianGenome)) ]) #Print line if getLogLevelString() in ("DEBUG", "INFO"): print line
def testCPecanEmMultipleTrials(self): """Runs uns cPecanEm with multiple different trials. """ for seqFile1, seqFile2 in seqFilePairGenerator(): tempDir = getTempDirectory(rootDir=os.getcwd()) jobTreeDir = os.path.join(tempDir, "jobTree") alignmentsFile = os.path.join(tempDir, "alignments.cigars") computeAlignments(seqFile1, seqFile2, alignmentsFile) logger.info("Computed alignments for seqs %s and %s" % (seqFile1, seqFile2)) outputModelFile = os.path.join(tempDir, "outputModel.txt") outputModelXMLFile = os.path.join(tempDir, "outputModel.xml") outputBlastFile = os.path.join(tempDir, "outputBlast.txt") #First run the script to generate a model and do one iteration of EM to #get the likelihood to compare with the final likelihood trials = 3 runCPecanEm( sequenceFiles=[seqFile1, seqFile2], alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, jobTreeDir=jobTreeDir, trials=trials, outputTrialHmms=True, iterations=5, randomStart=True, logLevel=getLogLevelString(), optionsToRealign= "--diagonalExpansion=6 --splitMatrixBiggerThanThis=100", outputXMLModelFile=outputModelXMLFile, blastScoringMatrixFile=outputBlastFile) trialHmms = [ Hmm.loadHmm(outputModelFile + ("_%i" % i)) for i in xrange(trials) ] hmm = Hmm.loadHmm(outputModelFile) node = ET.parse(outputModelXMLFile).getroot() logger.info( "After multiple trials and iterations of EM the best likelihood found was %s, the likelihoods of the variants were: %s" % (hmm.likelihood, " ".join( map(lambda x: str(x.likelihood), trialHmms)))) matchProbs, gapOpen, gapExtend = makeBlastScoringMatrix( hmm, ("ACTG", )) logger.info("Gap open: %s, Gap extend: %s, Match probs %s" % (gapOpen, gapExtend, " ".join(map(str, matchProbs)))) self.assertTrue( float(node.attrib["maxLikelihood"]) == hmm.likelihood) #Now use the blast file to compute a new matrix computeAlignments(seqFile1, seqFile2, alignmentsFile, lastzArguments=("--ambiguous=iupac --scores=%s" % outputBlastFile)) #Run modifyHmm to check it works system( "cPecanModifyHmm %s %s --gcContent=0.5 --substitutionRate=0.05 --setFlatIndelEmissions" % (outputModelFile, outputModelFile)) hmm = Hmm.loadHmm(outputModelFile) node = ET.parse(outputModelXMLFile).getroot() system("rm -rf %s" % tempDir)
def runDbTestScript(options, firstKey=0, keyNumber=0, addRecords=False, setRecords=False): def fn(stringId, bool): if bool: return stringId return "" addRecords = fn("--addRecords", addRecords) setRecords = fn("--setRecords", setRecords) command = "dbTestScript --databaseConf '%s' --firstKey %s --keyNumber %s %s %s --minRecordSize %s --maxRecordSize %s --logLevel %s" %\ (getDatabaseConf(options), firstKey, keyNumber, addRecords, setRecords, options.minRecordSize, options.maxRecordSize, getLogLevelString()) system(command)
def testCuTest(self): system("referenceTests %s" % getLogLevelString())
def testCPecanLib(self): """Run all the cPecanLib CuTests, fail if any of them fail. """ system("cPecanLibTests %s" % getLogLevelString())
def testCactusWorkflow_Blanchette(self): """Runs the workflow on blanchette's simulated (colinear) regions. """ if "SON_TRACE_DATASETS" not in os.environ: return for test in xrange(self.testNo): tempFiles = [] tempDir = getTempDirectory(os.getcwd()) trueAlignment = os.path.join(TestStatus.getPathToDataSets(), "blanchettesSimulation", "00.job", "true.mfa") #Load the true alignment. columnAlignment = [ i for i in fastaAlignmentRead(trueAlignment) ] fastaHeaders = [ i for i in fastaReadHeaders(trueAlignment) ] sequenceNumber = 9 #The tree newickTreeString = "((((HUMAN:0.006969, CHIMP:0.009727):0.025291, BABOON:0.044568):0.11,(RAT:0.072818, MOUSE:0.081244):0.260342):0.023260,((DOG:0.07, CAT:0.07):0.087381,(PIG:0.06, COW:0.06):0.104728):0.04);" #Get random dir testDir = getTempDirectory(tempDir) #random alignment alignmentLength = 5000 randomStart = random.choice(xrange(len(columnAlignment)-alignmentLength)) subAlignment = columnAlignment[randomStart:randomStart+alignmentLength] logger.info("Got a sub alignment, it is %i columns long" % len(subAlignment)) #Get sequences sequences = [ (fastaHeaders[seqNo], "".join([ column[seqNo] for column in subAlignment if column[seqNo] != '-' ])) for seqNo in xrange(sequenceNumber) ] logger.info("Got the sequences") #Write sequences into temp files tempFastaFiles = [] for seqNo in xrange(sequenceNumber): header, sequence = sequences[seqNo] logger.info("Making temp file for header: %s, seq: %s" % (header, sequence)) tempFastaFile = os.path.join(testDir, "%i.fa" % seqNo) tempFastaFiles.append(tempFastaFile) fileHandle = open(tempFastaFile, "w") fastaWrite(fileHandle, header, sequence) fileHandle.close() logger.info("Got the temp sequence files") experiment = getCactusWorkflowExperimentForTest(tempFastaFiles, newickTreeString, testDir) experimentFile = os.path.join(testDir, "experiment.xml") experiment.writeXML(experimentFile) cactusDiskDatabaseString = experiment.getDiskDatabaseString() jobTree = os.path.join(testDir, "jobTree") runCactusWorkflow(experimentFile, jobTree) logger.info("Ran the the workflow") #Check the output alignment runJobTreeStatusAndFailIfNotComplete(jobTree) logger.info("Checked the job tree dir") #Output the 'TRUE' alignment file if os.system("mfaToMaf --help > /dev/null 2>&1") == 0 and\ os.system("cactus_MAFGenerator --help > /dev/null 2>&1") == 0 and\ os.system("mafComparator --help > /dev/null 2>&1") == 0 and\ os.system("cactus_treeStats --help > /dev/null 2>&1") == 0: trueMFAFile = os.path.join(testDir, "true.mfa") fastaAlignmentWrite(subAlignment, fastaHeaders, len(fastaHeaders), trueMFAFile) trueMAFFile = os.path.join(testDir, "true.maf") system("mfaToMaf --mfaFile %s --outputFile %s --logLevel %s" % (trueMFAFile, trueMAFFile, getLogLevelString())) system("cat %s" % trueMAFFile) #Now get mafs for the region. mAFFile = os.path.join(testDir, "flower.maf") system("cactus_MAFGenerator --flowerName 0 --cactusDisk '%s' --outputFile %s --logLevel %s" % (cactusDiskDatabaseString, mAFFile, getLogLevelString())) logger.info("Got the MAFs from the flower disk") system("cat %s" % mAFFile) statsFile = os.path.join(testDir, "stats.xml") system("cactus_treeStats --cactusDisk '%s' --flowerName 0 --outputFile %s --logLevel %s" % (cactusDiskDatabaseString, statsFile, getLogLevelString())) system("cat %s" % statsFile) logger.info("Got the cactus tree stats") #Now compare the mafs to the output. resultsFile = os.path.join(testDir, "results.xml") system("mafComparator --mafFile1 %s --mafFile2 %s --outputFile %s --logLevel %s" % (trueMAFFile, mAFFile, resultsFile, getLogLevelString())) logger.info("Ran the maf comparator") system("cat %s" % resultsFile) #Cleanup experiment.cleanupDb() system("rm -rf %s" % testDir) logger.info("Successfully ran test for the problem") for tempFile in tempFiles: os.remove(tempFile) system("rm -rf %s" % tempDir)
def testPosetAlignerAPI(self): """Run all the cactus base aligner CuTests, fail if any of them fail. """ cactus_call(parameters=["cactus_barTests", getLogLevelString()])
def testCuTest(self): system("stCafTests %s" % getLogLevelString())
def testCPecanEm(self): """Runs cPecanEm. """ trial = 0 for modelType in ("fiveState", "fiveStateAsymmetric", "threeState", "threeStateAsymmetric"): for seqFile1, seqFile2 in seqFilePairGenerator(): tempDir = getTempDirectory(rootDir=os.getcwd()) jobTreeDir = os.path.join(tempDir, "jobTree") alignmentsFile = os.path.join(tempDir, "alignments.cigars") computeAlignments(seqFile1, seqFile2, alignmentsFile) logger.info("Computed alignments for seqs %s and %s" % (seqFile1, seqFile2)) outputModelFile = os.path.join(tempDir, "outputModel.txt") #First run the script to generate a model and do one iteration of EM to #get the likelihood to compare with the final likelihood runCPecanEm( sequenceFiles=[seqFile1, seqFile2], alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, modelType=modelType, jobTreeDir=jobTreeDir, iterations=1, trials=1, randomStart=False, logLevel=getLogLevelString(), setJukesCantorStartingEmissions=0.2, #useDefaultModelAsStart=, trainEmissions=True, tieEmissions=True, optionsToRealign= "--diagonalExpansion=6 --splitMatrixBiggerThanThis=100") hmm = Hmm.loadHmm(outputModelFile) system("rm -rf %s" % jobTreeDir) #Cleanup the old jobTree logger.info( "For trial %s the likelihood after 1 iteration of EM is %s" % (trial, hmm.likelihood)) iterations = 5 runCPecanEm( sequenceFiles=[seqFile1, seqFile2], alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, jobTreeDir=jobTreeDir, optionsToRealign= "--diagonalExpansion=6 --splitMatrixBiggerThanThis=100", iterations=iterations, inputModelFile=outputModelFile, logLevel=getLogLevelString(), maxAlignmentLengthPerJob=10000) #, updateTheBand=True) hmm2 = Hmm.loadHmm(outputModelFile) logger.info( "For trial %s the likelihood after a further %s iterations of EM is %s" % (trial, iterations, hmm2.likelihood)) self.assertTrue(hmm.likelihood < hmm2.likelihood) hmm2.normalise() logger.info("Final transitions: %s" % " ".join(map(str, hmm2.transitions))) logger.info("Final emissions: %s" % " ".join(map(str, hmm2.emissions))) system("rm -rf %s" % tempDir) trial += 1
def testAPI(self): """Run all the cactusAPI CuTests, fail if any of them fail. """ cactus_call(parameters=["cactusAPITests", getLogLevelString()])
def testReferenceAndAsMedianAlgorithms(self): """Iterates through a list of simulation variants and prints results """ headerLine = "\t".join( ("elementNumber", "chromosomeNumber", "leafGenomeNumber", "operationNumber", "totalOperationNumber", "doInversion", "doShortInversion", "doDcj", "doTranslocation", "doShortTranslocation", "greedyIterations", "theta", "replicate", "medianDCJDistance", "medianOutOfOrderDistance", "weightedMedianOutOfOrderDistance", "medianDCJDistanceForReferenceAlgorithm", "medianOutOfOrderDistanceForReferenceAlgorithm", "weightedMedianOutOfOrderDistanceForReferenceAlgorithm", "dCJDistanceForReferenceAlgorithmFromMedian", "outOfOrderDistanceForReferenceAlgorithmFromMedian", "weightedOutOfOrderDistanceForReferenceAlgorithmFromMedian", "medianDCJDistanceForAsMedian", "medianOutOfOrderDistanceForAsMedian", "weightedMedianOutOfOrderDistanceForAsMedian", "dCJDistanceForAsMedianFromMedian", "outOfOrderDistanceForAsMedianFromMedian", "weightedOutOfOrderDistanceForAsMedianFromMedian", "medianGenomeForReferenceAlgorithm", "medianGenomeForAsMedian")) if getLogLevelString() in ("DEBUG", "INFO"): print headerLine for elementNumber in self.elementNumbers: for chromosomeNumber in self.chromosomeNumbers: for leafGenomeNumber in self.leafGenomeNumbers: for operationNumber in self.operationNumber: for doInversion, doShortInversion, doDcj, doTranslocation, doShortTranslocation in self.operationType: for greedyIterations in self.greedyIterations: for theta in self.theta: for replicate in xrange(self.replicates): medianHistory = MedianHistory( Genome(elementNumber=elementNumber, chromosomeNumber= chromosomeNumber), leafGenomeNumber=leafGenomeNumber) medianHistory.permuteLeafGenomes( operationNumber=operationNumber, doInversion=doInversion, doDcj=doDcj, doTranslocation=doTranslocation, doShortInversion=doShortInversion, doShortTranslocation= doShortTranslocation) medianDCJDistance = medianHistory.getMedianDcjDistance( medianHistory.getMedianGenome()) medianOutOfOrderDistance = medianHistory.getMedianOutOfOrderDistance( medianHistory.getMedianGenome()) weightedMedianOutOfOrderDistance = medianHistory.getWeightedMedianOutOfOrderDistance( medianHistory.getMedianGenome(), theta=theta) #Now run reference problem algorithm referenceProblemMedianGenome = runReferenceMedianProblemTest( medianHistory, greedyIterations, theta) medianDCJDistanceForReferenceAlgorithm = medianHistory.getMedianDcjDistance( referenceProblemMedianGenome) medianOutOfOrderDistanceForReferenceAlgorithm = medianHistory.getMedianOutOfOrderDistance( referenceProblemMedianGenome) weightedMedianOutOfOrderDistanceForReferenceAlgorithm = medianHistory.getWeightedMedianOutOfOrderDistance( referenceProblemMedianGenome, theta=theta) dCJDistanceForReferenceAlgorithmFromMedian = medianHistory.getMedianGenome( ).getCircularDcjDistance( referenceProblemMedianGenome) outOfOrderDistanceForReferenceAlgorithmFromMedian = medianHistory.getMedianGenome( ).getOutOfOrderDistance( referenceProblemMedianGenome) weightedOutOfOrderDistanceForReferenceAlgorithmFromMedian = medianHistory.getMedianGenome( ).getWeightedOutOfOrderDistance( referenceProblemMedianGenome, theta=theta) totalOperationNumber = operationNumber * len( [ i for i in (doInversion, doShortInversion, doDcj, doTranslocation, doShortTranslocation) if i == True ]) #Biomedian comparison turned off if False and leafGenomeNumber == 3 and doDcj == False and float( totalOperationNumber ) / elementNumber <= 0.5: asMedianProblemMedianGenome = runAsMedianMedianProblemTest( medianHistory) medianDCJDistanceForAsMedian = medianHistory.getMedianDcjDistance( asMedianProblemMedianGenome) medianOutOfOrderDistanceForAsMedian = medianHistory.getMedianOutOfOrderDistance( asMedianProblemMedianGenome) weightedMedianOutOfOrderDistanceForAsMedian = medianHistory.getWeightedMedianOutOfOrderDistance( asMedianProblemMedianGenome, theta=theta) dCJDistanceForAsMedianFromMedian = medianHistory.getMedianGenome( ).getCircularDcjDistance( asMedianProblemMedianGenome) outOfOrderDistanceForAsMedianFromMedian = medianHistory.getMedianGenome( ).getOutOfOrderDistance( asMedianProblemMedianGenome) weightedOutOfOrderDistanceForAsMedianFromMedian = medianHistory.getMedianGenome( ).getWeightedOutOfOrderDistance( asMedianProblemMedianGenome, theta=theta) else: asMedianProblemMedianGenome = "n/a" medianDCJDistanceForAsMedian = "n/a" medianOutOfOrderDistanceForAsMedian = "n/a" weightedMedianOutOfOrderDistanceForAsMedian = "n/a" dCJDistanceForAsMedianFromMedian = "n/a" outOfOrderDistanceForAsMedianFromMedian = "n/a" weightedOutOfOrderDistanceForAsMedianFromMedian = "n/a" #Now prepare line to print line = "\t".join([ str(i) for i in (elementNumber, chromosomeNumber, leafGenomeNumber, operationNumber, totalOperationNumber, doInversion, doShortInversion, doDcj, doTranslocation, doShortTranslocation, greedyIterations, theta, replicate, medianDCJDistance, medianOutOfOrderDistance, weightedMedianOutOfOrderDistance, medianDCJDistanceForReferenceAlgorithm, medianOutOfOrderDistanceForReferenceAlgorithm, weightedMedianOutOfOrderDistanceForReferenceAlgorithm, dCJDistanceForReferenceAlgorithmFromMedian, outOfOrderDistanceForReferenceAlgorithmFromMedian, weightedOutOfOrderDistanceForReferenceAlgorithmFromMedian, medianDCJDistanceForAsMedian, medianOutOfOrderDistanceForAsMedian, weightedMedianOutOfOrderDistanceForAsMedian, dCJDistanceForAsMedianFromMedian, outOfOrderDistanceForAsMedianFromMedian, weightedOutOfOrderDistanceForAsMedianFromMedian, "'%s'" % str(referenceProblemMedianGenome), "'%s'" % str(asMedianProblemMedianGenome)) ]) #Print line if getLogLevelString() in ("DEBUG", "INFO"): print line