def testJobReadWriteAndDelete(self): jobDir = os.path.join(os.getcwd(), "testJobDir") os.mkdir(jobDir) #If directory already exists then the test will fail command = "by your command" memory = 2^32 cpu = 1 tryCount = 100 for i in xrange(10): startTime = time.time() for j in xrange(100): j = Job(command, memory, cpu, tryCount, jobDir) self.assertEquals(j.remainingRetryCount, tryCount) self.assertEquals(j.jobDir, jobDir) self.assertEquals(j.children, []) self.assertEquals(j.followOnCommands, [ (command, memory, cpu, 0)]) self.assertEquals(j.messages, []) j.write() j = Job.read(j.getJobFileName()) self.assertEquals(j.remainingRetryCount, tryCount) self.assertEquals(j.jobDir, jobDir) self.assertEquals(j.children, []) self.assertEquals(j.followOnCommands, [ (command, memory, cpu, 0)]) self.assertEquals(j.messages, []) self.assertTrue(os.path.exists(j.getJobFileName())) j.delete() self.assertTrue(not os.path.exists(j.getJobFileName())) print "It took %f seconds to load/unload jobs" % (time.time() - startTime) #We've just used it for benchmarking, so far #Would be good to extend this trivial test system("rm -rf %s" % jobDir)
def testJobUpdate(self): jobDir = os.path.join(os.getcwd(), "testJobDir") os.mkdir(jobDir) #If directory already exists then the test will fail command = "by your command" memory = 2^32 cpu = 1 tryCount = 100 for i in xrange(40): startTime = time.time() j = Job(command, memory, cpu, tryCount, jobDir) childNumber = random.choice(range(20)) for k in xrange(childNumber): j.children.append((command, memory, cpu)) self.assertEquals(len(j.children), childNumber) j.update(tryCount=tryCount, depth=0) j = Job.read(j.getJobFileName()) self.assertEquals(len(j.children) + len(j.followOnCommands), childNumber + 1) for childJobFile, memory, cpu in j.children: cJ = Job.read(childJobFile) self.assertEquals(cJ.remainingRetryCount, tryCount) #self.assertEquals(cJ.jobDir, os.path.split(cJ)[0]) self.assertEquals(cJ.children, []) self.assertEquals(cJ.followOnCommands, [ (command, memory, cpu, 0)]) self.assertEquals(cJ.messages, []) self.assertTrue(os.path.exists(cJ.getJobFileName())) cJ.delete() self.assertTrue(not os.path.exists(cJ.getJobFileName())) self.assertEquals(os.listdir(jobDir), [ "job" ]) j.delete() print "It took %f seconds to update jobs" % (time.time() - startTime) #We've just used it for benchmarking, so far system("rm -rf %s" % jobDir)
def testCPecanRealignSplitSequences(self): """Runs cPecanRealign, splitting indels longer than 100bp, and check that the coverage from the results is the same as the coverage from realigning with no arguments..""" for seqFile1, seqFile2 in seqFilePairGenerator(): # Drop the lastz command since it's not needed. But this # is still convenient to use the same parameters as all # the other tests realignCommand, _ = getCommands(seqFile1, seqFile2) splitRealignCommand = realignCommand + " --splitIndelsLongerThanThis 100" realignOutput = getTempFile() splitRealignOutput = getTempFile() realignCommand += " > %s" % realignOutput splitRealignCommand += " > %s" % splitRealignOutput system(realignCommand) system(splitRealignCommand) # Check coverage on seqFile1 #The following will fail until we refactor. splitRealignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile1, splitRealignOutput)) realignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile1, realignOutput)) self.assertTrue(splitRealignCoverage == realignCoverage) # Check coverage on seqFile2 splitRealignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile2, splitRealignOutput)) realignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile2, realignOutput)) self.assertTrue(splitRealignCoverage == realignCoverage) os.remove(realignOutput) os.remove(splitRealignOutput)
def run(self): self.logToMaster("Getting recomb. events for clone %s ..." % self.clone) max_vdel = len(self.vseq) - 3 min_vdel = find_min_vdel(self.vseq, self.aaseq) max_jdel = len(self.jseq) - 3 min_jdel = find_min_jdel(self.jseq, self.aaseq) self.logToMaster("Vdel: <%d-%d>" % (min_vdel, max_vdel)) self.logToMaster("Jdel: <%d-%d>" % (min_jdel, max_jdel)) for d, dseq in self.d2seq.iteritems(): devents = find_devents(dseq, self.aaseq) self.logToMaster("%d number of devents" % (len(devents))) # DEBUG #numempty = 0 #for devent in devents: # if devent.cdr3aa_dstart == -1: # numempty += 1 #self.logToMaster("\t%d empty D, %d non_empty_D\n" % (numempty, len(devents) - numempty)) # END DEBUG for i, devent in enumerate(devents): outdir = os.path.join(self.outdir, d, str(i)) #outdir/clone/d/i system("mkdir -p %s" % outdir) if devent.cdr3aa_dstart == -1: dempty_file = os.path.join(outdir, "d_empty") self.addChildTarget(Get_Vjins(self.clone, self.vseq, min_vdel, max_vdel, self.jseq, min_jdel, max_jdel, d, devent, self.aaseq, dempty_file)) else: self.addChildTarget(Get_Vd_Dj_Ins(self.clone, self.vseq, min_vdel, max_vdel, self.jseq, min_jdel, max_jdel, d, dseq, devent, self.aaseq, outdir)) self.setFollowOnTarget(CloneEventsAgg(self.outdir))
def run(self): infile = os.path.join(self.indir, "copyNumberStats.xml") if os.path.exists( infile ): cmd = "cnvPlot.py %s --outdir %s " %(infile, self.outdir) if self.filteredSamples != "": cmd += " --filteredSamples %s" %(self.filteredSamples) system(cmd)
def runComparisonOfBlastScriptVsNaiveBlast(self, blastMode): """We compare the output with a naive run of the blast program, to check the results are nearly equivalent. """ encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] #, 2) ] #Could go to six species = ("human", "mouse", "dog") #Other species to try "rat", "monodelphis", "macaque", "chimp" for encodeRegion in encodeRegions: regionPath = os.path.join(self.encodePath, encodeRegion) for i in xrange(len(species)): species1 = species[i] for species2 in species[i+1:]: seqFile1 = os.path.join(regionPath, "%s.%s.fa" % (species1, encodeRegion)) seqFile2 = os.path.join(regionPath, "%s.%s.fa" % (species2, encodeRegion)) #Run the random runNaiveBlast(seqFile1, seqFile2, self.tempOutputFile) logger.info("Ran the naive blast okay") #Run the blast jobTreeDir = os.path.join(getTempDirectory(self.tempDir), "jobTree") if blastMode == "allAgainstAll": runCactusBlast([ seqFile1, seqFile2 ], self.tempOutputFile2, jobTreeDir, chunkSize=500000, overlapSize=10000) else: runCactusBlast([ seqFile1 ], self.tempOutputFile2, jobTreeDir, chunkSize=500000, overlapSize=10000, targetSequenceFiles=[ seqFile2 ]) runJobTreeStatusAndFailIfNotComplete(jobTreeDir) system("rm -rf %s " % jobTreeDir) logger.info("Ran cactus_blast okay") logger.critical("Comparing cactus_blast and naive blast; using mode: %s" % blastMode) compareResultsFile(self.tempOutputFile, self.tempOutputFile2)
def trimGenome(sequenceFile, coverageFile, outputFile, complement=False, flanking=0, minSize=1, windowSize=10, threshold=1, depth=None): system("cactus_trimSequences.py %s %s %s %s %s %s %s %s > %s" % ( nameValue("complement", complement, valueType=bool), nameValue("flanking", flanking), nameValue("minSize", minSize), nameValue("windowSize", windowSize), nameValue("threshold", threshold), nameValue("depth", depth), sequenceFile, coverageFile, outputFile))
def run(self): newmodfile = "%s-modified" %self.modfile #modify small branch lengths (change all the xxxe-1y to xxxe-10) system("sed 's/e-1./e-08/g' %s > %s" %(self.modfile, newmodfile)) #get conservation bigwig and liftover files: cmd = "halTreePhyloP.py %s %s %s --bigWig --numProc %d" %(self.halfile, newmodfile, self.outdir, self.numproc) system(cmd)
def extractOutput(workDir, outputHalFile, options): if options.outputMaf is not None: mcProj = MultiCactusProject() mcProj.readXML( os.path.join(workDir, ProjectWrapper.alignmentDirName, ProjectWrapper.alignmentDirName + "_project.xml")) rootName = mcProj.mcTree.getRootName() rootPath = os.path.join(workDir, ProjectWrapper.alignmentDirName, rootName, rootName + '.maf') cmd = 'mv %s %s' % (rootPath, options.outputMaf) system(cmd) envFile = getEnvFilePath() logFile = os.path.join(workDir, 'cactus.log') pjPath = os.path.join(workDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) logHandle = open(logFile, "a") logHandle.write("\n\n%s: Beginning HAL Export\n\n" % str( datetime.datetime.now())) logHandle.close() cmd = '. %s && cactus2hal.py %s %s >> %s 2>&1' % (envFile, pjPath, outputHalFile, logFile) system(cmd) logHandle = open(logFile, "a") logHandle.write("\n%s: Finished HAL Export \n" % str( datetime.datetime.now())) logHandle.close()
def runCactus(workDir, jtCommands, jtPath, options): envFile = getEnvFilePath() pjPath = os.path.join(workDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) logFile = os.path.join(workDir, 'cactus.log') if options.overwrite: overwriteFlag = '--overwrite' system("rm -f %s" % logFile) else: overwriteFlag = '' logHandle = open(logFile, "a") logHandle.write("\n%s: Beginning Progressive Cactus Alignment\n\n" % str( datetime.datetime.now())) logHandle.close() cmd = '. %s && cactus_progressive.py %s %s %s >> %s 2>&1' % (envFile, jtCommands, pjPath, overwriteFlag, logFile) jtMonitor = JobStatusMonitor(jtPath, pjPath, logFile, deadlockCallbackFn=abortFunction(jtPath, options)) if options.database == "kyoto_tycoon": jtMonitor.daemon = True jtMonitor.start() system(cmd) logHandle = open(logFile, "a") logHandle.write("\n%s: Finished Progressive Cactus Alignment\n" % str( datetime.datetime.now())) logHandle.close()
def checkOptions(parser, args, options): if not options.indir: raise InputOptionError("Input directory is required. None was given.\n") if not os.path.exists(options.indir): raise InputOptionError("Input directory %s does not exist\n" % options.indir) if not os.path.exists(options.outdir): system("mkdir -p %s" % options.outdir)
def testKeepingCoverageOnIngroups(self): """Tests whether the --ingroupCoverageDir option works as advertised.""" encodeRegion = "ENm001" ingroups = ["human", "cow"] outgroups = ["macaque", "rabbit", "dog"] regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups) outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) # Run blast in "ingroup vs outgroups" mode, requesting to keep # the bed files that show outgroup coverage on the ingroup. toilDir = os.path.join(self.tempDir, "tmp_toil") outgroupFragmentPaths = [getTempFile(rootDir=self.tempDir) for outgroup in outgroups] ingroupCoveragePaths = [getTempFile(rootDir=self.tempDir) for ingroup in ingroups] runCactusBlastIngroupsAndOutgroups(ingroups=ingroupPaths, outgroups=outgroupPaths, alignmentsFile=self.tempOutputFile, outgroupFragmentPaths=outgroupFragmentPaths, ingroupCoveragePaths=ingroupCoveragePaths, toilDir=toilDir) for i, ingroupPath in enumerate(ingroupPaths): # Get the coverage from the outgroups independently and # check that it's the same as the file in # ingroupCoverageDir otherIngroupPath = ingroupPaths[1] if i == 0 else ingroupPaths[0] # To filter out alignments from the other ingroup and # self-alignments we need to create a fasta with all the # outgroup fragments in it. outgroupsCombined = getTempFile(rootDir=self.tempDir) for outgroupFragmentPath in outgroupFragmentPaths: system("cat %s >> %s" % (outgroupFragmentPath, outgroupsCombined)) independentCoverageFile = getTempFile(rootDir=self.tempDir) calculateCoverage(fromGenome=outgroupsCombined, sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=independentCoverageFile) # find the coverage file cactus_blast kept (should be # named according to the basename of the ingroup path # file) keptCoverageFile = ingroupCoveragePaths[i] self.assertTrue(filecmp.cmp(independentCoverageFile, keptCoverageFile))
def testScriptTree_Example2(self): """Tests that the global and local temp dirs of a job behave as expected. """ for test in xrange(self.testNo): command = "scriptTreeTest_Wrapper2.py --jobTree %s --logLevel=INFO --retryCount=0" % self.jobTreeDir system(command) runJobTreeStatusAndFailIfNotComplete(self.jobTreeDir)
def run(self): localTempDir = self.getLocalTempDir() i = 0 localfiles = [] for f in self.files: if not os.path.exists(f): #HACK continue localname = os.path.join(localTempDir, "%s%d.bam" %(os.path.basename(f).split('.')[0], i)) system("scp -C %s %s" %(f, localname)) localfiles.append(localname) i += 1 mergeFile = os.path.join(localTempDir, "merge.bam") if len(localfiles) == 1: system("mv %s %s" %(localfiles[0], mergeFile)) else: bamStr = " ".join(localfiles) logger.info("Merging bams...\n") mergeCmd = "samtools merge %s %s" %(mergeFile, bamStr) system( mergeCmd ) sortPrefix = os.path.join(localTempDir, "mergeSorted") sortCmp = "samtools sort %s %s" %( mergeFile, sortPrefix ) system( sortCmp ) system( "cp %s.bam %s" %(sortPrefix, self.outdir) ) #Get Snps info: self.setFollowOnTarget( Snp(self.outdir, self.options) )
def run(self): self.logToMaster("DownSampling\n") opts = self.options global_dir = self.getGlobalTempDir() #sampling_dir = os.path.join(global_dir, "down_sampling") sampling_dir = os.path.join(opts.outdir, "down_sampling") system("mkdir -p %s" % sampling_dir) for sam in os.listdir(self.sampledir): samdir = os.path.join(self.sampledir, sam) sample = pickle.load(gzip.open(os.path.join(samdir, sam), "rb")) out_samdir = os.path.join(sampling_dir, sam) system("mkdir -p %s" % out_samdir) if opts.sampling_uniq: # sampling uniq clones self.addChildTarget(libsample.SampleAnalysis0(sample, samdir, out_samdir, libsample.sampling, opts.sampling_uniq, 'uniq')) elif opts.sampling_top: # sampling reads, then report top clones self.addChildTarget(libsample.SampleAnalysis0(sample, samdir, out_samdir, libsample.sampling, opts.sampling, "top", opts.sampling_top)) else: # sampling reads self.addChildTarget(libsample.SampleAnalysis0(sample, samdir, out_samdir, libsample.sampling, opts.sampling)) if opts.normalize: self.setFollowOnTarget(Normalize(sampling_dir, opts)) else: self.setFollowOnTarget(Analyses(sampling_dir, opts))
def run(self): cmd = "halLiftover --outPSL --tab %s %s %s %s %s" % (self.opts.halfile, self.opts.query, self.bedfile, self.opts.target, self.liftfile) system(cmd) #system("cp %s %s_liftoverpsl" % (self.liftfile, self.opts.outfile)) status = get_liftover_status(self.bedfile, self.liftfile, self.opts.edge) print_status(status, self.statusfile)
def testScriptTree_Example(self): """Uses the jobTreeTest code to test the scriptTree Target wrapper. """ for test in xrange(self.testNo): command = "scriptTreeTest_Wrapper.py --jobTree %s --logLevel=INFO --retryCount=10" % self.jobTreeDir system(command) runJobTreeStatusAndFailIfNotComplete(self.jobTreeDir)
def getRandomConfigFile(): tempConfigFile = getTempFile(rootDir="./", suffix=".xml") config = ET.parse(os.path.join(cactusRootPath(), "cactus_config.xml")).getroot() cafNode = config.find("caf") assert len(config.findall("caf")) == 1 annealingRounds = 1 + int(random.random() * 10) cafNode.attrib["annealingRounds"] = " ".join([ str(1 + int(random.random() * 10)) for i in xrange(annealingRounds) ]) deannealingRounds = list(set([ 1 + int(random.random() * 10) for i in xrange(int(random.random() * 10)) ])) deannealingRounds.sort() cafNode.attrib["deannealingRounds"] = " ".join([ str(i) for i in deannealingRounds ]) cafNode.attrib["trim"] = " ".join([ str(1 + int(random.random() * 5)) for i in xrange(annealingRounds) ]) cafNode.attrib["alignRepeatsAtLoop"] = str(random.random() * annealingRounds) cafNode.attrib["minimumTreeCoverage"] = str(random.random()) cafNode.attrib["blockTrim"] = str(int(random.random() * 5)) cafNode.attrib["ignoreAllChainsLessThanMinimumTreeCoverage"] = str(random.choice([0, 1])) cafNode.attrib["minimumBlockDegree"] = str(random.choice([0, 5])) checkNode = config.find("check") checkNode.attrib["runCheck"] = "1" checkNode = config.find("normal") checkNode.attrib["iterations"] = "2" #Now print the file.. fileHandle = open(tempConfigFile, 'w') ET.ElementTree(config).write(fileHandle) fileHandle.close() if getLogLevelString() == "DEBUG": system("cat %s" % tempConfigFile) return tempConfigFile
def run(self): previousOutputFile = None previousOutputFile2 = None blanchettePath = os.path.join(TestStatus.getPathToDataSets(), "blanchettesSimulation") for i in xrange(self.options.blanchetteRepeats): trueAlignmentMFA = os.path.join(os.path.join(blanchettePath, "%.2i.job" % i), "true.mfa") trueAlignmentMAF = os.path.join(self.getLocalTempDir(), "temp.maf") treeFile = os.path.join(blanchettePath, "tree.newick") system("mfaToMaf --mfaFile %s --outputFile %s --treeFile %s" % (trueAlignmentMFA, trueAlignmentMAF, treeFile)) trueRenamedMAF = trueAlignmentMAF + ".renamed" expPath = os.path.join(self.outputDir, str(i), "experiment.xml") applyNamingToMaf(expPath, trueAlignmentMAF, trueRenamedMAF) trueAlignmentMAF = trueRenamedMAF if self.params.vanilla == False: predictedAlignmentMaf = os.path.join(self.outputDir, str(i), "progressiveCactusAlignment", "Anc0", "Anc0.maf") else: predictedAlignmentMaf = os.path.join(self.outputDir, str(i), "cactusVanilla.maf") outputFile = os.path.join(self.getLocalTempDir(), "temp%i" % i) system("mafComparator --mafFile1 %s --mafFile2 %s --outputFile %s" % (trueAlignmentMAF, predictedAlignmentMaf, outputFile)) system("cp %s %s" % (outputFile, os.path.join(self.outputDir, str(i), "mafComparison.xml"))) if previousOutputFile != None: system("mergeMafComparatorResults.py --results1 %s --results2 %s --outputFile %s" % (outputFile, previousOutputFile, outputFile)) previousOutputFile = outputFile system("mv %s %s" % (previousOutputFile, os.path.join(self.outputDir, "mafComparison.xml")))
def run(self): # filter by size starttime = time.time() opts = self.opts clones = pickle.load(gzip.open(self.samplefile, 'rb')) if (opts.mincount > 1 or opts.maxcount > 0 or opts.minfreq > 0 or opts.maxfreq > 0): clones = filter_by_size(clones, opts.mincount, opts.maxcount, opts.minfreq, opts.maxfreq) msg = ("Filter_by_size for file %s done in %.4f s" % (self.samplefile, time.time() - starttime)) logger.info(msg) starttime = time.time() # filter by status pclones = filter_by_status(clones, True) npclones = filter_by_status(clones, False) filename = os.path.basename(self.samplefile) if pclones: pdir = os.path.join(self.outdir, "productive", self.name) system("mkdir -p %s" % pdir) pfile = os.path.join(pdir, filename) pickle.dump(pclones, gzip.open(pfile, "wb")) if npclones: npdir = os.path.join(self.outdir, "non_productive", self.name) system("mkdir -p %s" % npdir) npfile = os.path.join(npdir, filename) pickle.dump(npclones, gzip.open(npfile, "wb")) msg = ("Filter_by_status for file %s done in %.4f s" % (self.samplefile, time.time() - starttime)) logger.info(msg) self.setFollowOnTarget(libcommon.CleanupFile(self.samplefile))
def runCactusProgressive(inputDir, jobTreeDir, logLevel=None, retryCount=0, batchSystem="single_machine", rescueJobFrequency=None, skipAlignments=False, buildHal=None, buildFasta=None, buildAvgs=False, jobTreeStats=False, maxThreads=None, maxCpus=None, defaultMemory=None, recursive=None, logFile=None, event=None, extraJobTreeArgumentsString="", profileFile=None): command = ("cactus_progressive.py %s" % inputDir) + " " + _fn(jobTreeDir, logLevel, retryCount, batchSystem, rescueJobFrequency, skipAlignments, buildAvgs, None, buildHal, buildFasta, jobTreeStats, maxThreads, maxCpus, defaultMemory, logFile, extraJobTreeArgumentsString=extraJobTreeArgumentsString) + \ (" %s %s" % (nameValue("recursive", recursive, bool), nameValue("event", event))) if profileFile != None: command = "python -m cProfile -o %s %s/bin/%s" % (profileFile, cactusRootPath(), command) system(command) logger.info("Ran the cactus progressive okay")
def runCactusBlast(sequenceFiles, outputFile, jobTreeDir, chunkSize=None, overlapSize=None, logLevel=None, blastString=None, selfBlastString=None, compressFiles=None, lastzMemory=None, targetSequenceFiles=None): logLevel = getLogLevelString2(logLevel) chunkSize = nameValue("chunkSize", chunkSize, int) overlapSize = nameValue("overlapSize", overlapSize, int) blastString = nameValue("blastString", blastString, str) selfBlastString = nameValue("selfBlastString", selfBlastString, str) compressFiles = nameValue("compressFiles", compressFiles, bool) lastzMemory = nameValue("lastzMemory", lastzMemory, int) if targetSequenceFiles != None: targetSequenceFiles = " ".join(targetSequenceFiles) targetSequenceFiles = nameValue("targetSequenceFiles", targetSequenceFiles, quotes=True) command = "cactus_blast.py %s --cigars %s %s %s %s %s %s %s %s --jobTree %s --logLevel %s" % \ (" ".join(sequenceFiles), outputFile, chunkSize, overlapSize, blastString, selfBlastString, compressFiles, lastzMemory, targetSequenceFiles, jobTreeDir, logLevel) logger.info("Running command : %s" % command) system(command) logger.info("Ran the cactus_blast command okay")
def split_fasta(input_fasta, work_dir): out_root = os.path.join(work_dir, "out") + '/' os.mkdir(out_root) system("faSplit byname {input} {out_root}".format( input=input_fasta, out_root=out_root)) return glob(os.path.join(work_dir, "out/*"))
def run(self): #self.logToMaster("Get_Vjins") tempdir = "%s_tempdir" % os.path.splitext(self.outfile)[0] system("mkdir -p %s" % tempdir) model = pickle.load(gzip.open(self.modelfile, 'rb')) items = self.clone.split('_') v = items[0] j = items[2] batchsize = 100000 for vdel in self.vdels: v_cdr3_nt = self.v_nt if vdel == 0 else self.v_nt[: -1 * vdel] v_hang = len(v_cdr3_nt) % 3 for jdel in self.jdels: j_cdr3_nt = self.j_nt if jdel == 0 else self.j_nt[jdel: ] d_nts = self.devent.left_nts + self.devent.right_nts vjins_nts = get_vjins_emptyd(self.v_nt, vdel, self.j_nt, jdel, d_nts, self.cdr3_aa) if vjins_nts is None: continue #self.logToMaster("Empty D: vdel: %d, jdel: %d, vjins: %d\n" % (vdel, jdel, len(vjins_nts))) numbatches = len(vjins_nts) / batchsize if len(vjins_nts) % batchsize > 0: numbatches += 1 for index in xrange(numbatches): outfile = os.path.join(tempdir, "%d_%d_%d" % (vdel, jdel, index)) endindex = min(len(vjins_nts), (index + 1) * batchsize) batch_vjins_nts = vjins_nts[index * batchsize: endindex] self.addChildTarget(Get_Vjins_Batch(batch_vjins_nts, v, v_hang, v_cdr3_nt, j, j_cdr3_nt, self.d, d_nts, self.cdr3_aa, vdel, jdel, self.devent.d5del, self.devent.d3del, model, outfile)) self.setFollowOnTarget(Get_Vd_Dj_Ins_Agg3(tempdir, self.outfile))
def run(self): if os.path.exists(self.outfile): system("rm -f" % self.outfile) for batch in os.listdir(self.indir): batchfile = os.path.join(self.indir, batch) clones = pickle.load(gzip.open(batchfile, "rb")) write_clones(self.outfile, clones, True)
def run(self): #self.logToMaster("Get_Vd_Dj_Ins") model = pickle.load(gzip.open(self.modelfile, "rb")) ins_vds = [] for vd in model.ins_vd.keys(): if vd >= 0: ins_vds.append(vd) ins_djs = [] for dj in model.ins_dj.keys(): if dj >= 0: ins_djs.append(dj) vdir = os.path.join(self.outdir, "vdels") system("mkdir -p %s" % vdir) for vdel in self.vdels: voutfile = os.path.join(vdir, str(vdel)) self.addChildTarget(Get_Ins(get_vdins_events, vdel, self.v_nt, self.devent, self.cdr3_aa, voutfile, ins_vds)) jdir = os.path.join(self.outdir, 'jdels') system("mkdir -p %s" % jdir) for jdel in self.jdels: joutfile = os.path.join(jdir, str(jdel)) self.addChildTarget(Get_Ins(get_djins_events, jdel, self.j_nt, self.devent, self.cdr3_aa, joutfile, ins_djs)) outfile = os.path.join(self.outdir, "events") self.setFollowOnTarget(Get_Vd_Dj_Ins_Agg(self.clone, vdir, jdir, self.v_nt, self.j_nt, self.d, self.d_nt, self.devent, outfile, self.modelfile))
def run(self): system("mkdir -p %s" % self.outdir) clone2sams = read_clone_file(self.clone_file, True) if os.path.isdir(self.model): model = rcommon.get_median_model(self.model) else: model = pickle.load(gzip.open(self.model, "rb")) sam2total, group2sams = read_clonesize(self.numclone_file) len2llh = read_llh(self.lenllh, intkey=True) clone2llh = read_llh(self.clonellh) global_dir = self.getGlobalTempDir() lencount_dir = os.path.join(global_dir, "sam2len2count") system("mkdir -p %s" % lencount_dir) for s in os.listdir(self.db_dir): samdir = os.path.join(self.db_dir, s) lencount_file = os.path.join(lencount_dir, s) self.addChildTarget(GetLencount(samdir, lencount_file)) self.setFollowOnTarget( GetLlhs( clone2sams, self.outdir, model, lencount_dir, group2sams, self.ingroup, self.outgroup, len2llh, clone2llh, ) )
def test_bedParsing(self): """ mafComparator should parse a bed file and use the intervals for testing """ for maf1, maf2, bed, totalTrue, totalTrueInInterval in self.knownValues: if not os.path.exists('tempTestFiles'): os.mkdir('tempTestFiles') f = open(self.maf1path, 'w') f.write('%s%s%s' % (self.header, maf1, self.footer)) f.close() f = open(self.maf2path, 'w') f.write('%s%s%s' % (self.header, maf2, self.footer)) f.close() f = open(self.bedpath, 'w') f.write('%s' % bed) f.close() cmd = ['mafComparator'] cmd.append('--mafFile1=%s' % self.maf1path) cmd.append('--mafFile2=%s' % self.maf2path) cmd.append('--outputFile=%s' % os.path.join('tempTestFiles', 'output.xml')) if bed != '': cmd.append('--bedFiles=%s' % os.path.join('tempTestFiles', 'bed.bed')) cmd.append('--sampleNumber=1000 --logLevel %s' % getLogLevelString()) system(" ".join(cmd)) tree = ET.parse(os.path.join('tempTestFiles', 'output.xml')) homTests = tree.findall('homologyTests') self.assertAlmostEquals(totalTrue, float(homTests[0].find('aggregateResults').find('all').attrib['totalTrue'])) if totalTrueInInterval is None: self.assertEqual(None, homTests[0].find('aggregateResults').find('A')) else: self.assertAlmostEquals(totalTrueInInterval, float(homTests[0].find('aggregateResults').find('A').attrib['totalTrue'])) shutil.rmtree(os.path.dirname(self.maf1path))
def realignSamFile(self): """Chains and then realigns the resulting global alignments. """ tempSamFile = os.path.join(self.getGlobalTempDir(), "temp.sam") system("cp %s %s" % (self.outputSamFile, tempSamFile)) self.addChildTargetFn(realignSamFileTargetFn, args=(tempSamFile, self.outputSamFile, self.readFastqFile, self.referenceFastaFile, self.options))
def run(self): #self.logToMaster("Get_Vd_Dj_Ins_Agg") tempdir = "%s_tempdir" % os.path.splitext(self.outfile)[0] system("mkdir -p %s" % tempdir) items = self.clone.split('_') v = items[0] cdr3_aa = items[1] j = items[2] if self.devent.d3del == 0: d_cdr3_nt = self.d_nt[self.devent.d5del: ] else: d_cdr3_nt = self.d_nt[self.devent.d5del: -1 * self.devent.d3del] for vdelname in os.listdir(self.vdir): vfile = os.path.join(self.vdir, vdelname) vdel = int(vdelname.split("_batch")[0]) v_cdr3_nt = self.v_nt if vdel == 0 else self.v_nt[: -1 * vdel] for jdelname in os.listdir(self.jdir): jfile = os.path.join(self.jdir, jdelname) jdel = int(jdelname.split("_batch")[0]) j_cdr3_nt = self.j_nt if jdel == 0 else self.j_nt[jdel: ] outfile = os.path.join(tempdir, "%s_%s" % (vdelname, jdelname)) self.addChildTarget(Get_Vd_Dj_Ins_Agg2(v, vdel, vfile, v_cdr3_nt, j, jdel, jfile, j_cdr3_nt, self.d, self.devent.d5del, self.devent.d3del, d_cdr3_nt, cdr3_aa, self.modelfile, outfile)) self.setFollowOnTarget(Get_Vd_Dj_Ins_Agg3(tempdir, self.outfile))
##Record time to run baseRuntime = runNaiveBlast(seqFile1, seqFile2, tempOutputFile, lastzOptions="--ambiguous=iupac,100 --ydrop=3000") results1 = loadResults(tempOutputFile) logger.info("Loaded first results") for setting in settings: #Run the blast ##Record time to run runtime = runNaiveBlast(seqFile1, seqFile2, tempOutputFile2, lastzOptions=setting) #Now compare the results results2 = loadResults(tempOutputFile2) logger.info("Loaded second results") def fm(f): return "%.5f" % float(f) def fm2(f): return str(int(f)) resultsComparator = ResultComparator(results1, results2) print((",".join([ species1, species2, "_".join(("_".join(setting.split())).split(",")), fm(distance), fm(resultsComparator.sensitivity), fm(resultsComparator.specificity), fm2(resultsComparator.intersectionSize), fm2(resultsComparator.unionSize), fm2(resultsComparator.trueDifference), fm2(resultsComparator.predictedDifference), fm2(resultsComparator.trueHits), fm2(resultsComparator.predictedHits), fm2(resultsComparator.trueHits -resultsComparator.predictedHits), fm(baseRuntime), fm(runtime) ]))) system("rm -rf %s" % tempDir)
def runHalCons(halPath, outputPath): system("halCons %s > outputPath" % halPath)
def runHalGen(preset, seed, hdf5Chunk, hdf5Compression, outPath): system("halRandGen --preset %s --seed %d --hdf5Chunk %d\ --hdf5Compression %d %s" % (preset, seed, hdf5Chunk, hdf5Compression, outPath))
def tearDown(self): unittest.TestCase.tearDown(self) system("rm -rf %s" % self.tempDir) for tempFile in self.tempFiles: os.remove(tempFile)
def run(self): system("mv %s/%s-all.xml %s" % (self.outdir, "genemapHomolog", self.extraInfoDir)) system("rm -f %s/%s-*.xml" % (self.outdir, "genemapHomolog")) system("mv %s/%s-all.txt %s" % (self.outdir, "genemapHomolog", self.extraInfoDir)) system("rm -f %s/%s*.txt" % (self.outdir, "genemapHomolog")) system("mv %s/%s-*.xml %s" % (self.outdir, "genemapChain", self.extraInfoDir)) system("mv %s/%s %s" % (self.outdir, "gene2chain", self.extraInfoDir))
def checkHalTree(halfile, outdir, options): treefile = os.path.join(outdir, "haltree.nw") system("halStats --tree %s > %s" % (halfile, treefile)) tree = Phylo.read(treefile, "newick") options.treeFile = treefile options.tree = tree
def run(self): system("rm %s/*bed" % self.cladedir)
def dless(target, split_ss_path, gff_path, model): """ Main function for running dless. Strips all headers out of final gff. """ system('dless {} {} | sed "/^#/ d" > {}'.format(split_ss_path, model, gff_path))
def runJobTreeStatusAndFailIfNotComplete(jobTreeDir): command = "jobTreeStatus --jobTree %s --failIfNotComplete --verbose" % jobTreeDir system(command)
def runJobTreeStats(jobTree, outputFile): system("jobTreeStats --jobTree %s --outputFile %s" % (jobTree, outputFile)) logger.info("Ran the job-tree stats command apparently okay")
def getChromSizesFromHal(halfile, genome, outfile): system("halStats --chromSizes %s %s > %s" % (genome, halfile, outfile))
def run(self): #localTempDir = getTempFile(rootDir=self.getGlobalTempDir()) localTempDir = self.getLocalTempDir() config = os.path.join(localTempDir, "cactus_workflow_config.xml") system("cp %s %s" % (self.config, config)) #Copy the config file to local disk #Copy sequences to localTempDir: localSeqdir = os.path.join(localTempDir, "data") system("mkdir -p %s" % localSeqdir) for spc in self.species.split(): currseqdir = os.path.join(self.seqdir, spc) system("cp -r %s %s" % (currseqdir, localSeqdir)) #Make dir for this region if not already existed #system("rm -fR %s" %self.region) system("mkdir -p %s" % os.path.join(os.getcwd(), self.region)) #Write experiment.xml for this region: experimentFile = os.path.join(localTempDir, "experiment.xml") writeExpCommand = "cactus_writeExperimentXml.py --species \"%s\" --tree \"%s\" --output %s --sequenceDir %s --config %s --databaseString %s"\ %(self.species, self.tree, experimentFile, localSeqdir, config, self.dbStr) system("%s" % writeExpCommand) system("cp %s %s" % (experimentFile, os.path.join(os.getcwd(), self.region, "experiment.xml"))) logger.info("Got experiment.xml file for %s with command: %s\n" % (self.region, writeExpCommand)) #Now ready to runCactus: batchSystem = "singleMachine" jobTree = os.path.join(localTempDir, "jobTree") cactusCommand = "cactus_workflow.py --stats --batchSystem %s --experiment %s --buildReference --setupAndBuildAlignments --logDebug --jobTree %s" \ %(batchSystem, experimentFile, jobTree) logger.info("Going to run cactus now, the command is %s" % cactusCommand) system("%s" % cactusCommand) system("cp -r %s %s" % (jobTree, os.path.join(os.getcwd(), self.region, "jobTree"))) logger.info("Done cactusRun for %s\n" % self.region) #Run genemapChain: self.addChildTarget( RunGenemapChain(self.region, self.dbStr, self.options.outdir, self.options.refSpecies, self.genedir)) self.addChildTarget( RunGenemapHomolog(self.region, self.dbStr, self.options.outdir, self.options.refSpecies, self.genedir))
def runWorkflow_TestScript(testId, sequences, newickTreeString, outputDir=None, batchSystem="single_machine", buildAvgs=False, buildHal=False, buildFasta=False, configFile=None, buildToilStats=False, constraints=None, progressive=False, cactusWorkflowFunction=runCactusWorkflow, logLevel=None): """Runs the workflow and various downstream utilities. The testId parameter is used to allocate a unique port so that tests can run in parallel. """ logger.info("Running cactus workflow test script") logger.info("Got the following sequence dirs/files: %s" % " ".join(sequences)) logger.info("Got the following tree %s" % newickTreeString) #Setup the output dir assert outputDir != None logger.info("Using the output dir: %s" % outputDir) #Setup the flower disk. experiment = getCactusWorkflowExperimentForTest(testId, sequences, newickTreeString, outputDir=outputDir, configFile=configFile, constraints=constraints, progressive=progressive) experimentFile = os.path.join(outputDir, "experiment.xml") experiment.writeXML(experimentFile) logger.info("The experiment file %s\n" % experimentFile) #Setup the job tree dir. toilDir = os.path.join(outputDir, "toil") logger.info("Got a job tree dir for the test: %s" % toilDir) #Run the actual workflow cactusWorkflowFunction(experimentFile, toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, buildHal=buildHal, buildFasta=buildFasta, toilStats=buildToilStats, logLevel=logLevel) logger.info("Ran the the workflow") #Now run various utilities.. if buildToilStats: toilStatsFile = os.path.join(outputDir, "toilStats.xml") runToilStats(toilDir, toilStatsFile) #Now remove everything we generate system("rm -rf %s %s" % (toilDir, experimentFile)) #Return so calling function can cleanup return experiment
def testSonLibCTests(self): """Run m,ost the sonLib CuTests, fail if any of them fail. """ system("sonLibTests %s" % getLogLevelString())
def testMaf(self): """Run all the api CuTests, fail if any of them fail. """ system("halMafTests")
def run(self): f = open(self.outfile, 'w') f.write("#Name\tLength\tMap\tIns\tDels\tOO\tInframe\n") f.close() system("cat %s/* >> %s" % (self.indir, self.outfile))
def tearDown(self): unittest.TestCase.tearDown(self) system("rm -rf %s" % self.tempDir)
def run(self): regions = getList(self.options.regions) genemapChainXmls = [] #list of all genemapChain output Xmls genemapHomologXmls = [] #list of all genemapHomology output Xmls for r in regions: genemapChainXmls.append( os.path.join(self.output, "%s-%s.xml" % ("genemapChain", r))) genemapHomologXmls.append( os.path.join(self.output, "%s-%s.xml" % ("genemapHomolog", r))) #Directory of more details information if interested extraInfoDir = os.path.join(self.output, "extraInfo") system("mkdir -p %s" % extraInfoDir) system("chmod ug+xrw %s" % extraInfoDir) #Merge homologXmls of all regions: allHomologXml = "%s/%s-all.xml" % (self.output, "genemapHomolog") mergeXmls(genemapHomologXmls, allHomologXml) genemapHomolog = "%s/%s-*.txt" % (self.output, "genemapHomolog") allHomolog = "%s/%s-all.txt" % (self.output, "genemapHomolog") system("rm -f %s" % allHomolog) system("cat %s > %s" % (genemapHomolog, allHomolog)) #geneToChain = "%s/%s" %(extraInfoDir, "gene2chain") geneToChain = "%s/%s" % (self.output, "gene2chain") genemapChainCommand = "genemapChain.py -o %s -c \"%s\" -i \"%s\" > %s" %(extraInfoDir, "cat",\ " ".join(genemapChainXmls), geneToChain) system("%s" % genemapChainCommand) chainMergeHomolog = "%s/%s" % (extraInfoDir, "chainMergeHomolog") chainMergeHomologTex = "%s/%s" % (self.output, "chainVsDup.tex") #chainMergeHomologTex = chainMergeHomolog + ".tex" missedGenes = "%s/%s" % (extraInfoDir, "missedGenes") genemapMergeCommand = "genemapMerge.py -f c -n %s %s %s %s %s > %s" %(self.options.runName, \ allHomolog, geneToChain, chainMergeHomolog, chainMergeHomologTex, missedGenes) system("%s" % genemapMergeCommand) homologCmp = "%s/%s" % (self.output, "homologCmp") homologCmpTex = "%s/%s" % (self.output, "homologCmp.tex") homologCmpV = "%s/%s" % (extraInfoDir, "homologCmpV") cactusVsMultizCommand = "genemapCactusVsMultiz.py -a %s -d %s %s %s %s > %s" %(extraInfoDir + "/perSpcDiff", \ self.options.geneDir + "/all.tx", self.options.multiz, allHomologXml, homologCmp, homologCmpV) system("%s" % cactusVsMultizCommand) makeLatexTabCommand = "genemapMakeLatexTab.py -s \"%s\" -n %s %s %s" \ %(self.species, self.options.runName, homologCmp, homologCmpTex) system("%s" % makeLatexTabCommand) #Cleanup now... self.setFollowOnTarget(Cleanup(self.output, extraInfoDir))
def runEvalMFAToMAF(mfa, maf): command = "mfaToMaf -b %s -d %s --logLevel DEBUG" % (mfa, maf) system(command) logger.info("Converted MFA %s to MAF %s\n" % (mfa, maf))
def run(self): system("rm -rf %s" % self.dir) logger.info("Clean up tempDir for next run\n")
def runEvalMAFComparator(mafFile1, mafFile2, outputFile, sampleNumber): command = "mafComparator -b %s -c %s -d %s -e %s" % ( mafFile1, mafFile2, outputFile, sampleNumber) system(command) logger.info("Compared MAF %s with MAF %s\n" % (mafFile1, mafFile2))
def run(self): bedfile = os.path.join(self.cladedir, "%s.bed" %self.target) system("halLiftover %s %s %s %s %s" %(self.halfile, self.query, self.queryBed, self.target, bedfile)) #Convert to big bed: bigbedfile = os.path.join(self.cladedir, "%s.bb" %self.target) system("bedToBigBed %s %s %s" %(bedfile, self.chrsizefile, bigbedfile))
def run(self): options = self.options localHalfile = os.path.join(self.outdir, os.path.basename(self.halfile)) if os.path.abspath(localHalfile) != os.path.abspath(self.halfile): if os.path.exists(localHalfile): system("rm %s" % localHalfile) if options.cpHal: system("cp %s %s" % (os.path.abspath(self.halfile), localHalfile)) else: system("ln -s %s %s" % (os.path.abspath(self.halfile), localHalfile)) #Create lod files if useLod is specified lodtxtfile, loddir = getLod(options, localHalfile, self.outdir) #Get the maximum window size to display SNPs if lodtxtfile: snpwidth = getLodLowestLevel(lodtxtfile) - 1 if snpwidth > -1: options.snpwidth = snpwidth genomes = sortByProperName(self.genomes, self.options.properName) #Create documentation files: docdir = os.path.join(self.outdir, "documentation") system("mkdir -p %s" % docdir) writeDocFiles(docdir, self.options) #Create genomes.txt file filename = os.path.join(self.outdir, "genomes.txt") f = open(filename, 'w') #for genome in self.genomes: for genome in genomes: genomedir = os.path.join(self.outdir, genome) f.write("genome %s\n" % genome) f.write("twoBitPath %s/%s.2bit\n" % (genome, genome)) #create trackDb for the current genome: if lodtxtfile == '': self.addChildTarget( WriteTrackDbFile(self.genomes, "../%s" % os.path.basename(self.halfile), genomedir, options)) else: self.addChildTarget( WriteTrackDbFile(self.genomes, "../%s" % os.path.basename(lodtxtfile), genomedir, options)) f.write("trackDb %s/trackDb.txt\n" % genome) #other info f.write("groups groups.txt\n") writeDescriptionFile(genome, genomedir) f.write("htmlPath %s/description.html\n" % genome) f.write("description %s\n" % getProperName(genome, self.options.properName)) f.write("organism %s\n" % getProperName(genome, self.options.properName)) f.write("orderKey 4800\n") f.write("scientificName %s\n" % genome) seq2len = self.genome2seq2len[genome] (seq, l) = getLongestSeq(seq2len) f.write("defaultPos %s:1-%d\n" % (seq, min(l, 1000))) f.write("\n") f.close()
def tearDown(self): for tempFile in self.tempFiles: if os.path.exists(tempFile): os.remove(tempFile) unittest.TestCase.tearDown(self) system("rm -rf %s" % self.tempDir)
def run(self): #GC content & Alignability for genome in self.genomes: genomedir = os.path.join(self.outdir, genome) if self.options.gcContent: self.addChildTarget(GetGCpercent( genomedir, genome)) #genomedir/genome.gc.bw if self.options.alignability: self.addChildTarget( GetAlignability( genomedir, genome, self.halfile)) #genomedir/genome.alignability.bw #Compute conservation track: if self.options.conservation: #if self.options.conservation or self.options.conservationDir: conservationDir = os.path.join(self.outdir, "conservation") if not self.options.conservationDir: system("mkdir -p %s" % conservationDir) self.addChildTarget( GetConservationFiles(self.halfile, conservationDir, self.options)) else: if os.path.abspath(self.options.conservationDir ) != os.path.abspath(conservationDir): system("ln -s %s %s" % (os.path.abspath( self.options.conservationDir), conservationDir)) #system("cp -r %s %s" %(self.options.conservationDir, conservationDir)) #Make bed tracks: preprocessAnnotationInputs(self.options, self.outdir, "bed") self.addChildTarget( MakeAnnotationTracks(self.options, self.outdir, self.halfile, self.genome2seq2len, "bed")) #Make bed2 tracks: preprocessAnnotationInputs(self.options, self.outdir, "bed2") self.addChildTarget( MakeAnnotationTracks(self.options, self.outdir, self.halfile, self.genome2seq2len, "bed2")) #Make wig tracks: preprocessAnnotationInputs(self.options, self.outdir, "wig") self.addChildTarget( MakeAnnotationTracks(self.options, self.outdir, self.halfile, self.genome2seq2len, "wig")) #Make clade-exclusive tracks: if self.options.tree and self.options.cladeExclusive: self.addChildTarget( GetCladeExclusiveRegions( self.halfile, self.options.tree, os.path.join(self.outdir, "liftoverbeds"), self.options.maxOut, self.options.minIn)) self.options.bigbeddirs.append( os.path.join(self.outdir, "liftoverbeds", "CladeExclusive")) #Get LOD if needed, and Write trackDb files self.setFollowOnTarget( WriteGenomesFile(self.genomes, self.genome2seq2len, self.halfile, self.options, self.outdir))
def testCuTest(self): system("matchingAndOrderingTests %s" % getLogLevelString())
def testMarginStats(self): system("%s %s %s %s --readIdentity --alignmentIdentity --mismatchesPerAlignedBase --readCoverage \ --deletionsPerReadBase --insertionsPerReadBase --printValuePerReadAlignment" % \ (self.marginStats, self.inputSamFile1, self.readFastqFile1, self.referenceFastaFile1))
def tearDown(self): unittest.TestCase.tearDown(self) # Clean up system("rm -rf %s %s %s %s" % (self.outputSamFile, self.outputHmmFile, self.outputVcfFile, self.jobTree))
def linkTwoBitSeqFile(genome, twobitdir, outdir): twobitfile = os.path.join(outdir, "%s.2bit" %genome) intwobitfile = os.path.abspath( os.path.join(twobitdir, "%s.2bit" %genome) ) if not os.path.exists(twobitfile): system("ln -s %s %s" %(intwobitfile, twobitfile))
def testProgressiveOutgroupsVsAllOutgroups(self): """Tests the difference in outgroup coverage on an ingroup when running in "ingroups vs. outgroups" mode and "set against set" mode. """ encodeRegion = "ENm001" ingroup = "human" outgroups = ["macaque", "rabbit", "dog"] regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPath = os.path.join(regionPath, ingroup + "." + encodeRegion + ".fa") outgroupPaths = map( lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) # Run in "set against set" mode, aligning the entire ingroup # vs each outgroup runCactusBlast([ingroupPath], alignmentsFile=self.tempOutputFile, toilDir=os.path.join(self.tempDir, "setVsSetToil"), chunkSize=500000, overlapSize=10000, targetSequenceFiles=outgroupPaths) # Run in "ingroup vs outgroups" mode, aligning the ingroup vs # the outgroups in order, trimming away sequence that's # already been aligned. runCactusBlastIngroupsAndOutgroups([ingroupPath], outgroupPaths, alignmentsFile=self.tempOutputFile2, toilDir=os.path.join( self.tempDir, "outgroupToil")) # Get the coverage on the ingroup, in bases, from each run. coverageSetVsSetUnfiltered = getTempFile(rootDir=self.tempDir) calculateCoverage(sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=coverageSetVsSetUnfiltered) coverageSetVsSet = int( popenCatch( "cat %s | awk '{ total += $3 - $2} END { print total }'" % coverageSetVsSetUnfiltered)) coverageIngroupVsOutgroupsUnfiltered = getTempFile( rootDir=self.tempDir) calculateCoverage(sequenceFile=ingroupPath, cigarFile=self.tempOutputFile2, outputFile=coverageIngroupVsOutgroupsUnfiltered) coverageIngroupVsOutgroups = int( popenCatch( "cat %s | awk '{ total += $3 - $2} END { print total }'" % coverageIngroupVsOutgroupsUnfiltered)) print "total coverage on human (set vs set mode, %d outgroups): %d" % ( len(outgroups), coverageSetVsSet) print "total coverage on human (ingroup vs outgroup mode, %d outgroups): %d" % ( len(outgroups), coverageIngroupVsOutgroups) # Make sure we're getting a reasonable fraction of the # alignments when using the trimming strategy. self.assertTrue( float(coverageIngroupVsOutgroups) / coverageSetVsSet >= 0.95) # Get the coverage on the ingroup, in bases, from just the # last outgroup. Obviously this should be much higher in set # vs set mode than in ingroup vs outgroup mode. outgroupAlignments = getTempFile(rootDir=self.tempDir) system("grep %s %s > %s" % (outgroups[-1], self.tempOutputFile, outgroupAlignments)) coverageFileSetVsSet = getTempFile(rootDir=self.tempDir) calculateCoverage(sequenceFile=ingroupPath, cigarFile=outgroupAlignments, outputFile=coverageFileSetVsSet) coverageFromLastOutgroupSetVsSet = int( popenCatch( "cat %s | awk '{ total += $3 - $2} END { print total }'" % coverageFileSetVsSet)) outgroupAlignments = getTempFile(rootDir=self.tempDir) system("grep %s %s > %s" % (outgroups[-1], self.tempOutputFile2, outgroupAlignments)) coverageFileInVsOut = getTempFile(rootDir=self.tempDir) calculateCoverage(sequenceFile=ingroupPath, cigarFile=outgroupAlignments, outputFile=coverageFileInVsOut) coverageFromLastOutgroupInVsOut = int( popenCatch( "cat %s | awk '{ total += $3 - $2} END { print total }'" % coverageFileInVsOut)) print "total coverage on human from last outgroup in set (%s) (set vs set mode): %d" % ( outgroups[-1], coverageFromLastOutgroupSetVsSet) print "total coverage on human from last outgroup in set (%s) (ingroup vs outgroup mode): %d" % ( outgroups[-1], coverageFromLastOutgroupInVsOut) self.assertTrue( float(coverageFromLastOutgroupInVsOut) / coverageFromLastOutgroupSetVsSet <= 0.10)