def getAlignStats(self): """Open output files of alignment jobs and report on statistics. """ #MapClassesRev stores totAssemblyLenMb self.varsP.updatePipeReport( "Starting AlignModule Align Stats stage for %s\n" % self.stageName, printalso=True) util.LogStatus("progress", "stage_start", "%s_stats" % self.stageName) if self.doref: reflen = mc.multiCmap(self.varsP.ref, lengthonly=True).totalLength / 1e6 else: reflen = self.varsP.totAssemblyLenMb getAlignStats(self.varsP, self.outFileList, reflen, isref=self.doref, mergepath=self.mergedir) mergeMap(self.varsP, self.outFileList, mergepath=self.mergedir) splitByContig = (2 if self.doref else 0) #see mergeRcmaps stageName = (self.varsP.alignMolvrefName if self.doref else "") mergeRcmaps(self.outFileList, self.mergedir, self.varsP, splitByContig, stageName) #xmapDict = split_XMap_byContig( self.outFileList, self.mergedir, self.varsP, self.stageName) xmapDict = split_XMap_byContig_new(self.outFileList, self.mergedir, self.varsP, stageName) #split_Qcmap_byContig(self.outFileList, self.mergedir, xmapDict, self.varsP) split_Qcmap_byContig_new(self.outFileList, self.mergedir, xmapDict, self.varsP, stageName) self.varsP.updatePipeReport( "Finished AlignModule Align Stats stage for %s\n" % self.stageName, printalso=True) util.LogStatus("progress", "stage_complete", "%s_stats" % self.stageName)
def __init__(self, varsP) : """sortBNX.__init__: this class is for sorting the input bnx for subsequent splitting by the splitBNX class, and eventually easier processing with the Pairwise class. The constructor (this) will call varsP.runJobs and doAllPipeReport.""" self.stageName="SortBNX" self.varsP = varsP #fewer code modifications below self.varsP.sorted_file = self.varsP.bnxFile.replace(".bnx", "_sorted") #replace this with checkMinMol; this needs to use sorted file which isn't yet made #calculateNPairwise(self.varsP, self.varsP.bnxFile.replace(".bnx","")) #run this here bc it contains check on N mol required to start pipeline checkMinMol(self.varsP, self.varsP.bnxFile) if self.generateJobList() : #return 0 for success, 1 for skip if not util.checkFile(self.varsP.sorted_file+".bnx") : #this happens when accidentally using bypass but no sorted bnx exists--log error err = "ERROR: no sorted bnx file found (%s) (check bypass (-B) argument to Pipeline)" % (self.varsP.sorted_file+".bnx") self.varsP.updatePipeReport(err+"\n") util.LogError("critical", err) util.LogStatus("progress", "pipeline", "failure") raise RuntimeError #calculateNPairwise(self.varsP, self.varsP.sorted_file) #correct varsP.nPairwiseJobs -- already above return util.LogStatus("progress", "stage_start", self.stageName) #after above bc check if bypass (executeCurrentStage) self.varsP.runJobs(self, "SortBNX") self.doAllPipeReport() if not self.allResultsFound() : err = "ERROR: sortBNX failed. Check: "+self.varsP.bnxFile self.varsP.updatePipeReport(err+"\n") util.LogError("critical", err) util.LogStatus("progress", "pipeline", "failure") raise RuntimeError util.LogStatus("progress", "stage_complete", self.stageName)
def __init__(self, varsP, argset=-1): '''argset is toggle between CharacterizeDefault and CharacterizeFinal argumets: -1 is default, 1 is final ''' self.varsP = varsP self.argStr = ( "Final" if argset == 1 else "Default" ) #!=1 and !=-1 is error in generateJobList, but not here self.stageName = 'Characterize' + self.argStr + ' ' + self.varsP.stageComplete util.LogStatus("progress", "stage_start", self.stageName) mthread.jobWrapper.__init__( self, varsP, self.stageName, clusterArgs=varsP.getClusterArgs('characterizeDefault')) self.xmapTarget = None self.curCharacterizeFileRoots = [] outdir = self.varsP.characterizeDirName # = 'alignref' if argset == 1: #this is final outdir += '_final' varsP.contigAlignTarget = os.path.join(varsP.outputContigFolder, outdir) if not (os.path.exists(varsP.contigAlignTarget)): os.mkdir(varsP.contigAlignTarget) self.generateJobList(argset)
def isBadErrorParams(self, noise, stage): #BAD means this: # for both stages: sr > 0.1 or sd > 0.1 or sf > 0.5 # also for stage 0 : sd > 0.1 and sf > 0.35 # also for stage 1 : sd > 0 and sf > 0.25 (this used to be for both stages) assert stage == 0 or stage == 1, "Error: invalid arg to autoNoise.isBadErrorParams" badparam = False if not noise : badparam = True elif stage == 0 and (noise["sd"] > 0.1 and noise["sf"] > 0.35) : badparam = True elif stage == 1 and (noise["sd"] > 0 and noise["sf"] > 0.25) : badparam = True #add not noise for case of empty dict, which readNoiseParameters will return if it can't read the .err file if badparam or noise["sr"] > 0.1 or noise["sd"] > 0.1 or noise["sf"] > 0.5 : errstr = "Failed to find usable noise parameters. Try decreasing maprate parameter and/or find a better reference. You can also try disabling auto noise (no -y, or 'Rough assembly' profile) with nominal noise parameters;" if noise.has_key("sf") : errstr += " sf=%f" % noise["sf"] if noise.has_key("sd") : errstr += " sd=%f" % noise["sd"] if noise.has_key("sr") : errstr += " sr=%f" % noise["sr"] self.varsP.updatePipeReport(errstr+"\n") util.LogError("critical", errstr) util.LogStatus("progress", "pipeline", "failure") #possibly redundant with DNPipeline.finalizePipeline raise RuntimeError
def mergeComplete(self): """Test if merge possibilities are exhaused and increment names and counters. If RefAligner argument -pairmergeRepeat is used, always terminate. """ prevCount = self.countContigs(self.varsP.inputContigFolder, self.prevPrefix) curCount = self.countContigs(self.varsP.outputContigFolder, self.curPrefix) #self.varsP.stageComplete = 'Merge% 2d' % self.varsP.extensionCount self.varsP.stageComplete = self.stageName self.checkResults() contigCount = ' %s %d to %s %d' % (self.prevPrefix, prevCount, self.curPrefix, curCount) self.varsP.inputContigPrefix = self.curPrefix self.varsP.inputContigFolder = self.varsP.outputContigFolder self.varsP.outputContigPrefix = self.curPrefix utilities.LogStatus( "progress", "stage_complete", self.groupName) #a stage is each merge (A, B, etc), not all term = "-pairmergeRepeat" in self.varsP.argsListed('merge') if term or curCount <= 1 or curCount >= prevCount or self.iterCount >= len( self.alphabet) - 1: # Terminate Merging contigCount += ' .. Terminate Merge ..' self.varsP.updatePipeReport(contigCount + '\n') if curCount == 0: self.varsP.outputContigPrefix = self.prevPrefix self.varsP.mergeIntoSingleCmap() return 1 else: contigCount += ' .. Continue Merge ..' self.varsP.updatePipeReport(contigCount + '\n') return 0
def __init__(self, varsP, splitname="SplitBNX") : """splitBNX.__init__: this class is for splitting the sorted bnx file into smaller chunks for easier processing with the Pairwise class. Like the sortBNX class, the constructor also calls varsP.runJobs and doAllPipeReport. """ util.LogStatus("progress", "stage_start", splitname) self.varsP = varsP #fewer code modifications below self.stageName = splitname if not self.generateJobList() : #check return value, and runJobs only if False self.varsP.runJobs(self, splitname) self.doAllPipeReport() if not self.allResultsFound() : err = "ERROR: splitBNX failed. Check: "+self.varsP.sorted_file+".bnx" self.varsP.updatePipeReport(err+"\n") util.LogError("critical", err) util.LogStatus("progress", "pipeline", "failure") raise RuntimeError util.LogStatus("progress", "stage_complete", splitname)
def endStage(self): """Call this in place of checkResults when this stage is bypassed.""" if self.refineStage not in ['refineB0', 'refineFinal0', 'extension0']: self.varsP.mergeIntoSingleCmap() StageName = self.refineStage + ( "_%i" % self.varsP.extensionCount if self.refineStage.startswith("extension") else "" ) #for status.xml only self.varsP.stageComplete = StageName util.LogStatus("progress", "stage_complete", StageName)
def __init__(self, varsP): self.varsP = varsP self.varsP.extensionCount += 1 self.stageName = 'Extension_' + str(self.varsP.extensionCount) utilities.LogStatus("progress", "stage_start", self.stageName) mthread.jobWrapper.__init__( self, varsP, self.stageName, clusterArgs=varsP.getClusterArgs('extension')) extContigPrefix = self.varsP.expID + '_ext%s' % self.varsP.extensionCount varsP.prepareContigIO(extContigPrefix, self.stageName) self.generateJobList()
def checkResults(self, stageSuffix=""): '''Call jobWrapper (self) .doAllPipeReport, and varsP.mergeIntoSingleCmap. stageSuffix, if supplied, is appended to varsP.stageComplete in order to fix the stage name reported by the CharacterizeModule in the informaticsReport. ''' self.doAllPipeReport() self.varsP.stageComplete = self.refineStage + stageSuffix if self.refineStage not in ['refineB0', 'refineFinal0', 'extension0']: self.varsP.mergeIntoSingleCmap() StageName = self.refineStage + ( "_%i" % self.varsP.extensionCount if self.refineStage.startswith("extension") else "" ) #for status.xml only util.LogStatus("progress", "stage_complete", StageName)
def generateJobList(self) : """splitBNX.generateJobList: submit varsP.nPairwiseJobs number of split bnx jobs. """ sorted_file = self.varsP.sorted_file if not util.checkFile(sorted_file+".bnx") : err = "ERROR: splitBNX input file (%s) not found; exiting" % self.varsP.sorted_file self.varsP.updatePipeReport(err+"\n") util.LogError("critical", err) util.LogStatus("progress", "pipeline", "failure") raise RuntimeError N = calculateNPairwise(self.varsP, sorted_file) #move back here (not sortBNX) bc needs to use sorted bnx #N = self.varsP.nPairwiseJobs self.varsP.updatePipeReport('Splitting BNX\n') #splitJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('splitting')) super(splitBNX, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs('splitting')) #should skip the rest and return 1, like in sortBNX, here: if not self.varsP.executeCurrentStage: return 1 #tell self.__init__ not to continue processing self.varsP.updatePipeReport("Splitting"+(" scan-scaled" if self.varsP.doScanScale else "")+" bnx file: %s.bnx\n\n" % self.varsP.sorted_file) #calculate threads per job: used to be fixed at 1, now file size / 1.5 GB rounded up. This was too low, add 1. threads = max(1, int(math.ceil( os.path.getsize(sorted_file+".bnx")/1.5e9 ))) + 1 if threads > 1 : self.varsP.updatePipeReport("Using %i threads per job\n" % threads) #the change in job partitioning breaks backward compatibility and was causing too many problems; make it conditional on refaligner version #this is now obsolete: assume binaries are up-to-date if False : #self.varsP.refaligner_version < 3995 : for partial in range(1,N + 1): output_file=self.varsP.bnxFile.replace(".bnx", "_%s_of_%s" %(partial, self.varsP.nPairwiseJobs)) cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(threads), "-merge", "-subsetbin", str(partial), str(N), "-bnx", "-o", output_file] if self.varsP.stdoutlog : cargs.extend( ['-stdout', '-stderr'] ) #print('%d/%d' % (partial, N), cargs) expectedResultFile=output_file+".bnx" self.addJob(mthread.singleJob(cargs, self.stageName + str(partial), expectedResultFile, self.stageName + str(partial), maxThreads=threads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=output_file+".stdout")) else : #change above to single command with -subsetbin 0 N output_file=self.varsP.bnxFile.replace(".bnx", "") cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(threads), "-merge", "-subsetbin", "0", str(N), "-bnx", "-o", output_file] if self.varsP.stdoutlog : cargs.extend( ['-stdout', '-stderr'] ) self.addJob(mthread.singleJob(cargs, self.stageName, output_file+".bnx", self.stageName, maxThreads=threads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=output_file+".stdout"))
def generateJobList(self): """Defines job parameters for merge. Updates variables for subsequent completion test in mergeComplete() """ self.clearJobs() self.prevPrefix = self.varsP.inputContigPrefix #self.curPrefix = self.prefixIter.next() self.curPrefix = self.stagePrefix + self.alphabet[self.iterCount] self.groupName = self.stageName + self.alphabet[ self.iterCount] #jobWrapper data member utilities.LogStatus("progress", "stage_start", self.groupName) self.varsP.updatePipeReport(' PREV PREFIX %s, CUR PREFIX %s' % (self.prevPrefix, self.curPrefix)) self.iterCount += 1 outputString = os.path.join(self.varsP.outputContigFolder, self.curPrefix) currentArgs = [self.varsP.RefAlignerBin, '-o', outputString] #if self.varsP.stdoutlog : #always use this here bc it's the only output which should always be there currentArgs.extend(['-f', '-stdout', '-stderr']) currentArgs += self.varsP.argsListed('merge') currentArgs += ['-maxthreads', str(self.varsP.nThreads)] contigsTextFile = os.path.join(self.varsP.inputContigFolder, 'mergeContigs.txt') contigFiles, contigIDs = self.varsP.findContigs( self.varsP.inputContigFolder, self.prevPrefix, txtOutput=contigsTextFile ) #this method creates the mergeContigs.txt file which is necessary for this job self.varsP.prefixUsed.append(self.curPrefix) fileArgs = ['-if', contigsTextFile] #expoutput = outputString+".align" #don't know which contigs will disappear, but should always get an align file -- with new arg 'pairmergeRepeat', there's no .align; use stdout expoutput = outputString + ".stdout" s1Job = mthread.singleJob(currentArgs + fileArgs, self.groupName, expoutput, self.groupName, maxThreads=self.varsP.nThreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=outputString + ".stdout") self.addJob(s1Job) self.logArguments()
def checkResults(self): #old heading says complete here and then summary after contig list; new says summary here outstr = 'Stage Summary: %s\n' % self.stageName if not self.varsP.ref: #still want contig stats infoReport = "Skipping Characterize because no reference (-r)\n" self.varsP.updatePipeReport( infoReport, printalso=False) #put this in pipereport just as an fyi infoReport += outstr #infoReport += 'Stage Complete: %s\n' % self.groupName #set in jobWrapper constructor #infoReport += MapClassesRev.ContigCharacterizationNoRef(self.varsP,self.groupName) infoReport += characterizeContigs(self.varsP) self.varsP.updateInfoReport(infoReport + '\n') return self.doAllPipeReport() #infoReport = 'Stage Complete: %s\n' % self.groupName #set in jobWrapper constructor #infoReport += MapClassesRev.TopLevelCharacterization(self.varsP,self.curCharacterizeFileRoots,self.groupName) #infoReport += 'OLD characterize\n' #debug infoReport = characterizeContigs(self.varsP, self.xmapTarget) self.varsP.updateInfoReport(outstr + infoReport + '\n') util.LogStatus("progress", "stage_complete", self.stageName)
def __init__(self, refineStage, varsP): validstages = ['refineA', 'refineB', 'refineNGS', 'refineFinal'] if not refineStage in validstages: varsP.error += 1 varsP.message += ' Error: Refine stage name invalid: ' + str( refineStage) + '\n' return self.refineStage = refineStage self.varsP = varsP utilities.LogStatus("progress", "stage_start", self.refineStage) #super is more pythonic than referring to the base class explicitly (only matters for multiple inheritance) super(Refine, self).__init__(varsP, refineStage, clusterArgs=varsP.getClusterArgs(refineStage)) intermediateContigPrefix = self.varsP.expID + self.refineStage.replace( "refine", "_r") self.varsP.prepareContigIO(intermediateContigPrefix, refineStage) #modify results of varsP.prepareContigIO for special case of refineNGS if self.refineStage == 'refineNGS': self.varsP.inputContigPrefix = self.varsP.ngsContigPrefix self.varsP.inputContigFolder = self.varsP.ngsInDir self.generateJobList()
def checkMinMol(varsP, input_file, minmol=2) : '''Simplified version of calculateNPairwise which just checks that there are at least minmol molecules.''' f=open(input_file, "r") count=0 #length=0 #site_count=0.0 for line in f: if line[0] == "0": #x=line.split() count+=1 #length+=float(x[2]) #if line[0] == "1": #site_count+=len(line.split())-1 if count > minmol : #this is all we need to check break f.close() #check that we have more than 1 molecule; if not, there's nothing to assemble, so exit if count < minmol : err = "ERROR in calculateNPairwise: number of molecules (%i) is too few for assembly; check bnx: %s" % (count, input_file) varsP.updatePipeReport(err+"\n") util.LogError("critical", err) util.LogStatus("progress", "pipeline", "failure") raise RuntimeError #will be caught in DNPipeline.constructData
def __init__(self, varsP) : """splitBNX.__init__: this class is for sorting the input bnx for subsequent splitting by the splitBNX class, and eventually easier processing with the Pairwise class. The constructor (this) will call varsP.runJobs and doAllPipeReport, then instantiate splitBNX, which will do all the splitting required for the Pairwise class. """ self.stageName = "Autonoise0" self.varsP = varsP #fewer code modifications below util.LogStatus("progress", "stage_start", self.stageName) #after above bc check if bypass (executeCurrentStage) self.output_folder = os.path.join(self.varsP.contigFolder, "auto_noise") if not util.checkDir(self.output_folder) : #will make if not exist, only returns False if already exists or can't make print "ERROR in autoNoise: bad dir:", self.output_folder raise RuntimeError # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short. #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly')) super(autoNoise, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs("assembly")) bnxfile = self.varsP.bnxFile if varsP.noiseOnly else self.varsP.sorted_file+".bnx" #was return if generateJobListChar, but need to get readparameters if bypass if not self.generateJobListChar({}, bnxfile, "autoNoise0") : #return 0 for success, 1 for skip self.varsP.runJobs(self, "AutoNoise0") self.doAllPipeReport() if not self.allResultsFound() : self.varsP.updatePipeReport("ERROR: AutoNoise0 failed. Check: "+self.output_file+".stdout\n") raise RuntimeError util.LogStatus("progress", "stage_complete", self.stageName) self.varsP.noise0 = readNoiseParameters(self.output_file) self.isBadErrorParams(self.varsP.noise0, 0) self.stageName = "Autonoise1" self.groupName = self.stageName #fix so that LogStatus call in MultiThreading.multiThreadRunJobs util.LogStatus("progress", "stage_start", self.stageName) self.clearJobs() self.varsP.replaceParam("noise0", "-readparameters", self.output_file+".errbin") #need to call again to set self.output_file if not self.generateJobListChar(self.varsP.noise0, bnxfile, "autoNoise1") : #return 0 for success, 1 for skip self.varsP.runJobs(self, "AutoNoise1") self.doAllPipeReport() if not self.allResultsFound() : self.varsP.updatePipeReport("ERROR: AutoNoise1 failed. Check: "+self.output_file+".stdout\n") raise RuntimeError self.varsP.noise1 = readNoiseParameters(self.output_file) infoReport="Automatically determined noise parameters:\n" klist = ["FP", "FN", "sf", "sd", "sr", "bpp", "readparameters"] #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict. for v in klist : if not self.varsP.noise1.has_key(v) : continue param=str(self.varsP.noise1[v]) util.LogStatus("parameter", "auto_"+v, param) infoReport+=v+":"+param+"\n" self.varsP.replaceParam("noise0", "-"+v, param) self.varsP.updateInfoReport(infoReport + '\n') self.isBadErrorParams(self.varsP.noise1, 1) if self.varsP.doScanScale : #change the sorted_file to the rescaled bnx file rescaledbnx = self.output_file + self.varsP.rescaleSuffix #no ".bnx" in suffix if not util.checkFile(rescaledbnx+".bnx") : #not found--not an error if bnx 0.1 is used err = "Warning: scan scaled bnx not found after autoNoise1; not performing scan scaling--check that bnx 1.0 or later used in input" self.varsP.updatePipeReport( err+"\n\n" ) util.LogError("warning", err) self.varsP.doScanScale = False else : #log that scan scaling is used self.varsP.updatePipeReport( "Using scan scaled bnx: "+rescaledbnx+".bnx\n\n" ) util.LogStatus("parameter", "scanscaled_bnx", rescaledbnx+".bnx") self.varsP.sorted_file = rescaledbnx #this variable is used in splitBNX (PairwiseModule.py) util.LogStatus("progress", "stage_complete", self.stageName)
break if case(): #varsP.error += 1 #these don't do anything #varsP.message += ' Error: Refine stage name invalid: '+str(StageName)+'\n' self.varsP.updatePipeReport( "Internal error: unknown stage %s" % StageName) return clusargs = varsP.getClusterArgs( StageName ) #get arguments before changing StageName, then add suffix StageName += ( ("_%i" % self.varsP.extensionCount) if StageName.startswith("extension") else "") #for status.xml only self.varsP.stageName = StageName util.LogStatus("progress", "stage_start", StageName) #super is more pythonic than referring to the base class explicitly (only matters for multiple inheritance) super(Refine, self).__init__(varsP, StageName, clusterArgs=clusargs) #intermediateContigPrefix = self.varsP.expID + self.StageName.replace("refine", "_r") self.varsP.prepareContigIO(ContigPrefix, StageName) #modify results of varsP.prepareContigIO for special case of refineNGS self.generateJobList() def runJobs(self): self.multiThreadRunJobs(self.varsP.nThreads, sleepTime=0.2) def writeIDFile(self, nJobs): f1 = open(self.varsP.idFile, 'wb') f1.write(str(nJobs)) f1.close()
def checkResults(self): self.varsP.stageComplete = 'Extension% 2d' % self.varsP.extensionCount self.varsP.mergeIntoSingleCmap() self.doAllPipeReport() #see Multithreading.jobWrapper utilities.LogStatus("progress", "stage_complete", self.stageName)
def __init__(self, varsP): jobName = "reference_process" opta_section = "referenceSvdetect" default_mres = "2.9" mres = "-mres" self.varsP = varsP usedefault = False if self.varsP.argData.has_key(opta_section): #check if in optargs opta = self.varsP.argsListed(opta_section) if not mres in opta: #must have mres self.varsP.updatePipeReport( "Warning in referenceProcess: " + mres + " missing in optArguments section " + opta_section + "\n") usedefault = True else: self.varsP.updatePipeReport( "Warning in referenceProcess: optArguments section " + opta_section + " missing\n") usedefault = True if usedefault: opta = [mres, default_mres] mresstr = opta[opta.index(mres) + 1] #get string for mres value for output name mresstr = mresstr.replace(".", "") if not util.checkDir(self.varsP.refFolder): self.varsP.updatePipeReport( "ERROR in referenceProcess: could not make output dir %s\n" % self.varsP.refFolder) return None refpref = os.path.basename( self.varsP.ref[:self.varsP.ref.rfind(".")]) + "_res" + mresstr outarg = os.path.join( self.varsP.refFolder, refpref) #refFolder is new output folder for this job expectedResultFile = outarg + ".cmap" #if ref is spots, is this spots? args = [ self.varsP.RefAlignerBin, '-f', '-o', outarg, '-i', self.varsP.ref, '-merge' ] + opta stdoutf = None if self.varsP.stdoutlog: args.extend(['-stdout', '-stderr']) stdoutf = outarg + ".stdout" args += ['-maxthreads', str(self.varsP.nThreads)] super(referenceProcess, self).__init__(self.varsP, jobName, clusterArgs=self.varsP.getClusterArgs("assembly")) job = mthread.singleJob(args, jobName, expectedResultFile, jobName, maxThreads=self.varsP.nThreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=stdoutf) self.addJob(job) util.LogStatus("progress", "stage_start", jobName) self.varsP.runJobs(self, "referenceProcess") self.doAllPipeReport() if not self.allResultsFound( ): #this is an error, but we'll continue processing without SV detect err = "ERROR in referenceProcess: job failed, disabling SV detect" self.varsP.updatePipeReport(err + "\n") util.LogError("error", err) #self.varsP.runSV = False #no need since this class is used in SVModule else: self.varsP.refDeresed = expectedResultFile #store good result for SV detect self.varsP.updatePipeReport( "referenceProcess: using reference %s for svdetect\n" % self.varsP.refDeresed) util.LogStatus("progress", "stage_complete", jobName)
def endStage(self): #same as GroupedRefinementModule.Refine.endStage utilities.LogStatus("progress", "stage_complete", self.refineStage)
def runAlignMol(): parser = argparse.ArgumentParser(description=description) parser.add_argument( '-q', dest='queryDir', help= 'Path to merged cmap to align molecules (-b) to OR alignmol dir from Pipeline for merge (if latter, no alignments are performed), required', type=str) parser.add_argument( '-b', dest='bnx', help='Input molecule (.bnx) file, required if aligning molecules', type=str) #parser.add_argument('-b', dest='bnx', help='Input molecule (.bnx) file OR path to dir containing split bnx pieces, required if aligning molecules', type=str) #I should add the split feature; for now, just do single bnx parser.add_argument( '-a', dest='optArguments', help= 'Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)', default="", type=str) parser.add_argument( '-r', help= 'If this flag is used, alignmolvref arguments are used, otherwise alignmol arguments are used (default alignmol; optional)', dest='ref', action='store_true') parser.add_argument( '-o', dest='outputDir', help= 'output dir (optional, defaults to sub-dir of input map dir called "alignmol")', default="", type=str) parser.add_argument( '-t', dest='RefAligner', help='Path to RefAligner or dir containing it (required)', type=str) parser.add_argument( '-T', dest='numThreads', help='Total number of threads (cores) to use (optional, default 4)', default=4, type=int) parser.add_argument( '-j', dest='maxthreads', help= 'Threads per Job, -maxthreads (non-cluster only;optional, default 4)', default=4, type=int) parser.add_argument( '-e', dest='errFile', help= '.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise', default="", type=str) parser.add_argument( '-E', dest='errbinFile', help= '.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise', default="", type=str) parser.add_argument( '-p', dest='pipelineDir', help= 'Pipeline dir (optional, defaults to script dir, or current directory)', default="", type=str) result = parser.parse_args() outprefix = "exp_refineFinal1" #this is the default; assume for now #check all Pipeline dependencies if result.pipelineDir: cwd = result.pipelineDir else: cwd = os.path.split( os.path.realpath(__file__))[0] #this is path of this script if not os.path.isfile(os.path.join( cwd, "utilities.py")): #if still not here, last try is actual cwd cwd = os.getcwd() #still check this below #this is the only one imported here and in runCharacterize if not os.path.isfile(os.path.join(cwd, "utilities.py")): print "ERROR: utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util if not os.path.isfile(os.path.join(cwd, "AlignModule.py")): print "ERROR: AlignModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import AlignModule as alignmod if not util.checkFile(os.path.join(cwd, "Pipeline.py")): print "ERROR: Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import Pipeline if not util.checkFile(os.path.join(cwd, "mapClasses.py")): print "ERROR: mapClasses.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import mapClasses as mc #input dir if not result.queryDir: print "ERROR: Query (-q) argument not supplied." sys.exit(1) qrypath = os.path.realpath(result.queryDir) if util.checkDir( qrypath, checkWritable=False, makeIfNotExist=False): #output elsewhere so not writeable is ok runaligns = False elif util.checkCmap(qrypath): runaligns = True else: print "ERROR: Query argument (" + qrypath + ") not found or not a dir or cmap. Check -q argument." sys.exit(1) #this check isn't really necessary...make it a warning -- left over from runAlignMerge.py #if not os.path.split(qrypath)[1].endswith("alignmol") : # print "Warning: Query dir ("+qrypath+") does not end with 'alignmol'; please be sure this is a Pipeline alignmol dir\n" #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args rabin = "" #need empty string for generateJobList even though no jobs are run if runaligns: rabin = result.RefAligner #replicate Pipeline behavior: RefAligner is always required if os.path.isdir(rabin): rabin = os.path.join(rabin, "RefAligner") if not util.checkExecutable(rabin): print "ERROR: RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg." sys.exit(1) #optargs file optargs = None if runaligns and result.optArguments: #supplied on command line optargs = result.optArguments if not util.checkFile(optargs, ".xml"): print "optArguments path is supplied (" + optargs + ") but not found or doesn't end in .xml, check -a argument." sys.exit(1) elif runaligns: #load from Pipeline dir if running alignments optargs = os.path.join(cwd, "optArguments_human.xml") if not util.checkFile(optargs): print "optArguments.xml missing in Pipeline directory (" + cwd + "). Try supplying path explicitly using -a." sys.exit(1) #output dir if not result.outputDir: outdir = os.path.join(qrypath, "merge") #should be same as in AlignModule else: outdir = os.path.realpath(result.outputDir) if os.path.isdir(outdir): if not util.checkDir(outdir): #check writeable print "\nERROR: Output dir is not writeable:\n", outdir, "\n" sys.exit(1) #this is ok here #elif outdir == contigdir : # print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n" # sys.exit(1) print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n" elif not util.checkDir( outdir ): #does not exist, make, if False, can't make or not writeable print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n" sys.exit(1) #bnx file bnxfile = result.bnx if bnxfile: #must check for empty string BEFORE you do realpath, or it returns cwd bnxfile = os.path.realpath(bnxfile) if not util.checkFile(bnxfile, ".bnx"): print "ERROR: bnx file supplied but not found or incorrect suffix:", bnxfile sys.exit(1) elif runaligns: print "ERROR: bnx file not supplied but running alignments; please supply bnx file as -b argument" sys.exit(1) #nthreads nthreads = result.numThreads if nthreads <= 0: print "ERROR: Number of threads value invalid (must be > 0): %i" % nthreads sys.exit(1) #maxthreads maxthreads = result.maxthreads if maxthreads <= 0: print "ERROR: Max threads value invalid (must be > 0): %i" % maxthreads sys.exit(1) elif nthreads < maxthreads: print "Warning: num threads (-T: %i) < max threads (-j: %i): increasing num threads to equal max threads\n" % ( nthreads, maxthreads) nthreads = maxthreads #.errbin file errbinfile = result.errbinFile if errbinfile: errbinfile = os.path.realpath(result.errbinFile) if not util.checkFile(errbinfile, ".errbin"): print "ERROR: errbin file supplied but not found or incorrect suffix:", errbinfile sys.exit(1) #.err file errfile = result.errFile if errfile and errbinfile: print "Warning: .err and .errbin arguments supplied; ignoring .err file" errfile = "" elif errfile: errfile = os.path.realpath(result.errFile) if not util.checkFile(errfile, ".err"): print "err file supplied but not found or incorrect suffix:", errfile sys.exit(1) if errfile and not util.checkFile(os.path.join(cwd, "SampleCharModule.py")): print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) elif errfile: import SampleCharModule as scm doref = result.ref #DONE checking arguments print "Using output dir", outdir if runaligns: print "Aligning", bnxfile, "\nTo", qrypath, "\n" else: print "Merging", qrypath, "\n" startTime = time.time() #time since Epoch memory_log = os.path.join(outdir, "memory_log.txt") util.initMemoryLog(memory_log) varsP = Pipeline.varsPipeline() varsP.RefAlignerBin = rabin varsP.contigFolder = "" #not used but needs to be an attr varsP.outputContigFolder = "" #not used but needs to be a string attr varsP.pipeReportFile = os.path.join(outdir, "alignmol_jobs_log.txt") varsP.infoReportFile = os.path.join(outdir, "alignmol_log.txt") util.InitStatus(os.path.join(outdir, "status.xml")) if runaligns: varsP.optArgumentsFileIn = optargs varsP.latestMergedCmap = qrypath #if !doref, need this one varsP.ref = qrypath #and if doref, need this one varsP.nThreads = nthreads #necessary otherwise job won't start -- max threads per node varsP.maxthreads = maxthreads #threads per job p = os.path.split(qrypath)[1] varsP.outputContigPrefix = p[:p.rfind(".")] #filename prefix varsP.stdoutlog = True #use -stdout -stderr varsP.sorted_file = bnxfile[:bnxfile.rfind( ".")] #enables the mol fraction align in AlignModule.getAlignStats if qrypath.endswith(".cmap"): #enable the mol stats varsP.totAssemblyLenMb = mc.multiCmap( qrypath, lengthonly=True).totalLength / 1e6 varsP.memoryLogpath = os.path.join(outdir, "memory_log.txt") varsP.parseArguments() #parses optArgumentsFile varsP.checkDependencies() varsP.RefAlignerBinOrig = rabin varsP.prerunLog( ) #general information in log -- needed for refaligner_version noisep = {} if errbinfile: noisep = {"readparameters": errbinfile} #print "Using noise parameters from "+errbinfile+"\n" #move below elif errfile: noisep = scm.readNoiseParameters(errfile.replace(".err", "")) if noisep.has_key( 'readparameters' ): #remove this because it's redundant, and it can cause problems with RefAligner compatibility del noisep['readparameters'] if not noisep: #readNoiseParameters returns empty dict on failure print "ERROR reading noise parameters, check .err file:", errfile sys.exit(1) #redundant with below? print "Using noise parameters from " + errfile + ":\n" + " ".join( ["-" + str(k) + " " + str(v) for k, v in noisep.iteritems()]) + "\n" #some code from SampleCharModule to load args into noise0 infoReport = "Loaded noise parameters:\n" klist = [ "FP", "FN", "sf", "sd", "sr", "bpp", "readparameters" ] #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict. #noiseargs = self.varsP.argsListed('noise0') #not necessary for v in klist: if not noisep.has_key(v): continue param = str(noisep[v]) util.LogStatus("parameter", "auto_" + v, param) infoReport += v + ":" + param + "\n" varsP.replaceParam("noise0", "-" + v, param) varsP.updateInfoReport(infoReport + '\n', printalso=True) else: print "Getting file list from", qrypath outFileList = getOutFileList(util, qrypath) if not outFileList: print "ERROR: Query dir (" + qrypath + ") does not contain alignmol data. Check -q argument." sys.exit(1) else: print "Found", len(outFileList), "alignment results" #end if runaligns amod = alignmod.AlignModule( varsP, doref, outdir, bnxfile) #constructor will call generateJobList if runaligns: amod.runJobs() amod.checkResults() else: amod.outFileList = outFileList p = os.path.split(outFileList[0])[1] if p.count("_") > 1: #expect something like "EXP_REFINEFINAL1_4" #p = p[:p.rfind("_")+1] #remove integer suffix p = p[:p.rfind("_")] #remove integer suffix (and underscore) #else : # p += "_" #because mrgstr is appended varsP.outputContigPrefix = p if not runaligns or len(amod.jobList) > 0: amod.getAlignStats() if runaligns: print #copy from Pipeline.py if util.SummarizeErrors(varsP=varsP) == 0: varsP.updatePipeReport("Pipeline has successfully completed\n") util.LogStatus("progress", "pipeline", "success") else: varsP.updatePipeReport("Pipeline has completed with errors\n") util.LogStatus("progress", "pipeline", "failure") #BELOW OLD CODE return #in Pipeline, this is called first #print "Calling getAlignStats:" #but it won't work without varsP atm; skip it #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir) #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir) print "Calling mergeMap" print outFileList[0] #, "\n", outputdir #moved above util.logMemory(memory_log, startTime, "mergeMap_start") #mergeMap(self.varsP, self.outFileList, mergepath=self.outputdir) #varsP is optional alignmod.mergeMap(None, outFileList, outputdir) util.logMemory(memory_log, startTime, "mergeMap_end") print "Calling mergeRcmaps" util.logMemory(memory_log, startTime, "mergeRcmaps_start") #mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="alignmol") : alignmod.mergeRcmaps(outFileList, outputdir, splitByContig=True, stageName=outprefix) util.logMemory(memory_log, startTime, "mergeRcmaps_end") print "Calling split_XMap_byContig" #split_XMapQcmap_byContig" util.logMemory(memory_log, startTime, "split_XMap_byContig_start") #xmapdict = alignmod.split_XMap_byContig(outFileList, outputdir, stageName=outprefix) #old xmapdict = alignmod.split_XMap_byContig_new(outFileList, outputdir, stageName=outprefix) util.logMemory(memory_log, startTime, "split_XMap_byContig_end") print "Calling split_Qcmap_byContig" util.logMemory(memory_log, startTime, "split_Qcmap_byContig_start") #alignmod.split_Qcmap_byContig(outFileList, outputdir, xmapdict) #old alignmod.split_Qcmap_byContig_new( outFileList, outputdir, xmapdict, stageName=outprefix) #new: better performance util.logMemory(memory_log, startTime, "split_Qcmap_byContig_end") print "AlignMerge successfully completed"
def multiThreadRunJobs(self, nActiveThreads, sleepTime=0.01, threadControl=False, background=False, callLogStatus=True): """Main Queue script, start jobs, check for completion """ #this is useful as a generic way to skip running a module--no jobs are submitted if len(self.jobList) == 0: self.varsP.updatePipeReport( " Warning in multiThreadRunJobs: number of jobs is 0, skipping stage: " + self.groupName + "\n") return if nActiveThreads == 0: self.varsP.updatePipeReport( " Error in multiThreadRunJobs: nActiveThreads must be > 0, skipping stage: " + self.groupName + "\n") return if background: start_new_thread(self.multiThreadRunJobs, (nActiveThreads, sleepTime, threadControl, False)) return utilities.logMemory( self.varsP.memoryLogpath, self.varsP.startTime, self.groupName) #call at start and end of this method jobw = 30 #width of job name in printout print ' Starting Multi-Threaded Process:' print ' ' + self.groupName self.nThreads = nActiveThreads availableThreads = nActiveThreads startTime = time.time() activeJobList = [] nActiveJobs = 0 nFinishedJobs = 0 nActiveThrottle = 0 nJobs = len(self.jobList) nRemainingJobs = nJobs global cSession if self.onCluster and cSession == None: cSession = drmaa.Session() cSession.initialize() print ' Running ' + str(nJobs) + ' jobs with ' + str( nActiveThreads) + ' threads' if callLogStatus: utilities.LogStatus("progress", "jobs_outstanding", str(nJobs), self.groupName) utilities.LogStatus("progress", "stage_pct_done", "0.0", self.groupName) job_status = (0, nJobs) while True: if nRemainingJobs > 0: for i, sJob in enumerate(self.jobList): if sJob.jobStarted or sJob.isRunning or sJob.isComplete: continue if sJob.hasContingentJob: if not sJob.contingentJob.isComplete: continue if not (sJob.onCluster): if nActiveJobs >= nActiveThreads: continue if not (sJob.onCluster): if availableThreads < sJob.maxThreads: continue if self.throttle and sJob.throttleClass: if nActiveThrottle >= self.throttleMax: continue nActiveThrottle += 1 activeJobList.append(sJob) nActiveJobs += 1 sJob.startJob(cSession=cSession, clusterArgs=self.clusterArgs) availableThreads -= sJob.maxThreads nRemainingJobs -= 1 statusString = (' START% 4d: % ' + str(jobw) + 's,% 3dThr,% 4dR,% 4dT,% 4dF,% 4dQ') % ( sJob.jobNum, sJob.jobName[:jobw], nActiveThreads, nActiveJobs, nJobs, nFinishedJobs, nRemainingJobs) print statusString sys.stdout.flush() time.sleep( sleepTime ) #sleep between job submission, but wait to check status #The block below is error prone in the case of multiple jobWrapper objects running simultaneously, # which we have implemented for CharacterizeModule using threading. The problem is that the # characterize os.wait call can steal the pid of another job, say, refinement, and then the # refinement job will never be marked completed. Simplest is to just wait, and inside # CheckIfRunning, the poll will take care of each job individually. time.sleep(sleepTime) ''' (pid, rc)=(-1, -1) # Defaults so the statement works for cluster jobs if self.onCluster : #if on cluster, no os.wait call is needed; sleep instead, then check all jobs time.sleep(sleepTime) else : try : #if not on cluster, use os.wait to wait for child process to finish global my_wait (pid, rc)=my_wait() #any child? except OSError, e : time.sleep(sleepTime) #print e #Set the return code of the job which was stolen by the wait call above (see comment below). for sJob in activeJobList: if sJob.markCompleted(pid, rc) : break #skip rest once correct one found ''' #The old version of this loop was dangerous because it popped from the list being iterated over. #So, if you skip a job due to this and that job's return code was stolen by the wait above, # then the job is never marked complete. #Though the below fix is probably sufficient, do the above also just to be safe. #If you iterate backwards, using reversed, removing an element will not affect the loop on the remaining elements for sJob in reversed(activeJobList): #sJob.markCompleted(pid, rc) #this call moved into loop above (see above comments) if sJob.CheckIfRunning(cSession=cSession): continue else: #activeJobList.pop(i) activeJobList.remove(sJob) nActiveJobs -= 1 nFinishedJobs += 1 availableThreads += sJob.maxThreads availableThreads = min(nActiveThreads, availableThreads) if self.throttle and sJob.throttleClass: nActiveThrottle -= 1 statusString = (' STOP % 4d: % ' + str(jobw) + 's,% 3dThr,% 4dR,% 4dT,% 4dF,% 4dQ') % ( sJob.jobNum, sJob.jobName[:jobw], nActiveThreads, nActiveJobs, nJobs, nFinishedJobs, nRemainingJobs) statusString += ' ' + timeFormat1(sJob.runTime) print statusString #log status after above loop to calculate nFinishedJobs pct_done = (nFinishedJobs * 100.0 / nJobs if nJobs > 0 else 0) njr = nJobs - nFinishedJobs #num jobs remaining new_status = (pct_done, njr) if job_status != new_status and callLogStatus: utilities.LogStatus("progress", "jobs_outstanding", "%d" % njr, self.groupName) utilities.LogStatus("progress", "stage_pct_done", "%.01f" % pct_done, self.groupName) job_status = new_status if nActiveJobs == 0 and nRemainingJobs == 0: break elif nActiveJobs < 0 or nRemainingJobs < 0: print "ERROR in multithreading: invalid: nActiveJobs:", nActiveJobs, "nRemainingJobs:", nRemainingJobs break #Note: you cannot check len(activeJobList) here becuase if one job takes all the threads, # it can finish, causing the list to be empty, but there are still more jobs to submit. # This is not an error. sys.stdout.flush() #end job submission - check loop #if self.onCluster: # cSession.exit() self.elapsedTime = time.time() - startTime self.cpuTime = 0. for sJob in self.jobList: self.cpuTime += sJob.runTime print ' Finished Multi-Threaded Process:' print ' ' + self.groupName print #extra newline for readability sys.stdout.flush() utilities.logMemory( self.varsP.memoryLogpath, self.varsP.startTime, self.groupName) #call at start and end of this method
def checkResults(self): self.varsP.stageComplete = self.refineStage self.varsP.mergeIntoSingleCmap() self.doAllPipeReport() #see Multithreading.jobWrapper utilities.LogStatus("progress", "stage_complete", self.refineStage)