def joinBnxFiles(varsP, bnxFiles):
    """After image processing, merge results into all.bnx.
    """
    #the old way was to use this fn which simply copies lines
    # while this is fine most of the time, RefAligner is more sophisticated,
    # so it should be more robust to use RefAligner
    #molecule.joinBnxFiles(bnxFiles, self.bnxFile)            

    #this used to be called writeIntermediate
    varsP.writeListToFile(bnxFiles, varsP.bnxTarget)

    # args, jobName, expectedResultFile, uniqueString
    args = [varsP.RefAlignerBin, "-if", varsP.bnxTarget, "-merge", "-bnx", "-o", varsP.bnxFile.replace(".bnx",""), "-f"]
    if varsP.stdoutlog :
        args.extend( ['-stdout', '-stderr'] )
    #print "joinBnxFiles: args:", args

    jobwrapper = mthread.jobWrapper(varsP, "joinBnxFiles")
    jobwrapper.addJob( mthread.singleJob(args, "joinBnxFiles", varsP.bnxFile, "joinBnxFiles") )
    jobwrapper.multiThreadRunJobs(1)
    jobwrapper.doAllPipeReport()

    success = jobwrapper.allResultsFound()
    if not success :
        varsP.updatePipeReport("ERROR in performImageAnalysis: joinBnxFiles failed. Check: "+varsP.bnxTarget+"\n")

    # this is just putting the path of bnxFile in bnxTarget
    # on second thought, if I don't do this, then SampleCharModule will run on each bnx individually
    #if success :
    #    varsP.writeListToFile([varsP.bnxFile], varsP.bnxTarget)

    #sense of return of allResultsFound is opposite of return of performImageAnalysis:
    # allResultsFound is True for all jobs success, False for any jobs fail
    # performImage analysis return is 1 for failure
    return not success
Пример #2
0
    def generateJobList(self) :
        """splitBNX.generateJobList: submit varsP.nPairwiseJobs number of split bnx jobs. """

        sorted_file = self.varsP.sorted_file
        if not util.checkFile(sorted_file+".bnx") :
            err = "ERROR: splitBNX input file (%s) not found; exiting" % self.varsP.sorted_file
            self.varsP.updatePipeReport(err+"\n")
            util.LogError("critical", err)
            util.LogStatus("progress", "pipeline", "failure")
            raise RuntimeError

        N = calculateNPairwise(self.varsP, sorted_file) #move back here (not sortBNX) bc needs to use sorted bnx
        #N = self.varsP.nPairwiseJobs

        self.varsP.updatePipeReport('Splitting BNX\n')
        #splitJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('splitting'))
        super(splitBNX, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs('splitting'))

        #should skip the rest and return 1, like in sortBNX, here:
        if not self.varsP.executeCurrentStage:
            return 1 #tell self.__init__ not to continue processing

        self.varsP.updatePipeReport("Splitting"+(" scan-scaled" if self.varsP.doScanScale else "")+" bnx file: %s.bnx\n\n" % self.varsP.sorted_file)

        #calculate threads per job: used to be fixed at 1, now file size / 1.5 GB rounded up. This was too low, add 1.
        threads = max(1, int(math.ceil( os.path.getsize(sorted_file+".bnx")/1.5e9 ))) + 1
        if threads > 1 :
            self.varsP.updatePipeReport("Using %i threads per job\n" % threads)

        #the change in job partitioning breaks backward compatibility and was causing too many problems; make it conditional on refaligner version
        #this is now obsolete: assume binaries are up-to-date
        if False : #self.varsP.refaligner_version < 3995 :
            for partial in range(1,N + 1):
                output_file=self.varsP.bnxFile.replace(".bnx", "_%s_of_%s" %(partial, self.varsP.nPairwiseJobs))
                cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(threads), "-merge", "-subsetbin", str(partial), str(N), "-bnx", "-o",  output_file]
                if self.varsP.stdoutlog :
                    cargs.extend( ['-stdout', '-stderr'] )
                #print('%d/%d' % (partial, N), cargs)
                expectedResultFile=output_file+".bnx"
                self.addJob(mthread.singleJob(cargs, self.stageName + str(partial), expectedResultFile, self.stageName + str(partial), maxThreads=threads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=output_file+".stdout"))

        else :
            #change above to single command with -subsetbin 0 N
            output_file=self.varsP.bnxFile.replace(".bnx", "")
            cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(threads), "-merge", "-subsetbin", "0", str(N), "-bnx", "-o",  output_file]
            if self.varsP.stdoutlog :
                cargs.extend( ['-stdout', '-stderr'] )
            self.addJob(mthread.singleJob(cargs, self.stageName, output_file+".bnx", self.stageName, maxThreads=threads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=output_file+".stdout"))
 def getDetectJobs(self, contingentJob=None):
     self.molFile = self.localTiff + '.mol'
     #if the molFile is already there, the image processing is already done; do not repeat
     if os.path.exists(self.molFile): 
         return []
     #print "remoteTiff:", self.remoteTiff #debug
     #print "localTiff:", self.localTiff, os.path.exists(self.localTiff) #debug
     #print "expectedOverlap", self.curExp.ExpectedOverlap, "minOverlap", minOverlap #debug
     #there was an issue with self.curExp.ExpectedOverlap being incorrectly computed due to a bad value in the xml (see manageTargetLocation.py)
     #hopefully this will work
     if self.curExp.ExpectedOverlap > 100 :
         oldoverlap = self.curExp.ExpectedOverlap
         self.curExp.ExpectedOverlap = 15
         #print "Warning: calculated expectedOverlap", oldoverlap, "too large; defaulting to", self.curExp.ExpectedOverlap
         self.varsP.updateInfoReport("Warning: "+self.nameStr()+": calculated expectedOverlap %i too large; defaulting to %i\n" % (oldoverlap, self.curExp.ExpectedOverlap), printalso=True)
     expolap = (self.curExp.ExpectedOverlap - 10 if self.curExp.ExpectedOverlap >= 10 else 0) # ExpectedOverlap - 10
     minOverlap = '%d' % (expolap)
     maxOverlap = '%d' % (self.curExp.ExpectedOverlap + 10) # ExpectedOverlap + 10
     dmOverlapArgs = ['-o', minOverlap, '-O', maxOverlap]
     dmArgs = self.varsP.argsListed('imgDetection')
     dmArgs = util.argumentReplaceList(dmArgs, ['-x', str(self.curExp.ScanColumnCount)])
     dmArgs = util.argumentReplaceList(dmArgs, ['-y', str(self.curExp.ScanRowCount)])
     dmArgs = util.argumentReplaceList(dmArgs, ['-p', str(self.curExp.Pitch)])
     nchan = (self.curExp.nColors - 1 if self.curExp.nColors >= 2 else 1) #must be at least 1
     colorArgs = ['-n', str(nchan)]
     sJobCpName = 'cp ' + shorten(self.remoteTiff) + ' to ' + shorten(self.localTiff)
     #print "cp\n"+self.remoteTiff, "\n"+self.localTiff #debug
     sJobCp = mthread.singleJob(['cp', self.remoteTiff, self.localTiff], sJobCpName, self.localTiff, 'cpTiff', throttleClass = True)
     if contingentJob:
         sJobCp.addContingentJob(contingentJob)
     sJobDMName = 'Detect ' + shorten(self.localTiff)
     curArgs = [self.varsP.DMstaticBin] + dmOverlapArgs + dmArgs + colorArgs + [self.localTiff]
     argumentString = " ".join(curArgs) + '\n'
     print " ".join(curArgs) #debug
     sJobDM = mthread.singleJob(curArgs, sJobDMName, self.molFile, 'Detect')
     sJobDM.addContingentJob(sJobCp)
     sJobDM.bpp = self.curExp.basesPerPixel
     sJobDM.molTag = self.molTag
     sJobDM.numLabelChannels = self.numLabelChannels
     #inputMoleculesReport += '   ' + self.molTag + '  ' + self.remoteTiff + '\n'
     dorm = True #default True (False for debug)
     joblist = [sJobCp, sJobDM]
     if dorm :
         sJobRmImgName = 'Detect Complete, rm ' + shorten(self.localTiff)
         sJobRmImg = mthread.singleJob(['rm', self.localTiff], sJobRmImgName, '', 'rmFile')
         sJobRmImg.addContingentJob(sJobDM)
         joblist.append( sJobRmImg )
     return joblist
    def generateJobListLinear(self):
        """Pairwise.generateJobListLinear: This method is the old way of doing pairwise
        comparison of all molecules. It uses the -partial option to RefAligner. This
        option is _incompatible_ with the various hashing options to RefAligner.
        """
        baseArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('pairwise') 
        ct = 0
        outputTarget = os.path.join(self.varsP.alignFolder, 'exp')

        cArgs = [self.varsP.RefAlignerBin, '-i', self.varsP.bnxFile]
        for jobNum in range(1,self.varsP.nPairwiseJobs + 1):
            jobName = 'Pairwise %d of %d' % (jobNum, self.varsP.nPairwiseJobs)
            outputString = 'pairwise%dof%d' % (jobNum,self.varsP.nPairwiseJobs)
            expectedResultFile = outputTarget + outputString + '.align'
            partialArgs = ['-partial', str(jobNum), str(self.varsP.nPairwiseJobs)]
            currentArgs = cArgs + baseArgs + ['-o' , outputTarget + outputString]
            if self.varsP.stdoutlog :
                currentArgs.extend( ['-stdout', '-stderr'] )
            if self.varsP.nPairwiseJobs > 1:
                currentArgs += partialArgs
            currentArgs += ['-maxthreads', str(self.varsP.maxthreads)]
	    if self.varsP.bnxStatsFile!=None:
		currentArgs += ['-XmapStatRead', self.varsP.bnxStatsFile]
            sJob = mthread.singleJob(currentArgs, 
                                     jobName, 
                                     expectedResultFile, 
                                     outputString,
                                     maxThreads=self.varsP.maxthreads,
                                     clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=outputTarget + outputString+".stdout")
            ct += 1
            self.addJob(sJob)
        self.logArguments()
 def generateJobList(self):
     """Defines job parameters for merge. Updates variables for subsequent
     completion test in mergeComplete()
     """
     self.clearJobs()
     self.prevPrefix = self.varsP.inputContigPrefix
     #self.curPrefix = self.prefixIter.next()
     self.curPrefix = self.stagePrefix + self.alphabet[self.iterCount]
     self.groupName = self.stageName + self.alphabet[self.iterCount] #jobWrapper data member
     utilities.LogStatus("progress", "stage_start", self.groupName)
     self.varsP.updatePipeReport('   PREV PREFIX %s, CUR PREFIX %s' % (self.prevPrefix, self.curPrefix))
     self.iterCount += 1
     outputString = os.path.join(self.varsP.outputContigFolder, self.curPrefix)
     currentArgs = [self.varsP.RefAlignerBin, '-o', outputString]
     #if self.varsP.stdoutlog : #always use this here bc it's the only output which should always be there
     currentArgs.extend( ['-f', '-stdout', '-stderr'] )
     currentArgs += self.varsP.argsListed('merge') 
     currentArgs += ['-maxthreads', str(self.varsP.nThreads)]
     contigsTextFile = os.path.join(self.varsP.inputContigFolder, 'mergeContigs.txt')
     contigFiles, contigIDs = self.varsP.findContigs(self.varsP.inputContigFolder, self.prevPrefix, txtOutput=contigsTextFile) #this method creates the mergeContigs.txt file which is necessary for this job
     self.varsP.prefixUsed.append(self.curPrefix)
     fileArgs = ['-if', contigsTextFile]
     #expoutput = outputString+".align" #don't know which contigs will disappear, but should always get an align file -- with new arg 'pairmergeRepeat', there's no .align; use stdout
     expoutput = outputString+".stdout"
     s1Job = mthread.singleJob(currentArgs + fileArgs, self.groupName, expoutput, self.groupName, maxThreads=self.varsP.nThreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile = outputString + ".stdout")
     self.addJob(s1Job)
     self.logArguments() 
 def generateJobList(self):
     baseArgs1 = self.varsP.argsListed(self.refineStage)
     if self.refineStage != 'refineNGS' : #noise args are in refineNGS
         baseArgs1 += self.varsP.argsListed('noise0')
     contigFiles, contigIDs = self.varsP.findContigs(self.varsP.inputContigFolder, self.varsP.inputContigPrefix)
     #nJobs = len(contigFiles)
     bnx = self.varsP.sorted_file+".bnx" #was self.varsP.bnxFile, but need sorted bc ids are different after sorting
     if self.refineStage == 'refineA' : #refineA uses assembler, all others use refaligner
         r1args = [self.varsP.AssemblerBin, '-i', bnx] #need this before -contigs
         r1args += ['-contigs', os.path.join(self.varsP.inputContigFolder, self.varsP.inputContigPrefix) + '.contigs']
     else : #should be same for refineB/NGS/Final
         r1args = [self.varsP.RefAlignerBin, '-i', bnx]
         self.writeIDFile(len(contigFiles)) #nJobs) 
     output1String = os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix)
     for contigID in contigIDs :
         expectedOutputString = self.varsP.outputContigPrefix + '_contig' + contigID
         expectedResultFile = os.path.join(self.varsP.outputContigFolder, expectedOutputString + '.cmap') #refineB
         jobName = self.refineStage + ' %5s' % contigID
         if self.refineStage == 'refineA' : 
             currentArgs = 2*[str(contigID)] #this must come after r1args because it's actually an argument to -contigs
         else : #should be same for refineB/NGS/Final
             r1_cmapFile = self.varsP.inputContigPrefix + '_contig' + str(contigID) + '.cmap'
             r1_cmapFile = os.path.join(self.varsP.inputContigFolder, r1_cmapFile)
             currentArgs = ['-maxthreads', str(self.varsP.maxthreads), '-ref', r1_cmapFile, '-id', contigID]
         currentArgs = r1args + currentArgs + baseArgs1 + ['-o', output1String]
         if self.varsP.stdoutlog :
             currentArgs.extend( ['-stdout', '-stderr'] )
         s1Job = mthread.singleJob(currentArgs, 
                                 jobName, 
                                 expectedResultFile, 
                                 expectedOutputString,
                                 maxThreads=self.varsP.maxthreads,
                                 clusterLogDir=self.varsP.clusterLogDir)
         self.addJob(s1Job)
     self.logArguments()
Пример #7
0
    def generateJobListLinear(self):
        """Pairwise.generateJobListLinear: This method is the old way of doing pairwise
        comparison of all molecules. It uses the -partial option to RefAligner. This
        option is _incompatible_ with the various hashing options to RefAligner.
        """
        baseArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('pairwise') 
        ct = 0
        outputTarget = os.path.join(self.varsP.alignFolder, 'exp')

        cArgs = [self.varsP.RefAlignerBin, '-i', self.varsP.bnxFile]
        for jobNum in range(1,self.varsP.nPairwiseJobs + 1):
            jobName = 'Pairwise %d of %d' % (jobNum, self.varsP.nPairwiseJobs)
            outputString = 'pairwise%dof%d' % (jobNum,self.varsP.nPairwiseJobs)
            expectedResultFile = outputTarget + outputString + '.align'
            partialArgs = ['-partial', str(jobNum), str(self.varsP.nPairwiseJobs)]
            currentArgs = cArgs + baseArgs + ['-o' , outputTarget + outputString]
            if self.varsP.stdoutlog :
                currentArgs.extend( ['-stdout', '-stderr'] )
            if self.varsP.nPairwiseJobs > 1:
                currentArgs += partialArgs
            currentArgs += ['-maxthreads', str(self.varsP.maxthreads)]
	    if self.varsP.bnxStatsFile!=None:
		currentArgs += ['-XmapStatRead', self.varsP.bnxStatsFile]
            sJob = mthread.singleJob(currentArgs, 
                                     jobName, 
                                     expectedResultFile, 
                                     outputString,
                                     maxThreads=self.varsP.maxthreads,
                                     clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=outputTarget + outputString+".stdout")
            ct += 1
            self.addJob(sJob)
        self.logArguments()
 def generateJobList(self):
     contigFiles, contigIDs = self.varsP.findContigs(self.varsP.inputContigFolder, self.varsP.inputContigPrefix)
     curargs = [self.varsP.RefAlignerBin, '-i', self.varsP.sorted_file+".bnx"] #was bnxFile
     baseArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('extension')
     nJobs = contigFiles.__len__()    
     ct = 0
     logArguments = "" #just in case the following loop isn't entered
     for jobNum in range(1,nJobs + 1):
         contigID = contigIDs[jobNum - 1]
         #jobName = 'Extend ' + contigID + ', Job ' + str(jobNum) + ' of ' + str(nJobs)
         expContigString = self.varsP.outputContigPrefix + '_contig' + contigID
         outputString = os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix)
         expectedResultFile = os.path.join(self.varsP.outputContigFolder, expContigString + '.cmap')# '_refined.cmap')
         jobName = 'Ext %s' % expContigString# + ', Job ' + str(jobNum) + ' of ' + str(nJobs)
         currentContig = contigFiles[jobNum - 1]
         currentArgs = curargs + baseArgs 
         currentArgs += ['-maxthreads', str(self.varsP.maxthreads), '-o', outputString, '-id', contigID, '-ref', currentContig]
         if self.varsP.stdoutlog :
             currentArgs.extend( ['-stdout', '-stderr'] )
         s1Job = mthread.singleJob(currentArgs, 
                                     jobName, 
                                     expectedResultFile, 
                                     expContigString,
                                     maxThreads=self.varsP.maxthreads, 
                                     forceForward = currentContig, 
                                     clusterLogDir=self.varsP.clusterLogDir)
         self.addJob(s1Job)
         ct += 1
     self.logArguments()
Пример #9
0
    def generateJobListSubsample(self) :

        if not self.varsP.executeCurrentStage:
            return 1 #tell self.__init__ not to continue processing
	    
        sorted_file = self.varsP.sorted_file
        nmols = 1000
        seed = 1
        self.subsampled=sorted_file+"_subsampled"

        self.varsP.updatePipeReport('Subsampling %s\n' % (sorted_file))
        
        jobName="SubsamplingBNX"
        expectedResultFile=self.subsampled+".bnx"
        # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short.
        #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly'))
        super(autoNoise, self).__init__(self.varsP, jobName, clusterArgs=self.varsP.getClusterArgs("assembly"))

        cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(self.varsP.maxthreads),  "-merge", "-minlen", "200", "-randomize", str(seed), "-subset", "1", str(nmols), "-bnx", "-o", self.subsampled] + self.varsP.argsListed('bnx_sort')
	if self.varsP.bnxStatsFile!=None:
		cargs += ['-XmapStatRead', self.varsP.bnxStatsFile]
        if self.varsP.stdoutlog :
            cargs.extend( ['-stdout', '-stderr'] )
        self.addJob(mthread.singleJob(cargs, jobName, expectedResultFile, jobName, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=self.subsampled+".stdout"))

        return 0 #success
    def generateJobListChar(self, noise_in, input_file, optSection) :

        if not self.varsP.executeCurrentStage:
            return 1 #tell self.__init__ not to continue processing
	    
        self.varsP.updatePipeReport('%s\n' % (optSection))
        
        self.output_folder=os.path.join(self.varsP.contigFolder, "auto_noise")
        if not util.checkDir(self.output_folder) : #will make if not exist, only returns False if already exists or can't make
                    print "ERROR in autoNoise.varsPipeline.prepareContigIO: bad dir:", self.output_folder
                    
        self.output_file=os.path.join(self.output_folder, optSection)
	    
        expectedResultFile=self.output_file+".err"
        # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short.
        #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly'))
        super(autoNoise, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs("assembly"))

        #cargs=[self.varsP.RefAlignerBin, '-f', '-i', input_file, "-ref", self.varsP.ref, "-maxthreads", str(self.varsP.maxthreads), "-o", self.output_file] 
        cargs=[self.varsP.RefAlignerBin, '-f', '-i', input_file, "-ref", self.varsP.ref, "-o", self.output_file] #remove maxthreads bc this is always running on its own
        if self.varsP.stdoutlog :
            cargs.extend( ['-stdout', '-stderr'] )
        for v in noise_in.keys():
		cargs.extend(["-"+v, str(noise_in[v])])
		
        cargs.extend(self.varsP.argsListed(optSection))
	if self.varsP.bnxStatsFile!=None:
		cargs += ['-XmapStatWrite', self.varsP.bnxStatsFile]
        self.addJob(mthread.singleJob(cargs, self.stageName, expectedResultFile, self.stageName, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=self.output_file+".stdout"))

        return 0 #success
    def generateJobListSubsample(self) :

        if not self.varsP.executeCurrentStage:
            return 1 #tell self.__init__ not to continue processing
	    
        sorted_file = self.varsP.sorted_file
        nmols = 1000
        seed = 1
        self.subsampled=sorted_file+"_subsampled"

        self.varsP.updatePipeReport('Subsampling %s\n' % (sorted_file))
        
        jobName="SubsamplingBNX"
        expectedResultFile=self.subsampled+".bnx"
        # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short.
        #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly'))
        super(autoNoise, self).__init__(self.varsP, jobName, clusterArgs=self.varsP.getClusterArgs("assembly"))

        cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(self.varsP.maxthreads),  "-merge", "-minlen", "200", "-randomize", str(seed), "-subset", "1", str(nmols), "-bnx", "-o", self.subsampled] + self.varsP.argsListed('bnx_sort')
	if self.varsP.bnxStatsFile!=None:
		cargs += ['-XmapStatRead', self.varsP.bnxStatsFile]
        if self.varsP.stdoutlog :
            cargs.extend( ['-stdout', '-stderr'] )
        self.addJob(mthread.singleJob(cargs, jobName, expectedResultFile, jobName, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=self.subsampled+".stdout"))

        return 0 #success
Пример #12
0
    def generateJobList(self):
        """ Instantiate job wrapper class with queue of single jobs for assembly
        
        """
        
        if self.varsP.pairwiseTriangleMode :
            AssemblerInputFlag="-if"
            AssemblerInputFile=self.varsP.bnxFileList
        else:
            AssemblerInputFlag="-i"
            AssemblerInputFile=self.varsP.bnxFile
        cargs = [self.varsP.AssemblerBin, AssemblerInputFlag, AssemblerInputFile, '-af', self.varsP.alignTarget]
	if self.varsP.bnxStatsFile!=None:
		cargs += ['-XmapStatRead', self.varsP.bnxStatsFile]
        baseArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('assembly')
        
        logFile = os.path.join(self.varsP.localRoot, 'AssemblyLog.txt')
        errFile = os.path.join(self.varsP.localRoot, 'AssemblyLog_stderr.txt')
        
        outFile = os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix) #no suffix for -o arg of Assembler
        self.contigsFile = os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix+".contigs") #Assembler will append this suffix
        currentArgs = cargs + baseArgs + ['-o', outFile]
        if self.varsP.stdoutlog :
            currentArgs.extend( ['-stdout', '-stderr'] )
        logArguments = "   ".join(currentArgs) + 2 * '\n'
        jobName = 'Assembly'
        #sJob = mthread.singleJob(currentArgs, jobName, self.contigsFile, jobName, stdOutFile=logFile, stdErrOutFile=errFile,clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=outFile+".stdout")
        sJob = mthread.singleJob(currentArgs, jobName, self.contigsFile, jobName, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=outFile+".stdout")
        self.addJob(sJob)
        self.logArguments()
Пример #13
0
    def generateJobListChar(self, noise_in, input_file, optSection) :
                    
        self.output_file=os.path.join(self.output_folder, optSection) #must assign before return bc used in constructor

        if not self.varsP.executeCurrentStage:
            return 1 #tell self.__init__ not to continue processing
	    
        self.varsP.updatePipeReport('%s\n' % (optSection))

        #move to constructor
        #self.output_folder=os.path.join(self.varsP.contigFolder, "auto_noise")
        #if not util.checkDir(self.output_folder) : #will make if not exist, only returns False if already exists or can't make
        #            print "ERROR in autoNoise.varsPipeline.prepareContigIO: bad dir:", self.output_folder
        
        expectedResultFile=self.output_file+".err"

        # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short.
        #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly'))
        #move to constructor
        #super(autoNoise, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs("assembly"))

        #cargs=[self.varsP.RefAlignerBin, '-f', '-i', input_file, "-ref", self.varsP.ref, "-maxthreads", str(self.varsP.maxthreads), "-o", self.output_file] 
        cargs=[self.varsP.RefAlignerBin, '-f', '-i', input_file, "-ref", self.varsP.ref, "-o", self.output_file] #remove maxthreads bc this is always running on its own
        if self.varsP.stdoutlog :
            cargs.extend( ['-stdout', '-stderr'] )
        cargs.extend( ['-output-veto-filter', 'intervals.txt$'] )
        for v in noise_in.keys():
		cargs.extend(["-"+v, str(noise_in[v])])
		
        cargs.extend(self.varsP.argsListed(optSection))
	if self.varsP.bnxStatsFile!=None:
		cargs += ['-XmapStatWrite', self.varsP.bnxStatsFile]
        self.addJob(mthread.singleJob(cargs, self.stageName, expectedResultFile, self.stageName, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=self.output_file+".stdout"))

        return 0 #success
 def generateJobList(self,argset=-1):
     if not self.varsP.ref : #no jobs if no ref
         return
     jobargs = [self.varsP.RefAlignerBin, '-ref', self.varsP.ref]
     if argset == -1 and self.varsP.argData.has_key('characterizeDefault') : # don't use nominal default
         opta = self.varsP.argsListed('characterizeDefault')
     elif argset == 1 and self.varsP.argData.has_key('characterizeFinal') : #extend (on default) -- make this default
         opta = self.varsP.argsListed('characterizeFinal')
     else : #this is an error
         self.varsP.updatePipeReport("ERROR in CharacterizeModule.generateJobList: invalid argset %s\n" % str(argset))
         return
     
     for i, curCharacterizeCmap in enumerate(self.varsP.curCharacterizeCmaps):
         if self.varsP.numCharacterizeJobs == 1:
             jobName = 'Char'+self.argStr+'_%s' % self.varsP.stageComplete
         else:
             jobName = 'Char'+self.argStr+'_%s_%d' % (self.varsP.stageComplete, i+1)
         outFileName = os.path.split(curCharacterizeCmap)[-1].replace(".cmap", "")
         outfile = os.path.join(self.varsP.contigAlignTarget,outFileName)
         self.curCharacterizeFileRoots.append(outfile)
         expectedResultFile = outfile+".xmap"
         self.xmapTarget = expectedResultFile
         currentArgs = jobargs + ["-i", curCharacterizeCmap, "-o", outfile]
         stdoutf = None
         if self.varsP.stdoutlog :
             currentArgs.extend( ['-stdout', '-stderr'] )
             stdoutf = outfile+".stdout"
         currentArgs += ['-maxthreads', str(self.varsP.nThreads)]
         currentArgs += ['-output-veto-filter', '_intervals.txt$']
         currentArgs += opta
         s1Job = mthread.singleJob(currentArgs, jobName, expectedResultFile, jobName.replace(' ',''),maxThreads=self.varsP.nThreads,clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=stdoutf)
         self.addJob(s1Job)
         if i==0:
             self.logArguments()
    def generateJobList(self) :
        """splitBNX.generateJobList: submit varsP.nPairwiseJobs number of split bnx jobs. """

        sorted_file = self.varsP.sorted_file
        if not util.checkFile(sorted_file+".bnx") :
            err = "ERROR: splitBNX input file (%s) not found; exiting" % self.varsP.sorted_file
            self.varsP.updatePipeReport(err+"\n")
            util.LogError("critical", err)
            util.LogStatus("progress", "pipeline", "failure")
            raise RuntimeError

        N = calculateNPairwise(self.varsP, sorted_file) #move back here (not sortBNX) bc needs to use sorted bnx
        #N = self.varsP.nPairwiseJobs

        self.varsP.updatePipeReport('Splitting BNX\n')
        #splitJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('splitting'))
        super(splitBNX, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs('splitting'))

        #should skip the rest and return 1, like in sortBNX, here:
        if not self.varsP.executeCurrentStage:
            return 1 #tell self.__init__ not to continue processing

        self.varsP.updatePipeReport("Splitting"+(" scan-scaled" if self.varsP.doScanScale else "")+" bnx file: %s.bnx\n\n" % self.varsP.sorted_file)

        #calculate threads per job: used to be fixed at 1, now file size / 1.5 GB rounded up. This was too low, add 1.
        threads = max(1, int(math.ceil( os.path.getsize(sorted_file+".bnx")/1.5e9 ))) + 1
        if threads > 1 :
            self.varsP.updatePipeReport("Using %i threads per job\n" % threads)

        #the change in job partitioning breaks backward compatibility and was causing too many problems; make it conditional on refaligner version
        if self.varsP.refaligner_version < 3995 :
            for partial in range(1,N + 1):
                output_file=self.varsP.bnxFile.replace(".bnx", "_%s_of_%s" %(partial, self.varsP.nPairwiseJobs))
                cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(threads), "-merge", "-subsetbin", str(partial), str(N), "-bnx", "-o",  output_file]
                if self.varsP.stdoutlog :
                    cargs.extend( ['-stdout', '-stderr'] )
                #print('%d/%d' % (partial, N), cargs)
                expectedResultFile=output_file+".bnx"
                self.addJob(mthread.singleJob(cargs, self.stageName + str(partial), expectedResultFile, self.stageName + str(partial), maxThreads=threads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=output_file+".stdout"))

        else :
            #change above to single command with -subsetbin 0 N
            output_file=self.varsP.bnxFile.replace(".bnx", "")
            cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(threads), "-merge", "-subsetbin", "0", str(N), "-bnx", "-o",  output_file]
            if self.varsP.stdoutlog :
                cargs.extend( ['-stdout', '-stderr'] )
            self.addJob(mthread.singleJob(cargs, self.stageName, output_file+".bnx", self.stageName, maxThreads=threads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=output_file+".stdout"))
    def getLambdaMapJob(self, snrCutoff=0, verbose=False):
        #Note, verbose will print once per job, so use for debugging only
        # add lambda alignment band to this
        lambdaFilter = self.varsP.argData['lambdaFilter']
        lamMinLen = float(lambdaFilter[lambdaFilter.index('-minlen')  +1]) if '-minlen'   in lambdaFilter else 40.
        lamMaxLen = float(lambdaFilter[lambdaFilter.index('-maxlen')  +1]) if '-maxlen'   in lambdaFilter else 60.
        lamMinLab = float(lambdaFilter[lambdaFilter.index('-minsites')+1]) if '-minsites' in lambdaFilter else 6.
        lamMaxLab = float(lambdaFilter[lambdaFilter.index('-maxsites')+1]) if '-maxsites' in lambdaFilter else 10.
        #old format below (dict, not list)
        #lamMinLen = int(lambdaFilter['-minlen'  ]) # 40
        #lamMaxLen = int(lambdaFilter['-maxlen'  ]) # 60
        #lamMinLab = int(lambdaFilter['-minsites']) # 6
        #lamMaxLab = int(lambdaFilter['-maxsites']) # 10
        if verbose :
            self.varsP.updateInfoReport("lamMinLen = %.0f\n" % lamMinLen, printalso=True)
            self.varsP.updateInfoReport("lamMaxLen = %.0f\n" % lamMaxLen, printalso=True)
            self.varsP.updateInfoReport("lamMinLab = %.0f\n" % lamMinLab, printalso=True)
            self.varsP.updateInfoReport("lamMaxLab = %.0f\n" % lamMaxLab, printalso=True)
        
        #need mol file to do this; if doesn't exist, return with warning
        if not(os.path.exists(self.molFile)):
            print "Skipping map lambda job", self.molTag, "because mol file missing:", self.molFile
            self.lambdaErrFile = None
            return

        bnxFileLambda = '%s_lambda.bnx' % self.molTag
        bnxFileLambda = os.path.join(os.path.split(self.molFile)[0], bnxFileLambda)
        #if lambda bnx exists, skip the isolation step
        if os.path.exists(bnxFileLambda) :
            print "Using lambda bnx", bnxFileLambda
        else :
            print '  Isolating Lambda %s' % self.molTag
            lab2File = self.molFile.replace('.mol', '.0.lab')
            scanDset = molecule.moleculeDataset(self.curExp.basesPerPixel, molTag=int(self.molTag))
            scanDset.readMolFile(self.molFile)
            scanDset.annotateLabels(lab2File)
            # Introduce optArguments for Lambda Band
            scanDsetLambda = molecule.filteredSubset(scanDset,snrCutoff,lamMinLab,lamMaxLab,lamMinLen,lamMaxLen,True)
            scanDsetLambda.writeBnxFile(bnxFileLambda, quality=self.quality)

        self.lambdaBnx = bnxFileLambda
        baseArgs = self.varsP.argsListed('mapLambda')
        outputTarget = bnxFileLambda.replace('.bnx', '')
        curArgs = [self.varsP.RefAlignerBin, '-i', bnxFileLambda, '-o', outputTarget, '-ref', self.varsP.lambdaRef] + baseArgs
        if self.varsP.stdoutlog :
            curArgs.extend( ['-stdout', '-stderr'] )
        jobTag = self.molTag + '_lambda'
        self.lambdaErrFile = outputTarget + '.err'

        #if the err file exists, no need to process
        if os.path.exists(self.lambdaErrFile) :
            print "Skipping map lambda job ", jobTag, "because err file exists", self.lambdaErrFile
            return
        
        return mthread.singleJob(curArgs, jobTag, self.lambdaErrFile, jobTag)
 def generateJobList(self):
     baseArgs1 = self.varsP.argsListed(self.refineStage)
     if self.refineStage != 'refineNGS':  #noise args are in refineNGS
         baseArgs1 += self.varsP.argsListed('noise0')
     contigFiles, contigIDs = self.varsP.findContigs(
         self.varsP.inputContigFolder, self.varsP.inputContigPrefix)
     #nJobs = len(contigFiles)
     bnx = self.varsP.sorted_file + ".bnx"  #was self.varsP.bnxFile, but need sorted bc ids are different after sorting
     if self.refineStage == 'refineA':  #refineA uses assembler, all others use refaligner
         r1args = [self.varsP.AssemblerBin, '-i',
                   bnx]  #need this before -contigs
         r1args += [
             '-contigs',
             os.path.join(self.varsP.inputContigFolder,
                          self.varsP.inputContigPrefix) + '.contigs'
         ]
     else:  #should be same for refineB/NGS/Final
         r1args = [self.varsP.RefAlignerBin, '-i', bnx]
         self.writeIDFile(len(contigFiles))  #nJobs)
     output1String = os.path.join(self.varsP.outputContigFolder,
                                  self.varsP.outputContigPrefix)
     for contigID in contigIDs:
         expectedOutputString = self.varsP.outputContigPrefix + '_contig' + contigID
         expectedResultFile = os.path.join(self.varsP.outputContigFolder,
                                           expectedOutputString +
                                           '.cmap')  #refineB
         jobName = self.refineStage + ' %5s' % contigID
         if self.refineStage == 'refineA':
             currentArgs = 2 * [
                 str(contigID)
             ]  #this must come after r1args because it's actually an argument to -contigs
         else:  #should be same for refineB/NGS/Final
             r1_cmapFile = self.varsP.inputContigPrefix + '_contig' + str(
                 contigID) + '.cmap'
             r1_cmapFile = os.path.join(self.varsP.inputContigFolder,
                                        r1_cmapFile)
             currentArgs = [
                 '-maxthreads',
                 str(self.varsP.maxthreads), '-ref', r1_cmapFile, '-id',
                 contigID
             ]
         currentArgs = r1args + currentArgs + baseArgs1 + [
             '-o', output1String
         ]
         if self.varsP.stdoutlog:
             currentArgs.extend(['-stdout', '-stderr'])
         s1Job = mthread.singleJob(currentArgs,
                                   jobName,
                                   expectedResultFile,
                                   expectedOutputString,
                                   maxThreads=self.varsP.maxthreads,
                                   clusterLogDir=self.varsP.clusterLogDir)
         self.addJob(s1Job)
     self.logArguments()
 def getTargetJobs(self, dormdir=False):
     localDataLocation = os.path.join(self.varsP.localRoot, self.expTag + '/')
     #print "localDataLocation:", localDataLocation #debug
     if dormdir :
         sJobRmName = 'Pre-Remove Folder: ' + shorten(localDataLocation)
         sJobRm = mthread.singleJob(['rm', '-f', '-r', localDataLocation], sJobRmName, '', 'rmDir')
         sJobMkdirName = 'Make Folder: ' + shorten(localDataLocation)
         sJobMkdir = mthread.singleJob(['mkdir', localDataLocation], sJobMkdirName, localDataLocation, 'mkDir')
         sJobMkdir.addContingentJob(sJobRm)
         allJobs = [sJobRm, sJobMkdir]
         contingentjob = sJobMkdir
     else :
         util.checkDir(localDataLocation) #will make dir localDataLocation
         allJobs = []
         contingentjob = None
     for scan in self.scans:
         scanjobs = scan.getDetectJobs(contingentjob)
         if not scanjobs : #no scan jobs means the scan has already been processed--clear all jobs
             self.varsP.updatePipeReport("Device.getTargetJobs: skipping path "+scan.nameStr()+"\n") #localDataLocation
         else :
             allJobs += scanjobs
     return allJobs
    def __init__(self, varsP) :
        jobName = "reference_process"
        opta_section = "referenceSvdetect"
        default_mres = "2.9"
        mres = "-mres"
        self.varsP = varsP
        usedefault = False
        if self.varsP.argData.has_key(opta_section) : #check if in optargs
            opta = self.varsP.argsListed(opta_section)
            if not mres in opta : #must have mres
                self.varsP.updatePipeReport("Warning in referenceProcess: "+mres+" missing in optArguments section "+opta_section+"\n")
                usedefault = True
        else :
            self.varsP.updatePipeReport("Warning in referenceProcess: optArguments section "+opta_section+" missing\n")
            usedefault = True
        if usedefault :
            opta = [mres, default_mres]

        mresstr = opta[opta.index(mres)+1] #get string for mres value for output name
        mresstr = mresstr.replace(".","")

        if not util.checkDir(self.varsP.refFolder) :
            self.varsP.updatePipeReport( "ERROR in referenceProcess: could not make output dir %s\n" % self.varsP.refFolder )
            return None
        refpref = os.path.basename(self.varsP.ref[:self.varsP.ref.rfind(".")]) + "_res" + mresstr
        outarg = os.path.join(self.varsP.refFolder, refpref) #refFolder is new output folder for this job
        expectedResultFile = outarg+".cmap" #if ref is spots, is this spots?
        args = [self.varsP.RefAlignerBin, '-o', outarg, '-i', self.varsP.ref, '-f', '-merge'] + opta
        stdoutf = None
        if self.varsP.stdoutlog :
            args.extend( ['-stdout', '-stderr'] )
            stdoutf = outarg+".stdout"
        args += ['-maxthreads', str(self.varsP.nThreads)]

        super(referenceProcess, self).__init__(self.varsP, jobName, clusterArgs=self.varsP.getClusterArgs("assembly"))

        job = mthread.singleJob(args, jobName, expectedResultFile, jobName, maxThreads=self.varsP.nThreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=stdoutf)
        self.addJob(job)

        util.LogStatus("progress", "stage_start", jobName)
        self.varsP.runJobs(self, "referenceProcess")
        self.doAllPipeReport()
        if not self.allResultsFound() : #this is an error, but we'll continue processing without SV detect
            err = "ERROR in referenceProcess: job failed, disabling SV detect"
            self.varsP.updatePipeReport( err+"\n" )
            util.LogError("error", err)
            #self.varsP.runSV = False #no need since this class is used in SVModule
        else :
            self.varsP.refDeresed = expectedResultFile #store good result for SV detect
            self.varsP.updatePipeReport( "referenceProcess: using reference %s for svdetect\n" % self.varsP.refDeresed )
        util.LogStatus("progress", "stage_complete", jobName)            
Пример #20
0
    def generateJobList(self, argset=-1):
        if not self.varsP.ref:  #no jobs if no ref
            return
        jobargs = [self.varsP.RefAlignerBin, '-ref', self.varsP.ref]
        if argset == -1 and self.varsP.argData.has_key(
                'characterizeDefault'):  # don't use nominal default
            opta = self.varsP.argsListed('characterizeDefault')
        elif argset == 1 and self.varsP.argData.has_key(
                'characterizeFinal'
        ):  #extend (on default) -- make this default
            opta = self.varsP.argsListed('characterizeFinal')
        else:  #this is an error
            self.varsP.updatePipeReport(
                "ERROR in CharacterizeModule.generateJobList: invalid argset %s\n"
                % str(argset))
            return

        for i, curCharacterizeCmap in enumerate(
                self.varsP.curCharacterizeCmaps):
            if self.varsP.numCharacterizeJobs == 1:
                jobName = 'Char' + self.argStr + '_%s' % self.varsP.stageComplete
            else:
                jobName = 'Char' + self.argStr + '_%s_%d' % (
                    self.varsP.stageComplete, i + 1)
            outFileName = os.path.split(curCharacterizeCmap)[-1].replace(
                ".cmap", "")
            outfile = os.path.join(self.varsP.contigAlignTarget, outFileName)
            self.curCharacterizeFileRoots.append(outfile)
            expectedResultFile = outfile + ".xmap"
            self.xmapTarget = expectedResultFile
            currentArgs = jobargs + ["-i", curCharacterizeCmap, "-o", outfile]
            stdoutf = None
            if self.varsP.stdoutlog:
                currentArgs.extend(['-stdout', '-stderr'])
                stdoutf = outfile + ".stdout"
            currentArgs += ['-maxthreads', str(self.varsP.nThreads)]
            currentArgs += ['-output-veto-filter', '_intervals.txt$']
            currentArgs += opta
            s1Job = mthread.singleJob(currentArgs,
                                      jobName,
                                      expectedResultFile,
                                      jobName.replace(' ', ''),
                                      maxThreads=self.varsP.nThreads,
                                      clusterLogDir=self.varsP.clusterLogDir,
                                      expectedStdoutFile=stdoutf)
            self.addJob(s1Job)
            if i == 0:
                self.logArguments()
Пример #21
0
    def generateJobListTriangle(self):
		baseArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('pairwise')

		cArgs = [self.varsP.RefAlignerBin, '-i', self.varsP.bnxFile]
		ct = 0
		outputTarget = os.path.join(self.varsP.alignFolder, 'exp')
		njobs=self.varsP.nPairwiseJobs*(self.varsP.nPairwiseJobs+1)/2
		BNX_list=[]
		for i in range(1,self.varsP.nPairwiseJobs + 1):
			file1=self.varsP.bnxFile.replace(".bnx", "_%s_of_%s.bnx" %(i, self.varsP.nPairwiseJobs))
			BNX_list.append(file1+"\n")
			for j in range(i,self.varsP.nPairwiseJobs + 1):
				file2=self.varsP.bnxFile.replace(".bnx", "_%s_of_%s.bnx" %(j, self.varsP.nPairwiseJobs))
				jobName = 'Pairwise %d of %d' % (ct+1, njobs)
				outputString = 'pairwise%dof%d' % (ct+1, njobs)
				expectedResultFile = outputTarget + outputString + '.align'
				if i==j :
					currentArgs = [self.varsP.RefAlignerBin, '-i', file1] + ['-o' , outputTarget + outputString] + baseArgs
				else :
					currentArgs = [self.varsP.RefAlignerBin, "-first", "-1", "-i", file1, "-i", file2] + ['-o' , outputTarget + outputString] + baseArgs
                                if self.varsP.stdoutlog :
                                    currentArgs.extend( ['-stdout', '-stderr'] )
				#if self.varsP.nPairwiseJobs > 1:
					#currentArgs += partialArgs
				currentArgs += ['-maxthreads', str(self.varsP.maxthreads)]
				if self.varsP.bnxStatsFile!=None:
					currentArgs += ['-XmapStatRead', self.varsP.bnxStatsFile]
				#if ct == 0: #redundant with logArguments below
				#	self.pipeReport += " ".join(currentArgs) + 2 * '\n'
				sJob = mthread.singleJob(currentArgs, 
							jobName, 
							expectedResultFile, 
							outputString,
							maxThreads=self.varsP.maxthreads,
							clusterLogDir=self.varsP.clusterLogDir,
							expectedStdoutFile=outputTarget + outputString+".stdout",
							)#, shell=True)
				ct += 1
				self.addJob(sJob)
		self.varsP.bnxFileList=self.varsP.bnxFile.replace(".bnx", ".list")
		f=open(self.varsP.bnxFileList, "w")
		f.writelines(BNX_list)
		f.close()
		self.logArguments()
    def generateJobListTriangle(self):
		baseArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('pairwise')

		cArgs = [self.varsP.RefAlignerBin, '-i', self.varsP.bnxFile]
		ct = 0
		outputTarget = os.path.join(self.varsP.alignFolder, 'exp')
		njobs=self.varsP.nPairwiseJobs*(self.varsP.nPairwiseJobs+1)/2
		BNX_list=[]
		for i in range(1,self.varsP.nPairwiseJobs + 1):
			file1=self.varsP.bnxFile.replace(".bnx", "_%s_of_%s.bnx" %(i, self.varsP.nPairwiseJobs))
			BNX_list.append(file1+"\n")
			for j in range(i,self.varsP.nPairwiseJobs + 1):
				file2=self.varsP.bnxFile.replace(".bnx", "_%s_of_%s.bnx" %(j, self.varsP.nPairwiseJobs))
				jobName = 'Pairwise %d of %d' % (ct+1, njobs)
				outputString = 'pairwise%dof%d' % (ct+1, njobs)
				expectedResultFile = outputTarget + outputString + '.align'
				if i==j :
					currentArgs = [self.varsP.RefAlignerBin, '-i', file1] + ['-o' , outputTarget + outputString] + baseArgs
				else :
					currentArgs = [self.varsP.RefAlignerBin, "-first", "-1", "-i", file1, "-i", file2] + ['-o' , outputTarget + outputString] + baseArgs
                                if self.varsP.stdoutlog :
                                    currentArgs.extend( ['-stdout', '-stderr'] )
				#if self.varsP.nPairwiseJobs > 1:
					#currentArgs += partialArgs
				currentArgs += ['-maxthreads', str(self.varsP.maxthreads)]
				if self.varsP.bnxStatsFile!=None:
					currentArgs += ['-XmapStatRead', self.varsP.bnxStatsFile]
				#if ct == 0: #redundant with logArguments below
				#	self.pipeReport += " ".join(currentArgs) + 2 * '\n'
				sJob = mthread.singleJob(currentArgs, 
							jobName, 
							expectedResultFile, 
							outputString,
							maxThreads=self.varsP.maxthreads,
							clusterLogDir=self.varsP.clusterLogDir,
							expectedStdoutFile=outputTarget + outputString+".stdout",
							)#, shell=True)
				ct += 1
				self.addJob(sJob)
		self.varsP.bnxFileList=self.varsP.bnxFile.replace(".bnx", ".list")
		f=open(self.varsP.bnxFileList, "w")
		f.writelines(BNX_list)
		f.close()
		self.logArguments()
 def generateJobList(self):
     """Defines job parameters for merge. Updates variables for subsequent
     completion test in mergeComplete()
     """
     self.clearJobs()
     self.prevPrefix = self.varsP.inputContigPrefix
     #self.curPrefix = self.prefixIter.next()
     self.curPrefix = self.stagePrefix + self.alphabet[self.iterCount]
     self.groupName = self.stageName + self.alphabet[
         self.iterCount]  #jobWrapper data member
     utilities.LogStatus("progress", "stage_start", self.groupName)
     self.varsP.updatePipeReport('   PREV PREFIX %s, CUR PREFIX %s' %
                                 (self.prevPrefix, self.curPrefix))
     self.iterCount += 1
     outputString = os.path.join(self.varsP.outputContigFolder,
                                 self.curPrefix)
     currentArgs = [self.varsP.RefAlignerBin, '-o', outputString]
     #if self.varsP.stdoutlog : #always use this here bc it's the only output which should always be there
     currentArgs.extend(['-f', '-stdout', '-stderr'])
     currentArgs += self.varsP.argsListed('merge')
     currentArgs += ['-maxthreads', str(self.varsP.nThreads)]
     contigsTextFile = os.path.join(self.varsP.inputContigFolder,
                                    'mergeContigs.txt')
     contigFiles, contigIDs = self.varsP.findContigs(
         self.varsP.inputContigFolder,
         self.prevPrefix,
         txtOutput=contigsTextFile
     )  #this method creates the mergeContigs.txt file which is necessary for this job
     self.varsP.prefixUsed.append(self.curPrefix)
     fileArgs = ['-if', contigsTextFile]
     #expoutput = outputString+".align" #don't know which contigs will disappear, but should always get an align file -- with new arg 'pairmergeRepeat', there's no .align; use stdout
     expoutput = outputString + ".stdout"
     s1Job = mthread.singleJob(currentArgs + fileArgs,
                               self.groupName,
                               expoutput,
                               self.groupName,
                               maxThreads=self.varsP.nThreads,
                               clusterLogDir=self.varsP.clusterLogDir,
                               expectedStdoutFile=outputString + ".stdout")
     self.addJob(s1Job)
     self.logArguments()
 def generateJobList(self):
     contigFiles, contigIDs = self.varsP.findContigs(
         self.varsP.inputContigFolder, self.varsP.inputContigPrefix)
     curargs = [
         self.varsP.RefAlignerBin, '-i', self.varsP.sorted_file + ".bnx"
     ]  #was bnxFile
     baseArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed(
         'extension')
     nJobs = contigFiles.__len__()
     ct = 0
     logArguments = ""  #just in case the following loop isn't entered
     for jobNum in range(1, nJobs + 1):
         contigID = contigIDs[jobNum - 1]
         #jobName = 'Extend ' + contigID + ', Job ' + str(jobNum) + ' of ' + str(nJobs)
         expContigString = self.varsP.outputContigPrefix + '_contig' + contigID
         outputString = os.path.join(self.varsP.outputContigFolder,
                                     self.varsP.outputContigPrefix)
         expectedResultFile = os.path.join(self.varsP.outputContigFolder,
                                           expContigString +
                                           '.cmap')  # '_refined.cmap')
         jobName = 'Ext %s' % expContigString  # + ', Job ' + str(jobNum) + ' of ' + str(nJobs)
         currentContig = contigFiles[jobNum - 1]
         currentArgs = curargs + baseArgs
         currentArgs += [
             '-maxthreads',
             str(self.varsP.maxthreads), '-o', outputString, '-id',
             contigID, '-ref', currentContig
         ]
         if self.varsP.stdoutlog:
             currentArgs.extend(['-stdout', '-stderr'])
         s1Job = mthread.singleJob(currentArgs,
                                   jobName,
                                   expectedResultFile,
                                   expContigString,
                                   maxThreads=self.varsP.maxthreads,
                                   forceForward=currentContig,
                                   clusterLogDir=self.varsP.clusterLogDir)
         self.addJob(s1Job)
         ct += 1
     self.logArguments()
Пример #25
0
 def generateJobList(self):
     curArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('sampleChar')
     if util.checkFile(self.varsP.bnxTarget) : #file exists only if image processing was run
         bnxFiles = parseExperimentFile(self.varsP.bnxTarget)
         if not bnxFiles : #check that you got at least one
             errstr = "ERROR in SampleChar.generateJobList: no bnx files found in: "+self.varsP.bnxTarget
             print errstr
             self.varsP.updatePipeReport(errstr+"\n\n")
             return
         basepath = "" #os.path.split(bnxFiles[0])[0] #don't use basepath for this case
     else : #otherwise, assume this is the only bnx file
         bnxFiles = [self.varsP.bnxFile]
         #here, make a dir for the results--should really check results of checkEmptyDir for errors
         basepath = os.path.join(self.varsP.localRoot, "sampleChar")
         if self.varsP.wipe and os.path.isdir(basepath) :
             shutil.rmtree(basepath)
             #util.checkEmptyDir(basepath) #will make if not exist, but if it does, will remove and re-make -- this fn doesn't exist...
         #else :
         util.checkDir(basepath) #will make if not exist, but won't remove anything
     nJobs = len(bnxFiles)
     #for i, bnxFile in enumerate(bnxFiles):
     for bnxFile in bnxFiles :
         #bnxGroupName = '%02d' % (i+1) #get this from the path, ie, bnxFiles
         cargs = [self.varsP.RefAlignerBin, '-i', bnxFile]
         bnxname = os.path.split(bnxFile)[1].replace(".bnx","")
         jobname = 'Sample_Char_' + bnxname
         #outputTarget = os.path.join(basepath, bnxGroupName)
         if basepath : #bnx input
             outputTarget = os.path.join(basepath, bnxname)
         else : #image processing
             outputTarget = bnxFile.replace(".bnx","") + "_sampleChar"
         expectedResultFile = outputTarget + '.err' #this is used in checkResults
         currentArgs = cargs + ['-ref', self.varsP.ref, '-o' , outputTarget, '-f']
         if self.varsP.stdoutlog :
             currentArgs.extend( ['-stdout', '-stderr'] )
         currentArgs += ['-maxthreads', str(self.varsP.maxthreads)] + curArgs
         sJob = mthread.singleJob(currentArgs, jobname, expectedResultFile, jobname, clusterLogDir=self.varsP.clusterLogDir) # peStr is deprecated in favor of clusterargs
         #sJob.expTag = bnxGroupName #removed from checkResults
         self.addJob(sJob)
     self.logArguments()
 def generateJobList(self):
     curArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('sampleChar')
     if util.checkFile(self.varsP.bnxTarget) : #file exists only if image processing was run
         bnxFiles = parseExperimentFile(self.varsP.bnxTarget)
         if not bnxFiles : #check that you got at least one
             errstr = "ERROR in SampleChar.generateJobList: no bnx files found in: "+self.varsP.bnxTarget
             print errstr
             self.varsP.updatePipeReport(errstr+"\n\n")
             return
         basepath = "" #os.path.split(bnxFiles[0])[0] #don't use basepath for this case
     else : #otherwise, assume this is the only bnx file
         bnxFiles = [self.varsP.bnxFile]
         #here, make a dir for the results--should really check results of checkEmptyDir for errors
         basepath = os.path.join(self.varsP.localRoot, "sampleChar")
         if self.varsP.wipe and os.path.isdir(basepath) :
             shutil.rmtree(basepath)
             #util.checkEmptyDir(basepath) #will make if not exist, but if it does, will remove and re-make -- this fn doesn't exist...
         #else :
         util.checkDir(basepath) #will make if not exist, but won't remove anything
     nJobs = len(bnxFiles)
     #for i, bnxFile in enumerate(bnxFiles):
     for bnxFile in bnxFiles :
         #bnxGroupName = '%02d' % (i+1) #get this from the path, ie, bnxFiles
         cargs = [self.varsP.RefAlignerBin, '-i', bnxFile]
         bnxname = os.path.split(bnxFile)[1].replace(".bnx","")
         jobname = 'Sample_Char_' + bnxname
         #outputTarget = os.path.join(basepath, bnxGroupName)
         if basepath : #bnx input
             outputTarget = os.path.join(basepath, bnxname)
         else : #image processing
             outputTarget = bnxFile.replace(".bnx","") + "_sampleChar"
         expectedResultFile = outputTarget + '.err' #this is used in checkResults
         currentArgs = cargs + ['-ref', self.varsP.ref, '-o' , outputTarget, '-f']
         if self.varsP.stdoutlog :
             currentArgs.extend( ['-stdout', '-stderr'] )
         currentArgs += ['-maxthreads', str(self.varsP.maxthreads)] + curArgs
         sJob = mthread.singleJob(currentArgs, jobname, expectedResultFile, jobname, clusterLogDir=self.varsP.clusterLogDir) # peStr is deprecated in favor of clusterargs
         #sJob.expTag = bnxGroupName #removed from checkResults
         self.addJob(sJob)
     self.logArguments()
Пример #27
0
    def generateJobList(self) :

        if not self.varsP.executeCurrentStage:
            return 1 #tell self.__init__ not to continue processing
	    
        sorted_file = self.varsP.sorted_file

        self.varsP.updatePipeReport('Sorting %s into %s\n' % (self.varsP.bnxFile, sorted_file))
	    
        expectedResultFile=sorted_file+".bnx"
        # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short.
        #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly'))
        super(sortBNX, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs("assembly"))

        cargs=[self.varsP.RefAlignerBin, '-f', '-i', self.varsP.bnxFile, "-maxthreads", str(self.varsP.maxthreads),  "-merge", "-sort-idinc", "-bnx", "-o", sorted_file] + self.varsP.argsListed('bnx_sort')
	if self.varsP.bnxStatsFile!=None:
		cargs += ['-XmapStatWrite', self.varsP.bnxStatsFile]
        if self.varsP.stdoutlog :
            cargs.extend( ['-stdout', '-stderr'] )
        self.addJob(mthread.singleJob(cargs, self.stageName, expectedResultFile, self.stageName, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=sorted_file+".stdout"))

        return 0 #success
    def generateJobList(self) :

        if not self.varsP.executeCurrentStage:
            return 1 #tell self.__init__ not to continue processing
	    
        sorted_file = self.varsP.sorted_file

        self.varsP.updatePipeReport('Sorting %s into %s\n' % (self.varsP.bnxFile, sorted_file))
	    
        expectedResultFile=sorted_file+".bnx"
        # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short.
        #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly'))
        super(sortBNX, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs("assembly"))

        cargs=[self.varsP.RefAlignerBin, '-f', '-i', self.varsP.bnxFile, "-maxthreads", str(self.varsP.maxthreads),  "-merge", "-sort-idinc", "-bnx", "-o", sorted_file] + self.varsP.argsListed('bnx_sort')
	if self.varsP.bnxStatsFile!=None:
		cargs += ['-XmapStatWrite', self.varsP.bnxStatsFile]
        if self.varsP.stdoutlog :
            cargs.extend( ['-stdout', '-stderr'] )
        self.addJob(mthread.singleJob(cargs, self.stageName, expectedResultFile, self.stageName, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=sorted_file+".stdout"))

        return 0 #success
Пример #29
0
    def __init__(self, varsP):
        jobName = "reference_process"
        opta_section = "referenceSvdetect"
        default_mres = "2.9"
        mres = "-mres"
        self.varsP = varsP
        usedefault = False
        if self.varsP.argData.has_key(opta_section):  #check if in optargs
            opta = self.varsP.argsListed(opta_section)
            if not mres in opta:  #must have mres
                self.varsP.updatePipeReport(
                    "Warning in referenceProcess: " + mres +
                    " missing in optArguments section " + opta_section + "\n")
                usedefault = True
        else:
            self.varsP.updatePipeReport(
                "Warning in referenceProcess: optArguments section " +
                opta_section + " missing\n")
            usedefault = True
        if usedefault:
            opta = [mres, default_mres]

        mresstr = opta[opta.index(mres) +
                       1]  #get string for mres value for output name
        mresstr = mresstr.replace(".", "")

        if not util.checkDir(self.varsP.refFolder):
            self.varsP.updatePipeReport(
                "ERROR in referenceProcess: could not make output dir %s\n" %
                self.varsP.refFolder)
            return None
        refpref = os.path.basename(
            self.varsP.ref[:self.varsP.ref.rfind(".")]) + "_res" + mresstr
        outarg = os.path.join(
            self.varsP.refFolder,
            refpref)  #refFolder is new output folder for this job
        expectedResultFile = outarg + ".cmap"  #if ref is spots, is this spots?
        args = [
            self.varsP.RefAlignerBin, '-f', '-o', outarg, '-i', self.varsP.ref,
            '-merge'
        ] + opta
        stdoutf = None
        if self.varsP.stdoutlog:
            args.extend(['-stdout', '-stderr'])
            stdoutf = outarg + ".stdout"
        args += ['-maxthreads', str(self.varsP.nThreads)]

        super(referenceProcess,
              self).__init__(self.varsP,
                             jobName,
                             clusterArgs=self.varsP.getClusterArgs("assembly"))

        job = mthread.singleJob(args,
                                jobName,
                                expectedResultFile,
                                jobName,
                                maxThreads=self.varsP.nThreads,
                                clusterLogDir=self.varsP.clusterLogDir,
                                expectedStdoutFile=stdoutf)
        self.addJob(job)

        util.LogStatus("progress", "stage_start", jobName)
        self.varsP.runJobs(self, "referenceProcess")
        self.doAllPipeReport()
        if not self.allResultsFound(
        ):  #this is an error, but we'll continue processing without SV detect
            err = "ERROR in referenceProcess: job failed, disabling SV detect"
            self.varsP.updatePipeReport(err + "\n")
            util.LogError("error", err)
            #self.varsP.runSV = False #no need since this class is used in SVModule
        else:
            self.varsP.refDeresed = expectedResultFile  #store good result for SV detect
            self.varsP.updatePipeReport(
                "referenceProcess: using reference %s for svdetect\n" %
                self.varsP.refDeresed)
        util.LogStatus("progress", "stage_complete", jobName)
                    break

                if case():
                    self.varsP.updatePipeReport(
                        "Internal error: cannot handle stage %s" %
                        (self.refineStage))
                    raise ValueError

            if self.varsP.bnxStatsFile != None:
                currentArgs.extend(['-XmapStatRead', self.varsP.bnxStatsFile])

            s1Job = mthread.singleJob(
                currentArgs,
                jobName,
                expectedResultFile,
                expectedOutputString,
                maxThreads=nthreads,
                clusterLogDir=self.varsP.clusterLogDir,
                expectedStdoutFile=expectedStdoutFile,
            )
            self.addJob(s1Job)
        self.logArguments()

    def checkResults(self, stageSuffix=""):
        '''Call jobWrapper (self) .doAllPipeReport, and varsP.mergeIntoSingleCmap.
        stageSuffix, if supplied, is appended to varsP.stageComplete in order to
        fix the stage name reported by the CharacterizeModule in the informaticsReport.
        '''
        self.doAllPipeReport()
        self.varsP.stageComplete = self.refineStage + stageSuffix
        if self.refineStage not in ['refineB0', 'refineFinal0', 'extension0']:
Пример #31
0
    def generateJobList(self):
        """AlignModule.generateJobList: create RefAligner jobs for aligning molecules to contigs.
        """
        #for runAlignMol, this method is called but not used: exit if RefAlignerBin is empty
        if not self.varsP.RefAlignerBin:
            return

        #the contigs are obtained from varsP.latestMergedCmap--check its validity, a return will mean no jobs, and no jobs is now handled in multiThreadRunJobs.
        if not self.doref and (
                not self.varsP.latestMergedCmap
                or not util.checkCmap(self.varsP.latestMergedCmap)):
            err = "Error in AlignModule.generateJobList: varsP.latestMergedCmap is not set or not valid cmap; skipping %s" % self.stageName
            self.varsP.updatePipeReport(err + "\n")
            util.LogError("error", err)
            return

        #Note: noise parameters should be fixed becuase when bnx is split, -M
        # would find different parameters for different contigs. Use noise0.

        baseargs = [self.varsP.RefAlignerBin]
        if not self.doref:
            baseargs += ['-ref', self.varsP.latestMergedCmap
                         ]  #reference is latest merged cmap
            mappref = os.path.split(self.varsP.latestMergedCmap)[1]
            mappref = mappref[:mappref.find(".")]
        else:
            baseargs += ['-ref', self.varsP.ref]
            mappref = self.stageName  #use stageName also for output filename

        noiseargs = self.varsP.argsListed('noise0')
        haverefargs = False
        try:  #argsListed does not check key
            refargs = self.varsP.argsListed(self.stageName)  #'alignmolvref'
            haverefargs = True
        except KeyError:  #this is same as old behavior
            #refargs = self.varsP.argsListed('noise0') + self.varsP.argsListed(self.argStageName) #old
            refargs = self.varsP.argsListed(self.argStageName)  #new
        #refargs = noiseargs + refargs

        if haverefargs:
            self.jobargs = refargs

        #single job with bnxin (constructor)
        if self.bnxin:
            outarg = os.path.join(self.alignTarget, mappref)
            self.outFileList.append(outarg)  #file prefixes
            jobargs = baseargs + ['-o', outarg]
            jobargs += ['-i', self.bnxin]

            stdoutf = None
            if self.varsP.stdoutlog:  #remember, these must be after -o
                jobargs.extend(['-f', '-stdout', '-stderr'])
                stdoutf = outarg + ".stdout"
            jobargs += ['-maxthreads', str(self.varsP.maxthreads)]
            #add noise0 before alignmol (stageName) so that the latter can override the former
            jobargs += noiseargs
            jobargs.extend(['-output-veto-filter', 'intervals.txt$'
                            ])  #this feature not in old RefAligner
            jobargs += refargs

            s1Job = mthread.singleJob(jobargs,
                                      self.stageName,
                                      outarg + ".xmap",
                                      self.stageName,
                                      maxThreads=self.varsP.maxthreads,
                                      clusterLogDir=self.varsP.clusterLogDir,
                                      expectedStdoutFile=stdoutf)
            self.addJob(s1Job)
            return  #and this is the only job

        #loop over the split bnxs, make one job per bnx
        for idx in range(1, self.varsP.nPairwiseJobs + 1):

            outarg = os.path.join(self.alignTarget, mappref + "_" + str(idx))
            self.outFileList.append(outarg)  #file prefixes
            jobargs = baseargs + ['-o', outarg]
            idxstr = "_%s_of_%s" % (idx, self.varsP.nPairwiseJobs)
            jobargs += [
                '-i',
                self.varsP.bnxFile.replace(".bnx", idxstr + ".bnx")
            ]

            stdoutf = None
            if self.varsP.stdoutlog:  #remember, these must be after -o
                jobargs.extend(['-f', '-stdout', '-stderr'])
                stdoutf = outarg + ".stdout"
            jobargs += ['-maxthreads', str(self.varsP.maxthreads)]
            #add noise0 before alignmol (stageName) so that the latter can override the former
            jobargs += noiseargs
            #if idx != 1 : #keep _r for first job only -- copied from SVModule
            #    jobargs.extend( ['-output-veto-filter', '_r.cmap$'] ) #need this for copy number; do NOT veto
            jobargs.extend(['-output-veto-filter', 'intervals.txt$'
                            ])  #this feature not in old RefAligner
            jobargs += refargs

            s1Job = mthread.singleJob(jobargs,
                                      self.stageName + idxstr,
                                      outarg + ".xmap",
                                      self.stageName + idxstr,
                                      maxThreads=self.varsP.maxthreads,
                                      clusterLogDir=self.varsP.clusterLogDir,
                                      expectedStdoutFile=stdoutf)
            self.addJob(s1Job)
    def generateJobList(self):
        baseArgs1 = self.varsP.argsListed(self.refineStage)
        
	for case in util.switch(self.refineStage):
		if case("refine(B1|Final1)", regexp=True):
			baseArgs1 += self.varsP.argsListed('noise0')
			ContigGroupList=self.findGroupedContigs()
			r1args = [self.varsP.RefAlignerBin]
			break
		if case("refine(B0|Final0)", regexp=True):
			baseArgs1 += self.varsP.argsListed('noise0')
			ContigGroupListFull=self.groupContigs()
			setattr(self.varsP, "count_"+self.varsP.outputContigPrefix, (ContigGroupListFull))
			#print self.varsP.outputContigPrefix, getattr(self.varsP, "count_"+self.varsP.outputContigPrefix)
			#r1args = [self.varsP.RefAlignerBin, '-i', self.varsP.bnxFile]
			#InputFileList=[self.varsP.bnxFile]
			r1args = [self.varsP.RefAlignerBin]
			ContigGroupList = zip(range(1,self.varsP.nPairwiseJobs + 1), range(1,self.varsP.nPairwiseJobs + 1), [self.varsP.bnxFile.replace(".bnx", "_%s_of_%s.bnx" %(x, self.varsP.nPairwiseJobs)) for x in range(1,self.varsP.nPairwiseJobs + 1)], [1]*self.varsP.nPairwiseJobs)
			break
		if case("refineA"):
			baseArgs1 += self.varsP.argsListed('noise0')
			ContigGroupList=self.groupContigs()
			#print("Found %d groups for refineA" % (len(ContigGroupList)))
			#r1args = [self.varsP.AssemblerBin, '-i', self.varsP.bnxFile.replace(".bnx", "_sorted.bnx")] #need this before -contigs -- can no longer use all_sorted.bnx due to scan scaling: must refer to varsP.sorted_file
			#r1args = [self.varsP.AssemblerBin, '-i', self.varsP.sorted_file+".bnx"] #need this before -contigs
			r1args = [self.varsP.AssemblerBin, '-if', self.varsP.bnxFileList] #need this before -contigs; use split files in case splitting changed (eg due to scan scaling producing labels at < 20 bp)
			r1args += ['-contigs', os.path.join(self.varsP.inputContigFolder, self.varsP.inputContigPrefix) + '.contigs']
			break
		if case("refineNGS"):
			r1args = [self.varsP.RefAlignerBin, '-i', self.varsP.bnxFile]
			ContigGroupList=self.groupContigs()
			break
		if case("extension0"):
			baseArgs1 += self.varsP.argsListed('noise0')
			ContigGroupList=self.groupContigs()
			setattr(self.varsP, "count_"+self.varsP.outputContigPrefix, (ContigGroupList))
			#print self.varsP.outputContigPrefix, getattr(self.varsP, "count_"+self.varsP.outputContigPrefix), self.varsP.inputContigFolder, self.varsP.inputContigPrefix
			#r1args = [self.varsP.RefAlignerBin, '-i', self.varsP.bnxFile]
			#InputFileList=[self.varsP.bnxFile]
			r1args = [self.varsP.RefAlignerBin]
			ContigGroupList = zip(range(1,self.varsP.nPairwiseJobs + 1), range(1,self.varsP.nPairwiseJobs + 1), [self.varsP.bnxFile.replace(".bnx", "_%s_of_%s.bnx" %(x, self.varsP.nPairwiseJobs)) for x in range(1,self.varsP.nPairwiseJobs + 1)], [1]*self.varsP.nPairwiseJobs)
			break;
		if case("extension1"):
			baseArgs1 += self.varsP.argsListed('noise0')
			ContigGroupList=self.findGroupedContigs()
			r1args = [self.varsP.RefAlignerBin]
			break;
		if case():
			varsP.error += 1
			varsP.message += '  Error: Refine stage name invalid: '+str(StageName)+'\n'
			return


        stdarg = []
        if self.varsP.stdoutlog : #this is the same for all cases below
            stdarg = ['-stdout', '-stderr'] 

	#contigFiles, contigIDs = self.varsP.findContigs(self.varsP.inputContigFolder, self.varsP.inputContigPrefix)
        #nJobs = len(contigFiles)
        output1String = os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix)
        #for jobNum in range(1,nJobs + 1):
            #contigID = contigIDs[jobNum - 1]
        for m in range(0, len(ContigGroupList)):
	    contigID=ContigGroupList[m][0]
	    rawContigID=ContigGroupList[m][1]
	    contig=ContigGroupList[m][2]
	    
	    # Figure out desired number of threads to use
	    threadBoost=ceil(ContigGroupList[m][3])
	    if threadBoost<1:
		    threadBoost=1
	    minthreads=self.varsP.getClusterArgs(self.refineStage, category="MinThreads")
	    if minthreads:
		minthreads=Template(minthreads).substitute(maxthreads=self.varsP.maxthreads)
	    else:
		minthreads=self.varsP.maxthreads
	    nthreads=float(minthreads)
	    nthreads=int(round(nthreads*threadBoost))
	    if nthreads>self.varsP.maxthreads:
		    nthreads=self.varsP.maxthreads
#        for contigID, contig in ContigGroupList :		
            jobName = self.refineStage + ' %5s' % contigID
	    for case in util.switch(self.refineStage):
		    if case("refineA"):
			endId=int(rawContigID)+self.bunching-1
			if m+1<len(ContigGroupList) :
				endId=int(ContigGroupList[m+1][1])-1
			currentArgs = [str(rawContigID), str(endId), '-maxthreads', str(nthreads)] #this must come after r1args because it's actually an argument to -contigs
			#currentArgs = r1args + currentArgs + baseArgs1 + ['-id', str(contigID), '-i', contig+"_mapped.bnx", '-o', output1String]
			currentArgs = r1args + currentArgs + ['-o', output1String] + stdarg + baseArgs1
			expectedOutputString = self.varsP.outputContigPrefix + '_contig' + str(rawContigID)
			expectedResultFile = os.path.join(self.varsP.outputContigFolder, expectedOutputString + '.cmap') #refineB
			expectedStdoutFile = output1String + "_id"+str(rawContigID)+".stdout"
			break
			
		    #if case("refineB1|refineFinal1|extension1", regexp=True):
			## TODO: make thread number configurable from clusterArgs
			#currentArgs = ['-maxthreads', str(16), self.ref_arg, contig]
			#currentArgs = r1args + currentArgs + baseArgs1 + ['-id', str(contigID), '-i', contig+"_mapped.bnx", '-o', output1String]
			#expectedOutputString = self.varsP.outputContigPrefix + '_contig' + str(contigID)
			#expectedResultFile = os.path.join(self.varsP.outputContigFolder, expectedOutputString + '.cmap') #refineB
			#break

		    if case("refineB1|refineFinal1|extension1", regexp=True):
			Inputs=zip(["-i"]*self.varsP.nPairwiseJobs, [contig.replace("_group", "_group"+str(i)+"_mapped_group")+".bnx" for i in range(1,self.varsP.nPairwiseJobs + 1)])
			Inputs=[x for t in Inputs for x in t]
                        #-id must come before -o, otherwise expectedStdoutFile is wrong
			currentArgs = ['-maxthreads', str(nthreads), '-id', str(contigID), '-o', output1String, self.ref_arg, contig]
			currentArgs = r1args + currentArgs + stdarg + baseArgs1 + Inputs 
			expectedOutputString = self.varsP.outputContigPrefix + '_contig' + str(rawContigID)
			expectedResultFile = os.path.join(self.varsP.outputContigFolder, expectedOutputString + '.cmap') #refineB
			expectedStdoutFile = output1String + "_id"+str(contigID)+".stdout"
			break
			
		    #if case("refineB0|refineFinal0|extension0", regexp=True):
			#currentArgs = ['-maxthreads', str(self.varsP.maxthreads), self.ref_arg, contig]
			#currentArgs = r1args + currentArgs + baseArgs1 + ['-mapped-unsplit', '1', '-refine', '0', '-mapped', contig+"_mapped", "-o", "/dev/null"]
			#expectedOutputString =  self.refineStage + "contig"+str(contigID) + "_mapped.bnx"
			#expectedResultFile = contig + "_mapped.bnx" #refineB
			#break
			
		    if case("refineB0|refineFinal0|extension0", regexp=True):
			currentArgs = [ '-maxthreads', str(nthreads), "-ref", os.path.join(self.varsP.inputContigFolder, util.uniquifyContigName(self.varsP.inputContigPrefix)+".cmap")]
                        outputfile = os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix+'_group'+str(contigID))
                        #-id must come before -o, otherwise expectedStdoutFile is wrong
			currentArgs = r1args + ['-i', contig, '-id', str(contigID), '-o', outputfile] + stdarg + currentArgs + baseArgs1
                        currentArgs += ['-refine', '0', '-grouped', os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix+'_group_manifest'), '-mapped', os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix+'_group'+str(contigID)+"_mapped"), '-output-filter', ".*.bnx"]
			expectedOutputString = self.varsP.outputContigPrefix+'_group'+str(contigID)+"_mapped.bnx"
			expectedResultFile = outputfile + "_mapped_group1.bnx" 
			expectedStdoutFile = outputfile + "_id"+str(contigID)+ ".stdout"
			break
			
		    if case():
			self.varsP.updatePipeReport("Internal error: cannot handle stage %s" % (self.refineStage))
			raise ValueError
                
            if self.varsP.bnxStatsFile!=None:
		currentArgs.extend(['-XmapStatRead', self.varsP.bnxStatsFile])
		    		    
            s1Job = mthread.singleJob(currentArgs, 
                                    jobName, 
                                    expectedResultFile, 
                                    expectedOutputString,
                                    maxThreads=nthreads,
                                    clusterLogDir=self.varsP.clusterLogDir,
                                    expectedStdoutFile=expectedStdoutFile,
                                    )
            self.addJob(s1Job)
        self.logArguments()
    def generateJobList(self):
        """AlignModule.generateJobList: create RefAligner jobs for aligning molecules to contigs.
        """
        #for runAlignMol, this method is called but not used: exit if RefAlignerBin is empty
        if not self.varsP.RefAlignerBin :
            return

        #the contigs are obtained from varsP.latestMergedCmap--check its validity, a return will mean no jobs, and no jobs is now handled in multiThreadRunJobs.
        if not self.doref and ( not self.varsP.latestMergedCmap or
                                not util.checkCmap(self.varsP.latestMergedCmap) ) :
            err = "Error in AlignModule.generateJobList: varsP.latestMergedCmap is not set or not valid cmap; skipping %s" % self.stageName
            self.varsP.updatePipeReport(err+"\n")
            util.LogError("error", err)
            return

        #Note: noise parameters should be fixed becuase when bnx is split, -M
        # would find different parameters for different contigs. Use noise0.

        baseargs = [self.varsP.RefAlignerBin]
        if not self.doref :
            baseargs += ['-ref', self.varsP.latestMergedCmap] #reference is latest merged cmap
            mappref = os.path.split(self.varsP.latestMergedCmap)[1]
            mappref = mappref[:mappref.find(".")]
        else :
            baseargs += ['-ref', self.varsP.ref] 
            mappref = self.stageName #use stageName also for output filename

        noiseargs = self.varsP.argsListed('noise0')
        haverefargs = False
        try : #argsListed does not check key
            refargs = self.varsP.argsListed(self.stageName) #'alignmolvref'
            haverefargs = True
        except KeyError : #this is same as old behavior
            #refargs = self.varsP.argsListed('noise0') + self.varsP.argsListed(self.argStageName) #old
            refargs = self.varsP.argsListed(self.argStageName) #new
        #refargs = noiseargs + refargs

        if haverefargs :
            self.jobargs = refargs

        #single job with bnxin (constructor)
        if self.bnxin :
            outarg = os.path.join(self.alignTarget, mappref)
            self.outFileList.append( outarg ) #file prefixes
            jobargs = baseargs + ['-o', outarg]
            jobargs += ['-i', self.bnxin]

            stdoutf = None
            if self.varsP.stdoutlog : #remember, these must be after -o
                jobargs.extend( ['-f', '-stdout', '-stderr'] )
                stdoutf = outarg+".stdout"
            jobargs += ['-maxthreads', str(self.varsP.maxthreads)]
            #add noise0 before alignmol (stageName) so that the latter can override the former
            jobargs += noiseargs
            jobargs.extend( ['-output-veto-filter', 'intervals.txt$'] ) #this feature not in old RefAligner
            jobargs += refargs

            s1Job = mthread.singleJob(jobargs, self.stageName, outarg+".xmap", self.stageName, maxThreads=self.varsP.maxthreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=stdoutf)
            self.addJob(s1Job)
            return #and this is the only job

        #loop over the split bnxs, make one job per bnx
        for idx in range(1,self.varsP.nPairwiseJobs+1) :

            outarg = os.path.join(self.alignTarget, mappref+"_"+str(idx))
            self.outFileList.append( outarg ) #file prefixes
            jobargs = baseargs + ['-o', outarg]
            idxstr = "_%s_of_%s" % (idx, self.varsP.nPairwiseJobs)
            jobargs += ['-i', self.varsP.bnxFile.replace(".bnx", idxstr+".bnx")]

            stdoutf = None
            if self.varsP.stdoutlog : #remember, these must be after -o
                jobargs.extend( ['-f', '-stdout', '-stderr'] )
                stdoutf = outarg+".stdout"
            jobargs += ['-maxthreads', str(self.varsP.maxthreads)]
            #add noise0 before alignmol (stageName) so that the latter can override the former
            jobargs += noiseargs
            #if idx != 1 : #keep _r for first job only -- copied from SVModule
            #    jobargs.extend( ['-output-veto-filter', '_r.cmap$'] ) #need this for copy number; do NOT veto
            jobargs.extend( ['-output-veto-filter', 'intervals.txt$'] ) #this feature not in old RefAligner
            jobargs += refargs

            s1Job = mthread.singleJob(jobargs, self.stageName+idxstr, outarg+".xmap", self.stageName+idxstr, maxThreads=self.varsP.maxthreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=stdoutf)
            self.addJob(s1Job)
def performImageAnalysis(varsP, bypass=False, quality=True, forceonecolor=False):
    """Top level function for instrument scaling, image handling, bnx encoding
    
    """
    #print "bypass = "******"ERROR in performImageAnalysis: no images found in paths in "+varsP.imgFile+"\n")
        return 1 #this is an error--new convention is to return 1 on error
    
    processImagesJobSet = mthread.jobWrapper(varsP, groupName = 'Image Transfer and Processing', throttle=8)
    expID = 1
    devices = []
    allJobs = []
    bnxFiles = [] #only used if bypass--see below
    for remoteDataLocation in remoteDataLocations:
        expTag = '%02d' % expID
        expID += 1
        localPath = os.path.join(varsP.localRoot, expTag + '/')
        #if expTag == "04" : #debug
        #print "data\n:", remoteDataLocation, "\n" #debug
        curDevice = Device(varsP, expTag, remoteDataLocation, localPath, bypass, quality, forceonecolor)
        for sJob in curDevice.getTargetJobs():
            if sJob : #empty list is returned if target mol file already exists
                processImagesJobSet.addJob(sJob)
        devices.append(curDevice)
        if bypass :
            bnxFiles.append(curDevice.bnxFile)
     
    if bypass:
        return #bnxFiles #no longer need to return anything--this is not an error

    processImagesJobSet.multiThreadRunJobs(varsP.nThreads, sleepTime =0.25)
    #pipeReport += processImagesJobSet.makeRunReport()
    #pipeReport += processImagesJobSet.makeParseReport()
    #varsP.updatePipeReport(pipeReport, printalso=False)
    processImagesJobSet.doAllPipeReport()
    
    if varsP.lambdaRef:
        mapLambdaJobSet = mthread.jobWrapper(varsP, groupName = 'Map Lambda')
    for device in devices:
        device.findSNRCutoff()
        if varsP.lambdaRef:
            for sJob in device.getLambdaMapJobs():
                if sJob :
                    mapLambdaJobSet.addJob(sJob)
                
    if varsP.lambdaRef:
        mapLambdaJobSet.multiThreadRunJobs(varsP.nThreads, sleepTime =0.25)
        pipeReport =  mapLambdaJobSet.makeRunReport()
        pipeReport += mapLambdaJobSet.makeParseReport()
        for device in devices:
            device.processLambdaMapResults()
        varsP.updatePipeReport(pipeReport, printalso=False)    

    #still need to writeCorrectedBnx
    targetLog = ''
    scanLog = ''
    deviceLog = ''
    bnxFiles = []
    for i,device in enumerate(devices):
        device.writeCorrectedBnx()
        if device.bnxFile : #this is nulled if above failed
            bnxFiles.append(device.bnxFile)
        if i == 0:
            #targetLog += device.getTargetReport(headerOnly=True) + '\n'
            deviceLog += device.getDeviceReport(headerOnly=True) + '\n'
        scanLog   += device.getScanReport() + '\n'
        #targetLog += device.getTargetReport() + '\n'
        deviceLog += device.getDeviceReport() + '\n'

    #remove targetLog here; put it in pipeReport also, 
    # and do it at the beginning, not the end, of processing
    #varsP.updateInfoReport(targetLog + '\n' + scanLog + '\n' + deviceLog + '\n')
    varsP.updateInfoReport(scanLog + '\n' + deviceLog + '\n')

    #return bnxFiles #instead of returning here, merge here, then return
    return joinBnxFiles(varsP, bnxFiles) #see return of this fn