def trimMatchOverlapsInBoth(inpfile,outfile,trim_subtype):
    gp = MyFile.myfile()
    MatchRecord.sortInXorderAP(inpfile,gp)
    # The following coalescing assumes perfect runs.
    hp = MyFile.myfile()
    coalesceMatches( gp, hp, ((trim_subtype == 'x') or (trim_subtype == 'u')) )
    gp = MyFile.myfile()
    trimMatchOverlapsInX(hp,gp,trim_subtype)
    hp = MyFile.myfile()
    MatchRecord.sortInYorderAP(gp,hp)
    trimMatchOverlapsInY(hp,outfile,trim_subtype)
    return
def trimMatchOverlapsInBoth(inpfile, outfile, trim_subtype):
    gp = MyFile.myfile()
    MatchRecord.sortInXorderAP(inpfile, gp)
    # The following coalescing assumes perfect runs.
    hp = MyFile.myfile()
    coalesceMatches(gp, hp, ((trim_subtype == 'x') or (trim_subtype == 'u')))
    gp = MyFile.myfile()
    trimMatchOverlapsInX(hp, gp, trim_subtype)
    hp = MyFile.myfile()
    MatchRecord.sortInYorderAP(gp, hp)
    trimMatchOverlapsInY(hp, outfile, trim_subtype)
    return
示例#3
0
def applyBothKeepMasks(inpfile, outfile):

    # Maybe we can think of a masking implementation where each ATAC match
    # is treated atomicly.  Assume that the keep mask intervals are sorted
    # by start postition.  Assume that the ATAC matches are sorted by start
    # postion.  Assert that all keep mask intervals are non-overlapping and
    # were cut from only one ATAC match.  Thus the mapping from keep mask
    # intervals is a function.  Note that this requires that we do not
    # coalesce abutting keep mask intervals that originate from multiple
    # matches.  Note this still allows an ATAC match to overlap more than
    # one keep mask interval.  Ignore all keep mask intervals with zero
    # length their creation has tie breaking problems.  See notes on 2003
    # Jul 29.

    debug = 0
    debugnum = 0
    inpfile.seek(0)
    outfile.seek(0)

    # Apply the keepMask for the first axis.
    # Make the sorted the keep mask intervals for the first axis.
    processFirstAxis = 1
    keepMaskFile = MyFile.myfile()
    tmpfile2 = inpfile
    tmpfile3 = MyFile.myfile()
    tmpfile4 = MyFile.myfile()

    findCoverageIntervals(inpfile, keepMaskFile, processFirstAxis)
    if debug:
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in keepMaskFile:
            print >> debugfile, line,

    MatchRecord.sortInXorderAP(tmpfile2, tmpfile3)
    if debug:
        #tmpfile2.seek(0)
        #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        #for line in tmpfile2: print >>debugfile, line,
        tmpfile3.seek(0)
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile3:
            print >> debugfile, line,

    applyOneKeepMask(tmpfile3, tmpfile4, keepMaskFile, processFirstAxis)
    if debug:
        tmpfile4.seek(0)
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile4:
            print >> debugfile, line,

    # Apply the keepMask for the second axis.
    # Make the sorted the keep mask intervals for the second axis.
    processFirstAxis = 0
    keepMaskFile = MyFile.myfile()
    tmpfile2 = tmpfile4
    tmpfile3 = MyFile.myfile()
    tmpfile4 = outfile

    findCoverageIntervals(inpfile, keepMaskFile, processFirstAxis)
    if debug:
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in keepMaskFile:
            print >> debugfile, line,

    MatchRecord.sortInYorderAP(tmpfile2, tmpfile3)
    if debug:
        #tmpfile2.seek(0)
        #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        #for line in tmpfile2: print >>debugfile, line,
        tmpfile3.seek(0)
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile3:
            print >> debugfile, line,

    applyOneKeepMask(tmpfile3, tmpfile4, keepMaskFile, processFirstAxis)
    if debug:
        tmpfile4.seek(0)
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile4:
            print >> debugfile, line,
示例#4
0
    def runOld(self):
        self.globals['atacAlgorithmVersion'] = str(17)
        print >>STDERR, "runName = %s\n" % self.runName

        # The ATAC globals used by this script:
        opt_t = int(self.globals['globalMatchMinSize'])
        opt_l = int(self.globals['globalPerfectRunMinLen'])
        maxdiff = int(self.globals['globalPerfectRunMaxGapLen'])

        assemblyId1 = self.globals['assemblyId1']
        assemblyId2 = self.globals['assemblyId2']

        assemblyFile1 = self.globals['assemblyFile1']
        assemblyFile2 = self.globals['assemblyFile2']

        boxRecoveryOn = 0  # Deprecated for same species comparisons 2003/09/09.
        if(self.globals.has_key("boxRecoveryOn")):
            boxRecoveryOn = int(self.globals['boxRecoveryOn'])
            
        t0 = time.time()

        assemblyIdx1 = IdxStore.IdxStore(assemblyFile1,assemblyId1)
        assemblyIdx2 = IdxStore.IdxStore(assemblyFile2,assemblyId2)
        rawfile = None
        
        ###################################################################
        # Setup for checkpointing scheme.        
        redo = 0
        keep = 0
        step = 0
        if(self.globals.has_key("ckpKeep")):
            keep = int(self.globals['ckpKeep'])
        ckpName = "AllDone"
        ###################################################################

        print >>STDERR, 'Keep step=' + str(keep)
        print >>STDERR, 'At step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)

        outprefix = self.runName

        step += 1
        print >>STDERR, 'At uniqueFilter, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            if(not(self.globals.has_key('uniqueFilterOn') and self.globals['uniqueFilterOn']=="0")):
                print >>STDERR, 'Running UniqueFilter'
                outfile = MyFile.myfile()
                UniqueFilter.main( self.matches, outfile)
                self.matches = outfile
                outprefix += '.uniq'
                self.checkpoint(outprefix)

        step += 1
        print >>STDERR, 'At filterByMatchLength, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, 'Running filterByMatchLength'
            outfile = MyFile.myfile()
            filterByMatchLength( self.matches, outfile, opt_t)
            self.matches = outfile
            outprefix += '.t' + str(opt_t)
            self.checkpoint(outprefix)

        step += 1
        print >>STDERR, 'At trimMatchOverlaps, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, "Start trimming for bp one-to-one-ness"
            tempdata = MyFile.myfile()
            TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u')
            self.matches = tempdata
            print >>STDERR, "Finished trimming for bp one-to-one-ness"
            outprefix += '.trim'
            self.checkpoint(outprefix)

        if( boxRecoveryOn == 1 ):
            # For box recovery later ... but what if we start from a checkpoint?
            rawfile = self.matches

        step += 1
        print >>STDERR, 'At formPerfectRuns, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.p6'
            tempdata = PerfectRuns.formPerfectRuns(self.matches,
                                                   MatchRecord.sortInXorderAP,
                                                   MatchRecord.sortInYorderAP,
                                                   maxdiff,
                                                   'r')
            self.matches = tempdata
            outprefix += ".p6"
        # end if

        step += 1
        print >>STDERR, 'At onlyKeepLongRuns, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.l' + str(opt_l)
            tempdata = onlyKeepLongRuns( self.matches, outprefix, opt_l)
            self.matches = tempdata
            outprefix += '.l' + str(opt_l)
            self.checkpoint(outprefix)

        step += 1
        print >>STDERR, 'At formPerfectRuns, step=' + str(step) 
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, 'Heal the perfect runs'
            tempdata = PerfectRuns.formPerfectRuns(self.matches,
                                       MatchRecord.sortInYorderAP,
                                       MatchRecord.sortInXorderAP, maxdiff, 'r')
            self.matches = tempdata
            outprefix += '.pr'
            self.checkpoint(outprefix)

        if(boxRecoveryOn == 1): 

            # This is a box recovery step.
            step += 1
            print >>STDERR, 'At boxRecovery, step=' + str(step) 
            print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
            if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
                redo = 1
                print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.br'
                print >>STDERR, "Make sorted raw matches"
                outfile = MyFile.myfile()
                MatchRecord.sortInXorderAP( rawfile, outfile)
                rawfile = outfile
                print >>STDERR, "perform box recovery"
                tempdata = boxRecovery( self.matches, rawfile, outprefix)
                self.matches = tempdata
                outprefix += '.br'
                self.checkpoint(outprefix)
            # end if

            step += 1
            print >>STDERR, 'At formPerfectRuns, step=' + str(step)
            print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
            if (redo or ( (keep < step) and not self.globals.has_key(ckpName))):
                print >>STDERR, "form perfect runs"
                redo = 1
                print >>STDERR, 'from ' + outprefix + ' to ' + outprefix + '.p6'
                tempdata = PerfectRuns.formPerfectRuns(self.matches,
                                       MatchRecord.sortInXorderAP,
                                       MatchRecord.sortInYorderAP, maxdiff, 'r')
                self.matches = tempdata
                outprefix += '.pr'
                self.checkpoint(outprefix)

        step += 1
        print >>STDERR, 'At squeezeIntraRunGaps, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, 'from ' + outprefix + ' to ' + outprefix + '.sq'
            tempdata = MyFile.myfile()
            squeezeIntraRunGaps.mainLoop(
                self.matches,
                tempdata,
                assemblyIdx1, assemblyIdx2)
            tempy = MyFile.myfile()
            # Beware the current match subtypes are 'x', 'L', and 'R'!
            coalesceMatches( tempdata, tempy, 1)
            self.matches = tempy
            outprefix += '.sq'
            self.checkpoint(outprefix)

        step += 1
        print >>STDERR, 'At TrimMatchOverlaps, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, "Start trimming for bp one-to-one-ness"
            tempdata = MyFile.myfile()
            TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u')
            self.matches = tempdata
            outprefix += '.trim'
            print >>STDERR, "Finished trimming for bp one-to-one-ness"

        step += 1
        print >>STDERR, 'At RunsAsMatches, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            self.runs = PerfectRuns.runsAsMatches( self.matches)
            outprefix += '.runs'
            self.checkpoint(outprefix)
        # end if

        if(self.globals.has_key('fillIntraRunGapsOn') and self.globals['fillIntraRunGapsOn']=="1" ):
        
            # Next comes the DNA sequence dependent stuff.
            step += 1
            print >>STDERR, 'At fillIntraRunGaps, step=' + str(step)
            print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
            if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
                redo = 1
                print >>STDERR, "fill the intrarun gaps"
                if(not self.globals.has_key('fillIntraRunGapsErate')):
                    self.globals['fillIntraRunGapsErate'] = 0.10
                if(not self.globals.has_key('fillIntraRunGapsMaxGap')):
                    self.globals['fillIntraRunGapsMaxGap'] = 100000
                fillIntraRunGapsErate = float(self.globals['fillIntraRunGapsErate'])
                fillIntraRunGapsMaxGap = int(self.globals['fillIntraRunGapsMaxGap'])
                tempdata = MyFile.myfile()
                fillIntraRunGaps.mainLoop(self.matches, tempdata,
                                          assemblyIdx1, assemblyIdx2,
                                          fillIntraRunGapsMaxGap, fillIntraRunGapsErate)
                self.matches = tempdata
                outprefix += '.fill'
                self.checkpoint(outprefix)

            step += 1
            print >>STDERR, 'At TrimMatchOverlaps, step=' + str(step)
            print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
            if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
                redo = 1
                print >>STDERR, "trim the overlaps"
                tempdata = MyFile.myfile()
                TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u')
                self.matches = tempdata
                outprefix += '.trim'
                self.checkpoint(outprefix)
def applyBothKeepMasks( inpfile, outfile ):

    # Maybe we can think of a masking implementation where each ATAC match
    # is treated atomicly.  Assume that the keep mask intervals are sorted
    # by start postition.  Assume that the ATAC matches are sorted by start
    # postion.  Assert that all keep mask intervals are non-overlapping and
    # were cut from only one ATAC match.  Thus the mapping from keep mask
    # intervals is a function.  Note that this requires that we do not
    # coalesce abutting keep mask intervals that originate from multiple
    # matches.  Note this still allows an ATAC match to overlap more than
    # one keep mask interval.  Ignore all keep mask intervals with zero
    # length their creation has tie breaking problems.  See notes on 2003
    # Jul 29.

    debug = 0
    debugnum = 0
    inpfile.seek(0)
    outfile.seek(0)


    # Apply the keepMask for the first axis.
    # Make the sorted the keep mask intervals for the first axis.
    processFirstAxis = 1
    keepMaskFile = MyFile.myfile()
    tmpfile2 = inpfile
    tmpfile3 = MyFile.myfile()
    tmpfile4 = MyFile.myfile()

    findCoverageIntervals( inpfile, keepMaskFile, processFirstAxis)
    if debug:
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in keepMaskFile: print >>debugfile, line,
            
    MatchRecord.sortInXorderAP(tmpfile2,tmpfile3)
    if debug:
        #tmpfile2.seek(0)
        #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        #for line in tmpfile2: print >>debugfile, line,
        tmpfile3.seek(0)
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile3: print >>debugfile, line,
        
    applyOneKeepMask( tmpfile3, tmpfile4, keepMaskFile, processFirstAxis)
    if debug:
        tmpfile4.seek(0)
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile4: print >>debugfile, line,
        
    # Apply the keepMask for the second axis.
    # Make the sorted the keep mask intervals for the second axis.
    processFirstAxis = 0
    keepMaskFile = MyFile.myfile()
    tmpfile2 = tmpfile4
    tmpfile3 = MyFile.myfile()
    tmpfile4 = outfile

    findCoverageIntervals( inpfile, keepMaskFile, processFirstAxis)
    if debug:
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in keepMaskFile: print >>debugfile, line,


    MatchRecord.sortInYorderAP(tmpfile2,tmpfile3)
    if debug:
        #tmpfile2.seek(0)
        #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        #for line in tmpfile2: print >>debugfile, line,
        tmpfile3.seek(0)
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile3: print >>debugfile, line,

    applyOneKeepMask( tmpfile3, tmpfile4, keepMaskFile, processFirstAxis)
    if debug:
        tmpfile4.seek(0)
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile4: print >>debugfile, line,