def getParms (): # use default input sys.argv[1:] parser = optparse.OptionParser(usage='%prog [options] <bas_file>') parser.add_option ('--linehelp', action='store_true', help='show line-specific help and exit') parser.add_option ('--filehelp', action='store_true', help='show help for file-related parameters, and exit') parser.add_option ('--ccs', help='directory containing ccs.h5 files for CCS reads, post-2.1.0') parser.add_option ('--aln', help='cmp.h5 file for subread alignments') parser.add_option ('--alnccs', help='cmp.h5 file for CCS alignments') parser.add_option ('--score', type='int', help='minimum HQ region score (def: %default)') parser.add_option ('--length', type='int', help='minimum HQ region length (def: %default)') parser.add_option ('--adapter', type='int', help='expected adapter length (def: %default)') parser.add_option ('--noadapt', action='store_true', help='do not print adapter lines (for brevity)') parser.add_option ('--nocons', action='store_true', help='do not print consensus passes lines') parser.set_defaults (score=DEF_SCORE_THRESHOLD, length=DEF_HQ_LENGTH, adapter=DEF_ADAPTER_LENGTH) opt, args = parser.parse_args() if opt.linehelp: lineHelp() if opt.filehelp: fileHelp() if opt.linehelp or opt.filehelp: sys.exit() if len(args) > 1: logger.warning ('WARNING: alignments cmp.h5 file should now be specified with --aln keyword') opt.aln = args.pop() # put it where it belongs return opt, args
def findCCSFile(self): """Given a directory to look in, find the ccs.h5 file that contains consensus reads for this bax file.""" self._hasConsensus = False # until proven otherwise if "PulseData/ConsensusBaseCalls" in self._top: # if this is an older bax file, in contains its own CCS data self._consBasecalls = self._top["PulseData/ConsensusBaseCalls"] self._consZMW = self._top["PulseData/ConsensusBaseCalls/ZMW"] self._consPasses = self._top["PulseData/ConsensusBaseCalls/Passes"] self._hasConsensus = True elif self._CCSDir is not None: CCSFilename = os.path.basename(self._filename).replace("bax", "ccs") fqCCSFilename = os.path.join(self._CCSDir, CCSFilename) if os.path.exists(fqCCSFilename): self._CCSFile = h5py.File(fqCCSFilename, "r") self._consBasecalls = self._CCSFile["PulseData/ConsensusBaseCalls"] self._consZMW = self._CCSFile["PulseData/ConsensusBaseCalls/ZMW"] self._consPasses = self._CCSFile["PulseData/ConsensusBaseCalls/Passes"] self._hasConsensus = True logger.debug("BaxFile %s found CCS file %s" % (self._shortName, fqCCSFilename)) else: logger.warning("%s: no CCS file found corresponding to %s" % (self._shortName, self._filename)) else: logger.info("BaxFile %s does not contain CCS data (rel 2.1.0 and later). Use --ccs" % self._shortName)
def makeTempDir (dir): if os.path.isdir (dir): logger.warning('WARNING: temp directory %s already exists' % dir) else: os.makedirs (dir) return
def makeTempDir(dir): if os.path.isdir(dir): logger.warning('WARNING: temp directory %s already exists' % dir) else: os.makedirs(dir) return
def getParms(): # use default input sys.argv[1:] parser = optparse.OptionParser(usage='%prog [options] <bas_file>') parser.add_option('--linehelp', action='store_true', help='show line-specific help and exit') parser.add_option('--filehelp', action='store_true', help='show help for file-related parameters, and exit') parser.add_option( '--ccs', help='directory containing ccs.h5 files for CCS reads, post-2.1.0') parser.add_option('--aln', help='cmp.h5 file for subread alignments') parser.add_option('--alnccs', help='cmp.h5 file for CCS alignments') parser.add_option('--score', type='int', help='minimum HQ region score (def: %default)') parser.add_option('--length', type='int', help='minimum HQ region length (def: %default)') parser.add_option('--adapter', type='int', help='expected adapter length (def: %default)') parser.add_option('--noadapt', action='store_true', help='do not print adapter lines (for brevity)') parser.add_option('--nocons', action='store_true', help='do not print consensus passes lines') parser.set_defaults(score=DEF_SCORE_THRESHOLD, length=DEF_HQ_LENGTH, adapter=DEF_ADAPTER_LENGTH) opt, args = parser.parse_args() if opt.linehelp: lineHelp() if opt.filehelp: fileHelp() if opt.linehelp or opt.filehelp: sys.exit() if len(args) > 1: logger.warning( 'WARNING: alignments cmp.h5 file should now be specified with --aln keyword' ) opt.aln = args.pop() # put it where it belongs return opt, args
def getGeneFromAnnotation (opt, tranList, exonList): '''Add to lists of transcripts and exons: annotations for gene of interest.''' if opt.gtf == None: return tranList, exonList omits = [] if opt.omit is None else opt.omit.split(',') # transcripts which must not be included if opt.annotations: annotList = opt.annotations else: if opt.format == 'pickle': annotList = anno.AnnotationList.fromPickle (opt.gtf) elif opt.format == 'alt': annotList = anno.AnnotationList (opt.gtf, altFormat=True) else: # standard format annotList = anno.AnnotationList (opt.gtf) allGenes = annotList.getGeneDict() if opt.gene not in allGenes: raise RuntimeError ('gene %s is not in the annotation file' % opt.gene) geneList = allGenes[opt.gene] # a list of Annotation objects if len(geneList) > 1: logger.warning('gene %s appears %d times in annotations, first occurrence plotted' \ % (opt.gene, len(geneList))) myGene = geneList[0] for tran in myGene.getChildren(): # tran is an Annotation object if tran.name not in omits: # if not in ignore list myTran = Transcript(tran.name, start=tran.start, end=tran.end, annot=True, ID=tran.ID) if hasattr(tran, 'startcodon'): myTran.startcodon = tran.startcodon if hasattr(tran, 'stopcodon'): myTran.stopcodon = tran.stopcodon for exon in tran.getChildren(): # exon is an Annotation object myExon = Exon(myTran, exon.name, exon.start, exon.end, exon.strand) # no Q score if hasattr (exon, 'polyAs'): print exon.name myExon.polyAs = exon.polyAs exonList.append (myExon) myTran.exons.append(myExon) tranList.append (myTran) return tranList, exonList
def getGeneFromAnnotation (opt, tranList, exonList): '''Add to lists of transcripts and exons: annotations for gene of interest.''' if opt.gtf == None: return tranList, exonList omits = [] if opt.omit is None else opt.omit.split(',') # transcripts which must not be included if opt.format == 'pickle': annotList = anno.AnnotationList.fromPickle (opt.gtf) elif opt.format == 'alt': annotList = anno.AnnotationList (opt.gtf, altFormat=True) else: # standard format annotList = anno.AnnotationList (opt.gtf) allGenes = annotList.getGeneDict() if opt.gene not in allGenes: raise RuntimeError ('gene %s is not in the annotation file' % opt.gene) geneList = allGenes[opt.gene] # a list of Annotation objects if len(geneList) > 1: logger.warning('gene %s appears %d times in annotations, first occurrence plotted' \ % (opt.gene, len(geneList))) myGene = geneList[0] for tran in myGene.getChildren(): # tran is an Annotation object if tran.name not in omits: # if not in ignore list myTran = Transcript(tran.name, annot=True) if hasattr(tran, 'startcodon'): myTran.startcodon = tran.startcodon if hasattr(tran, 'stopcodon'): myTran.stopcodon = tran.stopcodon for exon in tran.getChildren(): # exon is an Annotation object myExon = Exon(myTran, exon.name, exon.start, exon.end, exon.strand) # no Q score if hasattr (exon, 'polyAs'): print exon.name myExon.polyAs = exon.polyAs exonList.append (myExon) myTran.exons.append(myExon) tranList.append (myTran) return tranList, exonList
def getGeneFromAnnotation(opt, tranList, exonList): # Add to lists of transcripts and exons: annotations for gene of interest. if opt.gtf is None: return tranList, exonList if opt.annotations: annotList = opt.annotations else: if opt.format == 'pickle': annotList = anno.AnnotationList.fromPickle(opt.gtf) elif opt.format == 'alt': annotList = anno.AnnotationList(opt.gtf, altFormat=True) else: # standard format annotList = anno.AnnotationList(opt.gtf) allGenes = annotList.getGeneDict() allGenes.update({k.upper(): v for k, v in allGenes.iteritems()}) if opt.gene not in allGenes: raise RuntimeError('gene %s is not in the annotation file' % opt.gene) geneList = allGenes[opt.gene] # a list of Annotation objects if len(geneList) > 1: logger.warning( 'gene %s appears %d times in annotations, first occurrence plotted' % (opt.gene, len(geneList))) myGene = geneList[0] for tran in myGene.getChildren(): # tran is an Annotation object myTran = Transcript(tran.name, start=tran.start, end=tran.end, annot=True, ID=tran.ID, source=(0, opt.gtf)) if hasattr(tran, 'startcodon'): myTran.startcodon = tran.startcodon if hasattr(tran, 'stopcodon'): myTran.stopcodon = tran.stopcodon for exon in tran.getChildren(): # exon is an Annotation object myExon = Exon(myTran, exon.name, exon.start, exon.end, exon.strand) # no Q score if hasattr(exon, 'polyAs'): myExon.polyAs = exon.polyAs exonList.append(myExon) myTran.exons.append(myExon) tranList.append(myTran) return tranList, exonList
def findCCSFile(self): '''Given a directory to look in, find the ccs.h5 file that contains consensus reads for this bax file.''' self._hasConsensus = False # until proven otherwise if "PulseData/ConsensusBaseCalls" in self._top: # if this is an older bax file, in contains its own CCS data self._consBasecalls = self._top["PulseData/ConsensusBaseCalls"] self._consZMW = self._top["PulseData/ConsensusBaseCalls/ZMW"] self._consPasses = self._top["PulseData/ConsensusBaseCalls/Passes"] self._hasConsensus = True elif self._CCSDir is not None: CCSFilename = os.path.basename(self._filename).replace( 'bax', 'ccs') fqCCSFilename = os.path.join(self._CCSDir, CCSFilename) if os.path.exists(fqCCSFilename): self._CCSFile = h5py.File(fqCCSFilename, 'r') self._consBasecalls = self._CCSFile[ "PulseData/ConsensusBaseCalls"] self._consZMW = self._CCSFile[ "PulseData/ConsensusBaseCalls/ZMW"] self._consPasses = self._CCSFile[ "PulseData/ConsensusBaseCalls/Passes"] self._hasConsensus = True logger.debug('BaxFile %s found CCS file %s' % (self._shortName, fqCCSFilename)) else: logger.warning('%s: no CCS file found corresponding to %s' % (self._shortName, self._filename)) else: logger.info( 'BaxFile %s does not contain CCS data (rel 2.1.0 and later). Use --ccs' % self._shortName)
def getParms (): # use default input sys.argv[1:] parser = optparse.OptionParser(usage='%prog [options] <bas_file> [<cmp_file>]', description='Print (to stdout) summary information about the contents of a bas.h5 file.') parser.add_option ('--ccs', help='directory containing ccs.h5 files for CCS reads, post-2.1.0') parser.add_option ('--aln', help='cmp.h5 file for subread alignments') parser.add_option ('--score', type='int', help='Minimum HQ region score (def: %default)') parser.add_option ('--length', type='int', help='Minimum HQ region length (def: %default)') parser.add_option ('--insert', type='int', help='Minimum average insert length (def: %default)') parser.set_defaults (score=DEF_SCORE_THRESHOLD, length=DEF_HQ_LENGTH, insert=DEF_INSERT_THRESHOLD) opt, args = parser.parse_args() if len(args) > 1: logger.warning ('WARNING: alignments cmp.h5 file should now be specified with --aln keyword') opt.aln = args.pop() # put it where it belongs return opt, args
def getGeneFromAnnotation(opt, tranList, exonList): # Add to lists of transcripts and exons: annotations for gene of interest. if opt.gtf is None: return tranList, exonList if opt.annotations: annotList = opt.annotations else: if opt.format == 'pickle': annotList = anno.AnnotationList.fromPickle(opt.gtf) elif opt.format == 'alt': annotList = anno.AnnotationList(opt.gtf, altFormat=True) else: # standard format annotList = anno.AnnotationList(opt.gtf) allGenes = annotList.getGeneDict() allGenes.update({k.upper(): v for k, v in allGenes.iteritems()}) if opt.gene not in allGenes: raise RuntimeError('gene %s is not in the annotation file' % opt.gene) geneList = allGenes[opt.gene] # a list of Annotation objects if len(geneList) > 1: logger.warning('gene %s appears %d times in annotations, first occurrence plotted' % (opt.gene, len(geneList))) myGene = geneList[0] for tran in myGene.getChildren(): # tran is an Annotation object myTran = Transcript(tran.name, start=tran.start, end=tran.end, annot=True, ID=tran.ID, source=(0, opt.gtf)) if hasattr(tran, 'startcodon'): myTran.startcodon = tran.startcodon if hasattr(tran, 'stopcodon'): myTran.stopcodon = tran.stopcodon for exon in tran.getChildren(): # exon is an Annotation object myExon = Exon(myTran, exon.name, exon.start, exon.end, exon.strand) # no Q score if hasattr(exon, 'polyAs'): myExon.polyAs = exon.polyAs exonList.append(myExon) myTran.exons.append(myExon) tranList.append(myTran) return tranList, exonList
def main (): logger.debug("%s starting" % sys.argv[0]) opt, args = getParms() basFilename = args[0] logger.debug("bas file: %s" % basFilename) bf = H5BasFile.BasFile (basFilename, CCSDir=opt.ccs) if not opt.nocons: if not bf.hasConsensus(): logger.warning('no ccs data found: turning on --nocons') opt.nocons = True cmp = None # no cmp file? if opt.aln is not None: # was a subread cmp.h5 file specified? cmpFilename = opt.aln logger.debug("cmp file: %s" % cmpFilename) cf = H5CmpFile.CmpFile (fileName=cmpFilename) cmp = H5CmpFile.CmpMovie (cmpObject=cf, movieName=bf.movieName(), maxHole=bf.maxZMW()) cmpCCS = None if opt.alnccs is not None: # was a CCS cmp.h5 file specified? cmpCCSFilename = opt.alnccs logger.debug("CCS cmp file: %s" % cmpCCSFilename) cfCCS = H5CmpFile.CmpFile (fileName=cmpCCSFilename) cmpCCS = H5CmpFile.CmpMovie (cmpObject=cfCCS, movieName=bf.movieName(), maxHole=bf.maxZMW()) aln = SWAligner.Aligner() # we'll use this in the loop below for finding adapters aln.setRead (H5BasFile.ADAPTER) # adapter sequence is query minAdapterScore = opt.adapter * aln.getPenalties()[0] / 2 print " ZMW b/s stat prod tp start end+1 len aln chr st", print " from to off astart aend+1 mm ins del Q" print for hole in bf.holeNumbers(): # main loop! numBases = bf.readLen(hole) zStat = bf.holeStatusStr(hole) # this is a string, not a number zProd = bf.productivity(hole) numGoodInserts = 0 HQStart, HQEnd, HQScore = bf.HQregion(hole)[2:5] for region in bf.holeRegions(hole): regionHole, regionType, start, end, score = region inHQ = end > HQStart and start < HQEnd # does region overlap HQ? regionDuration = float(bf.elapsedFrames(hole, start, end)) / H5BasFile.frameRate regionBps = (end-start) / regionDuration if regionDuration > 0 else 0 if regionType == 0: # an adapter region? if not opt.noadapt: # if we are printing adapter lines print "%6d %6.3f %-5s %d" % (hole, regionBps, zStat, zProd), # these appear in every line flag = 'A ' if inHQ else 'a ' print "%-2s %5d %5d" % (flag, start, end) elif regionType == 2: # a HQ region? print "%6d %6.3f %-5s %d" % (hole, regionBps, zStat, zProd), # these appear in every line if zProd != 1 or not bf.isSequencingZMW(hole) or (HQEnd-HQStart) < opt.length: flag = 'h ' else: flag = 'H+' if score >= opt.score else 'H ' print "%-2s %5d %5d" % (flag, start, end), readDuration = float(bf.elapsedFrames(hole)) / H5BasFile.frameRate readBps = numBases/readDuration if readDuration > 0 else 0 print " score: %3d HQ: %5d read: %5d dur: %8.3f b/sec: %6.3f" \ % (score, HQEnd-HQStart, numBases, readDuration, readBps) elif regionType == 1: # a subread? print "%6d %6.3f %-5s %d" % (hole, regionBps, zStat, zProd), # these appear in every line insSize = end - start align = None if cmp is not None: # if a cmp.h5 was supplied align = cmp.getAlignmentByPosition (hole, start, end) # alignment record for this region if align is not None: # if the region aligned numGoodInserts += 1 flag = 'I+' print "%-2s %5d %5d %5d" % (flag, start, end, insSize), rStart, rEnd = align['rStart'], align['rEnd'] # fetch once, used many times alnLen = rEnd-rStart nMM, nIns, nDel = align['nMM'], align['nIns'], align['nDel'] print "%5d %2d %1s %9d %9d %4d %6d %6d %3d %4d %3d %4.1f" % \ (alnLen, # length of aligned portion of read align['contig'], # chr/contig id (see H5CmpFile) '-' if align['RCRefStrand'] else '+', # strand align['tStart'], align['tEnd'], # reference offset of start/end of alignment rStart-start, # offset of alignment start into insert rStart, rEnd, nMM, nIns, nDel, # # of mismatches, insertions, deletions getQ (align)), # read quality Q score for insert elif insSize > opt.adapter * 2: # if it's a non-descript, non-aligned region # TODO: Make the '2' a parameter numGoodInserts += 1 flag = 'I ' if inHQ else 'i ' print "%-2s %5d %5d %5d" % (flag, start, end, insSize), elif insSize < opt.adapter: # if it's too short to be an adapter flag = 'Is' if inHQ else 'is' print "%-2s %5d %5d %5d" % (flag, start, end, insSize), else: # see if it's really an adapter that wasn't called sequence = bf.getSequence(hole, start, end) aln.setRef (sequence) alnScore = aln.fillMatrix() # align it to adapter flag = 'ia' if alnScore >= minAdapterScore else 'is' print "%-2s %5d %5d %5d %2d" % (flag, start, end, insSize, alnScore), if alnScore >= minAdapterScore: peaks = aln.peakPosits() print " %2d" % len(peaks), refString, readString = aln.alignmentStrings() print ' ', refString # EOL here print " i2 ", # new line print readString, print else: raise ValueError ("unrecognised region type %d in ZMW %d" % (regionType, hole)) if not opt.nocons: # note that opt.nocons gets turned on if no CCS data is found if zProd == 1 and bf.isSequencingZMW(hole): printCCSDataForHole (bf, hole, numGoodInserts, cmpCCS) # process consensus read passes logger.debug("complete")
def main (): logger.debug("%s starting" % sys.argv[0]) opt, args = getParms() basFilename = args[0] logger.debug("bas file: %s" % basFilename) bf = H5BasFile.BasFile (basFilename, CCSDir=opt.ccs) if bf.hasConsensus(): # don't go looking for CCS data if it's not there nocons = False else: logger.warning('no ccs data found: point to it with --ccs if desired') nocons = True cmp = None # no cmp file? if opt.aln is not None: # was a subread cmp.h5 file specified? cmpFilename = opt.aln logger.debug("cmp file: %s" % cmpFilename) cf = H5CmpFile.CmpFile (fileName=cmpFilename) cmp = H5CmpFile.CmpMovie (cmpObject=cf, movieName=bf.movieName(), maxHole=bf.maxZMW()) totalC = Counter('Total') seqC = Counter('--Sequencing') prod0C = Counter('----Productivity-0') prod1C = Counter('----Productivity-1') HQLenC = Counter('------HQ Len >= %s' % opt.length) HQScoreC = Counter('--------HQ Score >= %s' % opt.score) adaptC = Counter('----------Avg Insert >= %s' % opt.insert) HQBasesC = Counter('----------HQ Bases') alignC = Counter('------------Aligned') consC = Counter('------------Consensus Reads') prod2C = Counter('----Productivity-2') longest = 0 longestZMW = None for hole in bf.holeNumbers(): numBases = bf.readLen(hole) zProd = bf.productivity(hole) if numBases > longest: longest = numBases longestZMW = hole HQStart, HQEnd, HQScore = bf.HQregion(hole)[2:5] HQLen = HQEnd - HQStart numSubreads = 0 numHQSubreads = 0 maxSubreadLen = 0 cumSubreadLen = 0 alignedSubreads = 0 alignedTotBases = 0 # total aligned bases in all inserts alignedMaxBases = 0 # longest alignment in single insert for region in bf.holeRegions(hole): regionHole, regionType, start, end, score = region inHQ = end > HQStart and start < HQEnd # does region overlap HQ? if regionType == 1: # if insert numSubreads += 1 maxSubreadLen = max (end-start, maxSubreadLen) cumSubreadLen += max (end-start, 0) # clip negative lengths to zero if inHQ: numHQSubreads += 1 if cmp is not None: align = cmp.getAlignmentByPosition (hole, start, end) # alignment record for this region if align is not None: # if the region aligned alignedSubreads += 1 alignedBases = align['rEnd'] - align ['rStart'] alignedTotBases += alignedBases alignedMaxBases = max (alignedBases, alignedMaxBases) # What follows is a series of increasingly restrictive # criteria for a useful subread. Keep track of the number of # ZMWs, subreads, and bases which pass the successive tests, # and the length and ZMW provenance of the longest accepted # subread. totalC.incr (1, numSubreads, numBases) totalC.longest (hole, maxSubreadLen) if bf.isSequencingZMW(hole): # sequencing ZMW? seqC.incr (1, numSubreads, numBases) seqC.longest (hole, maxSubreadLen) if zProd == 0: prod0C.incr (1, numSubreads, numBases) prod0C.longest (hole, maxSubreadLen) elif zProd == 2: prod2C.incr (1, numSubreads, numBases) prod2C.longest (hole, maxSubreadLen) elif zProd == 1: # productivity 1 gets broken down further prod1C.incr (1, numSubreads, numBases) prod1C.longest (hole, maxSubreadLen) if HQLen >= opt.length: HQLenC.incr (1, numSubreads, numBases) HQLenC.longest (hole, maxSubreadLen) if HQScore >= opt.score: HQScoreC.incr (1, numSubreads, numBases) HQScoreC.longest (hole, maxSubreadLen) # A very short average insert size probably indicates an adapter dimer. if cumSubreadLen >= numSubreads * opt.insert: adaptC.incr (1, numSubreads, numBases) adaptC.longest (hole, maxSubreadLen) HQBasesC.incr (1, numHQSubreads, HQLen) # HQBasesC.longest (hole, HQLen) if alignedSubreads > 0: alignC.incr (1, alignedSubreads, alignedTotBases) # total aligned bases alignC.longest (hole, alignedMaxBases) # longest single alignment if not nocons: consLen = bf.consReadLen(hole) if consLen > 0: consC.incr (1, bf.numConsensusPasses(hole), consLen) consC.longest (hole, consLen) print print "file: ", basFilename print print "longest read was ZMW %d at %d bases" % (longestZMW, longest) print print "statistics for subreads:" print Counter.title(); if cmp is not None: # if we processed a .cmp.h5 file for cntr in (totalC, seqC, prod0C, prod2C, prod1C, HQLenC, HQScoreC, adaptC, HQBasesC, consC, alignC): cntr.longPrint() else: for cntr in (totalC, seqC, prod0C, prod2C, prod1C, HQLenC, HQScoreC, adaptC, HQBasesC, consC): cntr.longPrint() print logger.debug("complete")
def main(): logger.debug("%s starting" % sys.argv[0]) opt, args = getParms() basFilename = args[0] logger.debug("bas file: %s" % basFilename) bf = H5BasFile.BasFile(basFilename, CCSDir=opt.ccs) if not opt.nocons: if not bf.hasConsensus(): logger.warning('no ccs data found: turning on --nocons') opt.nocons = True cmp = None # no cmp file? if opt.aln is not None: # was a subread cmp.h5 file specified? cmpFilename = opt.aln logger.debug("cmp file: %s" % cmpFilename) cf = H5CmpFile.CmpFile(fileName=cmpFilename) cmp = H5CmpFile.CmpMovie(cmpObject=cf, movieName=bf.movieName(), maxHole=bf.maxZMW()) cmpCCS = None if opt.alnccs is not None: # was a CCS cmp.h5 file specified? cmpCCSFilename = opt.alnccs logger.debug("CCS cmp file: %s" % cmpCCSFilename) cfCCS = H5CmpFile.CmpFile(fileName=cmpCCSFilename) cmpCCS = H5CmpFile.CmpMovie(cmpObject=cfCCS, movieName=bf.movieName(), maxHole=bf.maxZMW()) aln = SWAligner.Aligner( ) # we'll use this in the loop below for finding adapters aln.setRead(H5BasFile.ADAPTER) # adapter sequence is query minAdapterScore = opt.adapter * aln.getPenalties()[0] / 2 print " ZMW b/s stat prod tp start end+1 len aln chr st", print " from to off astart aend+1 mm ins del Q" print for hole in bf.holeNumbers(): # main loop! numBases = bf.readLen(hole) zStat = bf.holeStatusStr(hole) # this is a string, not a number zProd = bf.productivity(hole) numGoodInserts = 0 HQStart, HQEnd, HQScore = bf.HQregion(hole)[2:5] for region in bf.holeRegions(hole): regionHole, regionType, start, end, score = region inHQ = end > HQStart and start < HQEnd # does region overlap HQ? regionDuration = float(bf.elapsedFrames(hole, start, end)) / H5BasFile.frameRate regionBps = (end - start) / regionDuration if regionDuration > 0 else 0 if regionType == 0: # an adapter region? if not opt.noadapt: # if we are printing adapter lines print "%6d %6.3f %-5s %d" % ( hole, regionBps, zStat, zProd), # these appear in every line flag = 'A ' if inHQ else 'a ' print "%-2s %5d %5d" % (flag, start, end) elif regionType == 2: # a HQ region? print "%6d %6.3f %-5s %d" % ( hole, regionBps, zStat, zProd), # these appear in every line if zProd != 1 or not bf.isSequencingZMW(hole) or ( HQEnd - HQStart) < opt.length: flag = 'h ' else: flag = 'H+' if score >= opt.score else 'H ' print "%-2s %5d %5d" % (flag, start, end), readDuration = float( bf.elapsedFrames(hole)) / H5BasFile.frameRate readBps = numBases / readDuration if readDuration > 0 else 0 print " score: %3d HQ: %5d read: %5d dur: %8.3f b/sec: %6.3f" \ % (score, HQEnd-HQStart, numBases, readDuration, readBps) elif regionType == 1: # a subread? print "%6d %6.3f %-5s %d" % ( hole, regionBps, zStat, zProd), # these appear in every line insSize = end - start align = None if cmp is not None: # if a cmp.h5 was supplied align = cmp.getAlignmentByPosition( hole, start, end) # alignment record for this region if align is not None: # if the region aligned numGoodInserts += 1 flag = 'I+' print "%-2s %5d %5d %5d" % (flag, start, end, insSize), rStart, rEnd = align['rStart'], align[ 'rEnd'] # fetch once, used many times alnLen = rEnd - rStart nMM, nIns, nDel = align['nMM'], align['nIns'], align[ 'nDel'] print "%5d %2d %1s %9d %9d %4d %6d %6d %3d %4d %3d %4.1f" % \ (alnLen, # length of aligned portion of read align['contig'], # chr/contig id (see H5CmpFile) '-' if align['RCRefStrand'] else '+', # strand align['tStart'], align['tEnd'], # reference offset of start/end of alignment rStart-start, # offset of alignment start into insert rStart, rEnd, nMM, nIns, nDel, # # of mismatches, insertions, deletions getQ (align)), # read quality Q score for insert elif insSize > opt.adapter * 2: # if it's a non-descript, non-aligned region # TODO: Make the '2' a parameter numGoodInserts += 1 flag = 'I ' if inHQ else 'i ' print "%-2s %5d %5d %5d" % (flag, start, end, insSize), elif insSize < opt.adapter: # if it's too short to be an adapter flag = 'Is' if inHQ else 'is' print "%-2s %5d %5d %5d" % (flag, start, end, insSize), else: # see if it's really an adapter that wasn't called sequence = bf.getSequence(hole, start, end) aln.setRef(sequence) alnScore = aln.fillMatrix() # align it to adapter flag = 'ia' if alnScore >= minAdapterScore else 'is' print "%-2s %5d %5d %5d %2d" % (flag, start, end, insSize, alnScore), if alnScore >= minAdapterScore: peaks = aln.peakPosits() print " %2d" % len(peaks), refString, readString = aln.alignmentStrings() print ' ', refString # EOL here print " i2 ", # new line print readString, print else: raise ValueError("unrecognised region type %d in ZMW %d" % (regionType, hole)) if not opt.nocons: # note that opt.nocons gets turned on if no CCS data is found if zProd == 1 and bf.isSequencingZMW(hole): printCCSDataForHole(bf, hole, numGoodInserts, cmpCCS) # process consensus read passes logger.debug("complete")