def get_hn_range(self): for file in self.files: print >> sys.stderr, "getting holeNumber range for", file bas = BasH5Reader(file) _lo = bas.sequencingZmws[0] _hi = bas.sequencingZmws[-1] self.hn_range.append((_lo, _hi))
def emitNoBCFastqs(inputFofn_filename, barcodeFofn_filename, outDir, outFile): # step through the bas.h5 and barcode.h5 files and emit # reads for each of these. inputFofn = open(inputFofn_filename).read().splitlines() barcodeFofn = open(barcodeFofn_filename).read().splitlines() outFastq = [] for basFile, barcodeFile in zip(inputFofn, barcodeFofn): basH5 = BasH5Reader(basFile) bcH5 = BarcodeH5Reader(barcodeFile) msk = -np.in1d( basH5.sequencingZmws, bcH5.bestDS[:, 0], assume_unique=True) for hn in basH5.sequencingZmws[msk]: zmw = basH5[hn] if zmw: reads = zmw.subreads() if any(reads): for read in reads: outFastq.append( FastqRecord(read.readName, read.basecalls(), read.QualityValue())) with FastqWriter("%s/%s.fastq" % (outDir, outFile)) as w: for e in outFastq: w.writeRecord(e)
def ice_fa2fq(in_fa, ccs_fofn, out_fq): """Convert an input FASTA file to an output FASTQ file, reading QVs from the input ccs.h5, ccs.bam or ccs FOFN. """ ccs_fns = get_files_from_file_or_fofn(ccs_fofn) fmt = guess_file_format(ccs_fns) if fmt == FILE_FORMATS.H5: qver = basQVcacher() for ccs_fn in ccs_fns: qver.add_bash5(ccs_fn) bas_handlers = {} elif fmt == FILE_FORMATS.BAM: qver = BamCollection(*ccs_fns) else: raise IOError("ice_fa2fq does not support input %s." % ccs_fofn) with ContigSetReaderWrapper(in_fa) as reader, \ FastqWriter(out_fq) as writer: for r in reader: logging.debug("Getting QVs for {name} ...".format(name=r.name)) seqid = r.name.split(' ')[0] parsed_read_name = _Parsed_Read_Name(seqid) if fmt == FILE_FORMATS.H5: try: bas_file = qver.bas_files[parsed_read_name.movie][seqid] if bas_file not in bas_handlers: bas_handlers[bas_file] = BasH5Reader(bas_file) except KeyError: raise IOError("Could not read {s} from {f}.".format( s=seqid, f=ccs_fofn)) qvs = get_qv_from_bas_handler( bas_handler=bas_handlers[bas_file], parsed_read_name=parsed_read_name, qv_name="QualityValue") elif fmt == FILE_FORMATS.BAM: qvs = get_qvs_from_bam(reader=qver, parsed_read_name=parsed_read_name, qv_name="QualityValue") else: assert False if len(r.sequence) != len(qvs): raise ValueError( "Sequence and QVs of {r} should be the same!".format( r=r.name)) writer.writeRecord(r.name, r.sequence[:], qvs) if fmt == FILE_FORMATS.H5: for bas_file, bas_handler in bas_handlers.iteritems(): logging.debug("Closing {bas_file} ...".format(bas_file=bas_file)) bas_handler.close() elif fmt == FILE_FORMATS.BAM: qver.close()
def getUnlabeledZmws(): """Return FASTQ records for ZMWs which do not have a barcode label""" unlabeledZmws = [] for basFile, barcodeFile in zipFofns(runner.args.inputFofn, runner.args.barcodeFofn): basH5 = BasH5Reader(basFile) bcH5 = BarcodeH5Reader(barcodeFile) sdiff = basH5.sequencingZmws[ ~n.in1d(basH5.sequencingZmws, bcH5.labeledZmws.keys())] for hn in sdiff: unlabeledZmws.append(basH5[hn]) return reduce( lambda x, y: x + y, [getFastqRecords(unlabeledZmw) for unlabeledZmw in unlabeledZmws])
def ice_fa2fq(in_fa, ccs_fofn, out_fq): """Convert an input FASTA file to an output FASTQ file, reading QVs from the input ccs.h5 or ccs FOFN. """ qver = basQVcacher() if ccs_fofn.endswith(".h5"): # Input is a ccs.h5 file not a FOFN. qver.add_bash5(ccs_fofn) else: # Input is a ccs FOFN containing multiple ccs.h5 files. for ccs_fn in get_files_from_fofn(ccs_fofn): qver.add_bash5(ccs_fn) bas_handlers = {} with FastaReader(in_fa) as reader, \ FastqWriter(out_fq) as writer: for r in reader: seqid = r.name.split(' ')[0] movie, hn, s_e = "", "", "" try: movie, hn, s_e = seqid.split('/') hn = int(hn) except ValueError: raise ValueError( "{seqid} is not a valid CCS read".format(seqid=seqid)) try: bas_file = qver.bas_files[movie][seqid] if bas_file not in bas_handlers: bas_handlers[bas_file] = BasH5Reader(bas_file) except KeyError: raise IOError( "Could not read {s} from input ccs fofn.".format(s=seqid)) logging.debug("Getting QVs for {name} ...".format(name=r.name)) qvs = get_qv_from_bas_handler(bas_handler=bas_handlers[bas_file], hn=hn, s_e=s_e, qv_name="QualityValue") if len(r.sequence) != len(qvs): raise ValueError( "Sequence and QVs of {r} should be the same!".format( r=r.name)) writer.writeRecord(r.name, r.sequence, qvs) for bas_file, bas_handler in bas_handlers.iteritems(): logging.debug("Closing {bas_file} ...".format(bas_file=bas_file)) bas_handler.close()
def StoreMapped(fileNames, alnMap, stats): for fileName in fileNames: reader = BasH5Reader(fileName) for zmw in reader.sequencingZmws: for s in reader[zmw].subreads: stats.data["rs"].append(reader[zmw].readScore) stats.data["rl"].append(s.readEnd - s.readStart) if (s.readName in alnMap): stats.data["m"].append(len(stats.data["rs"]) - 1) stats.data["ml"].append(alnMap[s.readName].length) stats.data["mi"].append(alnMap[s.readName].identity) stats.data["s"].append(s) else: stats.data["um"].append(len(stats.data["rs"]) - 1) stats.data["ml"].append(0) stats.data["mi"].append(0) stats.data["us"].append(s)
def run(self): inBasH5 = BasH5Reader(self.args.inFile) if not inBasH5.hasConsensusBasecalls and self.args.readType == "ccs": print "Input file %s contains no CCS reads." % self.args.inFile sys.exit(-1) if not inBasH5.hasRawBasecalls and self.args.readType in [ "unrolled", "subreads" ]: print "Input file %s contains no %s reads" % (self.args.inFile, self.args.readType) sys.exit(-1) movieName = inBasH5.movieName outFilePrefix = self.args.outFilePrefix or movieName outFilename = "%s.%s" % (outFilePrefix, self.args.outType) if self.args.outType == "fasta": sink = FastaEmitter(outFilename) elif self.args.outType == "fastq": sink = FastqEmitter(outFilename) if self.args.readType == '': # choose based on file. if inBasH5.hasRawBasecalls: readType = 'subreads' elif inBasH5.hasConsensusBasecalls: readType = 'ccs' else: print "Input bas.h5 file has neither CCS nor subread data" sys.exit(-1) else: readType = self.args.readType for zmwRead in self.zmwReads(inBasH5, readType): zmw = zmwRead.zmw # # Emit read if filters pass # if ((readType != "ccs" or zmw.numPasses >= self.args.minPasses) and (readType == "ccs" or zmw.readScore >= self.args.minReadScore) and (len(zmwRead) >= self.args.minLength)): sink.emit(zmwRead)
def main(parser): args = parser.parse_args() # Get outfile name if args.outFile is None: outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq' else: outfile = args.outFile # Input files barcodeFofn = (l.strip('\n') for l in args.barcode_fofn) ccsFofn = (l.strip('\n') for l in args.ccs_fofn) # Get the read names that are not barcoded no_barcode = set() for barcodeFile in barcodeFofn: bcH5 = BarcodeH5Reader(barcodeFile) for row in bcH5.bestDS: if row[3] / row[1] < args.minAvgBarcodeScore: no_barcode.add('%s/%d' % (bcH5.movieName, row[0])) if args.fasta: outh = FastaWriter(outfile) else: outh = FastqWriter(outfile) for ccsFile in ccsFofn: ccsH5 = BasH5Reader(ccsFile) for ccsRead in ccsH5.ccsReads(): if ccsRead.zmw.zmwName in no_barcode: basecalls = ccsRead.basecalls() if len(basecalls) >= args.minMaxInsertLength: if args.fasta: outh.writeRecord( FastaRecord(ccsRead.zmw.zmwName, ccsRead.basecalls())) else: outh.writeRecord( FastqRecord(ccsRead.zmw.zmwName, ccsRead.basecalls(), ccsRead.QualityValue())) outh.close()
def getZmwsForBarcodes(labels=None): """dictionary of pbcore.io.Zmw and LabeledZmw indexed by barcode label""" zmwsForBCs = {} for basFile, barcodeFile in zipFofns(runner.args.inputFofn, runner.args.barcodeFofn): basH5 = BasH5Reader(basFile) bcH5 = BarcodeH5Reader(barcodeFile) allLabs = bcH5.barcodeLabels if labels: allLabs = [x for x in allLabs if x in labels] logging.info("Processing only: %s" % ",".join(allLabs)) for label in allLabs: lZmws = bcH5.labeledZmwsFromBarcodeLabel(label) for lZmw in lZmws: zmw = basH5[lZmw.holeNumber] if not label in zmwsForBCs.keys(): zmwsForBCs[label] = [] zmwsForBCs[label].append((zmw, lZmw)) return zmwsForBCs
def main(parser): args = parser.parse_args() # Get outfile name if args.outFile is None: outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq' else: outfile = args.outFile # Input files barcodeFofn = (l.strip('\n') for l in args.barcode_fofn) baxFofn = (l.strip('\n') for l in args.bax_fofn) # Get the read names that are not barcoded no_barcode = defaultdict(set) for barcodeFile in barcodeFofn: bcH5 = BarcodeH5Reader(barcodeFile) for row in bcH5.bestDS: if row[3] / row[1] < args.minAvgBarcodeScore: no_barcode[bcH5.movieName].add(row[0]) if args.fasta: outh = FastaWriter(outfile) else: outh = FastqWriter(outfile) for baxFile in baxFofn: baxH5 = BasH5Reader(baxFile) for holeNum in baxH5.sequencingZmws: if holeNum in no_barcode[baxH5.movieName]: zmw = baxH5[holeNum] if len(zmw.subreads) and max(len(sr.basecalls()) for sr in zmw.subreads) >= args.minMaxInsertLength: for subread in zmw.subreads: if len(subread.basecalls()) >= args.minSubreadLength: if args.fasta: outh.writeRecord(FastaRecord(subread.readName,subread.basecalls())) else: outh.writeRecord(FastqRecord(subread.readName,subread.basecalls(),subread.QualityValue())) outh.close()
def __init__(self): self.V = ZmwReadStitcher(getUnalignedBam()) self.B = BasH5Reader(getBaxForBam()) self.VZ = self.V[1650] self.BZ = self.B[1650]
maxLen = l maxS = s return (float(read.subreads[maxS].basecalls().count('G') + read.subreads[maxS].basecalls().count("C"))) / len( read.subreads[maxS].basecalls()) #dh5 = "/net/eichler/vol20/projects/pacbio/backups/incoming/130625_MYD_eee_20kb_368/D01_1/Analysis_Results/m130626_034031_42134_c100534392550000001823079711101324_s1_p0.bas.h5" #dsam = "/net/eichler/vol20/projects/pacbio/nobackups/results/130625_MYD_eee_20kb_368/D01_1/D.sam" dh5 = "/mnt/pacbio/D01_1/Analysis_Results/m130626_034031_42134_c100534392550000001823079711101324_s1_p0.bas.h5" dsam = "/mnt/pacbio_analysis/D01_1/D.sam" from pbcore.io import BasH5Reader dReader = BasH5Reader(dh5) # # key: # rs read score # rl read length # mi mapped identity # ml mapped length # m indices of mapped reads # um indices of unmapped reads # s mapped subreads # us unmapped subreads class Count: def __init__(self):
def open_base_file(basefile_path): """Open basefile, if using cmp.h5 legacy format and the the basefile path was provided. """ bas_reader = BasH5Reader(basefile_path) return bas_reader
from operator import itemgetter,attrgetter from itertools import imap, starmap, repeat,izip,ifilter from pbcore.io import BasH5Reader from collections import Counter import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages if not len(sys.argv) == 2: sys.exit("zmwProductivityHeatmap.py input.bas.h5\n") infile = sys.argv[1] cell = BasH5Reader(infile) get_prod = lambda o : getattr(o, "zmwMetric")("Productivity") zmwgetters = map(itemgetter, cell.allSequencingZmws) all_seq_zmws = list(starmap(apply,zip(zmwgetters, repeat([cell])))) zmw_prods = map(get_prod, all_seq_zmws) prod_lens = zip(zmw_prods, imap(lambda z: len(z.read()), all_seq_zmws)) prod1_lens = map(itemgetter(1), ifilter(lambda (p,l): p==1, prod_lens)) prod2_lens = map(itemgetter(1), ifilter(lambda (p,l): p==2, prod_lens)) xy = map(attrgetter("holeXY"), all_seq_zmws) xyl = map(list,xy)
def mpWrapper(f): return makeBarcodeH5FromBasH5(BasH5Reader(f))
#!/usr/bin/env python # makes fastq files from bas.h5 files import sys import os from pbcore.io import BasH5Reader, FastqWriter if len(sys.argv) != 3: print "Usage: {:s} bas.h5_file output_prefix".format(sys.argv[0]) exit(1) input_filename = sys.argv[1] output_prefix = sys.argv[2] bas = BasH5Reader(input_filename) filenames = {} writers = {} filenames['raw'] = output_prefix + ".fastq" filenames['subread'] = output_prefix + ".subreads.fastq" filenames['ccs'] = output_prefix + ".ccs.fastq" for filetype in filenames: if os.path.isfile(filenames[filetype]): exit("Error: file {:s} exists!".format(filenames[filetype])) else: writers[filetype] = FastqWriter(filenames[filetype]) for zmw in bas: if len(zmw.read()) > 0: writers['raw'].writeRecord(zmw.read().readName,
def callConsensus(): def makeReadAndReads(zmwsForBC): ccsData = filter(lambda x: x, [zmw.ccsRead for _, _, zmw in zmwsForBC if zmw]) srData = reduce(lambda x, y: x + y, [zmw.subreads for zmw, _, _ in zmwsForBC if zmw], []) if not srData and not ccsData: return (None, None) def getSeedRead(reads, lq=80, uq=90, sLambda=lambda x: -x.zmw.readScore): lens = map(len, reads) candidateRange = (n.percentile(lens, lq), n.percentile(lens, uq)) pfReads = [ read for read, l in zip(reads, lens) if l >= candidateRange[0] and l <= candidateRange[1] ] pfReads.sort(key=sLambda) return pfReads[0] if len(pfReads) else None if ccsData: ## all CCS reads should be the *same* length for an ## amplicon. Let's take the middle ones seedRead = getSeedRead(ccsData, lq=30, uq=70, sLambda=lambda x: -x.zmw.numPasses) if not seedRead: seedRead = getSeedRead(srData) logging.info("Unable to use a CCS read for the seed read.") else: logging.info("Using a CCS read for the seed read.") else: logging.info("Using a raw read for the seed read") seedRead = getSeedRead(srData) return (seedRead, srData) # check to make sure that you have the necessary dependencies, # i.e., hgap script, blasr, etc. try: import pbtools.pbdagcon except ImportError: raise ImportError( "Unable to find dependency `pbdagcon` - please install.") # retrieve ZMWs by barcode if runner.args.barcode: zmwsForBCs = getZmwsForBarcodes(runner.args.barcode) else: zmwsForBCs = getZmwsForBarcodes() # subsample zmwsForBCs = {k: subsampleReads(v) for k, v in zmwsForBCs.items()} logging.info("unfiltered average zmws per barcode: %g" % n.round(n.mean(map(len, zmwsForBCs.values())))) # filter ZMWs zmwsForBCs = filterZmws(zmwsForBCs) logging.info("filtered average zmws per barcode: %g" % n.round(n.mean(map(len, zmwsForBCs.values())))) # now choose the best subread to seed the assembly if runner.args.ccsFofn: # XXX: This part depends on the filenames of the ccs and input # fofns, this is essentially a workaround to the fact the the # part isn't part of the API ccsReaders = { movieNameFromFile(l): BasH5Reader(l) for l in open(runner.args.ccsFofn).read().splitlines() } # fill in the CCS spot. for k, v in zmwsForBCs.items(): l = [] for zmw, lZmw in v: r = ccsReaders[movieNameFromFile(zmw.baxH5.file.filename)] l.append((zmw, lZmw, r[zmw.holeNumber])) zmwsForBCs[k] = l else: # add none to the CCS spot. zmwsForBCs = { k: [(zmw, lZmw, None) for zmw, lZmw in v] for k, v in zmwsForBCs.iteritems() } readAndReads = {k: makeReadAndReads(v) for k, v in zmwsForBCs.items()} # remove barcodes that don't have a seed read and a set of useable reads. readAndReads = {k: v for k, v in readAndReads.items() if v[0] and v[1]} # generate FASTA files outDir = runner.args.outDir for barcode, reads in readAndReads.items(): bcdir = '/'.join((outDir, barcode)) if not os.path.exists(bcdir): os.makedirs(bcdir) # emit the seeds to separte files with FastaWriter("%s/seed_read.fasta" % bcdir) as w: w.writeRecord(FastaRecord(reads[0].readName, reads[0].basecalls())) subreads = reads[1] # emit the subreads to a single file with FastaWriter("%s/subreads.fasta" % bcdir) as w: for r in subreads: w.writeRecord(FastaRecord(r.readName, r.basecalls())) # construct the region file by subsetting the ZMWs that you # are interested in. nfofn = [] for inFof, in zipFofns(runner.args.inputFofn): bh5 = BaxH5Reader(inFof) reg = bh5.file['/PulseData/Regions'] inMovie = filter(lambda z: z.baxH5.movieName == bh5.movieName, subreads) holes = n.in1d(reg[:, 0], n.array([a.holeNumber for a in inMovie])) if any(holes): nreg = reg[holes, :] else: nreg = n.empty(shape=(0, reg.shape[1]), dtype='int32') fname = "%s/%s.rgn.h5" % (bcdir, movieNameFromFile(inFof)) nfile = h5.File(fname, 'w') ndset = nfile.create_dataset('/PulseData/Regions', data=nreg, maxshape=(None, None)) copyAttributes(reg, ndset) nfile.close() nfofn.append(fname) ofile = open('%s/region.fofn' % bcdir, 'w') ofile.writelines("\n".join(nfofn)) ofile.close() ## call gcon outDirs = [(outDir, k) for k in readAndReads.keys()] if runner.args.nProcs == 1: outFasta = filter(lambda z: z, map(gconFunc, outDirs)) else: pool = Pool(runner.args.nProcs) outFasta = filter(lambda z: z, pool.map(gconFunc, outDirs)) ## write the results with FastaWriter('/'.join((outDir, "consensus.fa"))) as w: for r in outFasta: w.writeRecord(r) ## optionally cleanup if not runner.args.keepTmpDir: for barcode, reads in readAndReads.items(): bcdir = '/'.join((outDir, barcode)) shutil.rmtree(bcdir)