class FastqEmitter(object): def __init__(self, filename): self.writer = FastqWriter(filename) def emit(self, zmwRead): self.writer.writeRecord(zmwRead.readName, zmwRead.basecalls(), zmwRead.QualityValue())
def combine_polished_isoforms(split_indices, split_hq_fns, split_lq_fns, combined_hq_fa, combined_hq_fq, combined_lq_fa, combined_lq_fq, hq_lq_prefix_dict_pickle, sample_name): """Combine split hq (lq) files and save to combined_dir. Dumping hq|lq prefix dictionary to pickle. Return an instance of CombinedFiles. Parameters: split_indices -- indices of splitted cluster bins. split_hq_fns -- hq files, #['*/all_quivered_hq.100_30_0.99.fastq', ...] split_lq_fns -- lq files, #['all_quivered_lq.fastq', ...] """ assert len(split_indices) == len(split_hq_fns) assert len(split_indices) == len(split_lq_fns) assert all([f.endswith(".fastq") for f in split_hq_fns + split_lq_fns]) hq_pre_dict, lq_pre_dict = {}, {} hq_fa_writer = FastaWriter(combined_hq_fa) hq_fq_writer = FastqWriter(combined_hq_fq) lq_fa_writer = FastaWriter(combined_lq_fa) lq_fq_writer = FastqWriter(combined_lq_fq) for i, split_hq, split_lq in zip(split_indices, split_hq_fns, split_lq_fns): logging.debug("Adding prefix i%s_| to %s, %s", str(i), split_hq, split_lq) hq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="HQ", sample_name=sample_name) lq_prefix = combined_prefix(cluster_bin_index=i, isoform_type="LQ", sample_name=sample_name) hq_pre_dict[hq_prefix] = op.dirname(op.abspath(split_hq)) lq_pre_dict[lq_prefix] = op.dirname(op.abspath(split_lq)) with FastqReader(split_hq) as reader: for read in reader: name = combined_cid_hq_name(cluster_bin_index=i, name=read.name, sample_name=sample_name) hq_fa_writer.writeRecord(name, read.sequence[:]) hq_fq_writer.writeRecord(name, read.sequence[:], read.quality) with FastqReader(split_lq) as reader: for read in reader: name = combined_cid_lq_name(cluster_bin_index=i, name=read.name, sample_name=sample_name) lq_fa_writer.writeRecord(name, read.sequence[:]) lq_fq_writer.writeRecord(name, read.sequence[:], read.quality) hq_fa_writer.close() hq_fq_writer.close() lq_fa_writer.close() lq_fq_writer.close() logging.info("HQ polished output combined to:%s", combined_hq_fq) logging.info("LQ polished output combined to:%s", combined_lq_fq) logging.info("Dumping hq|lq prefix dictionary to:%s", hq_lq_prefix_dict_pickle) with open(hq_lq_prefix_dict_pickle, 'wb') as writer: cPickle.dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, writer)
def emitNoBCFastqs(inputFofn_filename, barcodeFofn_filename, outDir, outFile): # step through the bas.h5 and barcode.h5 files and emit # reads for each of these. inputFofn = open(inputFofn_filename).read().splitlines() barcodeFofn = open(barcodeFofn_filename).read().splitlines() outFastq = [] for basFile, barcodeFile in zip(inputFofn, barcodeFofn): basH5 = BasH5Reader(basFile) bcH5 = BarcodeH5Reader(barcodeFile) msk = -np.in1d( basH5.sequencingZmws, bcH5.bestDS[:, 0], assume_unique=True) for hn in basH5.sequencingZmws[msk]: zmw = basH5[hn] if zmw: reads = zmw.subreads() if any(reads): for read in reads: outFastq.append( FastqRecord(read.readName, read.basecalls(), read.QualityValue())) with FastqWriter("%s/%s.fastq" % (outDir, outFile)) as w: for e in outFastq: w.writeRecord(e)
def split_laa_fastq(input_file_name, output_file_base, subreads_file_name, bio_samples_by_bc=None): """ Split an LAA FASTQ file into one file per barcode. """ if op.getsize(input_file_name) == 0: return [] records = defaultdict(list) with FastqReader(input_file_name) as fastq_in: for rec in fastq_in: bc_id = re.sub("^Barcode", "", rec.id.split("_")[0]) records[bc_id].append(rec) if bio_samples_by_bc is None: bio_samples_by_bc = {} with SubreadSet(subreads_file_name, strict=True) as ds: if ds.isBarcoded: # pylint: disable=no-member bio_samples_by_bc = get_barcode_sample_mappings(ds) outputs = [] for bc_id in sorted(records.keys()): bio_sample = bio_samples_by_bc.get(bc_id, "unknown") ofn = "{b}.{s}.{i}.fastq".format(b=output_file_base, s=bio_sample, i=bc_id) with FastqWriter(ofn) as fastq_out: for rec in records[bc_id]: fastq_out.writeRecord(rec) outputs.append(ofn) return outputs
def _write_fastx_file(cls, header, seq): fn = tempfile.NamedTemporaryFile(suffix=".fastq").name suffix = "|arrow" with FastqWriter(fn) as f: f.writeRecord("{h}{s}".format(h=header, s=suffix), seq, [35] * len(seq)) return fn
def onStart(self): self.referenceBasesProcessedById = OrderedDict() for refId in reference.byName: self.referenceBasesProcessedById[refId] = 0 self.variantsByRefId = defaultdict(list) self.consensusChunksByRefId = defaultdict(list) # open file writers self.fastaWriter = self.fastqWriter = self.gffWriter = None if options.fastaOutputFilename: self.fastaWriter = FastaWriter(options.fastaOutputFilename) if options.fastqOutputFilename: self.fastqWriter = FastqWriter(options.fastqOutputFilename) if options.gffOutputFilename: self.gffWriter = VariantsGffWriter(options.gffOutputFilename, vars(options), reference.byName.values())
def make_fastq_inputs(records=None, ofn=None): if records is None: records = _get_fastq_records() if ofn is None: ofn = tempfile.NamedTemporaryFile(suffix=".fastq").name with FastqWriter(ofn) as fastq_out: for rec in records: fastq_out.writeRecord(rec) return ofn
def combine_amplicon_analysis_files(directory): output_file = os.path.join(directory, 'amplicon_analysis.all.fastq') with FastqWriter(output_file) as handle: for input_file in [ 'amplicon_analysis.fastq', 'amplicon_analysis_chimeras_noise.fastq' ]: input_path = os.path.join(directory, input_file) for record in FastqReader(input_path): handle.writeRecord(record) return output_file
def ice_fa2fq(in_fa, ccs_fofn, out_fq): """Convert an input FASTA file to an output FASTQ file, reading QVs from the input ccs.h5, ccs.bam or ccs FOFN. """ ccs_fns = get_files_from_file_or_fofn(ccs_fofn) fmt = guess_file_format(ccs_fns) if fmt == FILE_FORMATS.H5: qver = basQVcacher() for ccs_fn in ccs_fns: qver.add_bash5(ccs_fn) bas_handlers = {} elif fmt == FILE_FORMATS.BAM: qver = BamCollection(*ccs_fns) else: raise IOError("ice_fa2fq does not support input %s." % ccs_fofn) with ContigSetReaderWrapper(in_fa) as reader, \ FastqWriter(out_fq) as writer: for r in reader: logging.debug("Getting QVs for {name} ...".format(name=r.name)) seqid = r.name.split(' ')[0] parsed_read_name = _Parsed_Read_Name(seqid) if fmt == FILE_FORMATS.H5: try: bas_file = qver.bas_files[parsed_read_name.movie][seqid] if bas_file not in bas_handlers: bas_handlers[bas_file] = BasH5Reader(bas_file) except KeyError: raise IOError("Could not read {s} from {f}.".format( s=seqid, f=ccs_fofn)) qvs = get_qv_from_bas_handler( bas_handler=bas_handlers[bas_file], parsed_read_name=parsed_read_name, qv_name="QualityValue") elif fmt == FILE_FORMATS.BAM: qvs = get_qvs_from_bam(reader=qver, parsed_read_name=parsed_read_name, qv_name="QualityValue") else: assert False if len(r.sequence) != len(qvs): raise ValueError( "Sequence and QVs of {r} should be the same!".format( r=r.name)) writer.writeRecord(r.name, r.sequence[:], qvs) if fmt == FILE_FORMATS.H5: for bas_file, bas_handler in bas_handlers.iteritems(): logging.debug("Closing {bas_file} ...".format(bas_file=bas_file)) bas_handler.close() elif fmt == FILE_FORMATS.BAM: qver.close()
def write_fastq(fasta_records, output_file): """ Write a FastaRecord, or list of records, out to file """ with FastqWriter(output_file) as handle: if isinstance(fasta_records, FastqRecord): handle.writeRecord(fasta_records) elif isinstance(fasta_records, list): for record in fasta_records: handle.writeRecord(record) else: msg = "Input Record(s) type not recognized" log.error(msg) raise TypeError(msg) check_output_file(output_file)
def ice_fa2fq(in_fa, ccs_fofn, out_fq): """Convert an input FASTA file to an output FASTQ file, reading QVs from the input ccs.h5 or ccs FOFN. """ qver = basQVcacher() if ccs_fofn.endswith(".h5"): # Input is a ccs.h5 file not a FOFN. qver.add_bash5(ccs_fofn) else: # Input is a ccs FOFN containing multiple ccs.h5 files. for ccs_fn in get_files_from_fofn(ccs_fofn): qver.add_bash5(ccs_fn) bas_handlers = {} with FastaReader(in_fa) as reader, \ FastqWriter(out_fq) as writer: for r in reader: seqid = r.name.split(' ')[0] movie, hn, s_e = "", "", "" try: movie, hn, s_e = seqid.split('/') hn = int(hn) except ValueError: raise ValueError( "{seqid} is not a valid CCS read".format(seqid=seqid)) try: bas_file = qver.bas_files[movie][seqid] if bas_file not in bas_handlers: bas_handlers[bas_file] = BasH5Reader(bas_file) except KeyError: raise IOError( "Could not read {s} from input ccs fofn.".format(s=seqid)) logging.debug("Getting QVs for {name} ...".format(name=r.name)) qvs = get_qv_from_bas_handler(bas_handler=bas_handlers[bas_file], hn=hn, s_e=s_e, qv_name="QualityValue") if len(r.sequence) != len(qvs): raise ValueError( "Sequence and QVs of {r} should be the same!".format( r=r.name)) writer.writeRecord(r.name, r.sequence, qvs) for bas_file, bas_handler in bas_handlers.iteritems(): logging.debug("Closing {bas_file} ...".format(bas_file=bas_file)) bas_handler.close()
def writeSequenceRecords(filename, records, filetype=None): """ Write the records out to file """ fileType = filetype or getFileType(filename) if fileType == 'fasta': with FastaWriter(filename) as writer: for record in records: writer.writeRecord(record) elif fileType == 'fastq': with FastqWriter(filename) as writer: for record in records: writer.writeRecord(record) else: msg = 'Output filetype must be either FASTA or FASTQ' log.error(msg) raise TypeError(msg) return filename
def split_laa_fastq(input_file_name, output_file_base): """ Split an LAA FASTQ file into one file per barcode. """ if op.getsize(input_file_name) == 0: return [] records = defaultdict(list) with FastqReader(input_file_name) as fastq_in: for rec in fastq_in: bc_id = rec.id.split("_")[0] records[bc_id].append(rec) outputs = [] for bc_id in sorted(records.keys()): ofn = "{b}.{i}.fastq".format(b=output_file_base, i=bc_id) with FastqWriter(ofn) as fastq_out: for rec in records[bc_id]: fastq_out.writeRecord(rec) outputs.append(ofn) return outputs
def main(parser): args = parser.parse_args() # Get outfile name if args.outFile is None: outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq' else: outfile = args.outFile # Input files barcodeFofn = (l.strip('\n') for l in args.barcode_fofn) ccsFofn = (l.strip('\n') for l in args.ccs_fofn) # Get the read names that are not barcoded no_barcode = set() for barcodeFile in barcodeFofn: bcH5 = BarcodeH5Reader(barcodeFile) for row in bcH5.bestDS: if row[3] / row[1] < args.minAvgBarcodeScore: no_barcode.add('%s/%d' % (bcH5.movieName, row[0])) if args.fasta: outh = FastaWriter(outfile) else: outh = FastqWriter(outfile) for ccsFile in ccsFofn: ccsH5 = BasH5Reader(ccsFile) for ccsRead in ccsH5.ccsReads(): if ccsRead.zmw.zmwName in no_barcode: basecalls = ccsRead.basecalls() if len(basecalls) >= args.minMaxInsertLength: if args.fasta: outh.writeRecord( FastaRecord(ccsRead.zmw.zmwName, ccsRead.basecalls())) else: outh.writeRecord( FastqRecord(ccsRead.zmw.zmwName, ccsRead.basecalls(), ccsRead.QualityValue())) outh.close()
def split(self, first_split=None): """Split `input_fastq` into smaller files each containing `reads_per_split` reads. Return splitted fastq.""" split_index = 0 self.out_fns = [] writer = FastqWriter(self._out_fn(split_index)) self.out_fns.append(self._out_fn(split_index)) if first_split is None: first_split = self.reads_per_split with FastqReader(self.input_fastq) as reader: for ridx, r in enumerate(reader): if ((split_index == 0 and ridx == first_split) or (split_index > 0 and ridx % self.reads_per_split == 0)) \ and ridx != 0: split_index += 1 writer.close() writer = FastqWriter(self._out_fn(split_index)) self.out_fns.append(self._out_fn(split_index)) writer.writeRecord(r.name, r.sequence, r.quality) writer.close() return list(self.out_fns)
def main(parser): args = parser.parse_args() # Get outfile name if args.outFile is None: outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq' else: outfile = args.outFile # Input files barcodeFofn = (l.strip('\n') for l in args.barcode_fofn) baxFofn = (l.strip('\n') for l in args.bax_fofn) # Get the read names that are not barcoded no_barcode = defaultdict(set) for barcodeFile in barcodeFofn: bcH5 = BarcodeH5Reader(barcodeFile) for row in bcH5.bestDS: if row[3] / row[1] < args.minAvgBarcodeScore: no_barcode[bcH5.movieName].add(row[0]) if args.fasta: outh = FastaWriter(outfile) else: outh = FastqWriter(outfile) for baxFile in baxFofn: baxH5 = BasH5Reader(baxFile) for holeNum in baxH5.sequencingZmws: if holeNum in no_barcode[baxH5.movieName]: zmw = baxH5[holeNum] if len(zmw.subreads) and max(len(sr.basecalls()) for sr in zmw.subreads) >= args.minMaxInsertLength: for subread in zmw.subreads: if len(subread.basecalls()) >= args.minSubreadLength: if args.fasta: outh.writeRecord(FastaRecord(subread.readName,subread.basecalls())) else: outh.writeRecord(FastqRecord(subread.readName,subread.basecalls(),subread.QualityValue())) outh.close()
def consolidate(self, out_prefix): """Consolidate ContigSet to FASTA/FASTQ file, return path to output file.""" try: r0 = self.next() except StopIteration: raise ValueError("No records to consolidate") if isinstance(r0, FastaRecord) or isinstance(r0, IndexedFastaRecord): out_fn = out_prefix + ".fasta" with FastaWriter(out_fn) as writer: writer.writeRecord(r0.name, r0.sequence[:]) while True: try: r = self.next() except StopIteration: break if not (isinstance(r, FastaRecord) or isinstance(r, IndexedFastaRecord)): raise ValueError( "Not able to consolidate records of mixed types.") writer.writeRecord(r.name, r.sequence) return out_fn elif isinstance(r0, FastqRecord): out_fn = out_prefix + ".fastq" with FastqWriter(out_fn) as writer: writer.writeRecord(r0) while True: try: r = self.next() except StopIteration: break if not isinstance(r, FastqRecord): raise ValueError( "Not able to consolidate records of mixed types.") writer.writeRecord(r) return out_fn else: raise ValueError("Files must only contain FASTA/FASTQ records.")
def main(parser): args = parser.parse_args() def makeFqName(bcPair): return '{}/{}--{}.fastq'.format(args.outDir, *[bcNames[i] for i in bcPair]) bcNames = { i: rec.name for i, rec in enumerate(FastaReader(args.barcodeFasta)) } bcNames[-1] = 'NoBC' bam = IndexedBamReader(args.ccsBAM) for bcPair in set(zip(bam.bcForward, bam.bcReverse)): with FastqWriter(makeFqName(bcPair)) as writer: for rec in bam[(bam.bcForward == bcPair[0]) & (bam.bcReverse == bcPair[1])]: header = rec.readName if not args.noBcQual: header += ' bq=%i' % rec.bcQual writer.writeRecord(header, rec.read(aligned=False), rec.peer.query_qualities)
def write_good_collapsed_isoforms(in_abundance_filename, in_gff_filename, in_rep_filename, out_abundance_filename, out_gff_filename, out_rep_filename, good): """Write good collapsed isoforms.""" in_suffix = parse_ds_filename(in_rep_filename)[1] out_suffix = parse_ds_filename(out_rep_filename)[1] if in_suffix != out_suffix: raise ValueError("Format of input %s and output %s must match." % (in_rep_filename, out_rep_filename)) if in_suffix not in ("fasta", "fastq"): raise ValueError( "Format of input %s and output %s must be either FASTA or FASTQ." % (in_rep_filename, out_rep_filename)) # then read gff, and write good gff record. with CollapseGffWriter(out_gff_filename) as gff_writer: for r in CollapseGffReader(in_gff_filename): if r.seqid in good: gff_writer.writeRecord(r) # next read rep fasta/fastq, and write good rep fasta/fastq record. rep_reader = FastaReader(in_rep_filename) if in_suffix == "fasta" \ else FastqReader(in_rep_filename) rep_writer = FastaWriter(out_rep_filename) if in_suffix == "fasta" \ else FastqWriter(out_rep_filename) for r in rep_reader: # r.name e.g., PB.1.1|PB.1.1:10712-11643(+)|i0_HQ_sample18ba5d|c1543/f8p1/465 if r.name.split('|')[0] in good: rep_writer.writeRecord(r) # finally write abundance info of good records. with AbundanceReader(in_abundance_filename) as a_reader, \ AbundanceWriter(out_abundance_filename, comments=a_reader.comments) as a_writer: for r in a_reader: if r.pbid in good: a_writer.writeRecord(r)
class ResultCollector(object): """ Gathers results and writes to a file. """ def __init__(self, resultsQueue, algorithmName, algorithmConfig): self._resultsQueue = resultsQueue self._algorithmName = algorithmName self._algorithmConfig = algorithmConfig def _run(self): self.onStart() sentinelsReceived = 0 while sentinelsReceived < options.numWorkers: result = self._resultsQueue.get() if result is None: sentinelsReceived += 1 else: self.onResult(result) self.onFinish() def run(self): if options.doProfiling: cProfile.runctx("self._run()", globals=globals(), locals=locals(), filename=os.path.join(options.temporaryDirectory, "profile-%s.out" % (self.name))) else: self._run() # ================================== # Overridable interface begins here. # def onStart(self): self.referenceBasesProcessedById = OrderedDict() for refId in reference.byName: self.referenceBasesProcessedById[refId] = 0 self.variantsByRefId = defaultdict(list) self.consensusChunksByRefId = defaultdict(list) # open file writers self.fastaWriter = self.fastqWriter = self.gffWriter = None if options.fastaOutputFilename: self.fastaWriter = FastaWriter(options.fastaOutputFilename) if options.fastqOutputFilename: self.fastqWriter = FastqWriter(options.fastqOutputFilename) if options.gffOutputFilename: self.gffWriter = VariantsGffWriter(options.gffOutputFilename, vars(options), reference.byName.values()) def onResult(self, result): window, cssAndVariants = result css, variants = cssAndVariants self._recordNewResults(window, css, variants) self._flushContigIfCompleted(window) def onFinish(self): logging.info("Analysis completed.") if self.fastaWriter: self.fastaWriter.close() if self.fastqWriter: self.fastqWriter.close() if self.gffWriter: self.gffWriter.close() logging.info("Output files completed.") def _recordNewResults(self, window, css, variants): refId, refStart, refEnd = window self.consensusChunksByRefId[refId].append(css) self.variantsByRefId[refId] += variants self.referenceBasesProcessedById[refId] += (refEnd - refStart) def _flushContigIfCompleted(self, window): refId, _, _ = window refEntry = reference.byName[refId] refName = refEntry.fullName basesProcessed = self.referenceBasesProcessedById[refId] requiredBases = reference.numReferenceBases(refId, options.referenceWindows) if basesProcessed == requiredBases: # This contig is done, so we can dump to file and delete # the data structures. if self.gffWriter: self.gffWriter.writeVariants(sorted(self.variantsByRefId[refId])) del self.variantsByRefId[refId] # # If the user asked to analyze a window or a set of # windows, we output a FAST[AQ] contig per analyzed # window. Otherwise we output a fasta contig per # reference contig. # # We try to be intelligent about naming the output # contigs, to include window information where applicable. # for span in reference.enumerateSpans(refId, options.referenceWindows): _, s, e = span if (s == 0) and (e == refEntry.length): spanName = refName else: spanName = refName + "_%d_%d" % (s, e) cssName = consensus.consensusContigName(spanName, self._algorithmName) # Gather just the chunks pertaining to this span chunksThisSpan = [ chunk for chunk in self.consensusChunksByRefId[refId] if windows.windowsIntersect(chunk.refWindow, span) ] css = consensus.join(chunksThisSpan) if self.fastaWriter: self.fastaWriter.writeRecord(cssName, css.sequence) if self.fastqWriter: self.fastqWriter.writeRecord(cssName, css.sequence, css.confidence) del self.consensusChunksByRefId[refId]
def pick_rep(isoform_filename, gff_filename, group_filename, output_filename, pick_least_err_instead=False, bad_gff_filename=None): """ For each group of collapsed sam records, select the representative record. If is FASTA file -- then always pick the longest one If is FASTQ file -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ fd = None is_fq = False dummy_prefix, _suffix = parse_ds_filename(isoform_filename) if _suffix == "fasta": fd = FastaRandomReader(isoform_filename) elif _suffix == "fastq": fd = FastqRandomReader(isoform_filename) is_fq = True elif _suffix == "contigset.xml": fd = ContigSet(isoform_filename) _fns = fd.toExternalFiles() if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith( ".fastq"): fd = FastqRandomReader(_fns[0]) is_fq = True else: if not fd.isIndexed: # Must be indexed FASTA, or exactly contains one FASTQ file raise IOError( "%s must contain either indexed FASTA files or " % isoform_filename + "contain exactly one FASTQ file!") else: raise IOError("Unable to recognize file type of %s." % isoform_filename) fa_out_fn, fq_out_fn, ds_out_fn = None, None, None _prefix, _suffix = parse_ds_filename(output_filename) if _suffix == "fasta": fa_out_fn = output_filename elif _suffix == "fastq": if not is_fq: raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename) else: fq_out_fn = output_filename elif _suffix == "contigset.xml": # output is contigset.xml ds_out_fn = output_filename fa_out_fn = _prefix + ".fasta" if is_fq: fq_out_fn = _prefix + ".fastq" else: raise IOError("Unable to recognize file type of %s." % output_filename) fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None coords = {} for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) if bad_gff_filename is not None: for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) for group in GroupReader(group_filename): pb_id, members = group.name, group.members if not pb_id in coords: raise ValueError("Could not find %s in %s and %s" % (pb_id, gff_filename, bad_gff_filename)) #logging.info("Picking representative sequence for %s", pb_id) best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members: if is_fq and pick_least_err_instead: err = sum(i**-(i / 10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or \ ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id) _seq_ = best_seq if fq_writer is not None: fq_writer.writeRecord(_id_, _seq_, best_qual) if fa_writer is not None: fa_writer.writeRecord(_id_, _seq_) if fa_writer is not None: fa_writer.close() if fq_writer is not None: fq_writer.close() if ds_out_fn is not None: as_contigset(fa_out_fn, ds_out_fn)
input_filename = sys.argv[1] output_prefix = sys.argv[2] bas = BasH5Reader(input_filename) filenames = {} writers = {} filenames['raw'] = output_prefix + ".fastq" filenames['subread'] = output_prefix + ".subreads.fastq" filenames['ccs'] = output_prefix + ".ccs.fastq" for filetype in filenames: if os.path.isfile(filenames[filetype]): exit("Error: file {:s} exists!".format(filenames[filetype])) else: writers[filetype] = FastqWriter(filenames[filetype]) for zmw in bas: if len(zmw.read()) > 0: writers['raw'].writeRecord(zmw.read().readName, zmw.read().basecalls(), zmw.read().QualityValue()) for subread in zmw.subreads: if len(subread) > 0: writers['subread'].writeRecord(subread.readName, subread.basecalls(), subread.QualityValue()) if zmw.ccsRead is not None: writers['ccs'].writeRecord(zmw.ccsRead.readName,
def pickup_best_clusters(self): """Pick up hiqh QV clusters.""" self.add_log( "Picking up the best clusters according to QVs from {fs}.".format( fs=", ".join(self.fq_filenames))) a = load(open(self.final_pickle_fn)) uc = a['uc'] # check if the uc cids are integers uc_keys_are_int = type(uc.keys()[0]) is int polished = {} # cid --> FastqRecord for fq in self.fq_filenames: self.add_log("Looking at arrowed fq {f}".format(f=fq)) for r in FastqReader(fq): # possible ID #1: c0|arrow (a single Ice2 directory) # possible ID #2: b112_c0|arrow (after collecting several Ice2 directory) cid = r.name.split('|')[0] if cid.endswith('_ref'): cid = cid[:-4] i = cid.find('/') if i > 0: cid = cid[:i] if uc_keys_are_int: # only convert in the case where uc keys are integers (ex: is c10, but 10) cid = int( cid[1:]) #becuz possible ID #2, dont convert to int polished[cid] = r expected_acc_dict = {} # cid --> expected accuracy (ex: 0.99) good = [] # contains all the cids that are HQ # calculate expected QV given 5'/3' trimming # for sequences that are shorter than the trimming, use the length itself for cid, r in polished.iteritems(): qv_len = max(len(r.quality), len(r.quality) - self.qv_trim_5 - self.qv_trim_3) q = [phred_to_qv(x) for x in r.quality] err_sum = sum(q[self.qv_trim_5:-self.qv_trim_3]) expected_acc_dict[cid] = 1.0 - (err_sum / float(qv_len)) if expected_acc_dict[cid] >= self.hq_arrow_min_accuracy and \ len(uc[cid]) >= self.hq_min_full_length_reads : good.append(cid) partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc'] partial_uc2 = defaultdict(lambda: []) partial_uc2.update(partial_uc) if self.report_fn is not None: self.write_report(report_fn=self.report_fn, uc=uc, partial_uc=partial_uc2) self.add_log("Writing hiqh-quality isoforms to {f}|fq".format( f=self.arrowed_good_fa)) self.add_log("Writing low-quality isoforms to {f}|fq".format( f=self.arrowed_bad_fa)) with FastaWriter(self.arrowed_good_fa) as good_fa_writer, \ FastaWriter(self.arrowed_bad_fa) as bad_fa_writer, \ FastqWriter(self.arrowed_good_fq) as good_fq_writer, \ FastqWriter(self.arrowed_bad_fq) as bad_fq_writer: for cid in polished: r = polished[cid] newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\ format(cid=cid, flnc_num=len(uc[cid]), nfl_num=len(partial_uc2[cid]), read_len=len(r.sequence)) newname = cid_with_annotation2( newname, expected_acc=expected_acc_dict[cid]) if cid in good: self.add_log( "processing arrowed cluster {c} --> good.".format( c=cid)) good_fa_writer.writeRecord(newname, r.sequence[:]) good_fq_writer.writeRecord(newname, r.sequence[:], r.quality) else: self.add_log( "processing arrowed cluster {c} --> bad.".format( c=cid)) bad_fa_writer.writeRecord(newname, r.sequence[:]) bad_fq_writer.writeRecord(newname, r.sequence[:], r.quality) self.add_log("-" * 60, level=logging.INFO) self.add_log( "High-quality Arrowed consensus written " + "to:\n{0}\n{1}".format(self.arrowed_good_fa, self.arrowed_good_fq), level=logging.INFO) self.add_log( "Low-quality Arrowed consensus written " + "to:\n{0}\n{1}".format(self.arrowed_bad_fa, self.arrowed_bad_fq), level=logging.INFO) self.add_log("-" * 60, level=logging.INFO)
def pickup_best_clusters(self, fq_filenames): """Pick up hiqh QV clusters.""" self.add_log( "Picking up the best clusters according to QVs from {fs}.".format( fs=", ".join(fq_filenames))) a = load(open(self.final_pickle_fn)) uc = a['uc'] quivered = {} for fq in fq_filenames: self.add_log("Looking at quivered fq {f}".format(f=fq)) for r in FastqReader(fq): # possible ID: c0/0_1611|quiver cid = r.name.split('|')[0] if cid.endswith('_ref'): cid = cid[:-4] i = cid.find('/') if i > 0: cid = cid[:i] cid = int(cid[1:]) quivered[cid] = r good = [] for cid, r in quivered.iteritems(): qv_len = max(0, len(r.quality) - self.qv_trim_5 - self.qv_trim_3) if qv_len != 0: q = [phred_to_qv(x) for x in r.quality] err_sum = sum(q[self.qv_trim_5:-self.qv_trim_3]) if 1.0 - (err_sum / float(qv_len)) >= self.hq_quiver_min_accuracy: good.append(cid) partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc'] partial_uc2 = defaultdict(lambda: []) partial_uc2.update(partial_uc) if self.report_fn is not None: self.write_report(report_fn=self.report_fn, uc=uc, partial_uc=partial_uc2) self.add_log("Writing hiqh-quality isoforms to {f}|fq".format( f=self.quivered_good_fa)) self.add_log("Writing low-quality isoforms to {f}|fq".format( f=self.quivered_bad_fa)) with FastaWriter(self.quivered_good_fa) as good_fa_writer, \ FastaWriter(self.quivered_bad_fa) as bad_fa_writer, \ FastqWriter(self.quivered_good_fq) as good_fq_writer, \ FastqWriter(self.quivered_bad_fq) as bad_fq_writer: for cid in quivered: r = quivered[cid] newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\ format(cid=cid, flnc_num=len(uc[cid]), nfl_num=len(partial_uc2[cid]), read_len=len(r.sequence)) newname = cid_with_annotation(newname) if cid in good: self.add_log( "processing quivered cluster {c} --> good.".format( c=cid)) good_fa_writer.writeRecord(newname, r.sequence) good_fq_writer.writeRecord(newname, r.sequence, r.quality) else: self.add_log( "processing quivered cluster {c} --> bad.".format( c=cid)) bad_fa_writer.writeRecord(newname, r.sequence) bad_fq_writer.writeRecord(newname, r.sequence, r.quality) self.add_log("-" * 60, level=logging.INFO) self.add_log("High-quality Quivered consensus written " + "to:\n{0}\n{1}".format(self.quivered_good_fa, self.quivered_good_fq)) self.add_log( "Low-qulality Quivered consensus written " + "to:\n{0}\n{1}".format(self.quivered_bad_fa, self.quivered_bad_fq)) self.add_log("-" * 60, level=logging.INFO)
def __init__(self, filename): self.writer = FastqWriter(filename)
class ResultCollector(object): """ Gathers results and writes to a file. """ def __init__(self, resultsQueue, algorithmName, algorithmConfig): self._resultsQueue = resultsQueue self._algorithmName = algorithmName self._algorithmConfig = algorithmConfig def _run(self): self.onStart() sentinelsReceived = 0 while sentinelsReceived < options.numWorkers: result = self._resultsQueue.get() if result is None: sentinelsReceived += 1 else: self.onResult(result) self.onFinish() def run(self): if options.doProfiling: cProfile.runctx("self._run()", globals=globals(), locals=locals(), filename=os.path.join( options.temporaryDirectory, "profile-%s.out" % (self.name))) else: self._run() # ================================== # Overridable interface begins here. # def onStart(self): self.referenceBasesProcessedById = OrderedDict() for refId in reference.byName: self.referenceBasesProcessedById[refId] = 0 self.variantsByRefId = defaultdict(list) self.consensusChunksByRefId = defaultdict(list) # open file writers self.fastaWriter = None self.fastqWriter = None self.gffWriter = None self.vcfWriter = None if options.fastaOutputFilename: self.fastaWriter = FastaWriter(options.fastaOutputFilename) if options.fastqOutputFilename: self.fastqWriter = FastqWriter(options.fastqOutputFilename) if options.gffOutputFilename: self.gffWriter = VariantsGffWriter(options.gffOutputFilename, vars(options), reference.byName.values()) if options.vcfOutputFilename: self.vcfWriter = VariantsVcfWriter(options.vcfOutputFilename, vars(options), reference.byName.values()) def onResult(self, result): window, cssAndVariants = result css, variants = cssAndVariants self._recordNewResults(window, css, variants) self._flushContigIfCompleted(window) def onFinish(self): logging.info("Analysis completed.") if self.fastaWriter: self.fastaWriter.close() if self.fastqWriter: self.fastqWriter.close() if self.gffWriter: self.gffWriter.close() if self.vcfWriter: self.vcfWriter.close() logging.info("Output files completed.") def _recordNewResults(self, window, css, variants): refId, refStart, refEnd = window self.consensusChunksByRefId[refId].append(css) self.variantsByRefId[refId] += variants self.referenceBasesProcessedById[refId] += (refEnd - refStart) def _flushContigIfCompleted(self, window): refId, _, _ = window refEntry = reference.byName[refId] refName = refEntry.fullName basesProcessed = self.referenceBasesProcessedById[refId] requiredBases = reference.numReferenceBases(refId, options.referenceWindows) if basesProcessed == requiredBases: # This contig is done, so we can dump to file and delete # the data structures. if self.gffWriter or self.vcfWriter: variants = sorted(self.variantsByRefId[refId]) if self.gffWriter: self.gffWriter.writeVariants(variants) if self.vcfWriter: self.vcfWriter.writeVariants(variants) del self.variantsByRefId[refId] # # If the user asked to analyze a window or a set of # windows, we output a FAST[AQ] contig per analyzed # window. Otherwise we output a fasta contig per # reference contig. # # We try to be intelligent about naming the output # contigs, to include window information where applicable. # for span in reference.enumerateSpans(refId, options.referenceWindows): _, s, e = span if (s == 0) and (e == refEntry.length): spanName = refName else: spanName = refName + "_%d_%d" % (s, e) cssName = consensus.consensusContigName( spanName, self._algorithmName) # Gather just the chunks pertaining to this span chunksThisSpan = [ chunk for chunk in self.consensusChunksByRefId[refId] if windows.windowsIntersect(chunk.refWindow, span) ] css = consensus.join(chunksThisSpan) if self.fastaWriter: self.fastaWriter.writeRecord(cssName, css.sequence) if self.fastqWriter: self.fastqWriter.writeRecord(cssName, css.sequence, css.confidence) del self.consensusChunksByRefId[refId]
def pick_rep(isoform_filename, gff_filename, group_filename, output_filename, pick_least_err_instead=False, bad_gff_filename=None): """ For each group of collapsed sam records, select the representative record. If is FASTA file -- then always pick the longest one If is FASTQ file -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ fd = None is_fq = False dummy_prefix, _suffix = parse_ds_filename(isoform_filename) if _suffix == "fasta": fd = FastaRandomReader(isoform_filename) elif _suffix == "fastq": fd = FastqRandomReader(isoform_filename) is_fq = True elif _suffix == "contigset.xml": fd = ContigSet(isoform_filename) _fns = fd.toExternalFiles() if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(".fastq"): fd = FastqRandomReader(_fns[0]) is_fq = True else: if not fd.isIndexed: # Must be indexed FASTA, or exactly contains one FASTQ file raise IOError("%s must contain either indexed FASTA files or " % isoform_filename + "contain exactly one FASTQ file!") else: raise IOError("Unable to recognize file type of %s." % isoform_filename) fa_out_fn, fq_out_fn, ds_out_fn = None, None, None _prefix, _suffix = parse_ds_filename(output_filename) if _suffix == "fasta": fa_out_fn = output_filename elif _suffix == "fastq": if not is_fq: raise ValueError("Input file %s is not FASTQ while output is." % isoform_filename) else: fq_out_fn = output_filename elif _suffix == "contigset.xml": # output is contigset.xml ds_out_fn = output_filename fa_out_fn = _prefix + ".fasta" if is_fq: fq_out_fn = _prefix + ".fastq" else: raise IOError("Unable to recognize file type of %s." % output_filename) fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None coords = {} for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) if bad_gff_filename is not None: for r in CollapseGffReader(gff_filename): tid = r.transcript_id coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end, r.strand) for group in GroupReader(group_filename): pb_id, members = group.name, group.members if not pb_id in coords: raise ValueError("Could not find %s in %s and %s" % (pb_id, gff_filename, bad_gff_filename)) #logging.info("Picking representative sequence for %s", pb_id) best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members: if is_fq and pick_least_err_instead: err = sum(i**-(i/10.) for i in fd[x].quality) if (is_fq and pick_least_err_instead and err < best_err) or \ ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len): best_id = x best_seq = fd[x].sequence if is_fq: best_qual = fd[x].quality best_err = err max_len = len(fd[x].sequence) _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id) _seq_ = best_seq if fq_writer is not None: fq_writer.writeRecord(_id_, _seq_, best_qual) if fa_writer is not None: fa_writer.writeRecord(_id_, _seq_) if fa_writer is not None: fa_writer.close() if fq_writer is not None: fq_writer.close() if ds_out_fn is not None: as_contigset(fa_out_fn, ds_out_fn)
def pickup_best_clusters(self, fq_filenames): """Pick up hiqh QV clusters.""" self.add_log( "Picking up the best clusters according to QVs from {fs}.".format( fs=", ".join(fq_filenames))) a = load(open(self.final_pickle_fn)) uc = a['uc'] quivered = {} for fq in fq_filenames: self.add_log("Looking at quivered fq {f}".format(f=fq)) for r in FastqReader(fq): cid = r.name.split('|')[0] if cid.endswith('_ref'): cid = cid[:-4] cid = int(cid[1:]) quivered[cid] = r good = [] for cid, r in quivered.iteritems(): q = [phred_to_qv(x) for x in r.quality] if sum(q[self.qv_trim_5:-self.qv_trim_3]) <= self.qv_max_err: good.append(cid) partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc'] partial_uc2 = defaultdict(lambda: []) partial_uc2.update(partial_uc) self.add_log("Writing hiqh-quality isoforms to {f}|fq".format( f=self.quivered_good_fa)) self.add_log("Writing low-quality isoforms to {f}|fq".format( f=self.quivered_bad_fa)) with FastaWriter(self.quivered_good_fa) as good_fa_writer, \ FastaWriter(self.quivered_bad_fa) as bad_fa_writer, \ FastqWriter(self.quivered_good_fq) as good_fq_writer, \ FastqWriter(self.quivered_bad_fq) as bad_fq_writer: for cid in quivered: r = quivered[cid] newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\ format(cid=cid, flnc_num=len(uc[cid]), nfl_num=len(partial_uc2[cid]), read_len=len(r.sequence)) if cid in good: self.add_log( "processing quivered cluster {c} --> good.".format( c=cid)) good_fa_writer.writeRecord(newname, r.sequence) good_fq_writer.writeRecord(newname, r.sequence, r.quality) else: self.add_log( "processing quivered cluster {c} --> bad.".format( c=cid)) bad_fa_writer.writeRecord(newname, r.sequence) bad_fq_writer.writeRecord(newname, r.sequence, r.quality) self.add_log("-" * 60, level=logging.INFO) self.add_log("High-quality Quivered consensus written " + "to:\n{0}\n{1}\n".format(self.quivered_good_fa, self.quivered_good_fq)) self.add_log( "Low-qulality Quivered consensus written " + "to:\n{0}\n{1}".format(self.quivered_bad_fa, self.quivered_bad_fq)) self.add_log("-" * 60, level=logging.INFO)