def __init__(self, input_fofn, fasta_filename=None, prob_threshold=0.03, window_size=5): self.qver = basQVcacher() self.input_fofn = input_fofn self.seqids = [] self.prob_threshold = prob_threshold self.window_size = window_size self.full_prob = None with open(self.input_fofn) as f: for line in f: self.qver.add_bash5(line.strip()) if fasta_filename is not None: self.add_seqs_from_fasta(fasta_filename)
def __init__(self, input_fofn, fasta_filename=None, prob_threshold=.03, window_size=5): self.qver = basQVcacher() self.input_fofn = input_fofn self.seqids = [] self.prob_threshold = prob_threshold self.window_size = window_size self.full_prob = None with open(self.input_fofn) as f: for line in f: self.qver.add_bash5(line.strip()) if fasta_filename is not None: self.add_seqs_from_fasta(fasta_filename)
def ice_fa2fq(in_fa, ccs_fofn, out_fq): """Convert an input FASTA file to an output FASTQ file, reading QVs from the input ccs.h5 or ccs FOFN. """ qver = basQVcacher() if ccs_fofn: if ccs_fofn.endswith(".h5"): # Input is a ccs.h5 file not a FOFN. qver.add_bash5(ccs_fofn) else: # Input is a ccs FOFN containing multiple ccs.h5 files. for ccs_fn in get_files_from_fofn(ccs_fofn): qver.add_bash5(ccs_fn) bas_handlers = {} with FastaReader(in_fa) as reader, \ FastqWriter(out_fq) as writer: for r in reader: seqid = r.name.split(' ')[0] movie, hn, s_e = "", "", "" try: movie, hn, s_e = seqid.split('/') hn = int(hn) except ValueError: raise ValueError("{seqid} is not a valid CCS read". format(seqid=seqid)) if ccs_fofn: try: bas_file = qver.bas_files[movie][seqid] if bas_file not in bas_handlers: bas_handlers[bas_file] = BasH5Reader(bas_file) except KeyError: raise IOError("Could not read {s} from input ccs fofn.". format(s=seqid)) logging.debug("Getting QVs for {name} ...".format(name=r.name)) qvs = get_qv_from_bas_handler(bas_handler=bas_handlers[bas_file], hn=hn, s_e=s_e, qv_name="QualityValue") else: #No quality values provided to pbtranscript.py cluster qvs = [60]*len(r.sequence) # No information given, have strong belief in the base calls if len(r.sequence) != len(qvs): raise ValueError("Sequence and QVs of {r} should be the same!". format(r=r.name)) writer.writeRecord(r.name, r.sequence, qvs) for bas_file, bas_handler in bas_handlers.iteritems(): logging.debug("Closing {bas_file} ...".format(bas_file=bas_file)) bas_handler.close()
def ice_fa2fq(in_fa, ccs_fofn, out_fq): """Convert an input FASTA file to an output FASTQ file, reading QVs from the input ccs.h5 or ccs FOFN. """ qver = basQVcacher() if ccs_fofn.endswith(".h5"): # Input is a ccs.h5 file not a FOFN. qver.add_bash5(ccs_fofn) else: # Input is a ccs FOFN containing multiple ccs.h5 files. for ccs_fn in get_files_from_fofn(ccs_fofn): qver.add_bash5(ccs_fn) bas_handlers = {} with FastaReader(in_fa) as reader, \ FastqWriter(out_fq) as writer: for r in reader: seqid = r.name.split(' ')[0] movie, hn, s_e = "", "", "" try: movie, hn, s_e = seqid.split('/') hn = int(hn) except ValueError: raise ValueError( "{seqid} is not a valid CCS read".format(seqid=seqid)) try: bas_file = qver.bas_files[movie][seqid] if bas_file not in bas_handlers: bas_handlers[bas_file] = BasH5Reader(bas_file) except KeyError: raise IOError( "Could not read {s} from input ccs fofn.".format(s=seqid)) logging.debug("Getting QVs for {name} ...".format(name=r.name)) qvs = get_qv_from_bas_handler(bas_handler=bas_handlers[bas_file], hn=hn, s_e=s_e, qv_name="QualityValue") if len(r.sequence) != len(qvs): raise ValueError( "Sequence and QVs of {r} should be the same!".format( r=r.name)) writer.writeRecord(r.name, r.sequence, qvs) for bas_file, bas_handler in bas_handlers.iteritems(): logging.debug("Closing {bas_file} ...".format(bas_file=bas_file)) bas_handler.close()