def pickup_best_clusters(self): """Pick up hiqh QV clusters.""" self.add_log( "Picking up the best clusters according to QVs from {fs}.".format( fs=", ".join(self.fq_filenames))) a = load(open(self.final_pickle_fn)) uc = a['uc'] # check if the uc cids are integers uc_keys_are_int = type(uc.keys()[0]) is int polished = {} # cid --> FastqRecord for fq in self.fq_filenames: self.add_log("Looking at arrowed fq {f}".format(f=fq)) for r in FastqReader(fq): # possible ID #1: c0|arrow (a single Ice2 directory) # possible ID #2: b112_c0|arrow (after collecting several Ice2 directory) cid = r.name.split('|')[0] if cid.endswith('_ref'): cid = cid[:-4] i = cid.find('/') if i > 0: cid = cid[:i] if uc_keys_are_int: # only convert in the case where uc keys are integers (ex: is c10, but 10) cid = int( cid[1:]) #becuz possible ID #2, dont convert to int polished[cid] = r expected_acc_dict = {} # cid --> expected accuracy (ex: 0.99) good = [] # contains all the cids that are HQ # calculate expected QV given 5'/3' trimming # for sequences that are shorter than the trimming, use the length itself for cid, r in polished.iteritems(): qv_len = max(len(r.quality), len(r.quality) - self.qv_trim_5 - self.qv_trim_3) q = [phred_to_qv(x) for x in r.quality] err_sum = sum(q[self.qv_trim_5:-self.qv_trim_3]) expected_acc_dict[cid] = 1.0 - (err_sum / float(qv_len)) if expected_acc_dict[cid] >= self.hq_arrow_min_accuracy and \ len(uc[cid]) >= self.hq_min_full_length_reads : good.append(cid) partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc'] partial_uc2 = defaultdict(lambda: []) partial_uc2.update(partial_uc) if self.report_fn is not None: self.write_report(report_fn=self.report_fn, uc=uc, partial_uc=partial_uc2) self.add_log("Writing hiqh-quality isoforms to {f}|fq".format( f=self.arrowed_good_fa)) self.add_log("Writing low-quality isoforms to {f}|fq".format( f=self.arrowed_bad_fa)) with FastaWriter(self.arrowed_good_fa) as good_fa_writer, \ FastaWriter(self.arrowed_bad_fa) as bad_fa_writer, \ FastqWriter(self.arrowed_good_fq) as good_fq_writer, \ FastqWriter(self.arrowed_bad_fq) as bad_fq_writer: for cid in polished: r = polished[cid] newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\ format(cid=cid, flnc_num=len(uc[cid]), nfl_num=len(partial_uc2[cid]), read_len=len(r.sequence)) newname = cid_with_annotation2( newname, expected_acc=expected_acc_dict[cid]) if cid in good: self.add_log( "processing arrowed cluster {c} --> good.".format( c=cid)) good_fa_writer.writeRecord(newname, r.sequence[:]) good_fq_writer.writeRecord(newname, r.sequence[:], r.quality) else: self.add_log( "processing arrowed cluster {c} --> bad.".format( c=cid)) bad_fa_writer.writeRecord(newname, r.sequence[:]) bad_fq_writer.writeRecord(newname, r.sequence[:], r.quality) self.add_log("-" * 60, level=logging.INFO) self.add_log( "High-quality Arrowed consensus written " + "to:\n{0}\n{1}".format(self.arrowed_good_fa, self.arrowed_good_fq), level=logging.INFO) self.add_log( "Low-quality Arrowed consensus written " + "to:\n{0}\n{1}".format(self.arrowed_bad_fa, self.arrowed_bad_fq), level=logging.INFO) self.add_log("-" * 60, level=logging.INFO)
def pickup_best_clusters(self, fq_filenames): """Pick up hiqh QV clusters.""" self.add_log("Picking up the best clusters according to QVs from {fs}.". format(fs=", ".join(fq_filenames))) a = load(open(self.final_pickle_fn)) uc = a['uc'] quivered = {} for fq in fq_filenames: self.add_log("Looking at quivered fq {f}".format(f=fq)) for r in FastqReader(fq): # possible ID: c0/0_1611|quiver cid = r.name.split('|')[0] if cid.endswith('_ref'): cid = cid[:-4] i = cid.find('/') if i > 0: cid = cid[:i] cid = int(cid[1:]) quivered[cid] = r good = [] for cid, r in quivered.iteritems(): qv_len = max(0, len(r.quality) - self.qv_trim_5 - self.qv_trim_3) if qv_len != 0: q = [phred_to_qv(x) for x in r.quality] err_sum = sum(q[self.qv_trim_5: -self.qv_trim_3]) if 1.0 - (err_sum / float(qv_len)) >= self.hq_quiver_min_accuracy and \ len(uc[cid]) >= self.hq_min_full_length_reads : good.append(cid) partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc'] partial_uc2 = defaultdict(lambda: []) partial_uc2.update(partial_uc) if self.report_fn is not None: self.write_report(report_fn=self.report_fn, uc=uc, partial_uc=partial_uc2) self.add_log("Writing hiqh-quality isoforms to {f}|fq". format(f=self.quivered_good_fa)) self.add_log("Writing low-quality isoforms to {f}|fq". format(f=self.quivered_bad_fa)) with FastaWriter(self.quivered_good_fa) as good_fa_writer, \ FastaWriter(self.quivered_bad_fa) as bad_fa_writer, \ FastqWriter(self.quivered_good_fq) as good_fq_writer, \ FastqWriter(self.quivered_bad_fq) as bad_fq_writer: for cid in quivered: r = quivered[cid] newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\ format(cid=cid, flnc_num=len(uc[cid]), nfl_num=len(partial_uc2[cid]), read_len=len(r.sequence)) newname = cid_with_annotation(newname) if cid in good: self.add_log("processing quivered cluster {c} --> good.". format(c=cid)) good_fa_writer.writeRecord(newname, r.sequence[:]) good_fq_writer.writeRecord(newname, r.sequence[:], r.quality) else: self.add_log("processing quivered cluster {c} --> bad.". format(c=cid)) bad_fa_writer.writeRecord(newname, r.sequence[:]) bad_fq_writer.writeRecord(newname, r.sequence[:], r.quality) self.add_log("-" * 60, level=logging.INFO) self.add_log("High-quality Quivered consensus written " + "to:\n{0}\n{1}".format(self.quivered_good_fa, self.quivered_good_fq), level=logging.INFO) self.add_log("Low-qulality Quivered consensus written " + "to:\n{0}\n{1}".format(self.quivered_bad_fa, self.quivered_bad_fq), level=logging.INFO) self.add_log("-" * 60, level=logging.INFO)
def pickup_best_clusters(self, fq_filenames): """Pick up hiqh QV clusters.""" self.add_log("Picking up the best clusters according to QVs from {fs}.". format(fs=", ".join(fq_filenames))) a = load(open(self.final_pickle_fn)) uc = a['uc'] quivered = {} for fq in fq_filenames: self.add_log("Looking at quivered fq {f}".format(f=fq)) for r in FastqReader(fq): # possible ID: c0/0_1611|quiver cid = r.name.split('|')[0] if cid.endswith('_ref'): cid = cid[:-4] i = cid.find('/') if i > 0: cid = cid[:i] cid = int(cid[1:]) quivered[cid] = r good = [] for cid, r in quivered.iteritems(): qv_len = max(0, len(r.quality) - self.qv_trim_5 - self.qv_trim_3) if qv_len != 0: q = [phred_to_qv(x) for x in r.quality] err_sum = sum(q[self.qv_trim_5: -self.qv_trim_3]) if 1.0 - (err_sum / float(qv_len)) >= self.hq_quiver_min_accuracy and \ len(uc[cid]) >= self.hq_min_full_length_reads : good.append(cid) partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc'] partial_uc2 = defaultdict(lambda: []) partial_uc2.update(partial_uc) if self.report_fn is not None: self.write_report(report_fn=self.report_fn, uc=uc, partial_uc=partial_uc2) self.add_log("Writing hiqh-quality isoforms to {f}|fq". format(f=self.quivered_good_fa)) self.add_log("Writing low-quality isoforms to {f}|fq". format(f=self.quivered_bad_fa)) with FastaWriter(self.quivered_good_fa) as good_fa_writer, \ FastaWriter(self.quivered_bad_fa) as bad_fa_writer, \ FastqWriter(self.quivered_good_fq) as good_fq_writer, \ FastqWriter(self.quivered_bad_fq) as bad_fq_writer: for cid in quivered: r = quivered[cid] newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\ format(cid=cid, flnc_num=len(uc[cid]), nfl_num=len(partial_uc2[cid]), read_len=len(r.sequence)) newname = cid_with_annotation(newname) if cid in good: self.add_log("processing quivered cluster {c} --> good.". format(c=cid)) good_fa_writer.writeRecord(newname, r.sequence[:]) good_fq_writer.writeRecord(newname, r.sequence[:], r.quality) else: self.add_log("processing quivered cluster {c} --> bad.". format(c=cid)) bad_fa_writer.writeRecord(newname, r.sequence[:]) bad_fq_writer.writeRecord(newname, r.sequence[:], r.quality) self.add_log("-" * 60, level=logging.INFO) self.add_log("High-quality Quivered consensus written " + "to:\n{0}\n{1}".format(self.quivered_good_fa, self.quivered_good_fq)) self.add_log("Low-qulality Quivered consensus written " + "to:\n{0}\n{1}".format(self.quivered_bad_fa, self.quivered_bad_fq)) self.add_log("-" * 60, level=logging.INFO)
def pickup_best_clusters(self): """Pick up hiqh QV clusters.""" self.add_log("Picking up the best clusters according to QVs from {fs}.". format(fs=", ".join(self.fq_filenames))) a = load(open(self.final_pickle_fn)) uc = a['uc'] # check if the uc cids are integers uc_keys_are_int = type(uc.keys()[0]) is int polished = {} # cid --> FastqRecord for fq in self.fq_filenames: self.add_log("Looking at arrowed fq {f}".format(f=fq)) for r in FastqReader(fq): # possible ID #1: c0|arrow (a single Ice2 directory) # possible ID #2: b112_c0|arrow (after collecting several Ice2 directory) cid = r.name.split('|')[0] if cid.endswith('_ref'): cid = cid[:-4] i = cid.find('/') if i > 0: cid = cid[:i] if uc_keys_are_int: # only convert in the case where uc keys are integers (ex: is c10, but 10) cid = int(cid[1:]) #becuz possible ID #2, dont convert to int polished[cid] = r expected_acc_dict = {} # cid --> expected accuracy (ex: 0.99) good = [] # contains all the cids that are HQ # calculate expected QV given 5'/3' trimming # for sequences that are shorter than the trimming, use the length itself for cid, r in polished.iteritems(): qv_len = max(len(r.quality), len(r.quality) - self.qv_trim_5 - self.qv_trim_3) q = [phred_to_qv(x) for x in r.quality] err_sum = sum(q[self.qv_trim_5: -self.qv_trim_3]) expected_acc_dict[cid] = 1.0 - (err_sum / float(qv_len)) if expected_acc_dict[cid] >= self.hq_arrow_min_accuracy and \ len(uc[cid]) >= self.hq_min_full_length_reads : good.append(cid) partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc'] partial_uc2 = defaultdict(lambda: []) partial_uc2.update(partial_uc) if self.report_fn is not None: self.write_report(report_fn=self.report_fn, uc=uc, partial_uc=partial_uc2) self.add_log("Writing hiqh-quality isoforms to {f}|fq". format(f=self.arrowed_good_fa)) self.add_log("Writing low-quality isoforms to {f}|fq". format(f=self.arrowed_bad_fa)) with FastaWriter(self.arrowed_good_fa) as good_fa_writer, \ FastaWriter(self.arrowed_bad_fa) as bad_fa_writer, \ FastqWriter(self.arrowed_good_fq) as good_fq_writer, \ FastqWriter(self.arrowed_bad_fq) as bad_fq_writer: for cid in polished: r = polished[cid] newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\ format(cid=cid, flnc_num=len(uc[cid]), nfl_num=len(partial_uc2[cid]), read_len=len(r.sequence)) newname = cid_with_annotation2(newname, expected_acc=expected_acc_dict[cid]) if cid in good: self.add_log("processing arrowed cluster {c} --> good.". format(c=cid)) good_fa_writer.writeRecord(newname, r.sequence[:]) good_fq_writer.writeRecord(newname, r.sequence[:], r.quality) else: self.add_log("processing arrowed cluster {c} --> bad.". format(c=cid)) bad_fa_writer.writeRecord(newname, r.sequence[:]) bad_fq_writer.writeRecord(newname, r.sequence[:], r.quality) self.add_log("-" * 60, level=logging.INFO) self.add_log("High-quality Arrowed consensus written " + "to:\n{0}\n{1}".format(self.arrowed_good_fa, self.arrowed_good_fq), level=logging.INFO) self.add_log("Low-quality Arrowed consensus written " + "to:\n{0}\n{1}".format(self.arrowed_bad_fa, self.arrowed_bad_fq), level=logging.INFO) self.add_log("-" * 60, level=logging.INFO)