def map_isoforms_to_reference_transcripts(self): """Map isoforms to reference transcripts.""" m5out = self.output_analysis_fn + ".blasr.out.m5" cmd = 'blasr %s %s --bestn 1 -m 5 --out %s' % \ (self.isoseq_output_fa, self.reference_transcripts_fn, m5out) execute(cmd) return [r for r in BLASRM5Reader(m5out)]
def blasr_against_ref2(output_filename, is_FL, sID_starts_with_c, qver_get_func, qvmean_get_func, qv_prob_threshold=.03, ece_penalty=1, ece_min_len=20, same_strand_only=True, max_missed_start=200, max_missed_end=50, full_missed_start=50, full_missed_end=30): """ Excluding criteria: (1) self hit (2) opposite strand hit (should already be in the same orientation; can override with <same_strand_only> set to False) (3) less than 90% aligned or more than 50 bp missed qver_get_func --- should be basQV.basQVcacher.get() or .get_smoothed(), or can just pass in lambda (x, y): 1. to ignore QV """ with BLASRM5Reader(output_filename) as reader: for r in reader: missed_q = r.qStart + r.qLength - r.qEnd missed_t = r.sStart + r.sLength - r.sEnd if sID_starts_with_c: # because all consensus should start with # c<cluster_index> assert r.sID.startswith('c') if r.sID.find('/') > 0: r.sID = r.sID.split('/')[0] if r.sID.endswith('_ref'): # probably c<cid>_ref cID = int(r.sID[1:-4]) else: cID = int(r.sID[1:]) else: cID = r.sID # self hit, useless! # opposite strand not allowed! if (cID == r.qID or (r.strand == '-' and same_strand_only)): yield HitItem(qID=r.qID, cID=cID) continue # regardless if whether is full-length (is_FL) # the query MUST be mapped fully (based on full_missed_start/end) if r.qStart > full_missed_start or (r.qLength - r.qEnd) > full_missed_end: yield HitItem(qID=r.qID, cID=cID) # full-length case: allow up to max_missed_start bp of 5' not aligned # and max_missed_end bp of 3' not aligned # non-full-length case: not really tested...don't use if is_FL and not alignment_missed_start_end_less_than_threshold(r,\ max_missed_start, max_missed_end, full_missed_start, full_missed_end): yield HitItem(qID=r.qID, cID=cID) else: cigar_str, ece_arr = eval_blasr_alignment( record=r, qver_get_func=qver_get_func, qvmean_get_func=qvmean_get_func, sID_starts_with_c=sID_starts_with_c, qv_prob_threshold=qv_prob_threshold) if alignment_has_large_nonmatch(ece_arr, ece_penalty, ece_min_len): yield HitItem(qID=r.qID, cID=cID) else: yield HitItem(qID=r.qID, cID=cID, qStart=r.qStart, qEnd=r.qEnd, missed_q=missed_q * 1. / r.qLength, missed_t=missed_t * 1. / r.sLength, fakecigar=cigar_str, ece_arr=ece_arr)