def merge_reads(reads): """ Generator over merged reads. :param reads: iterable of reads from FastqReader :return: a generator with items (merged_bases may be None if merge fails): (pair_name, (read1_name, bases, quality), (read2_name, bases, quality), merged_bases) """ for pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2) in reads: if not (seq1 and seq2): score = -1 aligned1 = aligned2 = None else: seq2_rev = reverse_and_complement(seq2) aligned1, aligned2, score = align_it(seq1, seq2_rev, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) if score >= MIN_PAIR_ALIGNMENT_SCORE and aligned1[0] != '-': aligned_qual1 = align_quality(aligned1, qual1) aligned_qual2 = align_quality(aligned2, reversed(qual2)) merged = merge_pairs(aligned1, aligned2, aligned_qual1, aligned_qual2, q_cutoff=Q_CUTOFF) else: merged = None yield (pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2), merged)
def merge_reads(reads): """ Generator over merged reads. :param reads: iterable of reads from FastqReader :return: a generator with items (merged_bases may be None if merge fails): (pair_name, (read1_name, bases, quality), (read2_name, bases, quality), merged_bases) """ for pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2) in reads: if not (seq1 and seq2): score = -1 else: seq2_rev = reverse_and_complement(seq2) aligned1, aligned2, score = align_it(seq1, seq2_rev, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) if score >= MIN_PAIR_ALIGNMENT_SCORE and aligned1[0] != '-': aligned_qual1 = align_quality(aligned1, qual1) aligned_qual2 = align_quality(aligned2, reversed(qual2)) merged = merge_pairs(aligned1, aligned2, aligned_qual1, aligned_qual2, q_cutoff=Q_CUTOFF) else: merged = None yield (pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2), merged)
def testLowQualityInSecondRead(self): seq1 = 'AGT' # @IgnorePep8 seq2 = '---GCA' # @IgnorePep8 qual1 = 'JJJ' # @IgnorePep8 qual2 = '!!!J*J' # @IgnorePep8 expected_mseq = 'AGTGNA' mseq = merge_pairs(seq1, seq2, qual1, qual2) self.assertEqual(expected_mseq, mseq)
def testGap(self): seq1 = 'AGT' # @IgnorePep8 seq2 = '------GCA' # @IgnorePep8 qual1 = 'JJJ' # @IgnorePep8 qual2 = '!!!!!!JJJ' # @IgnorePep8 expected_mseq = 'AGTnnnGCA' mseq = merge_pairs(seq1, seq2, qual1, qual2) self.assertEqual(expected_mseq, mseq)
def testDisagreementWithLowQuality(self): seq1 = 'AGTGCA' # @IgnorePep8 seq2 = 'ACTGCA' # @IgnorePep8 qual1 = 'J!JJJJ' # @IgnorePep8 qual2 = 'J*JJJJ' # @IgnorePep8 expected_mseq = 'ANTGCA' mseq = merge_pairs(seq1, seq2, qual1, qual2) self.assertEqual(expected_mseq, mseq)
def testReverseDeletion(self): seq1 = 'CTGCA' # @IgnorePep8 seq2 = '--GCAT-T' # @IgnorePep8 qual1 = 'JJJJJ' # @IgnorePep8 qual2 = '!!JJJJ!J' # @IgnorePep8 expected_mseq = 'CTGCAT-T' mseq = merge_pairs(seq1, seq2, qual1, qual2) self.assertEqual(expected_mseq, mseq)
def testForwardDeletion(self): seq1 = 'C-GCA' # @IgnorePep8 seq2 = '---CATCT' # @IgnorePep8 qual1 = 'J!JJJ' # @IgnorePep8 qual2 = '!!!JJJJJ' # @IgnorePep8 expected_mseq = 'C-GCATCT' mseq = merge_pairs(seq1, seq2, qual1, qual2) self.assertEqual(expected_mseq, mseq)
def testOffset(self): seq1 = '-CTGCA' # @IgnorePep8 seq2 = '---GCATCT' # @IgnorePep8 qual1 = '!JJJJJ' # @IgnorePep8 qual2 = '!!!JJJJJJ' # @IgnorePep8 expected_mseq = '-CTGCATCT' mseq = merge_pairs(seq1, seq2, qual1, qual2) self.assertEqual(expected_mseq, mseq)
def testDifferentLength(self): seq1 = 'ACTGCATCT' # @IgnorePep8 seq2 = 'ACTGCA' # @IgnorePep8 qual1 = 'JJJJJJJJJ' # @IgnorePep8 qual2 = 'JJJJJJ' # @IgnorePep8 expected_mseq = 'ACTGCATCT' mseq = merge_pairs(seq1, seq2, qual1, qual2) self.assertEqual(expected_mseq, mseq)
def testSimple(self): seq1 = 'ACTGCA' # @IgnorePep8 seq2 = 'ACTGCA' # @IgnorePep8 qual1 = 'JJJJJJ' # @IgnorePep8 qual2 = 'JJJJJJ' # @IgnorePep8 expected_mseq = 'ACTGCA' mseq = merge_pairs(seq1, seq2, qual1, qual2) self.assertEqual(expected_mseq, mseq)
def testLowQualityInSecondRead(self): sequence1 = 'AGT' sequence2 = '---GCA' quality_1 = 'JJJ' quality_2 = '!!!J*J' exp_m_seq = 'AGTGNA' mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2) self.assertEqual(exp_m_seq, mseq)
def testDisagreementWithLowQuality(self): sequence1 = 'AGTGCA' sequence2 = 'ACTGCA' quality_1 = 'J!JJJJ' quality_2 = 'J*JJJJ' exp_m_seq = 'ANTGCA' mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2) self.assertEqual(exp_m_seq, mseq)
def testGap(self): sequence1 = 'AGT' sequence2 = '------GCA' quality_1 = 'JJJ' quality_2 = '!!!!!!JJJ' exp_m_seq = 'AGTnnnGCA' mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2) self.assertEqual(exp_m_seq, mseq)
def testDisagreementWithDifferentQualityFirstHigher(self): sequence1 = 'AGTGCA' sequence2 = 'ACTGCA' quality_1 = 'JJJJJJ' quality_2 = 'JEJJJJ' exp_m_seq = 'AGTGCA' mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2) self.assertEqual(exp_m_seq, mseq)
def testReverseDeletion(self): sequence1 = 'CTGCA' sequence2 = '--GCAT-T' quality_1 = 'JJJJJ' quality_2 = '!!JJJJ!J' exp_m_seq = 'CTGCAT-T' mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2) self.assertEqual(exp_m_seq, mseq)
def testOffset(self): sequence1 = '-CTGCA' sequence2 = '---GCATCT' quality_1 = '!JJJJJ' quality_2 = '!!!JJJJJJ' exp_m_seq = '-CTGCATCT' mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2) self.assertEqual(exp_m_seq, mseq)
def testForwardDeletion(self): sequence1 = 'C-GCA' sequence2 = '---CATCT' quality_1 = 'J!JJJ' quality_2 = '!!!JJJJJ' exp_m_seq = 'C-GCATCT' mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2) self.assertEqual(exp_m_seq, mseq)
def testSimple(self): sequence1 = 'ACTGCA' sequence2 = 'ACTGCA' quality_1 = 'JJJJJJ' quality_2 = 'JJJJJJ' exp_m_seq = 'ACTGCA' mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2) self.assertEqual(exp_m_seq, mseq)
def testDifferentLength(self): sequence1 = 'ACTGCATCT' sequence2 = 'ACTGCA' quality_1 = 'JJJJJJJJJ' quality_2 = 'JJJJJJ' exp_m_seq = 'ACTGCATCT' mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2) self.assertEqual(exp_m_seq, mseq)
def testDisagreementWithCloseQualitySecondHigher(self): sequence1 = 'AGTGCA' sequence2 = 'ACTGCA' quality_1 = 'JFJJJJ' quality_2 = 'JJJJJJ' exp_m_seq = 'ANTGCA' mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2) self.assertEqual(exp_m_seq, mseq)
def testTwoInsertions(self): seq1 = 'AGT' # @IgnorePep8 seq2 = '---GCA' # @IgnorePep8 qual1 = 'JJJ' # @IgnorePep8 qual2 = '!!!JJJ' # @IgnorePep8 ins1 = {2: ('CCC', 'JJJ')} ins2 = {5: ('TTT', 'JJJ')} expected_mseq = 'AGCCCTGCTTTA' mseq = merge_pairs(seq1, seq2, qual1, qual2, ins1, ins2) self.assertEqual(expected_mseq, mseq)
def merge_reads(quality_cutoff, read_pair): """ Merge a pair of reads. Also skip reads that don't meet certain criteria. @param quality_cutoff: minimum quality score for a base to be counted @param read_pair: a sequence of two sequences, each with fields from a SAM file record @return: (rname, mseq, merged_inserts, qual1, qual2) or None to skip the pair """ read1, read2 = read_pair if read2 and read1[2] != read2[2]: # region mismatch, ignore the read pair. return None filtered_reads = [] rname = None for read in read_pair: if not read: continue (_qname, flag, rname, refpos_str, _mapq, cigar, _rnext, _pnext, _tlen, seq, qual) = read[:11] # ignore optional fields if is_unmapped_read(flag): continue filtered_reads.append(dict(rname=rname, cigar=cigar, seq=seq, qual=qual, pos=int(refpos_str))) if not filtered_reads: return None seq1, qual1, ins1 = apply_cigar(filtered_reads[0]['cigar'], filtered_reads[0]['seq'], filtered_reads[0]['qual'], filtered_reads[0]['pos']-1) if len(filtered_reads) == 1: seq2 = qual2 = '' ins2 = None else: seq2, qual2, ins2 = apply_cigar(filtered_reads[1]['cigar'], filtered_reads[1]['seq'], filtered_reads[1]['qual'], filtered_reads[1]['pos']-1) mseq = merge_pairs(seq1, seq2, qual1, qual2, q_cutoff=quality_cutoff) merged_inserts = merge_inserts(ins1, ins2, quality_cutoff) return rname, mseq, merged_inserts, qual1, qual2
def testConflictingInsertions(self): seq1 = 'AGTGCA' # @IgnorePep8 seq2 = 'AGTGCA' # @IgnorePep8 qual1 = 'JJJJJJ' # @IgnorePep8 qual2 = 'JJJJJJ' # @IgnorePep8 ins1 = {2: ('CCC', 'JJJ')} ins2 = {2: ('CTC', 'JAJ')} expected_mseq = 'AGCCCTGCA' mseq = merge_pairs(seq1, seq2, qual1, qual2, ins1, ins2) self.assertEqual(expected_mseq, mseq)
def testConflictingInsertions(self): sequence1 = 'AGTGCA' sequence2 = 'AGTGCA' quality_1 = 'JJJJJJ' quality_2 = 'JJJJJJ' ins1 = {2: ('CCC', 'JJJ')} ins2 = {2: ('CTC', 'JAJ')} exp_m_seq = 'AGCCCTGCA' mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2, ins1, ins2) self.assertEqual(exp_m_seq, mseq)
def testTwoInsertions(self): sequence1 = 'AGT' sequence2 = '---GCA' quality_1 = 'JJJ' quality_2 = '!!!JJJ' ins1 = {2: ('CCC', 'JJJ')} ins2 = {5: ('TTT', 'JJJ')} exp_m_seq = 'AGCCCTGCTTTA' mseq = merge_pairs(sequence1, sequence2, quality_1, quality_2, ins1, ins2) self.assertEqual(exp_m_seq, mseq)
def merge_reads(quality_cutoff, read_pair): """ Merge a pair of reads. Also skip reads that don't meet certain criteria. @param quality_cutoff: minimum quality score for a base to be counted @param read_pair: a sequence of two sequences, each with fields from a SAM file record @return: (rname, mseq, merged_inserts, qual1, qual2) or None to skip the pair """ read1, read2 = read_pair if read2 and read1[2] != read2[2]: # region mismatch, ignore the read pair. return None filtered_reads = [] for read in read_pair: if not read: continue (_qname, flag, rname, refpos_str, _mapq, cigar, _rnext, _pnext, _tlen, seq, qual) = read[:11] # ignore optional fields if is_unmapped_read(flag): continue filtered_reads.append(dict(rname=rname, cigar=cigar, seq=seq, qual=qual, pos=int(refpos_str))) if not filtered_reads: return None seq1, qual1, ins1 = apply_cigar(filtered_reads[0]['cigar'], filtered_reads[0]['seq'], filtered_reads[0]['qual'], filtered_reads[0]['pos']-1) if len(filtered_reads) == 1: seq2 = qual2 = '' ins2 = None else: seq2, qual2, ins2 = apply_cigar(filtered_reads[1]['cigar'], filtered_reads[1]['seq'], filtered_reads[1]['qual'], filtered_reads[1]['pos']-1) mseq = merge_pairs(seq1, seq2, qual1, qual2, q_cutoff=quality_cutoff) merged_inserts = merge_inserts(ins1, ins2, quality_cutoff) return rname, mseq, merged_inserts, qual1, qual2
def sam_g2p(pssm, remap_csv, nuc_csv, g2p_csv, g2p_summary_csv=None, min_count=1): pairs = {} # cache read for pairing merged = Counter() # { merged_nuc_seq: count } tracker = RegionTracker('V3LOOP') # look up clipping region for each read reader = csv.DictReader(nuc_csv) for row in reader: if row['query.nuc.pos'] == '': # skip deletions in query relative to reference continue tracker.add_nuc(row['seed'], row['region'], int(row['query.nuc.pos'])-1) # parse contents of remap CSV output reader = csv.DictReader(remap_csv) for row in reader: clip_from, clip_to = tracker.get_range(row['rname']) if clip_from is None or row['cigar'] == '*': # uninteresting region continue seq2, qual2, ins2 = apply_cigar(row['cigar'], row['seq'], row['qual'], int(row['pos'])-1, clip_from, clip_to) mate = pairs.pop(row['qname'], None) if mate: seq1 = mate['seq'] qual1 = mate['qual'] ins1 = mate['ins'] mseq = merge_pairs(seq1, seq2, qual1, qual2, ins1, ins2) merged[mseq] += 1 else: pairs.update({row['qname']: {'seq': seq2, 'qual': qual2, 'ins': ins2}}) # apply g2p algorithm to merged reads g2p_writer = csv.DictWriter( g2p_csv, ['rank', 'count', 'g2p', 'fpr', 'call', 'seq', 'aligned', 'error', 'comment'], lineterminator=os.linesep) g2p_writer.writeheader() counts = Counter() skip_count = 0 for s, count in merged.most_common(): if count < min_count: skip_count += count continue # remove in-frame deletions seq = re.sub(pat, r'\g<1>\g<3>', s) row = _build_row(seq, count, counts, pssm) g2p_writer.writerow(row) if skip_count: counts['mapped'] += skip_count g2p_writer.writerow(dict(rank=counts['rank'] + 1, count=skip_count, error='count < {}'.format(min_count))) if g2p_summary_csv is not None: if counts['valid'] == 0: x4_pct_display = '' final_call = '' else: x4_pct = 100.0 * counts['x4'] / counts['valid'] final_call = 'X4' if x4_pct >= 2.0 else 'R5' x4_pct_display = '{:0.2f}'.format(x4_pct) summary_writer = csv.writer(g2p_summary_csv, lineterminator=os.linesep) summary_writer.writerow(['mapped', 'valid', 'X4calls', 'X4pct', 'final']) summary_writer.writerow([counts['mapped'], counts['valid'], counts['x4'], x4_pct_display, final_call])