def _do_trim(self, seq): 'It trims the edges of the given seqs.' annots = get_annotations(seq) if not TRIMMING_RECOMMENDATIONS in annots: return seq trim_rec = annots[TRIMMING_RECOMMENDATIONS] # fixing the trimming recommendations if TRIMMING_RECOMMENDATIONS in annots: del annots[TRIMMING_RECOMMENDATIONS] trim_segments = [] for trim_kind in TRIMMING_KINDS: trim_segments.extend(trim_rec.get(trim_kind, [])) # masking if self.mask: seq = _mask_sequence(seq, trim_segments) else: # trimming if trim_segments: trim_limits = get_longest_complementary_segment( trim_segments, get_length(seq)) if trim_limits is None: # there's no sequence left return None else: trim_limits = [] if trim_limits: seq = slice_seq(seq, trim_limits[0], trim_limits[1] + 1) return seq
def test_blast_short_trimming(self): 'It trims oligos using blast-short' oligo1 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACATGGG')) oligo2 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACTTTTT')) oligo1 = SeqWrapper(SEQRECORD, oligo1, None) oligo2 = SeqWrapper(SEQRECORD, oligo2, None) adaptors = [oligo1, oligo2] blast_trim = TrimWithBlastShort(oligos=adaptors) fhand = StringIO(FASTQ4) seq_packets = read_seq_packets([fhand], prefered_seq_classes=[SEQRECORD]) trim_packets = list(seq_to_trim_packets(seq_packets)) trim_packets2 = blast_trim(trim_packets[0]) # It should trim the first and the second reads. res = [ get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(VECTOR, []) for l in trim_packets2[SEQS_PASSED] for s in l ] assert res == [[(0, 29)], [(0, 29)], []] # With SeqItems oligo1 = SeqItem('oligo1', ['>oligo1\n', 'AAGCAGTGGTATCAACGCAGAGTACATGGG\n']) oligo2 = SeqItem('oligo2', ['>oligo2\n', 'AAGCAGTGGTATCAACGCAGAGTACTTTTT\n']) oligo1 = SeqWrapper(SEQITEM, oligo1, 'fasta') oligo2 = SeqWrapper(SEQITEM, oligo2, 'fasta') adaptors = [oligo1, oligo2] blast_trim = TrimWithBlastShort(oligos=adaptors) fhand = StringIO(FASTQ4) seq_packets = list( read_seq_packets([fhand], prefered_seq_classes=[SEQITEM])) trim_packets = list(seq_to_trim_packets(seq_packets)) trim_packets2 = blast_trim(trim_packets[0]) # It should trim the first and the second reads. res = [ get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(VECTOR, []) for l in trim_packets2[SEQS_PASSED] for s in l ] assert res == [[(0, 29)], [(0, 29)], []]
def test_blast_short_trimming(self): 'It trims oligos using blast-short' oligo1 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACATGGG')) oligo2 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACTTTTT')) oligo1 = SeqWrapper(SEQRECORD, oligo1, None) oligo2 = SeqWrapper(SEQRECORD, oligo2, None) adaptors = [oligo1, oligo2] blast_trim = TrimWithBlastShort(oligos=adaptors) fhand = StringIO(FASTQ4) seq_packets = read_seq_packets([fhand], prefered_seq_classes=[SEQRECORD]) trim_packets = list(seq_to_trim_packets(seq_packets)) trim_packets2 = blast_trim(trim_packets[0]) # It should trim the first and the second reads. res = [get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(VECTOR, []) for l in trim_packets2[SEQS_PASSED] for s in l] assert res == [[(0, 29)], [(0, 29)], []] # With SeqItems oligo1 = SeqItem('oligo1', ['>oligo1\n', 'AAGCAGTGGTATCAACGCAGAGTACATGGG\n']) oligo2 = SeqItem('oligo2', ['>oligo2\n', 'AAGCAGTGGTATCAACGCAGAGTACTTTTT\n']) oligo1 = SeqWrapper(SEQITEM, oligo1, 'fasta') oligo2 = SeqWrapper(SEQITEM, oligo2, 'fasta') adaptors = [oligo1, oligo2] blast_trim = TrimWithBlastShort(oligos=adaptors) fhand = StringIO(FASTQ4) seq_packets = list(read_seq_packets([fhand], prefered_seq_classes=[SEQITEM])) trim_packets = list(seq_to_trim_packets(seq_packets)) trim_packets2 = blast_trim(trim_packets[0]) # It should trim the first and the second reads. res = [get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(VECTOR, []) for l in trim_packets2[SEQS_PASSED] for s in l] assert res == [[(0, 29)], [(0, 29)], []]
def test_trimming(self): 'The sequences are trimmed according to the recommendations.' seq1 = 'gggtctcatcatcaggg'.upper() seq = SeqRecord(Seq(seq1), annotations={TRIMMING_RECOMMENDATIONS: {}}) seq = SeqWrapper(SEQRECORD, seq, None) trim_rec = get_annotations(seq)[TRIMMING_RECOMMENDATIONS] seq_trimmer = TrimOrMask() trim_rec['vector'] = [(0, 3), (8, 13)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec seqs2 = seq_trimmer([seq]) assert get_str_seq(seqs2[0]) == 'CTCA' trim_rec['vector'] = [(0, 0), (8, 13)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec seqs2 = seq_trimmer([seq]) assert get_str_seq(seqs2[0]) == 'GGTCTCA' trim_rec['vector'] = [(0, 1), (8, 12)] trim_rec['quality'] = [(1, 8), (13, 17)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec seqs2 = seq_trimmer([seq]) assert not seqs2 trim_rec['vector'] = [(0, 0), (8, 13)] trim_rec['quality'] = [] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec seqs2 = seq_trimmer([seq]) assert get_str_seq(seqs2[0]) == 'GGTCTCA' assert TRIMMING_RECOMMENDATIONS not in get_annotations(seqs2[0])
def __call__(self, seqs): 'It trims the edges of the given seqs.' mask = self.mask processed_seqs = [] for seq in seqs: annots = get_annotations(seq) if not TRIMMING_RECOMMENDATIONS in annots: processed_seqs.append(copy_seq(seq)) continue trim_rec = annots[TRIMMING_RECOMMENDATIONS] # fixing the trimming recommendations if TRIMMING_RECOMMENDATIONS in annots: del annots[TRIMMING_RECOMMENDATIONS] trim_segments = [] for trim_kind in TRIMMING_KINDS: trim_segments.extend(trim_rec.get(trim_kind, [])) # masking if mask: seq = _mask_sequence(seq, trim_segments) else: # trimming if trim_segments: trim_limits = get_longest_complementary_segment( trim_segments, get_length(seq)) if trim_limits is None: # there's no sequence left continue else: trim_limits = [] if trim_limits: seq = slice_seq(seq, trim_limits[0], trim_limits[1] + 1) processed_seqs.append(seq) return processed_seqs
def test_trim_chimeric_region(self): index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') query1 = '@seq2 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query1 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n' query1 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' query1 += '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n' query2 = '@seq2 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n' query2 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n' query = query1 + query2 fhand = NamedTemporaryFile() fhand.write(query) fhand.flush() trim_chimeras = TrimMatePairChimeras(index_fpath) seq_packets = list(read_seq_packets([open(fhand.name)])) trim_packets = list(seq_to_trim_packets(seq_packets)) trim_packets2 = trim_chimeras(trim_packets[0]) # It should trim the first and the second reads. res = [get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(OTHER, []) for l in trim_packets2[SEQS_PASSED] for s in l] assert res == [[(49, 105)], []]
def test_trim_chimeric_region(self): index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') query1 = '@seq2 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query1 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n' query1 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' query1 += '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n' query2 = '@seq2 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n' query2 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n' query = query1 + query2 fhand = NamedTemporaryFile() fhand.write(query) fhand.flush() trim_chimeras = TrimMatePairChimeras(index_fpath) seq_packets = list(read_seq_packets([open(fhand.name)])) trim_packets = list(seq_to_trim_packets(seq_packets)) trim_packets2 = trim_chimeras(trim_packets[0]) # It should trim the first and the second reads. res = [ get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(OTHER, []) for l in trim_packets2[SEQS_PASSED] for s in l ] assert res == [[(49, 105)], []]
def test_trimming(self): 'The sequences are trimmed according to the recommendations.' seq1 = 'gggtctcatcatcaggg'.upper() seq = SeqRecord(Seq(seq1), annotations={TRIMMING_RECOMMENDATIONS: {}}) seq = SeqWrapper(SEQRECORD, seq, None) seqs = [seq] trim_packet = {SEQS_PASSED: [seqs], ORPHAN_SEQS: []} trim_rec = get_annotations(seq)[TRIMMING_RECOMMENDATIONS] seq_trimmer = TrimOrMask() trim_rec['vector'] = [(0, 3), (8, 13)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l] assert res == ['CTCA'] trim_rec['vector'] = [(0, 0), (8, 13)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l] assert res == ['GGTCTCA'] trim_rec['vector'] = [(0, 1), (8, 12)] trim_rec['quality'] = [(1, 8), (13, 17)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) assert not trim_packet2[SEQS_PASSED] trim_rec['vector'] = [(0, 0), (8, 13)] trim_rec['quality'] = [] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l] assert res == ['GGTCTCA'] trim_packet2[SEQS_PASSED][0][0] assert TRIMMING_RECOMMENDATIONS not in get_annotations( trim_packet2[SEQS_PASSED][0][0])
def test_trimming(self): 'The sequences are trimmed according to the recommendations.' seq1 = 'gggtctcatcatcaggg'.upper() seq = SeqRecord(Seq(seq1), annotations={TRIMMING_RECOMMENDATIONS: {}}) seq = SeqWrapper(SEQRECORD, seq, None) seqs = [seq] trim_packet = {SEQS_PASSED: [seqs], ORPHAN_SEQS: []} trim_rec = get_annotations(seq)[TRIMMING_RECOMMENDATIONS] seq_trimmer = TrimOrMask() trim_rec['vector'] = [(0, 3), (8, 13)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l] assert res == ['CTCA'] trim_rec['vector'] = [(0, 0), (8, 13)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l] assert res == ['GGTCTCA'] trim_rec['vector'] = [(0, 1), (8, 12)] trim_rec['quality'] = [(1, 8), (13, 17)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) assert not trim_packet2[SEQS_PASSED] trim_rec['vector'] = [(0, 0), (8, 13)] trim_rec['quality'] = [] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l] assert res == ['GGTCTCA'] trim_packet2[SEQS_PASSED][0][0] assert TRIMMING_RECOMMENDATIONS not in get_annotations(trim_packet2[SEQS_PASSED][0][0])