def test_trim_seqs(): 'It tests the trim seq function' seqs = [] seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTTTC')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('CTTCaa')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTCaa')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('actg')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('AC')), None)]) trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_lowercased_seqs = TrimLowercasedLetters() trim = TrimOrMask() # pylint: disable=W0141 trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CTTTC', 'CTTC', 'CTC', 'AC'] seqs = [] seq = SeqItem('s', ['>s\n', 'aaCTTTC\n']) seqs.append([SeqWrapper(SEQITEM, seq, 'fasta')]) trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CTTTC'] # with pairs seq = SeqItem('s.f', ['>s.f\n', 'aaCTTTC\n']) seq1 = SeqItem('s.r', ['>s.r\n', 'aaCTTTC\n']) seq2 = SeqItem('s1.f', ['>s1.f\n', 'aa\n']) seq3 = SeqItem('s1.r', ['>s1.r\n', 'aaCTTTC\n']) seqs = [] seqs.append([SeqWrapper(SEQITEM, seq, 'fasta'), SeqWrapper(SEQITEM, seq1, 'fasta')]) seqs.append([SeqWrapper(SEQITEM, seq2, 'fasta'), SeqWrapper(SEQITEM, seq3, 'fasta')]) trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_lowercased_seqs = TrimLowercasedLetters() trim = TrimOrMask() # pylint: disable=W0141 trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] orphan_res = [get_str_seq(s) for s in trim_packet[ORPHAN_SEQS]] assert orphan_res == ['CTTTC'] assert ['CTTTC', 'CTTTC'] == res # no drag trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_lowercased_seqs = TrimLowercasedLetters() trim = TrimOrMask() # pylint: disable=W0141 trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] orphan_res = [get_name(s) for s in trim_packet[ORPHAN_SEQS]] assert orphan_res == ['s1.r'] assert ['CTTTC', 'CTTTC'] == res
def test_trimming(self): 'The sequences are trimmed according to the recommendations.' seq1 = 'gggtctcatcatcaggg'.upper() seq = SeqRecord(Seq(seq1), annotations={TRIMMING_RECOMMENDATIONS: {}}) seq = SeqWrapper(SEQRECORD, seq, None) seqs = [seq] trim_packet = {SEQS_PASSED: [seqs], ORPHAN_SEQS: []} trim_rec = get_annotations(seq)[TRIMMING_RECOMMENDATIONS] seq_trimmer = TrimOrMask() trim_rec['vector'] = [(0, 3), (8, 13)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l] assert res == ['CTCA'] trim_rec['vector'] = [(0, 0), (8, 13)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l] assert res == ['GGTCTCA'] trim_rec['vector'] = [(0, 1), (8, 12)] trim_rec['quality'] = [(1, 8), (13, 17)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) assert not trim_packet2[SEQS_PASSED] trim_rec['vector'] = [(0, 0), (8, 13)] trim_rec['quality'] = [] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l] assert res == ['GGTCTCA'] trim_packet2[SEQS_PASSED][0][0] assert TRIMMING_RECOMMENDATIONS not in get_annotations( trim_packet2[SEQS_PASSED][0][0])
def test_quality_trimming(self): 'It trims the edges' trim = TrimOrMask() trim_quality = TrimByQuality(window=5, threshold=30) seq = SeqRecord(Seq('ACTGCTGCATAAAA')) quals = [10, 10, 20, 30, 30, 30, 40, 40, 30, 30, 20, 20, 10, 10] seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_int_qualities(seq2) == [20, 30, 30, 30, 40, 40, 30, 30, 20] # all bad trim_quality = TrimByQuality(window=5, threshold=60) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) assert not trim_packet2[SEQS_PASSED] # all OK trim_quality = TrimByQuality(window=5, threshold=5) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_int_qualities(seq2) == quals seq = SeqRecord(Seq('ACTGCTGCATAA')) quals = [20, 20, 20, 60, 60, 60, 60, 60, 20, 20, 20, 20] trim_quality = TrimByQuality(window=5, threshold=50) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_int_qualities(seq2) == [20, 60, 60, 60, 60, 60, 20] quals = [ 40, 18, 10, 40, 40, 5, 8, 30, 14, 3, 40, 40, 40, 11, 6, 5, 3, 20, 10, 12, 8, 5, 4, 7, 1 ] seq = SeqRecord(Seq('atatatatagatagatagatagatg')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_quality = TrimByQuality(window=5, threshold=25) trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_int_qualities(seq2) == [40, 18, 10, 40, 40] quals = [ 40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9 ] seq = SeqRecord(Seq('atatatatatatatatatatatata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_quality = TrimByQuality(window=5, threshold=25) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] expected = [40, 4, 27, 38, 40] assert get_int_qualities(seq2) == expected quals = [ 40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9 ] seq = SeqRecord(Seq('atatatatatatatatatatatata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_quality = TrimByQuality(window=5, threshold=25, trim_left=False) trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_int_qualities(seq2) == [ 40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40 ] quals = [ 40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9 ] seq = SeqRecord(Seq('atatatatatatatatatatatata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_quality = TrimByQuality(window=5, threshold=25, trim_right=False) trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_int_qualities(seq2) == [ 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9 ] quals = [ 40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9 ] seq = SeqRecord(Seq('atatatatatatatatatatatata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_quality = TrimByQuality(window=5, threshold=25, trim_right=False, trim_left=False) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_int_qualities(seq2) == quals # With SeqItems seq = SeqItem('s', [ '@s\n', 'atatatatatatatatatatatata\n', '\n', 'II.,I*I%<GI%,II++6$I**-+*\n' ]) seq = SeqWrapper(SEQITEM, seq, 'fastq') trim_quality = TrimByQuality(window=5, threshold=25, trim_right=True, trim_left=False) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert seq2.object.lines[3] == 'II.,I*I%<GI\n'
def test_edge_trimming(self): 'It trims the edges' trim = TrimOrMask() trim_edges = TrimEdges(left=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CCG', 'AACCCGGG'] trim_edges = TrimEdges(right=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['ACC', 'AAACCCGG'] trim_edges = TrimEdges(left=1, right=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CC', 'AACCCGG'] trim_edges = TrimEdges(left=2, right=2) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['ACCCG'] trim_edges = TrimEdges(left=3, right=3) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CCC'] trim = TrimOrMask(mask=True) trim_edges = TrimEdges(left=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['aCCG', 'aAACCCGGG'] trim_edges = TrimEdges(right=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['ACCg', 'AAACCCGGg'] trim_edges = TrimEdges(left=1, right=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['aCCg', 'aAACCCGGg'] trim_edges = TrimEdges(left=2, right=2) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['accg', 'aaACCCGgg'] trim_edges = TrimEdges(left=3, right=3) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['accg', 'aaaCCCggg'] # test overlapping mask trim1 = TrimEdges(left=3, right=3) trim2 = TrimEdges(left=4, right=4) trim_packet = trim(trim2(trim1(self._some_seqs()))) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['accg', 'aaacCcggg'] # With a SeqItem trim = TrimOrMask(mask=False) trim_edges = TrimEdges(left=1, right=1) seq = SeqItem('s', ['>s\n', 'ACTTTC\n']) seqs = [[SeqWrapper(SEQITEM, seq, 'fasta')]] trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_packet = trim(trim_edges(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CTTT'] trim = TrimOrMask(mask=True) seq = SeqItem('s', ['>s\n', 'ACTTTC\n']) seqs = [[SeqWrapper(SEQITEM, seq, 'fasta')]] trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_packet = trim(trim_edges(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['aCTTTc']