def test_str_qualities(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') try: assert get_str_qualities(seq, 'fasta') self.fail('ValueError expected') except ValueError: pass # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') assert get_str_qualities(seq) == '!???' # with fastq to fastq-illumina seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') assert get_str_qualities(seq, ILLUMINA_QUALITY) == '@^^^' # with multiline fastq-illumina seq = SeqItem(name='seq', lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina') assert get_str_qualities(seq, ILLUMINA_QUALITY) == '@AAABBBB' # with multiline fastq-illumina to fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina') assert get_str_qualities(seq, 'fastq') == '!"""####'
def test_seqitem_pairs_equal(self): seq1 = SeqWrapper( SEQITEM, SeqItem('seq1', ['@seq1\n', 'TAATAC\n', '+\n', 'TTTDFG\n']), 'fastq') seq2 = SeqWrapper( SEQITEM, SeqItem('seq2', ['@seq2\n', 'TCATTA\n', '+\n', 'ABCBEG\n']), 'fastq') seq3 = SeqWrapper( SEQITEM, SeqItem('seq3', ['@seq3\n', 'TAATAC\n', '+\n', 'TTTDFG\n']), 'fastq') seq4 = SeqWrapper( SEQITEM, SeqItem('seq4', ['@seq4\n', 'ACGCGT\n', '+\n', 'ABCBEG\n']), 'fastq') pair1 = (seq1, seq2) pair2 = (seq2, seq4) pair3 = (seq3, seq2) pair4 = (seq2, seq1) assert _seqitem_pairs_equal(pair1, pair3) assert not _seqitem_pairs_equal(pair1, pair2) assert not _seqitem_pairs_equal(pair1, pair4) assert _seqitem_pairs_equal([seq1], [seq3]) assert not _seqitem_pairs_equal([seq1], [seq2]) assert not _seqitem_pairs_equal([seq1], pair1) assert not _seqitem_pairs_equal(pair1, seq2)
def _build_some_paired_seqs(): seq1 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.f\n', 'A\n']), 'fasta') seq2 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.r\n', 'C\n']), 'fasta') seq3 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.f\n', 'T\n']), 'fasta') seq4 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.r\n', 'G\n']), 'fasta') seqs = seq1, seq2, seq3, seq4 return seqs
def test_slice(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTGGTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n']) expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta') assert slice_seq(seq, 1, 5) == expected_seq # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aata\n', '+\n', '!?!?\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') seq = slice_seq(seq, 1, 3) assert list(get_int_qualities(seq)) == [30, 0] assert get_str_seq(seq) == 'at' assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n'] # with multiline fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaatcaaa\n', '+\n', '@AAABBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina') seq_ = slice_seq(seq, 1, 5) assert list(get_int_qualities(seq_)) == [1, 1, 1, 2] assert get_str_seq(seq_) == get_str_seq(seq)[1:5] # It tests the stop is None seq = SeqItem('seq', ['>seq\n', 'aCTG']) seq = SeqWrapper(SEQITEM, seq, 'fasta') assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:] assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
def _itemize_fastx_multiline(fhand): # this is a generator function last_line = None # this is a buffer keeping the last unprocessed line is_empty = True n_single_line_seqs = 0 n_seqs_read = 0 while True: # mimic closure; is it a bad idea? if not last_line: # the first record or a record following a fastq for line in fhand: # search for the start of the next record if line[0] in '@>': # fasta/q header line last_line = line # save this line break if not last_line: break title = last_line seq_lines = [] last_line = None name = title[1:-1].partition(" ")[0] for line in fhand: # read the sequence if line[0] in '@+>': last_line = line break seq_lines.append(line.rstrip()) if not last_line or last_line[0] != '+': # this is a fasta record yield SeqItem(name, [title, ''.join(seq_lines) + '\n']) n_seqs_read += 1 is_empty = False if not last_line: break else: # this is a fastq record seq = ''.join(seq_lines) length = 0 qual_lines = [] len_seq = len(seq) for line in fhand: # read the quality qual_lines.append(line.rstrip()) length += len(line) - 1 if length >= len_seq: # have read enough quality if length != len_seq: msg = 'Malformed fastq file: seq and quality lines' msg += 'have different lengths' raise MalformedFile(msg) last_line = None is_empty = False yield SeqItem(name, [title, seq + '\n', '+\n', ''.join(qual_lines) + '\n']) n_seqs_read += 1 if len(qual_lines) == 1: n_single_line_seqs += 1 if n_seqs_read == 1000: if n_single_line_seqs == n_seqs_read: raise IsSingleLineFastqError() break if last_line: # reach EOF before reading enough quality msg = 'Malformed fastq file: quality line missing' raise MalformedFile(msg) if is_empty: raise FileIsEmptyError('File is empty')
def test_str_seq(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTGGTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') assert get_str_seq(seq) == 'ACTGGTAC' # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '????\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') assert get_str_seq(seq) == 'aaaa'
def test_change_name(self): seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+seq\n', '!???\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') seq = copy_seq(seq, name='seq2') assert seq.object == ('seq2', ['@seq2\n', 'aaaa\n', '+\n', '!???\n'], {}) seq = SeqItem(name='seq', lines=['>seq\n', 'aaaa\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') seq = copy_seq(seq, name='seq2') assert seq.object == ('seq2', ['>seq2\n', 'aaaa\n'], {})
def test_no_name(self): seqs = _build_some_paired_seqs() seq = SeqWrapper(SEQITEM, SeqItem('s', ['>s\n', 'N\n']), 'fasta') seqs = seqs[0], seqs[1], seqs[2], seq, seqs[3] paired_seqs = list(group_pairs_by_name(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['T'] assert [get_str_seq(s) for s in paired_seqs[2]] == ['N'] assert [get_str_seq(s) for s in paired_seqs[3]] == ['G'] seqs = _build_some_paired_seqs() seqs = seqs[0], seq, seqs[1], seqs[2], seqs[3] paired_seqs = list(group_pairs_by_name(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['N'] assert [get_str_seq(s) for s in paired_seqs[2]] == ['C'] assert [get_str_seq(s) for s in paired_seqs[3]] == ['T', 'G'] seqs = _build_some_paired_seqs() seqs = seq, seqs[0], seqs[1], seqs[2], seqs[3] paired_seqs = list(group_pairs_by_name(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['N'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['A', 'C'] assert [get_str_seq(s) for s in paired_seqs[2]] == ['T', 'G']
def test_blast_short_trimming(self): 'It trims oligos using blast-short' oligo1 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACATGGG')) oligo2 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACTTTTT')) oligo1 = SeqWrapper(SEQRECORD, oligo1, None) oligo2 = SeqWrapper(SEQRECORD, oligo2, None) adaptors = [oligo1, oligo2] blast_trim = TrimWithBlastShort(oligos=adaptors) fhand = StringIO(FASTQ4) seq_packets = read_seq_packets([fhand], prefered_seq_classes=[SEQRECORD]) trim_packets = list(seq_to_trim_packets(seq_packets)) trim_packets2 = blast_trim(trim_packets[0]) # It should trim the first and the second reads. res = [ get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(VECTOR, []) for l in trim_packets2[SEQS_PASSED] for s in l ] assert res == [[(0, 29)], [(0, 29)], []] # With SeqItems oligo1 = SeqItem('oligo1', ['>oligo1\n', 'AAGCAGTGGTATCAACGCAGAGTACATGGG\n']) oligo2 = SeqItem('oligo2', ['>oligo2\n', 'AAGCAGTGGTATCAACGCAGAGTACTTTTT\n']) oligo1 = SeqWrapper(SEQITEM, oligo1, 'fasta') oligo2 = SeqWrapper(SEQITEM, oligo2, 'fasta') adaptors = [oligo1, oligo2] blast_trim = TrimWithBlastShort(oligos=adaptors) fhand = StringIO(FASTQ4) seq_packets = list( read_seq_packets([fhand], prefered_seq_classes=[SEQITEM])) trim_packets = list(seq_to_trim_packets(seq_packets)) trim_packets2 = blast_trim(trim_packets[0]) # It should trim the first and the second reads. res = [ get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(VECTOR, []) for l in trim_packets2[SEQS_PASSED] for s in l ] assert res == [[(0, 29)], [(0, 29)], []]
def test_matching_segments(self): 'It tests the detection of oligos in sequence files' seq_5 = 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC' mate_fhand = create_a_matepair_file() linkers = [ SeqItem('titan', ['>titan\n', TITANIUM_LINKER + '\n']), SeqItem('flx', ['>flx\n', FLX_LINKER + '\n']) ] linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta') expected_region = (len(seq_5), len(seq_5 + TITANIUM_LINKER) - 1) matcher = BlasterForFewSubjects(mate_fhand.name, linkers, program='blastn', elongate_for_global=True) linker_region = matcher.get_matched_segments_for_read('seq1')[0] assert [expected_region] == linker_region
def test_int_qualities(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') try: assert get_int_qualities(seq) self.fail('AttributeError expected') except AttributeError: pass # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') assert list(get_int_qualities(seq)) == [0, 30, 30, 30] seq = SeqItem(name='seq', lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina') assert list(get_int_qualities(seq)) == [0, 1, 1, 1, 2, 2, 2, 2]
def __init__(self, linkers=None): 'The initiator' if linkers is None: linkers = get_setting('LINKERS') linkers = [ SeqItem(str(i), '>%d\n%s\n' % (i, l)) for i, l in enumerate(linkers) ] linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta') self.linkers = list(linkers)
def test_copy(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'], annotations={'a': 'b'}) seq = SeqWrapper(SEQITEM, seq, 'fasta') seq2 = copy_seq(seq, seq='ACTG') assert seq2.object == SeqItem(name='s1', lines=['>s1\n', 'ACTG\n'], annotations={'a': 'b'}) assert seq.object is not seq2.object assert seq.object.lines is not seq2.object.lines # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') seq2 = copy_seq(seq, seq='ACTG') assert seq2.object == SeqItem( name='seq', lines=['@seq\n', 'ACTG\n', '+\n', '!???\n']) # with multiline fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina') seq2 = copy_seq(seq, seq='ACTGactg') assert seq2.object == SeqItem( name='seq', lines=['@seq\n', 'ACTGactg\n', '+\n', '@AAABBBB\n'])
def alignedread_to_seqitem(aligned_read, start_pos=0, end_pos=None): if aligned_read is None or aligned_read.seq is None: return None name = aligned_read.qname seq = aligned_read.seq[start_pos: end_pos] quals = aligned_read.qual if aligned_read.is_reverse: seq = _reverse(_complementary(seq)) if quals is None: lines = ['>' + name + '\n', seq + '\n'] file_format = 'fasta' else: quals = quals[start_pos: end_pos] if aligned_read.is_reverse: quals = _reverse(quals) lines = ['@' + name + '\n', seq + '\n', '+\n', quals + '\n'] file_format = 'fastq' return SeqWrapper(SEQITEM, SeqItem(name, lines), file_format)
def test_trim_seqs(): 'It tests the trim seq function' seqs = [] seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTTTC')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('CTTCaa')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTCaa')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('actg')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('AC')), None)]) trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_lowercased_seqs = TrimLowercasedLetters() trim = TrimOrMask() # pylint: disable=W0141 trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CTTTC', 'CTTC', 'CTC', 'AC'] seqs = [] seq = SeqItem('s', ['>s\n', 'aaCTTTC\n']) seqs.append([SeqWrapper(SEQITEM, seq, 'fasta')]) trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CTTTC'] # with pairs seq = SeqItem('s.f', ['>s.f\n', 'aaCTTTC\n']) seq1 = SeqItem('s.r', ['>s.r\n', 'aaCTTTC\n']) seq2 = SeqItem('s1.f', ['>s1.f\n', 'aa\n']) seq3 = SeqItem('s1.r', ['>s1.r\n', 'aaCTTTC\n']) seqs = [] seqs.append([ SeqWrapper(SEQITEM, seq, 'fasta'), SeqWrapper(SEQITEM, seq1, 'fasta') ]) seqs.append([ SeqWrapper(SEQITEM, seq2, 'fasta'), SeqWrapper(SEQITEM, seq3, 'fasta') ]) trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_lowercased_seqs = TrimLowercasedLetters() trim = TrimOrMask() # pylint: disable=W0141 trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] orphan_res = [get_str_seq(s) for s in trim_packet[ORPHAN_SEQS]] assert orphan_res == ['CTTTC'] assert ['CTTTC', 'CTTTC'] == res # no drag trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_lowercased_seqs = TrimLowercasedLetters() trim = TrimOrMask() # pylint: disable=W0141 trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] orphan_res = [get_name(s) for s in trim_packet[ORPHAN_SEQS]] assert orphan_res == ['s1.r'] assert ['CTTTC', 'CTTTC'] == res
def test_edge_trimming(self): 'It trims the edges' trim = TrimOrMask() trim_edges = TrimEdges(left=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CCG', 'AACCCGGG'] trim_edges = TrimEdges(right=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['ACC', 'AAACCCGG'] trim_edges = TrimEdges(left=1, right=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CC', 'AACCCGG'] trim_edges = TrimEdges(left=2, right=2) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['ACCCG'] trim_edges = TrimEdges(left=3, right=3) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CCC'] trim = TrimOrMask(mask=True) trim_edges = TrimEdges(left=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['aCCG', 'aAACCCGGG'] trim_edges = TrimEdges(right=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['ACCg', 'AAACCCGGg'] trim_edges = TrimEdges(left=1, right=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['aCCg', 'aAACCCGGg'] trim_edges = TrimEdges(left=2, right=2) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['accg', 'aaACCCGgg'] trim_edges = TrimEdges(left=3, right=3) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['accg', 'aaaCCCggg'] # test overlapping mask trim1 = TrimEdges(left=3, right=3) trim2 = TrimEdges(left=4, right=4) trim_packet = trim(trim2(trim1(self._some_seqs()))) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['accg', 'aaacCcggg'] # With a SeqItem trim = TrimOrMask(mask=False) trim_edges = TrimEdges(left=1, right=1) seq = SeqItem('s', ['>s\n', 'ACTTTC\n']) seqs = [[SeqWrapper(SEQITEM, seq, 'fasta')]] trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_packet = trim(trim_edges(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CTTT'] trim = TrimOrMask(mask=True) seq = SeqItem('s', ['>s\n', 'ACTTTC\n']) seqs = [[SeqWrapper(SEQITEM, seq, 'fasta')]] trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_packet = trim(trim_edges(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['aCTTTc']
def test_quality_trimming(self): 'It trims the edges' trim = TrimOrMask() trim_quality = TrimByQuality(window=5, threshold=30) seq = SeqRecord(Seq('ACTGCTGCATAAAA')) quals = [10, 10, 20, 30, 30, 30, 40, 40, 30, 30, 20, 20, 10, 10] seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_int_qualities(seq2) == [20, 30, 30, 30, 40, 40, 30, 30, 20] # all bad trim_quality = TrimByQuality(window=5, threshold=60) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) assert not trim_packet2[SEQS_PASSED] # all OK trim_quality = TrimByQuality(window=5, threshold=5) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_int_qualities(seq2) == quals seq = SeqRecord(Seq('ACTGCTGCATAA')) quals = [20, 20, 20, 60, 60, 60, 60, 60, 20, 20, 20, 20] trim_quality = TrimByQuality(window=5, threshold=50) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_int_qualities(seq2) == [20, 60, 60, 60, 60, 60, 20] quals = [ 40, 18, 10, 40, 40, 5, 8, 30, 14, 3, 40, 40, 40, 11, 6, 5, 3, 20, 10, 12, 8, 5, 4, 7, 1 ] seq = SeqRecord(Seq('atatatatagatagatagatagatg')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_quality = TrimByQuality(window=5, threshold=25) trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_int_qualities(seq2) == [40, 18, 10, 40, 40] quals = [ 40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9 ] seq = SeqRecord(Seq('atatatatatatatatatatatata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_quality = TrimByQuality(window=5, threshold=25) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] expected = [40, 4, 27, 38, 40] assert get_int_qualities(seq2) == expected quals = [ 40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9 ] seq = SeqRecord(Seq('atatatatatatatatatatatata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_quality = TrimByQuality(window=5, threshold=25, trim_left=False) trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_int_qualities(seq2) == [ 40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40 ] quals = [ 40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9 ] seq = SeqRecord(Seq('atatatatatatatatatatatata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_quality = TrimByQuality(window=5, threshold=25, trim_right=False) trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_int_qualities(seq2) == [ 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9 ] quals = [ 40, 40, 13, 11, 40, 9, 40, 4, 27, 38, 40, 4, 11, 40, 40, 10, 10, 21, 3, 40, 9, 9, 12, 10, 9 ] seq = SeqRecord(Seq('atatatatatatatatatatatata')) seq.letter_annotations['phred_quality'] = quals seq = SeqWrapper(SEQRECORD, seq, None) trim_quality = TrimByQuality(window=5, threshold=25, trim_right=False, trim_left=False) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert get_int_qualities(seq2) == quals # With SeqItems seq = SeqItem('s', [ '@s\n', 'atatatatatatatatatatatata\n', '\n', 'II.,I*I%<GI%,II++6$I**-+*\n' ]) seq = SeqWrapper(SEQITEM, seq, 'fastq') trim_quality = TrimByQuality(window=5, threshold=25, trim_right=True, trim_left=False) trim_packet = {SEQS_PASSED: [[seq]], ORPHAN_SEQS: []} trim_packet2 = trim(trim_quality(trim_packet)) seq2 = trim_packet2[SEQS_PASSED][0][0] assert seq2.object.lines[3] == 'II.,I*I%<GI\n'
def _itemize_fastq_singleline(fhand): 'It returns the fhand divided in chunks, one per seq' # group_in_packets_fill_last is faster than group_in_packets blobs = group_in_packets_fill_last(ifilter(_line_is_not_empty, fhand), 4) return (SeqItem(_get_name_from_lines(lines), lines) for lines in blobs)