def test_quality_filter(self): 'It filters the reads given a quality threshold' seq1 = SeqRecord( Seq('AAcTg'), id='seq1', letter_annotations={'phred_quality': [42, 42, 40, 42, 40]}) seq1 = SeqWrapper(object=seq1, kind=SEQRECORD, file_format=None) seq2 = SeqRecord( Seq('AAcTg'), id='seq2', letter_annotations={'phred_quality': [40, 40, 42, 40, 42]}) seq2 = SeqWrapper(object=seq2, kind=SEQRECORD, file_format=None) seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []} filter_ = FilterByQuality(threshold=41) passed = _seqs_to_names(filter_(seqs)[SEQS_PASSED]) assert passed == ['seq1'] filter_ = FilterByQuality(threshold=41, reverse=True) passed = _seqs_to_names(filter_(seqs)[SEQS_PASSED]) assert passed == ['seq2'] filter_ = FilterByQuality(threshold=41.5, ignore_masked=True) passed = _seqs_to_names(filter_(seqs)[SEQS_PASSED]) assert passed == ['seq1']
def test_dust_filter(): 'It tests the complexity filter' seq1 = 'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAAAAAAAAAAAAAAAAAAAAAAAAA' seq2 = 'CATCGATTGCGATCGATCTTGTTGCACGACTAGCTATCGATTGCTAGCTTAGCTAGCTAGTT' seq1 = SeqRecord(Seq(seq1), id='seq1') seq2 = SeqRecord(Seq(seq2), id='seq2') seq1 = SeqWrapper(SEQRECORD, seq1, None) seq2 = SeqWrapper(SEQRECORD, seq2, None) seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []} filter_dust = FilterDustComplexity() filter_packet = filter_dust(seqs) assert len(filter_packet[SEQS_PASSED]) == 1 assert len(filter_packet[SEQS_FILTERED_OUT]) == 1 assert _seqs_to_names(filter_packet[SEQS_PASSED])[0] == 'seq2' assert _seqs_to_names(filter_packet[SEQS_FILTERED_OUT])[0] == 'seq1' # reverse filter_dust = FilterDustComplexity(reverse=True) filter_packet = filter_dust(seqs) assert len(filter_packet[SEQS_PASSED]) == 1 assert len(filter_packet[SEQS_FILTERED_OUT]) == 1 assert _seqs_to_names(filter_packet[SEQS_PASSED])[0] == 'seq1' assert _seqs_to_names(filter_packet[SEQS_FILTERED_OUT])[0] == 'seq2'
def test_filter_by_read_count(self): seq1 = 'T' * 1000 seq2 = 'A' * 1000 seq1 = SeqRecord(Seq(seq1), id='seq1') seq2 = SeqRecord(Seq(seq2), id='seq2') seq1 = SeqWrapper(SEQRECORD, seq1, None) seq2 = SeqWrapper(SEQRECORD, seq2, None) seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []} read_counts = {'seq1': {'mapped_reads': 10, 'unmapped_reads': 999989, 'length': len(seq1.object)}, 'seq2': {'mapped_reads': 1, 'unmapped_reads': 0, 'length': len(seq2.object)}} filter_ = FilterByRpkm(read_counts, 2) seqs2 = filter_(seqs) assert _seqs_to_names(seqs2[SEQS_FILTERED_OUT]) == ['seq2'] assert _seqs_to_names(seqs2[SEQS_PASSED]) == ['seq1'] filter_ = FilterByRpkm(read_counts, 1) seqs2 = filter_(seqs) assert not seqs2[SEQS_FILTERED_OUT] filter_ = FilterByRpkm(read_counts, 2, reverse=True) seqs2 = filter_(seqs) assert _seqs_to_names(seqs2[SEQS_FILTERED_OUT]) == ['seq1'] assert _seqs_to_names(seqs2[SEQS_PASSED]) == ['seq2']
def test_ns_filter(): seq1 = 'N' * 50 + 'n' * 50 + '-' * 50 + '*' * 50 seq2 = 'CATCGATTGCGATCGATCTTGTTGCACGACTAGCTATCGATTGCTAGCTTAGCTAGCTAGTT' seq1 = SeqRecord(Seq(seq1), id='seq1') seq2 = SeqRecord(Seq(seq2), id='seq2') seq1 = SeqWrapper(SEQRECORD, seq1, None) seq2 = SeqWrapper(SEQRECORD, seq2, None) seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []} filter_ns = FilterAllNs() filter_packet = filter_ns(seqs) assert len(filter_packet[SEQS_PASSED]) == 1 assert len(filter_packet[SEQS_FILTERED_OUT]) == 1 assert _seqs_to_names(filter_packet[SEQS_PASSED])[0] == 'seq2' assert _seqs_to_names(filter_packet[SEQS_FILTERED_OUT])[0] == 'seq1' # reverse filter_ns = FilterAllNs(reverse=True) filter_packet = filter_ns(seqs) assert len(filter_packet[SEQS_PASSED]) == 1 assert len(filter_packet[SEQS_FILTERED_OUT]) == 1 assert _seqs_to_names(filter_packet[SEQS_PASSED])[0] == 'seq1' assert _seqs_to_names(filter_packet[SEQS_FILTERED_OUT])[0] == 'seq2'
def test_str_qualities(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') try: assert get_str_qualities(seq, 'fasta') self.fail('ValueError expected') except ValueError: pass # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') assert get_str_qualities(seq) == '!???' # with fastq to fastq-illumina seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') assert get_str_qualities(seq, ILLUMINA_QUALITY) == '@^^^' # with multiline fastq-illumina seq = SeqItem(name='seq', lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina') assert get_str_qualities(seq, ILLUMINA_QUALITY) == '@AAABBBB' # with multiline fastq-illumina to fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina') assert get_str_qualities(seq, 'fastq') == '!"""####'
def test_copy(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'], annotations={'a': 'b'}) seq = SeqWrapper(SEQITEM, seq, 'fasta') seq2 = copy_seq(seq, seq='ACTG') assert seq2.object == SeqItem(name='s1', lines=['>s1\n', 'ACTG\n'], annotations={'a': 'b'}) assert seq.object is not seq2.object assert seq.object.lines is not seq2.object.lines # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') seq2 = copy_seq(seq, seq='ACTG') assert seq2.object == SeqItem( name='seq', lines=['@seq\n', 'ACTG\n', '+\n', '!???\n']) # with multiline fastq seq = SeqItem( name='seq', lines=['@seq\n', 'aaaa\n', 'aaaa\n', '+\n', '@AAA\n', 'BBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline') seq2 = copy_seq(seq, seq='ACTGactg') assert seq2.object == SeqItem( name='seq', lines=['@seq\n', 'ACTGactg\n', '+\n', '@AAABBBB\n'])
def test_slice(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n']) expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta') assert slice_seq(seq, 1, 5) == expected_seq # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aata\n', '+\n', '!?!?\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') seq = slice_seq(seq, 1, 3) assert list(get_qualities(seq)) == [30, 0] assert get_str_seq(seq) == 'at' assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n'] # with multiline fastq seq = SeqItem( name='seq', lines=['@seq\n', 'aaat\n', 'caaa\n', '+\n', '@AAA\n', 'BBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline') seq_ = slice_seq(seq, 1, 5) assert list(get_qualities(seq_)) == [1, 1, 1, 2] assert get_str_seq(seq_) == get_str_seq(seq)[1:5] # It tests the stop is None seq = SeqItem('seq', ['>seq\n', 'aCTG']) seq = SeqWrapper(SEQITEM, seq, 'fasta') assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:] assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
def _build_some_paired_seqs(): seq1 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.f\n', 'A\n']), 'fasta') seq2 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.r\n', 'C\n']), 'fasta') seq3 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.f\n', 'T\n']), 'fasta') seq4 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.r\n', 'G\n']), 'fasta') seqs = seq1, seq2, seq3, seq4 return seqs
def test_seqitem_pairs_equal(self): seq1 = SeqWrapper( SEQITEM, SeqItem('seq1', ['@seq1\n', 'TAATAC\n', '+\n', 'TTTDFG\n']), 'fastq') seq2 = SeqWrapper( SEQITEM, SeqItem('seq2', ['@seq2\n', 'TCATTA\n', '+\n', 'ABCBEG\n']), 'fastq') seq3 = SeqWrapper( SEQITEM, SeqItem('seq3', ['@seq3\n', 'TAATAC\n', '+\n', 'TTTDFG\n']), 'fastq') seq4 = SeqWrapper( SEQITEM, SeqItem('seq4', ['@seq4\n', 'ACGCGT\n', '+\n', 'ABCBEG\n']), 'fastq') pair1 = (seq1, seq2) pair2 = (seq2, seq4) pair3 = (seq3, seq2) pair4 = (seq2, seq1) assert _seqitem_pairs_equal(pair1, pair3) assert not _seqitem_pairs_equal(pair1, pair2) assert not _seqitem_pairs_equal(pair1, pair4) assert _seqitem_pairs_equal([seq1], [seq3]) assert not _seqitem_pairs_equal([seq1], [seq2]) assert not _seqitem_pairs_equal([seq1], pair1) assert not _seqitem_pairs_equal(pair1, seq2)
def test_trim_seqs(): 'It tests the trim seq function' seqs = [] seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTTTC')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('CTTCaa')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTCaa')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('actg')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('AC')), None)]) trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_lowercased_seqs = TrimLowercasedLetters() trim = TrimOrMask() # pylint: disable=W0141 trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CTTTC', 'CTTC', 'CTC', 'AC'] seqs = [] seq = SeqItem('s', ['>s\n', 'aaCTTTC\n']) seqs.append([SeqWrapper(SEQITEM, seq, 'fasta')]) trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CTTTC'] # with pairs seq = SeqItem('s.f', ['>s.f\n', 'aaCTTTC\n']) seq1 = SeqItem('s.r', ['>s.r\n', 'aaCTTTC\n']) seq2 = SeqItem('s1.f', ['>s1.f\n', 'aa\n']) seq3 = SeqItem('s1.r', ['>s1.r\n', 'aaCTTTC\n']) seqs = [] seqs.append([SeqWrapper(SEQITEM, seq, 'fasta'), SeqWrapper(SEQITEM, seq1, 'fasta')]) seqs.append([SeqWrapper(SEQITEM, seq2, 'fasta'), SeqWrapper(SEQITEM, seq3, 'fasta')]) trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_lowercased_seqs = TrimLowercasedLetters() trim = TrimOrMask() # pylint: disable=W0141 trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] orphan_res = [get_str_seq(s) for s in trim_packet[ORPHAN_SEQS]] assert orphan_res == ['CTTTC'] assert ['CTTTC', 'CTTTC'] == res # no drag trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_lowercased_seqs = TrimLowercasedLetters() trim = TrimOrMask() # pylint: disable=W0141 trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] orphan_res = [get_name(s) for s in trim_packet[ORPHAN_SEQS]] assert orphan_res == ['s1.r'] assert ['CTTTC', 'CTTTC'] == res
def test_str_seq(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTGGTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') assert get_str_seq(seq) == 'ACTGGTAC' # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '????\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') assert get_str_seq(seq) == 'aaaa'
def test_pair_grouper(): seq1 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.f\n', 'A\n']), 'fasta') seq2 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.r\n', 'C\n']), 'fasta') seq3 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.f\n', 'T\n']), 'fasta') seq4 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.r\n', 'G\n']), 'fasta') seqs = seq1, seq2, seq3, seq4 paired_seqs = list(group_seqs_in_pairs(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['T', 'G'] assert len(paired_seqs) == 2
def test_len(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') assert get_length(seq) == 8 # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '????\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') assert get_length(seq) == 4
def _some_seqs(self): 'It returns some seqrecords.' seqs = [] seq = SeqRecord(Seq('ACCG'), letter_annotations={'dummy': 'dddd'}) seq = SeqWrapper(SEQRECORD, seq, None) seqs.append([seq]) seq = SeqRecord(Seq('AAACCCGGG')) seq = SeqWrapper(SEQRECORD, seq, None) seqs.append([seq]) trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} return trim_packet
def test_change_name(self): seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+seq\n', '!???\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') seq = copy_seq(seq, name='seq2') assert seq.object == ('seq2', ['@seq2\n', 'aaaa\n', '+\n', '!???\n'], {}) seq = SeqItem(name='seq', lines=['>seq\n', 'aaaa\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') seq = copy_seq(seq, name='seq2') assert seq.object == ('seq2', ['>seq2\n', 'aaaa\n'], {})
def test_filter_by_feat_type(self): orf = SeqFeature(FeatureLocation(3, 4), type='ORF') seq1 = SeqRecord(Seq('aaaa'), id='seq1', features=[orf]) seq2 = SeqRecord(Seq('aaaa'), id='seq2') seq1 = SeqWrapper(SEQRECORD, seq1, None) seq2 = SeqWrapper(SEQRECORD, seq2, None) seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []} filter_ = FilterByFeatureTypes(['ORF']) seqs = filter_(seqs) assert len(seqs[SEQS_FILTERED_OUT]) == 1 assert len(seqs[SEQS_PASSED]) == 1
def match_pairs_unordered(seq_fpath, out_fhand, orphan_out_fhand, out_format): 'It matches the seq pairs in an iterator and splits the orphan seqs' index_ = _index_seq_file(seq_fpath) paired, orphans = _get_paired_and_orphan(index_) # write paired write_seqs((SeqWrapper(SEQRECORD, index_[title], None) for title in paired), out_fhand, out_format) # orphans write_seqs((SeqWrapper(SEQRECORD, index_[title], None) for title in orphans), orphan_out_fhand, out_format)
def test_with_pairs(self): seq1 = SeqRecord(Seq('ACTG'), id='seq1') seq1 = SeqWrapper(object=seq1, kind=SEQRECORD, file_format=None) seq2 = SeqRecord(Seq('ACTG'), id='seq2') seq2 = SeqWrapper(object=seq2, kind=SEQRECORD, file_format=None) seqs = {SEQS_PASSED: [[seq1, seq2]], SEQS_FILTERED_OUT: []} ids = ['seq1'] filter_by_id = FilterById(ids, failed_drags_pair=True) passed = _seqs_to_names(filter_by_id(seqs)[SEQS_PASSED]) assert not passed filter_by_id = FilterById(ids, failed_drags_pair=False) passed = _seqs_to_names(filter_by_id(seqs)[SEQS_PASSED]) assert passed == ['seq1', 'seq2']
def test_no_name(self): seqs = _build_some_paired_seqs() seq = SeqWrapper(SEQITEM, SeqItem('s', ['>s\n', 'N\n']), 'fasta') seqs = seqs[0], seqs[1], seqs[2], seq, seqs[3] paired_seqs = list(group_pairs_by_name(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['T'] assert [get_str_seq(s) for s in paired_seqs[2]] == ['N'] assert [get_str_seq(s) for s in paired_seqs[3]] == ['G'] seqs = _build_some_paired_seqs() seqs = seqs[0], seq, seqs[1], seqs[2], seqs[3] paired_seqs = list(group_pairs_by_name(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['N'] assert [get_str_seq(s) for s in paired_seqs[2]] == ['C'] assert [get_str_seq(s) for s in paired_seqs[3]] == ['T', 'G'] seqs = _build_some_paired_seqs() seqs = seq, seqs[0], seqs[1], seqs[2], seqs[3] paired_seqs = list(group_pairs_by_name(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['N'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['A', 'C'] assert [get_str_seq(s) for s in paired_seqs[2]] == ['T', 'G']
def test_blastmatch_filter(): 'it test filter by blast' blastdb = os.path.join(TEST_DATA_DIR, 'blastdbs', 'arabidopsis_genes') match = 'CCAAAGTACGGTCTCCCAAGCGGTCTCTTACCGGACACCGTCACCGATTTCACCCTCT' seq = 'ATCATGTAGTTACACATGAACACACACATG' seq += match seq1 = SeqRecord(Seq(seq), id='seq') seq1 = SeqWrapper(object=seq1, kind=SEQRECORD, file_format=None) seqs = {SEQS_PASSED: [[seq1]], SEQS_FILTERED_OUT: []} filters = [{'kind': 'score_threshold', 'score_key': 'expect', 'max_score': 0.001}, {'kind': 'score_threshold', 'score_key': 'identity', 'min_score': 80}, {'kind': 'min_length', 'min_percentage': 60, 'length_in_query': True}] filter_ = FilterBlastMatch(blastdb, 'blastn', filters=filters, dbtype=NUCL) new_seqs = filter_(seqs)[SEQS_PASSED] assert new_seqs == [] filters = [{'kind': 'score_threshold', 'score_key': 'expect', 'max_score': 1e-28}] filter_ = FilterBlastMatch(blastdb, 'blastn', filters) new_seqs = filter_(seqs)[SEQS_PASSED] assert len(new_seqs) == 1 filters = [{'kind': 'score_threshold', 'score_key': 'expect', 'max_score': 1e-28}] filter_ = FilterBlastMatch(blastdb, 'blastn', filters, reverse=True) filter_packets = filter_(seqs) assert filter_packets[SEQS_PASSED] == [] assert len(filter_packets[SEQS_FILTERED_OUT]) == 1
def test_seq_list_filter(self): 'It filters the reads given a list of ids' seq1 = SeqRecord(Seq('ACTG'), id='seq1') seq1 = SeqWrapper(object=seq1, kind=SEQRECORD, file_format=None) seq2 = SeqRecord(Seq('ACTG'), id='seq2') seq2 = SeqWrapper(object=seq2, kind=SEQRECORD, file_format=None) seqs = {SEQS_PASSED: [[seq1], [seq2]], SEQS_FILTERED_OUT: []} ids = ['seq1'] filter_by_id = FilterById(ids) passed = _seqs_to_names(filter_by_id(seqs)[SEQS_PASSED]) assert passed == ['seq1'] filter_by_id = FilterById(set(ids), reverse=True) passed = _seqs_to_names(filter_by_id(seqs)[SEQS_PASSED]) assert passed == ['seq2']
def test_blast_short_trimming(self): 'It trims oligos using blast-short' oligo1 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACATGGG')) oligo2 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACTTTTT')) oligo1 = SeqWrapper(SEQRECORD, oligo1, None) oligo2 = SeqWrapper(SEQRECORD, oligo2, None) adaptors = [oligo1, oligo2] blast_trim = TrimWithBlastShort(oligos=adaptors) fhand = StringIO(FASTQ4) seq_packets = read_seq_packets([fhand], prefered_seq_classes=[SEQRECORD]) trim_packets = list(seq_to_trim_packets(seq_packets)) trim_packets2 = blast_trim(trim_packets[0]) # It should trim the first and the second reads. res = [ get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(VECTOR, []) for l in trim_packets2[SEQS_PASSED] for s in l ] assert res == [[(0, 29)], [(0, 29)], []] # With SeqItems oligo1 = SeqItem('oligo1', ['>oligo1\n', 'AAGCAGTGGTATCAACGCAGAGTACATGGG\n']) oligo2 = SeqItem('oligo2', ['>oligo2\n', 'AAGCAGTGGTATCAACGCAGAGTACTTTTT\n']) oligo1 = SeqWrapper(SEQITEM, oligo1, 'fasta') oligo2 = SeqWrapper(SEQITEM, oligo2, 'fasta') adaptors = [oligo1, oligo2] blast_trim = TrimWithBlastShort(oligos=adaptors) fhand = StringIO(FASTQ4) seq_packets = list( read_seq_packets([fhand], prefered_seq_classes=[SEQITEM])) trim_packets = list(seq_to_trim_packets(seq_packets)) trim_packets2 = blast_trim(trim_packets[0]) # It should trim the first and the second reads. res = [ get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(VECTOR, []) for l in trim_packets2[SEQS_PASSED] for s in l ] assert res == [[(0, 29)], [(0, 29)], []]
def create_seq(index): 'It creates a random seq with a linker' seq1 = ''.join(choice('ACTG') for i in range(100)) seq2 = ''.join(choice('ACTG') for i in range(100)) seq = seq1 + linker + seq2 seq = SeqRecord(id='seq_' + str(index), seq=Seq(seq)) seq = SeqWrapper(SEQRECORD, seq, None) return seq
def xtest_blaster(self): seq = 'GAGAAATTCCTTTGGAAGTTATTCCGTAGCATAAGAGCTGAAACTTCAGAGCAAGTTT' seq += 'TCATTGGGCAAAATGGGGGAACAACCTATCTTCAGCACTCGAGCTCATGTCTTCCAAATTGA' seq += 'CCCAAACACAAAGAAGAACTGGGTACCCACCAGCAAGCATGCAGTTACTGTGTCTTATTTCT' seq += 'ATGACAGCACAAGAAATGTGTATAGGATAATCAGTTTAGATGGCTCAAAGGCAATAATAAAT' seq += 'AGTACCATCACCCCAAACATGACA' seqrec = SeqWrapper(SEQRECORD, SeqRecord(Seq(seq), id='seq'), None) blaster = Blaster([seqrec], 'nr', 'blastn', remote=True) print blaster.get_matched_segments('seq') assert blaster.get_matched_segments('seq') == [(1, 1740)]
def test_int_qualities(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') try: assert get_int_qualities(seq) self.fail('AttributeError expected') except AttributeError: pass # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') assert list(get_int_qualities(seq)) == [0, 30, 30, 30] seq = SeqItem(name='seq', lines=['@seq\n', 'aaaaaaaa\n', '+\n', '@AAABBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina') assert list(get_int_qualities(seq)) == [0, 1, 1, 1, 2, 2, 2, 2]
def test_dustscore_calculation(): 'It calculates the dust score' seqs = ['TTTTTTTTTTTTTTTTTTTTTTTTTTTT', 'TATATATATATATATATATATATATATA', 'GAAGAAGAAGAAGAAGAAGAAGAAGAAG', 'AACTGCAGTCGATGCTGATTCGATCGAT', 'AACTGAAAAAAAATTTTTTTAAAAAAAA'] # short sequences scores = [100, 48, 30.76, 4.31, 23.38] scoresx3 = [100, 48.68, 28.65, 5.62, 27.53] scoresx4 = [100, 48.55, 28.25, 5.79, 28.00] for seq, score, scorex3, scorex4 in zip(seqs, scores, scoresx3, scoresx4): seqrec = SeqRecord(Seq(seq)) seqrec = SeqWrapper(SEQRECORD, seqrec, None) assert calculate_dust_score(seqrec) - score < 0.01 seqrec = SeqRecord(Seq(seq * 3)) seqrec = SeqWrapper(SEQRECORD, seqrec, None) assert calculate_dust_score(seqrec) - scorex3 < 0.01 seqrec = SeqRecord(Seq(seq * 4)) seqrec = SeqWrapper(SEQRECORD, seqrec, None) assert calculate_dust_score(seqrec) - scorex4 < 0.01
def test_bam_filter(): 'it test filter by being mapped in a BAM file' reads = [SeqRecord(seq=Seq('aaa'), id='seq{}'.format(n)) for n in range(16, 23)] reads = [[SeqWrapper(SEQRECORD, r, None)] for r in reads] bam_fpath = os.path.join(TEST_DATA_DIR, 'seqs.bam') filter_ = FilterByBam([bam_fpath]) filterpacket = {SEQS_PASSED: reads, SEQS_FILTERED_OUT: []} new_filterpacket = filter_(filterpacket) passed = _seqs_to_names(new_filterpacket[SEQS_PASSED]) assert passed == ['seq16', 'seq17', 'seq18'] filtered_out = _seqs_to_names(new_filterpacket[SEQS_FILTERED_OUT]) assert filtered_out == ['seq19', 'seq20', 'seq21', 'seq22']
def test_blastmatch_filter(): 'it test filter by blast' seq = 'CCAAAGTACGGTCTCCCAAGCGGTCTCTTACCGGACACCGTCACCGATTTCACCCTCT' oligo = 'GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT' seq_oligo = seq + oligo oligo = SeqRecord(Seq(oligo)) oligo = SeqWrapper(SEQRECORD, oligo, None) seq = SeqRecord(Seq(seq), id='seq') seq = SeqWrapper(object=seq, kind=SEQRECORD, file_format=None) seq_oligo = SeqRecord(Seq(seq_oligo), id='seq_oligo') seq_oligo = SeqWrapper(object=seq_oligo, kind=SEQRECORD, file_format=None) seqs = {SEQS_PASSED: [[seq], [seq_oligo]], SEQS_FILTERED_OUT: []} filter_ = FilterBlastShort([oligo]) filt_packet = filter_(seqs) passed = [get_name(pair[0]) for pair in filt_packet[SEQS_PASSED]] fail = [get_name(pair[0]) for pair in filt_packet[SEQS_FILTERED_OUT]] assert passed == ['seq'] assert fail == ['seq_oligo']
def __call__(self, seqs): 'It orientates seqs, that should have a SeqRecord in it' orientations = None orientation_log = [None] * len(seqs) for annotator in self._annotators: if orientations: to_annalyze = [not o for o in orientations] seqs_to_analyze = list(compress(seqs, to_annalyze)) else: orientations = [None] * len(seqs) seqs_to_analyze = seqs annotator_name = annotator['name'] blastdb = annotator.get('blastdb', None) annotator = self._get_annotator(annotator_name, blastdb) annot_seqrecords = annotator(seqs_to_analyze) annot_strands = self._guess_orientations(annot_seqrecords, annotator_name, blastdb=blastdb) if blastdb: annotator_name += ' ' + os.path.basename(blastdb) analyzed_seqs_index = 0 for index, orientation in enumerate(orientations): if orientation is None: orientations[index] = annot_strands[analyzed_seqs_index] if annot_strands[analyzed_seqs_index] == -1: # reverse orientation_log[index] = annotator_name analyzed_seqs_index += 1 # Now we reverse the seqs that we have guess that are reversed reorientated_seqrecords = [] for orientation, seq, reason in zip(orientations, seqs, orientation_log): if orientation == -1: rev_seqrecord = seq.object.reverse_complement(id=True, description=True, annotations=True, features=True, dbxrefs=True, name=True) seq = SeqWrapper(SEQRECORD, rev_seqrecord, None) # we mark the reason why it has been reversed text = '(reversed because of: {})'.format(reason) append_to_description(seq, text) reorientated_seqrecords.append(seq) return reorientated_seqrecords
def alignedread_to_seqitem(aligned_read, start_pos=0, end_pos=None): if aligned_read is None or aligned_read.seq is None: return None name = aligned_read.qname seq = aligned_read.seq[start_pos: end_pos] quals = aligned_read.qual if aligned_read.is_reverse: seq = _reverse(_complementary(seq)) if quals is None: lines = ['>' + name + '\n', seq + '\n'] file_format = 'fasta' else: quals = quals[start_pos: end_pos] if aligned_read.is_reverse: quals = _reverse(quals) lines = ['@' + name + '\n', seq + '\n', '+\n', quals + '\n'] file_format = 'fastq' return SeqWrapper(SEQITEM, SeqItem(name, lines), file_format)