Exemplo n.º 1
0
def test_min_cutter_T_T():
    unconditional_before = UnconditionalCutter((2, -2))
    unconditional_after = UnconditionalCutter((1, -1))
    min_trimmer = MinCutter((5, -5), True, True)

    read1 = Sequence('read1', "CAATCGATCGAACGTACCGAT")
    assert read1.clipped == [0, 0, 0, 0], str(read1.clipped)
    read1 = unconditional_before(read1)
    assert read1.sequence == "ATCGATCGAACGTACCG"
    assert read1.clipped == [2, 2, 0, 0], str(read1.clipped)

    # test without adapter trimming
    assert min_trimmer(read1).sequence == "ATCGATCGAACGTACCG"

    # test with adapter trimming
    read2 = read1[:]
    read2.sequence = "ATCGAACGTACCG"
    read2.match, read2.match_info = front_match(read2)
    read3 = min_trimmer(read2)
    assert read3.sequence == "TCGAACGTACCG", read3.sequence
    assert read3.clipped == [2, 2, 1, 0]

    # test with subsequent clipping
    read4 = unconditional_after(read2)
    assert read4.sequence == "TCGAACGTACC", read4.sequence
    assert read4.clipped == [2, 2, 1, 1], read4.clipped
    read5 = min_trimmer(read4)
    assert read5.sequence == "TCGAACGTACC", read5.sequence
    assert read5.clipped == [2, 2, 1, 1], read5.clipped
Exemplo n.º 2
0
def test_ncontentfilter_paired():
    params = [
        ('AAA', 'AAA', 0, KEEP),
        ('AAAN', 'AAA', 0, DISCARD),
        ('AAA', 'AANA', 0, DISCARD),
        ('ANAA', 'AANA', 1, KEEP),
    ]
    for seq1, seq2, count, expected in params:
        filter = NContentFilter(count=count)
        filter_legacy = SingleWrapper(filter)
        filter_both = PairedWrapper(filter)
        read1 = Sequence('read1', seq1, qualities='#' * len(seq1))
        read2 = Sequence('read1', seq2, qualities='#' * len(seq2))
        assert filter_legacy(read1, read2) == filter(read1)
        # discard entire pair if one of the reads fulfills criteria
        assert filter_both(read1, read2) == expected
Exemplo n.º 3
0
 def test(self):
     reads = [
         (Sequence('A/1 comment', 'TTA', '##H'),
         Sequence('A/2 comment', 'GCT', 'HH#')),
         (Sequence('B/1', 'CC', 'HH'),
         Sequence('B/2', 'TG', '#H'))
     ]
     fmt = InterleavedFormatter(FastqFormat(), "foo")
     result = defaultdict(lambda: [])
     for read1, read2 in reads:
         fmt.format(result, read1, read2)
     assert fmt.written == 2
     assert fmt.read1_bp == 5
     assert fmt.read2_bp == 5
     assert "foo" in result
     assert "".join(result["foo"]) == '@A/1 comment\nTTA\n+\n##H\n@A/2 comment\nGCT\n+\nHH#\n@B/1\nCC\n+\nHH\n@B/2\nTG\n+\n#H\n'
Exemplo n.º 4
0
def test_unconditional_cutter():
    uc = UnconditionalCutter(lengths=[5])
    s = Sequence("read1", 'abcdefg')
    assert UnconditionalCutter(lengths=[2])(s).sequence == 'cdefg'
    assert UnconditionalCutter(lengths=[-2])(s).sequence == 'abcde'
    assert UnconditionalCutter(lengths=[100])(s).sequence == ''
    assert UnconditionalCutter(lengths=[-100])(s).sequence == ''
Exemplo n.º 5
0
def test_linked_adapter():
    linked_adapter = LinkedAdapter('AAAA', 'TTTT')
    sequence = Sequence(name='seq', sequence='AAAACCCCCTTTT')
    match = linked_adapter.match_to(sequence)
    trimmed = linked_adapter.trimmed(match)
    assert trimmed.name == 'seq'
    assert trimmed.sequence == 'CCCCC'
Exemplo n.º 6
0
def test_error_correction_no_insert_match_one_adapter_match():
    a1 = 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTC'
    a2 = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGTTA'
    a2_mod = 'ACATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGTTA'
    frag = 'CCAAGCAGACATTCACTCAGATTGCA'
    correct_frag = 'CCAAGTAGACATTCGCTCAGATTGCA'

    r1 = list(frag)
    # C>T at pos 6
    r1[5] = 'T'
    q1 = ['#'] * 40
    # quality of read1 > quality of read2 at pos 6
    q1[5] = 'A'
    r1 = (''.join(r1) + a1)[0:40]
    q1 = ''.join(q1)

    r2 = list(frag)
    # A>G at pos 15
    r2[14] = 'G'
    q2 = ['#'] * 40
    # quality of read2 > quality of read1 at pos 11
    q2[len(frag) - 15] = 'A'
    r2 = reverse_complement(reverse_complement(a2_mod) + ''.join(r2))[0:40]
    q2 = ''.join(q2)

    read1 = Sequence('foo', r1, q1)
    read2 = Sequence('foo', r2, q2)

    parser1 = AdapterParser()
    adapter1 = parser1.parse(a1)
    # Allow zero mismatches to prevent adapter alignment
    parser2 = AdapterParser(max_error_rate=0)
    adapter2 = parser2.parse(a2)

    # Allow zero mismatches to prevent insert alignment
    cutter = InsertAdapterCutter(adapter1,
                                 adapter2,
                                 mismatch_action='liberal',
                                 max_insert_mismatch_frac=0)

    new_read1, new_read2 = cutter(read1, read2)
    assert len(new_read1) == 26
    assert not new_read1.insert_overlap
    assert new_read1.sequence == correct_frag
    assert len(new_read2) == 26
    assert not new_read2.insert_overlap
    assert new_read2.sequence == reverse_complement(correct_frag)
Exemplo n.º 7
0
def test_ncontentfilter():
    # third parameter is True if read should be discarded
    params = [('AAA', 0, KEEP), ('AAA', 1, KEEP), ('AAACCTTGGN', 1, KEEP),
              ('AAACNNNCTTGGN', 0.5, KEEP), ('NNNNNN', 1, DISCARD),
              ('ANAAAA', 1 / 6, KEEP), ('ANAAAA', 0, DISCARD)]
    for seq, count, expected in params:
        filter = NContentFilter(count=count)
        _seq = Sequence('read1', seq, qualities='#' * len(seq))
        assert filter(_seq) == expected
Exemplo n.º 8
0
def test_statistics():
    read = Sequence('name', 'AAAACCCCAAAA')
    adapters = [Adapter('CCCC', BACK, 0.1)]
    cutter = AdapterCutter(adapters, times=3)
    trimmed_read = cutter(read)
    # TODO make this a lot simpler
    trimmed_bp = 0
    for adapter in adapters:
        for d in (adapter.lengths_front, adapter.lengths_back):
            trimmed_bp += sum(seqlen * count for (seqlen, count) in d.items())
    assert trimmed_bp <= len(read), trimmed_bp
Exemplo n.º 9
0
def test_min_cutter_T_F():
    unconditional_before = UnconditionalCutter((2, -2))
    min_trimmer = MinCutter((4, -4), True, False)

    read1 = Sequence('read1', "CAATCGATCGAACGTACCGAT")
    read1 = unconditional_before(read1)
    assert read1.sequence == "ATCGATCGAACGTACCG"
    assert read1.clipped == [2, 2, 0, 0]

    # test without adapter trimming
    assert min_trimmer(read1).sequence == "CGATCGAACGTAC"
Exemplo n.º 10
0
def test_Modifiers_single():
    m = Modifiers(paired=False)
    m.add_modifier(UnconditionalCutter, lengths=[5])
    mod1 = m.get_modifiers(read=1)
    mod2 = m.get_modifiers(read=2)
    assert len(mod1) == 1
    assert isinstance(mod1[0], UnconditionalCutter)
    assert len(mod2) == 0
    # test single-end
    read = Sequence('read1', 'ACGTTTACGTA', '##456789###')
    mod_read, mod_bp = m.modify(read)
    assert mod_read[0].sequence == 'TACGTA'
Exemplo n.º 11
0
def test_overwrite_read():
    overwrite = OverwriteRead(20, 40, 10)
    lowseq = 'ACGT' * 5
    highseq = 'TCAG' * 5

    # mean lowq > 20, mean highq > 40
    lowq = (11, 31, 16, 24, 16, 20, 17, 19, 21, 28) * 2
    highq = (22, 62, 32, 48, 32, 40, 34, 38, 42, 56) * 2
    read1 = Sequence('foo', lowseq, ints2quals(lowq))
    read2 = Sequence('foo', highseq, ints2quals(highq))
    new_read1, new_read2 = overwrite(read1, read2)
    assert new_read1.sequence == lowseq
    assert new_read1.qualities == ints2quals(lowq)
    assert new_read2.sequence == highseq
    assert new_read2.qualities == ints2quals(highq)
    assert new_read1.corrected == new_read2.corrected == 0

    # mean lowq < 20, mean highq > 40
    lowq = tuple(i - 1 for i in lowq)
    read1 = Sequence('foo', lowseq, ints2quals(lowq))
    new_read1, new_read2 = overwrite(read1, read2)
    assert new_read1.sequence == rc(highseq)
    assert new_read1.qualities == ints2quals(reversed(highq))
    assert new_read2.sequence == highseq
    assert new_read2.qualities == ints2quals(highq)
    assert new_read1.corrected == new_read2.corrected == 1

    # mean lowq < 20, mean highq < 40
    highq = tuple(i - 1 for i in highq)
    read2 = Sequence('foo', highseq, ints2quals(highq))
    new_read1, new_read2 = overwrite(read1, read2)
    assert new_read1.sequence == lowseq
    assert new_read1.qualities == ints2quals(lowq)
    assert new_read2.sequence == highseq
    assert new_read2.qualities == ints2quals(highq)
    assert new_read1.corrected == new_read2.corrected == 0
Exemplo n.º 12
0
def test_issue_52():
    adapter = Adapter(sequence='GAACTCCAGTCACNNNNN',
                      where=BACK,
                      max_error_rate=0.12,
                      min_overlap=5,
                      read_wildcards=False,
                      adapter_wildcards=True)
    read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC')
    am = Match(astart=0,
               astop=17,
               rstart=5,
               rstop=21,
               matches=15,
               errors=2,
               front=None,
               adapter=adapter,
               read=read)
    assert am.wildcards() == 'GGC'
    """
Exemplo n.º 13
0
def test_issue_80():
    # This issue turned out to not be an actual issue with the alignment
    # algorithm. The following alignment is found because it has more matches
    # than the 'obvious' one:
    #
    # TCGTATGCCGTCTTC
    # =========X==XX=
    # TCGTATGCCCTC--C
    #
    # This is correct, albeit a little surprising, since an alignment without
    # indels would have only two errors.

    adapter = Adapter(sequence="TCGTATGCCGTCTTC",
                      where=BACK,
                      max_error_rate=0.2,
                      min_overlap=3,
                      read_wildcards=False,
                      adapter_wildcards=False)
    read = Sequence(name="seq2", sequence="TCGTATGCCCTCC")
    result = adapter.match_to(read)
    assert read.original_length == 13, result
    assert result.errors == 3, result
    assert result.astart == 0, result
    assert result.astop == 15, result
Exemplo n.º 14
0
import sys
import os
from io import StringIO
import shutil
from textwrap import dedent
from tempfile import mkdtemp
from atropos.seqio import (Sequence, ColorspaceSequence, FormatError,
    FastaReader, FastqReader, FastaQualReader, InterleavedSequenceReader,
    FastaFormat, FastqFormat, InterleavedFormatter, get_format,
    open_reader as openseq, sequence_names_match, open_output)
from atropos.xopen import xopen, open_output
from .utils import temporary_path

# files tests/data/simple.fast{q,a}
simple_fastq = [
    Sequence("first_sequence", "SEQUENCE1", ":6;;8<=:<"),
    Sequence("second_sequence", "SEQUENCE2", "83<??:(61")
    ]

simple_fasta = [ Sequence(x.name, x.sequence, None) for x in simple_fastq ]


class TestSequence:
    def test_too_many_qualities(self):
        with raises(FormatError):
            Sequence(name="name", sequence="ACGT", qualities="#####")

    def test_too_many_qualities_colorspace(self):
        with raises(FormatError):
            ColorspaceSequence(name="name", sequence="T0123", qualities="#####")
Exemplo n.º 15
0
def test_str():
    a = Adapter('ACGT', where=BACK, max_error_rate=0.1)
    str(a)
    str(a.match_to(Sequence(name='seq', sequence='TTACGT')))
    ca = ColorspaceAdapter('0123', where=BACK, max_error_rate=0.1)
    str(ca)
Exemplo n.º 16
0
def test_overlapping():
    trimmer = MergeOverlapping(min_overlap=10, error_rate=0.1)
    a1 = 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTC'
    a2 = reverse_complement('AGATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGTTA')
    frag = 'CCAAGCAGACATTCACTCAGATTGCA'
    r1 = (frag + a1)[0:40]
    q1 = '#' * 40
    r2 = reverse_complement(a2 + frag)[0:40]
    q2 = '!' * 40
    parser = AdapterParser()
    adapter1 = parser.parse(a1)
    adapter2 = parser.parse(a2)
    cutter = AdapterCutter([adapter1, adapter2])
    read1 = Sequence('foo', r1, q1)
    read1 = cutter(read1)
    assert len(read1) == 26
    read2 = Sequence('foo', r2, q2)
    read2 = cutter(read2)
    assert len(read2) == 26

    # complete overlap
    read1_merged, read2_merged = trimmer(read1, read2)
    assert read1_merged.merged
    assert read2_merged is None
    assert read1 == read1_merged

    # partial overlap
    read1.merged = False
    read2 = read2.subseq(0, 24)[2]
    read1_merged, read2_merged = trimmer(read1, read2)
    assert read1_merged.merged
    assert read2_merged is None
    assert read1 == read1_merged

    # partial overlap r1, r2
    read1.merged = False
    read1 = read1.subseq(0, 24)[2]
    read1_merged, read2_merged = trimmer(read1, read2)
    assert read1_merged.merged
    assert read2_merged is None
    assert len(read1_merged) == 26
    assert read1_merged.sequence == 'CCAAGCAGACATTCACTCAGATTGCA'
    assert read1_merged.qualities == ('#' * 24) + ('!' * 2)

    # errors
    # round(0.1 * 24) = 2, so 2 errors should pass but 3 should not
    read1.merged = False
    r1_seq = list(read1.sequence)
    r1_seq[10] = reverse_complement(r1_seq[10])
    r1_seq[20] = reverse_complement(r1_seq[20])
    read1.sequence = "".join(r1_seq)
    read1_merged, read2_merged = trimmer(read1, read2)
    assert read1_merged.merged
    assert read2_merged is None
    assert len(read1_merged) == 26
    assert read1_merged.sequence == 'CCAAGCAGACTTTCACTCAGTTTGCA'
    assert read1_merged.qualities == ('#' * 24) + ('!' * 2)

    # too few overlapping bases
    read1.merged = False
    r1_seq[15] = reverse_complement(r1_seq[15])
    read1.sequence = "".join(r1_seq)
    read1_merged, read2_merged = trimmer(read1, read2)
    assert read1_merged.merged is False
    assert read2 is not None
Exemplo n.º 17
0
 def match(name1, name2):
     seq1 = Sequence(name1, 'ACGT')
     seq2 = Sequence(name2, 'AACC')
     return sequence_names_match(seq1, seq2)
Exemplo n.º 18
0
 def test_too_many_qualities(self):
     with raises(FormatError):
         Sequence(name="name", sequence="ACGT", qualities="#####")
Exemplo n.º 19
0
def test_TruSeq_trimmer():
    trimmer = TruSeqBisulfiteTrimmer()
    read1 = Sequence('read1', "CTATCGATCCACGAGACTAAC")
    assert trimmer(read1).sequence == "ATCCACGAGACTAAC"