示例#1
0
def test_adapter_cutter():
    from cutadapt.adapters import BackAdapter
    a1 = BackAdapter("GTAGTCCCGC")
    a2 = BackAdapter("GTAGTCCCCC")
    match = AdapterCutter.best_match([a1, a2],
                                     Sequence("name", "ATACCCCTGTAGTCCCC"))
    assert match.adapter is a2
示例#2
0
def test_add_adapter_statistics():
    stats = BackAdapter('A', name='name',
                        max_error_rate=0.1).create_statistics()
    end_stats = stats.back
    end_stats.adjacent_bases['A'] = 7
    end_stats.adjacent_bases['C'] = 19
    end_stats.adjacent_bases['G'] = 23
    end_stats.adjacent_bases['T'] = 42
    end_stats.adjacent_bases[''] = 45

    end_stats.errors[10][0] = 100
    end_stats.errors[10][1] = 11
    end_stats.errors[10][2] = 3
    end_stats.errors[20][0] = 600
    end_stats.errors[20][1] = 66
    end_stats.errors[20][2] = 6

    stats2 = BackAdapter('A', name='name',
                         max_error_rate=0.1).create_statistics()
    end_stats2 = stats2.back
    end_stats2.adjacent_bases['A'] = 43
    end_stats2.adjacent_bases['C'] = 31
    end_stats2.adjacent_bases['G'] = 27
    end_stats2.adjacent_bases['T'] = 8
    end_stats2.adjacent_bases[''] = 5
    end_stats2.errors[10][0] = 234
    end_stats2.errors[10][1] = 14
    end_stats2.errors[10][3] = 5
    end_stats2.errors[15][0] = 90
    end_stats2.errors[15][1] = 17
    end_stats2.errors[15][2] = 2

    stats += stats2
    r = stats.back

    assert r.adjacent_bases == {'A': 50, 'C': 50, 'G': 50, 'T': 50, '': 50}
    assert r.errors == {
        10: {
            0: 334,
            1: 25,
            2: 3,
            3: 5
        },
        15: {
            0: 90,
            1: 17,
            2: 2
        },
        20: {
            0: 600,
            1: 66,
            2: 6
        },
    }
示例#3
0
def test_paired_adapter_cutter_actions(action, expected_trimmed1,
                                       expected_trimmed2):
    a1 = BackAdapter("GGTTAA")
    a2 = BackAdapter("AACCGG")
    s1 = Sequence("name", "CCCCGGTTAACCCC")
    s2 = Sequence("name", "TTTTAACCGGTTTT")
    pac = PairedAdapterCutter([a1], [a2], action=action)
    info1 = ModificationInfo(s1)
    info2 = ModificationInfo(s2)
    trimmed1, trimmed2 = pac(s1, s2, info1, info2)
    assert expected_trimmed1 == trimmed1.sequence
    assert expected_trimmed2 == trimmed2.sequence
示例#4
0
def test_random_match_probabilities():
    a = BackAdapter('A', max_errors=0.1).create_statistics()
    assert a.back.random_match_probabilities(0.5) == [1, 0.25]
    assert a.back.random_match_probabilities(0.2) == [1, 0.4]

    for s in ('ACTG', 'XMWH'):
        a = BackAdapter(s, max_errors=0.1).create_statistics()
        assert a.back.random_match_probabilities(0.5) == [1, 0.25, 0.25**2, 0.25**3, 0.25**4]
        assert a.back.random_match_probabilities(0.2) == [1, 0.4, 0.4*0.1, 0.4*0.1*0.4, 0.4*0.1*0.4*0.1]

    a = FrontAdapter('GTCA', max_errors=0.1).create_statistics()
    assert a.front.random_match_probabilities(0.5) == [1, 0.25, 0.25**2, 0.25**3, 0.25**4]
    assert a.front.random_match_probabilities(0.2) == [1, 0.4, 0.4*0.1, 0.4*0.1*0.4, 0.4*0.1*0.4*0.1]
示例#5
0
def test_action_retain():
    back = BackAdapter("AACCGG")
    ac = AdapterCutter([back], action="retain")
    seq = Sequence("r1", "ATTGCCAACCGGTATATAT")
    info = ModificationInfo(seq)
    trimmed = ac(seq, info)
    assert "ATTGCCAACCGG" == trimmed.sequence
def test_back_adapter_absolute_number_of_errors():
    adapter = BackAdapter(
        sequence="GATCGGAAGA",
        max_errors=1,
        min_overlap=3,
    )
    assert adapter.max_error_rate == 1 / 10
def test_info_record():
    adapter = BackAdapter(sequence='GAACTCCAGTCACNNNNN',
                          max_errors=0.12,
                          min_overlap=5,
                          read_wildcards=False,
                          adapter_wildcards=True,
                          name="Foo")
    read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC')
    am = RemoveAfterMatch(astart=0,
                          astop=17,
                          rstart=5,
                          rstop=21,
                          matches=15,
                          errors=2,
                          adapter=adapter,
                          sequence=read.sequence)
    assert am.get_info_records(read) == [[
        "",
        2,
        5,
        21,
        'CCCCA',
        'GAACTACAGTCCCGGC',
        '',
        'Foo',
        '',
        '',
        '',
    ]]
def test_back_adapter_indel_and_mismatch_occurrence():
    adapter = BackAdapter(
        sequence="GATCGGAAGA",
        max_errors=0.1,
        min_overlap=3,
    )
    match = adapter.match_to("CTGGATCGGAGAGCCGTAGATCGGGAGAGGC")
    # CTGGATCGGA-GAGCCGTAGATCGGGAGAGGC
    #    ||||||| ||      ||||||X|||
    #    GATCGGAAGA      GATCGGAAGA
    assert match.errors == 1
    assert match.matches == 9
    assert match.astart == 0
    assert match.astop == 10
    assert match.rstart == 3
    assert match.rstop == 12
示例#9
0
def test_end_trim_with_mismatch():
    """
    Test the not-so-obvious case where an adapter of length 13 is trimmed from
    the end of a sequence with overlap 9 and there is one deletion.
    In this case the algorithm starts with 10 bases of the adapter to get
    the hit and so the match is considered good. An insertion or substitution
    at the same spot is not a match.
    """
    adapter = BackAdapter("TCGATCGATCGAT", max_errors=0.1)

    read = Sequence('foo1', 'AAAAAAAAAAATCGTCGATC')
    cutter = AdapterCutter([adapter], times=1)
    trimmed_read = cutter(read, ModificationInfo(read))

    assert trimmed_read.sequence == 'AAAAAAAAAAA'
    assert cutter.adapter_statistics[adapter].back.lengths == {9: 1}
    # We see 1 error at length 9 even though the number of allowed mismatches at
    # length 9 is 0.
    assert cutter.adapter_statistics[adapter].back.errors[9][1] == 1

    read = Sequence('foo2', 'AAAAAAAAAAATCGAACGA')
    cutter = AdapterCutter([adapter], times=1)
    trimmed_read = cutter(read, ModificationInfo(read))

    assert trimmed_read.sequence == read.sequence
    assert cutter.adapter_statistics[adapter].back.lengths == {}
示例#10
0
def test_linked_matches_property():
    """Accessing matches property of non-anchored linked adapters"""
    # Issue #265
    front_adapter = FrontAdapter("GGG")
    back_adapter = BackAdapter("TTT")
    la = LinkedAdapter(front_adapter, back_adapter, front_required=False, back_required=False, name='name')
    assert la.match_to("AAAATTTT").matches == 3
示例#11
0
def test_back_adapter_indel_and_exact_occurrence():
    adapter = BackAdapter(
        sequence="GATCGGAAGA",
        max_errors=0.1,
        min_overlap=3,
    )
    match = adapter.match_to("GATCGTGAAGAGATCGGAAGA")
    # We want the leftmost match of these two possible ones:
    # GATCGTGAAGAGATCGGAAGA
    # GATCG-GAAGA
    #            GATCGGAAGA
    assert match.errors == 0
    assert match.matches == 10
    assert match.astart == 0
    assert match.astop == 10
    assert match.rstart == 0
    assert match.rstop == 10
示例#12
0
def test_issue_80():
    # This issue turned out to not be an actual issue with the alignment
    # algorithm. The following alignment is found because it has more matches
    # than the 'obvious' one:
    #
    # TCGTATGCCGTCTTC
    # =========X==XX=
    # TCGTATGCCCTC--C
    #
    # This is correct, albeit a little surprising, since an alignment without
    # indels would have only two errors.

    adapter = BackAdapter(sequence="TCGTATGCCGTCTTC",
                          max_errors=0.2,
                          min_overlap=3,
                          read_wildcards=False,
                          adapter_wildcards=False)
    result = adapter.match_to("TCGTATGCCCTCC")
    assert result.errors == 3, result
    assert result.astart == 0, result
    assert result.astop == 15, result
示例#13
0
def test_statistics():
    read = Sequence('name', 'AAAACCCCAAAA')
    adapters = [BackAdapter("CCCC", max_errors=0.1)]
    cutter = AdapterCutter(adapters, times=3)
    cutter(read, ModificationInfo(read))
    # TODO make this a lot simpler
    trimmed_bp = 0
    for adapter in adapters:
        for d in (cutter.adapter_statistics[adapter].front.lengths,
                  cutter.adapter_statistics[adapter].back.lengths):
            trimmed_bp += sum(seqlen * count for (seqlen, count) in d.items())
    assert trimmed_bp <= len(read), trimmed_bp
示例#14
0
def test_issue_52():
    adapter = BackAdapter(
        sequence='GAACTCCAGTCACNNNNN',
        max_errors=0.12,
        min_overlap=5,
        read_wildcards=False,
        adapter_wildcards=True)
    sequence = "CCCCAGAACTACAGTCCCGGC"
    am = RemoveAfterMatch(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2,
        adapter=adapter, sequence=sequence)
    assert am.wildcards() == 'GGC'
    """
示例#15
0
def test_linked_adapter():
    front_adapter = PrefixAdapter('AAAA', min_overlap=4)
    back_adapter = BackAdapter('TTTT', min_overlap=3)

    linked_adapter = LinkedAdapter(
        front_adapter, back_adapter, front_required=True, back_required=False, name='name')
    assert linked_adapter.front_adapter.min_overlap == 4
    assert linked_adapter.back_adapter.min_overlap == 3

    read = Sequence(name='seq', sequence='AAAACCCCCTTTT')
    trimmed = linked_adapter.match_to(read.sequence).trimmed(read)
    assert trimmed.name == 'seq'
    assert trimmed.sequence == 'CCCCC'
示例#16
0
def test_linked_action_retain(s, expected):
    front = FrontAdapter("GGTTAACC")
    back = BackAdapter("AACCGG")
    adapters: List[Adapter] = [
        LinkedAdapter(front,
                      back,
                      front_required=False,
                      back_required=False,
                      name="linked")
    ]
    ac = AdapterCutter(adapters, action="retain")
    seq = Sequence("r1", s)
    info = ModificationInfo(seq)
    trimmed = ac(seq, info)
    assert expected == trimmed.sequence
示例#17
0
def test_multiple_adapters():
    a1 = BackAdapter("GTAGTCCCGC")
    a2 = BackAdapter("GTAGTCCCCC")
    ma = MultipleAdapters([a1, a2])
    match = ma.match_to("ATACCCCTGTAGTCCCC")
    assert match.adapter is a2
示例#18
0
def test_back_adapter_partial_occurrence_in_front():
    adapter = BackAdapter("CTGAATT", max_errors=0, min_overlap=4)
    assert adapter.match_to("AATTGGGGGGG") is None
示例#19
0
def test_str():
    a = BackAdapter('ACGT', max_errors=0.1)
    str(a)
    str(a.match_to("TTACGT"))
示例#20
0
def test_retain_times():
    with pytest.raises(ValueError) as e:
        AdapterCutter([BackAdapter("ACGT")], times=2, action="retain")
    assert "cannot be combined with times" in e.value.args[0]