예제 #1
0
def test_advance_scores_nasty_del_softclip():
    """Test a situation that truly failed: deletion followed by softclip (not really allowed but...)"""
    read1 = mkread('AAAGCGGTTCACAAGACGCCGGACGTATGAGTTGAGAG' + 'CTATAAAGTAAA',
                   None, '38M3D10M2S', 2)
    read2 = mkread('AAAGCGGTTCACAAGACGCCGGACGTATGAGTTGAGAG' + 'CTATAAAGTAAA',
                   None, '38M3D12M', 2)
    ref = 'TCAAAGCGGTTCACAAGACGCCTGACGTATGAGTTGAGTGGAACGATTTAGTATCATATCTTGGGACGGTCAAATAGACTGTACCCTTCC'
    offset = 0
    win_end = 50
    score_start = 24
    refscore = np.array(map(int, (
        '0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 1 0 0 1 2 0 0'
        '0 3 0 2 2 0 0 0 0 0 0 0 0 0').split()),
                        dtype=int)
    depth = np.array(map(int, (
        '1 17 17 17 16 15 14 14 14 14 14 14 13 13 13 12 10  9  8  7  7  7  7  7  7'
        '0  7  7  7  6  6  6  5  4  4  4  3  3  3  3  2  2  1  0  0  0  0  0  0  0'
        '0').split()),
                     dtype=int)
    scores1 = get_active_score(ref, read1, offset, win_end, score_start)
    scores2 = get_active_score(ref, read2, offset, win_end, score_start)
    if scores1.shape[0] != scores2.shape[0] or not all(
        [a == b for a, b in zip(scores1, scores2)]):
        raise ValueError('Different shapes for 10M2S and 12M'
                         '\n{}\n{}'.format(scores1, scores2))
    _advance_scores(pool=[read1],
                    offset=25,
                    win_stop=win_end,
                    flank=25,
                    reference=ref,
                    refscore=refscore,
                    depth=depth)
예제 #2
0
def test_advance_scores2():
    """\
    A second test for the advancement of the window

 
    scores:        00001112223339344333322
    window:      23|----------o----------|45
    reads:             ------------>  (27, 0, 13)
                          ------|------>    (30, 3, 14)
                             ---|-------->    (33, 3, 13)
                                  -------------->   (38, 0, 15)  
    next_window:            34|----------o----------|56
    next_scores:              33934433332211111110000

    """
    offset, win_end = 23, 45
    ref = 'A' * 60
    read1 = mkread('T' * 13, None, '13M', 27)
    read2 = mkread('T' * 17, None, '7M3I7M', 30)
    read3 = mkread('T' * 16, None, '4M3I9M', 33)
    read4 = mkread('T' * 15, None, '15M', 38)
    pool = [read1, read2, read3, read4]
    refscore = np.array(map(int, '00001112223339344333322'))
    depth = np.array(map(int, '00001112223333344333322'))
    expected = map(int, '33934433332211111110000')
    refscore, depth = _advance_scores(pool=pool,
                                      offset=34,
                                      win_stop=56,
                                      flank=11,
                                      reference=ref,
                                      refscore=refscore,
                                      depth=depth)
    if not all([a == b for a, b in zip(refscore.tolist(), expected)]):
        raise ValueError('Observed != Expected\n{}\n{}'.format(
            refscore.tolist(), expected))
예제 #3
0
def test_read_likelihood():
    """Test that the read likelihood is doing the right thing"""
    ref = 'ACTAGAGATCGTTAGCGTCTCGATCGACGATGACGTTAAGGCCATTAGCGAT'
    # first, a read that matches the ref at Q20 should have a well known score
    r1 = 'TCGTTAGCGTCTCGATCGACGATGACGTTAAGG'
    r2 = 'TCGTTAGCGTCTCTATCGTCGATGTCGTTAAGG'  # snphere
    read1 = mkread(r1, [20] * len(r1), '{}M'.format(len(r1)), 1)
    likelihood = _read_likelihood(read1, ref, {
        'open': 0.005,
        'continue': 0.05
    }, {
        'open': 0.005,
        'continue': 0.05
    })
    if likelihood < np.log(0.96**len(r1) / len(r1)):
        raise ValueError('Read likelihood is less than it ought to be.'
                         '\nExpected: {}, Observed: {}'.format(
                             np.log(0.98**len(r1) / len(r1)), likelihood))
    read2 = mkread(r2, [20] * len(r2), '{}M'.format(len(r2)), 1)
    likelihood2 = _read_likelihood(read2, ref, {
        'open': 0.005,
        'continue': 0.05
    }, {
        'open': 0.005,
        'continue': 0.05
    })
    if likelihood2 - likelihood > -0.1:
        raise ValueError('L2={} should be < L1={}'.format(
            likelihood2, likelihood))
예제 #4
0
def test_advance_scores():
    """\
    Test that the score advancement is functioning properly. Set up the following situation:

    scores:          000000112233333444443
    window:        12|---------o---------|32
    reads:                 ------------->
                             ------------>
                               -------------->
                                    ------------->
    next window:             22|---------o---------|42
    next scores:               333334444432222111100

    the reads are all 100% mismatch so that we can validate using sums

    """
    offset, win_end = 12, 32
    ref = 'A' * 60
    r1, r1o = 'T' * 14, 6
    r2, r2o = 'T' * 13, 8
    r3, r3o = 'T' * 15, 10
    r4, r4o = 'T' * 14, 15
    pool = [
        mkread(r, None, '{}M'.format(len(r)), offset + ro)
        for r, ro in zip([r1, r2, r3, r4], [r1o, r2o, r3o, r4o])
    ]
    refscore = np.array(map(int, '000000112233333444443'))
    depth = np.array([r for r in refscore])
    expected = map(int, '333334444432222111100')
    refscore, depth = _advance_scores(pool, 22, 42, 10, ref, refscore, depth)
    if not all([a == b for a, b in zip(refscore.tolist(), expected)]):
        raise ValueError('Observed != Expected\n{}\n{}'.format(
            refscore.tolist(), expected))
예제 #5
0
def test_path_counting():
    """Check that path counting from the assembly is working properly"""
    #          0    1         2         3         4         5         6         7
    #          567890123456789012345678901234567890123456789012345678901234567890123
    ref = 'TATAGATCTAGCGGCCTATTGCATGTACGTATACGGCAGTCACGTCGTCGCTAATAGCGATCCACTACT'
    h1 = 'TATAGATCTAGCGGCCTATTGCATGTACGT'
    h2 = 'GATCTAGCGGCCTATTGCATGTACGTAT' + 'TC'
    h3 = 'TAGCCGCCTATTGCATGTACGTAT' + 'TCACGTCGTCG'
    h4 = 'CGGCCTATTGCATGTACGTAT' + 'TGACGTCGTCGCTAATAGC'
    h5 = 'CCTTTTGCATGTACGTAT' + 'TCACGTCGTCGCTAATAGCGATCC'
    h6 = 'GTACGTAT' + 'TCACGTCATCGCTAATAGCGATCCACTACT'
    h7 = 'CGTAT' + 'TCACGTCGTCGCTAATAGCGATCCACTACT'
    h8 = 'TAT' + 'TCACGTCGTCGCTAATAGCGATCCACTACT'
    ref_pos = 5
    escores = '222233333334455555655' + '432333322222233333333333222222'
    h1p, h2p, h3p, h4p, h5p, h6p = 5, 9, 13, 16, 19, 29
    reads = [
        mkread(h, None, '{}M'.format(len(h)),
               p)  # cigar doesn't matter for assembly (!)
        for h, p in [(h1, h1p), (h2, h2p), (h3, h3p), (h4,
                                                       h4p), (h5,
                                                              h5p), (h6, h6p)]
    ]
    q = list(reads[5].qual)
    q[15] = chr(3 + 33)
    reads[5].qual = ''.join(q)
    result = build_haplotype(ref, reads, k=9, min_kmer_count=0)
    expected = 'TATAGATCTAGCGGCCTATTGCATGTACGTAT' + 'TCACGTCGTCGCTAATAGCGATCCACTACT'
    if result.seq != expected:
        raise ValueError('Observed not equal expected:\n'
                         '{}\n{}'.format(result.seq, expected))
예제 #6
0
def test_cread_likelihood():
    """test the c read likelihood"""
    def _is_rel_close(a, b, frac):
        return -frac < (a - b) / a < frac

    ref = 'TATTAGA'
    r1 = 'TAG'
    r2 = 'ATCAG'
    r3 = 'TTTAGG'
    for q in xrange(15, 25):
        read = mkread(r1, [q] * len(r1), '{}M'.format(len(r1)), 1)
        r1_lik = _read_likelihood(read, ref, {
            'open': 0.005,
            'continue': 0.05
        }, {
            'open': 0.005,
            'continue': 0.05
        })
        r1_clik = _c_read_likelihood(read, ref)
        assert _is_rel_close(r1_lik, r1_clik, 0.01)
    for q in xrange(15, 25):
        read = mkread(r2, [q] * len(r2), '{}M'.format(len(r2)), 1)
        r2_lik = _read_likelihood(read, ref, {
            'open': 0.005,
            'continue': 0.05
        }, {
            'open': 0.005,
            'continue': 0.05
        })
        r2_clik = _c_read_likelihood(read, ref)
        assert _is_rel_close(r2_lik, r2_clik, 0.01)
    for q in xrange(15, 25):
        read = mkread(r3, [q] * len(r3), '{}M'.format(len(r3)), 1)
        r3_lik = _read_likelihood(read, ref, {
            'open': 0.005,
            'continue': 0.05
        }, {
            'open': 0.005,
            'continue': 0.05
        })
        r3_clik = _c_read_likelihood(read, ref)
        assert _is_rel_close(r3_lik, r3_clik, 0.05), (r3_lik, r3_clik)
예제 #7
0
def test_active_score():
    """\
    Test that the active score function is working properly. Set up some situations:
             0            
             0
             0  1         2         3         4         5         6         7
             789012345678901234567890123456789012345678901234567890123456789012|
    Ref:     CCTATGTGTGTGCTATACGATAGCGATCCAGTACGGTAAACGTATTCCTGATGGAACTGCGCTCTCGAGATCAGTCATG
    Read1:        GTGTGTGCTATATGATAGCGATCCACTAC
    Exp:          00000000000010000000000001000
    Read2:                       TAGCGAT-----ACGGTAAACGTATTCCTGAT
    Exp:                         0000003000030000000000000000000
    Read3:                                           CGTATTCCXGATGGAACTGCGCTC  (X = 'TCCCCC')
    Exp:                                             0000000030000000000000000
    Read4:      ATGTCTGTGCT------TAGCGATCCAGT
    Exp:        00001000003000003000000000000
    Read5:                                          (X = 'GTG')    AACTXCGCTCACGAGATCAGTCT
    Exp:                                                           000040000010
    Read6: TACCTATGTGTGTGCTTTACGATAGC
    Exp:     000000000000001000000000

    """
    Example = namedtuple('Example', ['seq', 'cigar', 'pos', 'expected'])
    offset = 7
    ref = 'AGTACAA' + 'CCTATGTGTGTGCTATACGATAGCGATCCAGTACGGTAAACGTATTCCTGATGGAACTGCGCTCTCGAGATCAGTCATG'
    examples = [
        Example('GTGTGTGCTATATGATAGCGATCCACTAC', '29M', 12,
                map(int, '00000000000010000000000001000')),
        Example('TAGCGATACGGTAAACGTATTCCTGAT', '7M5D20M', 27,
                map(int, '00000030000300000000000000000000')),
        Example('CGTATTCCTCCCCCGATGGAACTGCGCTC', '9M5I15M', 47,
                map(int, '000000004000000000000000')),
        Example('ATGTCTGTGCTTAGCGATCCAGT', '11M6D12M', 10,
                map(int, '00001000003000003000000000000')),
        Example('AACTGTGCGCTCACGAGATCAGTCT', '5M2I18M', 61,
                map(int, '000040000010')),
        Example('TACCTATGTGTGTGCTTTACGATAGC', '26M', 5,
                map(int, '000000000000001000000000'))
    ]
    for i, example in enumerate(examples):
        read = mkread(seq=example.seq,
                      cigar=example.cigar,
                      pos=example.pos,
                      quals=None)
        read_offset = read.pos - offset
        active_score = get_active_score(ref, read, offset, 3192).tolist()
        if not all([a == b for a, b in zip(active_score, example.expected)]):
            raise ValueError('Error in example {}. Observed != expected: \n'
                             '{}\n{}'.format(
                                 1 + i, ''.join(map(str, active_score)),
                                 ''.join(map(str, example.expected))))
예제 #8
0
def test_advance_scores_reads_bigger_than_window():
    """\
    Advance scores broke when reads were larger than the window. This test
    ensures we shall never regress.

    """
    ref = 'GGTGGAGTAGCCCACGTAGATGCGACCATCC' + 'T' * 100  # suffix doesn't matter
    offset, win_stop, flank = 0, 20, 10
    read1 = mkread('GATGGAGTAGCCCACGTAGATGCGACCATCCTCTCCGGGCTTAGCGGTCT', None,
                   '50M', 0)
    read2 = mkread('CCCACGTAGATGCGACCATCCTCTCCGGGCTTAGCGGTCTTTCTATTCAT', None,
                   '50M', 10)
    read3 = mkread(
        'ACGTAGATGCGACGATCCTCTCCGGGCTTAGCAGTCTTTCTATTCATGTG',  # note snp: GATC not CATC
        None,
        '50M',
        13)
    read4 = mkread('ACGTAGATGCGACCATCCTCTCCGGGCTTAGCGGTCTTTCTATTCATGTG', None,
                   '50M', 13)
    read5 = mkread('CGTAGATGCGACCATCCTCTGCGGGCTTAGCGGTCTTTCTATTCATGTGA', None,
                   '50M', 14)
    read6 = mkread('TAGATGCGACCATCCTCTCCGGGCTTAGCGGTCTTTCTATTCATGTGAGG', None,
                   '50M', 16)
    refscore = np.array(
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
    expected = [0] * 21
    expected[-5] = 1
    refscore, depth = _advance_scores(
        pool=[read1, read2, read3, read4, read5, read6],
        offset=10,
        win_stop=30,
        flank=10,
        reference=ref,
        refscore=refscore,
        depth=np.array([4] * 21))
    if not all([a == b for a, b in zip(refscore.tolist(), expected)]):
        raise ValueError('Observed != Expected\n{}\n{}'.format(
            refscore.tolist(), expected))
예제 #9
0
def test_advance_window():
    """\
    Test that advance window is doing the right thing

    """
    offset = 21273
    wend = 21294
    pool = list()
    for start_pos in xrange(21250, 21274):
        for seqsize in [20, 25, 30, 35, 40, 45, 50]:
            if start_pos + seqsize >= 21273:
                pool.append(
                    mkread(seq='A' * seqsize,
                           cigar='{}M'.format(seqsize),
                           pos=start_pos,
                           quals=None))
            for dsize in [1, 3, 4, 5, 8]:
                if start_pos + seqsize + dsize >= 21273:
                    pool.append(
                        mkread(seq='A' * seqsize,
                               cigar='{}M{}D{}M'.format(
                                   10, dsize, seqsize - 10),
                               quals=None))
            for isize in [1, 2, 5, 10]:
                if start_pos + seqsize - isize >= 21273:
                    pool.append(
                        mkread(seq='A' * seqsize,
                               cigar='{}M{}I{}M'.format(
                                   5, isize, seqsize - 5 - isize),
                               quals=None))
    pool_copy = [read for read in pool]
    old_offset = offset
    pool, offset, new_wend = _advance_window(pool, offset, wend, 10, 23130)
    # make sure everything left overlaps the new offset
    if not all([read.pos + read.alen >= offset for read in pool]):
        raise ValueError(
            'Error in pool advancement: some reads do not overlap the offset:\n'
            '{}'.format(
                [read for read in pool if read.pos + read.alen < offset]))
    # make sure everything not in the pool does not overlap the new offset
    kept = {(read.pos, read.alen, read.cigarstring) for read in pool}
    dropped = [
        read for read in pool_copy
        if (read.pos, read.alen, read.cigarstring) not in kept
    ]
    if not all([read.pos + read.alen <= offset for read in dropped]):
        raise ValueError(
            'Error in pool advancement; some reads which overlap the offset fail'
            ' to make it into the pool:\n'
            '{}'.format('\n'.join([
                str(read) for read in dropped if read.pos + read.alen >= offset
            ])))
    if offset != old_offset + 11:
        raise ValueError(
            'Error in pool advancement: the new offset does not equal the old offset but shifted by flank+1'
        )

    # now check the edge case
    pool = [read for read in pool_copy]
    pool, offset, new_wend = _advance_window(pool, offset, wend, 10, 21303)
    if new_wend != 21302:
        raise ValueError(
            'The window advance does not properly truncate window size at end of contig'
        )
예제 #10
0
 def _mk(seq, p):
     return mkread(seq, [20] * len(seq), '{}M'.format(len(seq)), p)