示例#1
0
def test_lossless_reads():
    A = Alphabet('ACGT')
    S = rand_seq(A, 100)
    with pytest.raises(AssertionError):
        next(rand_read(S, len_mean=200, num=1))  # len_mean must be < len(S)
    with pytest.raises(AssertionError):
        # at most one of num or expected_coverage given
        next(rand_read(S, len_mean=50, num=1, expected_coverage=1))

    assert sum(1 for _ in rand_read(S, len_mean=50, num=10)) == 10, \
        'The number of sampled reads should be controllable'
    assert sum(1 for _ in rand_read(S, len_mean=50)) == 1, \
        'If neither num or expected coverage is given only one sample is read'

    # there should be no noise added
    read, pos = next(rand_read(S, len_mean=40, num=1))
    assert S[pos:pos+len(read)] == read

    S = A.parse('ACT' * 100)
    reads = [x for x in rand_read(S, len_mean=100, len_sd=0.01, num=100)]
    assert set(len(read) for read, _ in reads) > 1, \
        'Read lengths should be randomly chosen'
    len_mean = sum(len(read) for read, _ in reads) / 100.
    assert len_mean > 50 and len_mean < 150, \
        'Normal distribution of read lengths works'

    # index edge cases
    A = Alphabet(['00', '01'])
    S = A.parse('01' * 10)
    _bak = np.random.normal
    np.random.normal = mock.Mock(return_value=[1])
    assert next(rand_read(S, len_mean=1, num=1))[0] == A.parse('01'), \
        'sequences in alphabets with > 1 long letters can be sampled too'
    np.random.normal = _bak
示例#2
0
文件: test_pw.py 项目: amirkdv/biseqt
def test_alignment_std_local(err):
    A = Alphabet('ACGT')
    M = MutationProcess(A, subst_probs=err, go_prob=err, ge_prob=err)
    subst_scores, (go_score, ge_score) = M.log_odds_scores()

    S = rand_seq(A, 100)
    T, tx = M.mutate(S)
    T = A.parse('A' * 100) + T + A.parse('G' * 100)
    mutation_aln = Alignment(S, T, tx)
    mutation_score = mutation_aln.calculate_score(subst_scores, go_score,
                                                  ge_score)

    aligner = Aligner(S, T, subst_scores=subst_scores, go_score=go_score,
                      ge_score=ge_score, alnmode=STD_MODE, alntype=LOCAL)
    with aligner:
        reported_score = aligner.solve()
        assert round(reported_score, 3) >= round(mutation_score, 3), \
            'optimal alignment scores better than the known transcript'
        alignment = aligner.traceback()
        aln_score = alignment.calculate_score(subst_scores, go_score, ge_score)
        assert round(aln_score, 3) == round(reported_score, 3), \
            'The alignment score should be calculated correctly'

        ori_len = Alignment.projected_len(alignment.transcript, on='origin')
        mut_len = Alignment.projected_len(alignment.transcript, on='mutant')
        assert ori_len <= len(S) and mut_len < len(T), \
            'Local alignments do not cover the entirety of both sequences'
示例#3
0
def test_scan_kmers(seq_db, wordlen):
    db = seq_db
    A = db.alphabet
    kmer_index = KmerIndex(db, wordlen)
    S = rand_seq(A, 50)
    assert sum(1 for _ in kmer_index.scan_kmers(S)) == len(S) - wordlen + 1, \
        'correct number of kmers should be scanned'
示例#4
0
def sequencing_sample(request):
    """Creates a random sequence, generates reads, with parameterized mutation
    probabilities, of equal length starting at whole multiples of half of read
    length. It is expected that successive reads have an overlap starting at
    their halfway position.

    Returns:
        tuple:
            A tuple containing the full genome, a list of reads, the gap
            probability and the seed index.
    """
    A = Alphabet('ACGT')
    gap_prob, subst_prob, wordlen = request.param
    seq_len, read_len = 2000, 500
    seq = rand_seq(A, seq_len).to_named('genome')
    mutation_process = MutationProcess(A, subst_probs=subst_prob,
                                       go_prob=gap_prob, ge_prob=gap_prob)
    reads = []
    for i in range(0, seq_len - read_len, int(read_len/2)):
        read, _ = mutation_process.mutate(seq[i: i + read_len])
        reads += [read.to_named('read#%d' % i)]

    db = DB(':memory:', A)
    kmer_index = KmerIndex(db, wordlen)
    seed_index = SeedIndex(kmer_index)
    seed_index.db.initialize()
    records = [db.insert(r) for r in reads]
    return seq, reads, records, gap_prob, seed_index
示例#5
0
def seed_index():
    """Creates a database, a kmer index, and a seed index with word length 5
    stored in memory and returns the seed index. The database is populated with
    3 random sequences of length 100 and all kmers and seeds are indexed."""
    A = Alphabet('ACGT')
    num_seqs = 3
    seq_len = 100
    wordlen = 5

    db = DB(':memory:', A)
    seed_index = SeedIndex(KmerIndex(db, wordlen))
    seed_index.db.initialize()

    fasta = StringIO()
    seqs = (rand_seq(A, seq_len).to_named('#%d' % i) for i in range(num_seqs))
    write_fasta(fasta, seqs)
    fasta.seek(0)

    db.load_fasta(fasta)
    seed_index.index_seeds()
    return seed_index
示例#6
0
def test_expected_coverage():
    A = Alphabet('ACGT')
    S = rand_seq(A, 100)
    cov = 10
    reads = [r for r in rand_read(S, len_mean=len(S)/2, expected_coverage=cov)]
    assert len(reads) == 2 * cov
示例#7
0
def test_rand_seq():
    _bak = np.random.choice
    np.random.choice = mock.Mock(return_value=[0, 0, 0])
    A = Alphabet('ACGT')
    assert rand_seq(A, 10) == A.parse('AAA')
    np.random.choice = _bak