Exemplo n.º 1
0
def test_extract_paired_reads_4_output_files():
    # test input file
    infile = utils.get_test_data('paired-mixed.fa')

    ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe')
    ex_outfile2 = utils.get_test_data('paired-mixed.fa.se')

    # actual output files...
    outfile1 = utils.get_temp_filename('out_pe')
    outfile2 = utils.get_temp_filename('out_se')

    script = 'extract-paired-reads.py'
    args = [infile, '-p', outfile1, '-s', outfile2]

    utils.runscript(script, args)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Exemplo n.º 2
0
def test_interleave_read_stdout():
    # create input files
    infile1 = utils.get_test_data('paired-slash1.fq.1')
    infile2 = utils.get_test_data('paired-slash1.fq.2')

    # correct output
    ex_outfile = utils.get_test_data('paired-slash1.fq')

    # actual output file
    outfile = utils.get_temp_filename('out.fq')

    script = 'interleave-reads.py'
    args = [infile1, infile2]

    (stats, out, err) = utils.runscript(script, args)

    with open(outfile, 'w') as ofile:
        ofile.write(out)

    n = 0
    for r, q in zip(screed.open(ex_outfile), screed.open(outfile)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Exemplo n.º 3
0
def test_extract_paired_reads_3_output_dir():
    # test input file
    infile = utils.get_test_data('paired-mixed.fa')

    ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe')
    ex_outfile2 = utils.get_test_data('paired-mixed.fa.se')

    # output directory
    out_dir = utils.get_temp_filename('output')

    script = 'extract-paired-reads.py'
    args = [infile, '-d', out_dir]

    utils.runscript(script, args)

    outfile1 = os.path.join(out_dir, 'paired-mixed.fa.pe')
    outfile2 = os.path.join(out_dir, 'paired-mixed.fa.se')
    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Exemplo n.º 4
0
def test_split_paired_reads_3_output_files_right():
    # test input file
    infile = utils.get_test_data('paired.fq')

    ex_outfile1 = utils.get_test_data('paired.fq.1')
    ex_outfile2 = utils.get_test_data('paired.fq.2')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired.fq.1')
    output_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('yyy', output_dir)

    script = 'split-paired-reads.py'
    args = ['-2', outfile2, '-d', output_dir, infile]

    utils.runscript(script, args)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.quality == q.quality
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.quality == q.quality
    assert n > 0
Exemplo n.º 5
0
def test_extract_paired_reads_3_output_dir():
    # test input file
    infile = utils.get_test_data('paired-mixed.fa')

    ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe')
    ex_outfile2 = utils.get_test_data('paired-mixed.fa.se')

    # output directory
    out_dir = utils.get_temp_filename('output')

    script = 'extract-paired-reads.py'
    args = [infile, '-d', out_dir]

    utils.runscript(script, args)

    outfile1 = os.path.join(out_dir, 'paired-mixed.fa.pe')
    outfile2 = os.path.join(out_dir, 'paired-mixed.fa.se')
    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Exemplo n.º 6
0
def test_extract_paired_reads_4_output_files():
    # test input file
    infile = utils.get_test_data('paired-mixed.fa')

    ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe')
    ex_outfile2 = utils.get_test_data('paired-mixed.fa.se')

    # actual output files...
    outfile1 = utils.get_temp_filename('out_pe')
    outfile2 = utils.get_temp_filename('out_se')

    script = 'extract-paired-reads.py'
    args = [infile, '-p', outfile1, '-s', outfile2]

    utils.runscript(script, args)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Exemplo n.º 7
0
def test_split_paired_reads_1_fa():
    # test input file
    infile = utils.get_test_data('paired.fa')

    ex_outfile1 = utils.get_test_data('paired.fa.1')
    ex_outfile2 = utils.get_test_data('paired.fa.2')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired.fa.1')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('paired.fa.2', in_dir)

    script = 'split-paired-reads.py'
    args = [infile]

    utils.runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Exemplo n.º 8
0
def test_split_paired_reads_3_output_files_right():
    # test input file
    infile = utils.get_test_data('paired.fq')

    ex_outfile1 = utils.get_test_data('paired.fq.1')
    ex_outfile2 = utils.get_test_data('paired.fq.2')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired.fq.1')
    output_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('yyy', output_dir)

    script = 'split-paired-reads.py'
    args = ['-2', outfile2, '-d', output_dir, infile]

    utils.runscript(script, args)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.quality == q.quality
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.quality == q.quality
    assert n > 0
Exemplo n.º 9
0
def test_extract_paired_reads_2_fq():
    # test input file
    infile = utils.get_test_data('paired-mixed.fq')

    ex_outfile1 = utils.get_test_data('paired-mixed.fq.pe')
    ex_outfile2 = utils.get_test_data('paired-mixed.fq.se')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired-mixed.fq.pe')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('paired-mixed.fq.se', in_dir)

    script = 'extract-paired-reads.py'
    args = [infile]

    utils.runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name, (r.name, q.name, n)
        assert r.sequence == q.sequence
        assert r.quality == q.quality
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.quality == q.quality
    assert n > 0
Exemplo n.º 10
0
def test_interleave_read_stdout():
    # create input files
    infile1 = utils.get_test_data('paired-slash1.fq.1')
    infile2 = utils.get_test_data('paired-slash1.fq.2')

    # correct output
    ex_outfile = utils.get_test_data('paired-slash1.fq')

    # actual output file
    outfile = utils.get_temp_filename('out.fq')

    script = 'interleave-reads.py'
    args = [infile1, infile2]

    (stats, out, err) = utils.runscript(script, args)

    with open(outfile, 'w') as ofile:
        ofile.write(out)

    n = 0
    for r, q in zip(screed.open(ex_outfile), screed.open(outfile)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Exemplo n.º 11
0
def test_split_paired_reads_1_fa():
    # test input file
    infile = utils.get_test_data('paired.fa')

    ex_outfile1 = utils.get_test_data('paired.fa.1')
    ex_outfile2 = utils.get_test_data('paired.fa.2')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired.fa.1')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('paired.fa.2', in_dir)

    script = 'split-paired-reads.py'
    args = [infile]

    utils.runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Exemplo n.º 12
0
def test_interleave_reads_broken_fq():
    # test input files
    infile1 = utils.get_test_data('paired-broken.fq.1')
    infile2 = utils.get_test_data('paired-broken.fq.2')

    # actual output file
    outfile = utils.get_temp_filename('out.fq')

    script = 'interleave-reads.py'
    args = [infile1, infile2, '-o', outfile]

    status, out, err = utils.runscript(script, args, fail_ok=True)
    assert status == 1
    assert 'ERROR: Input files contain different number of records.' in err
Exemplo n.º 13
0
def test_interleave_reads_broken_fq_5():
    # test input files
    infile1 = utils.get_test_data('paired-broken4.fq.1')
    infile2 = utils.get_test_data('paired-broken4.fq.2')

    # actual output file
    outfile = utils.get_temp_filename('out.fq')

    script = 'interleave-reads.py'
    args = [infile1, infile2, '-o', outfile]

    status, out, err = utils.runscript(script, args, fail_ok=True)
    assert status == 1
    assert "ERROR: This doesn't look like paired data!" in err
Exemplo n.º 14
0
def test_interleave_reads_broken_fq_5():
    # test input files
    infile1 = utils.get_test_data('paired-broken4.fq.1')
    infile2 = utils.get_test_data('paired-broken4.fq.2')

    # actual output file
    outfile = utils.get_temp_filename('out.fq')

    script = 'interleave-reads.py'
    args = [infile1, infile2, '-o', outfile]

    status, out, err = utils.runscript(script, args, fail_ok=True)
    assert status == 1
    assert "ERROR: This doesn't look like paired data!" in err
Exemplo n.º 15
0
def test_interleave_reads_broken_fq():
    # test input files
    infile1 = utils.get_test_data('paired-broken.fq.1')
    infile2 = utils.get_test_data('paired-broken.fq.2')

    # actual output file
    outfile = utils.get_temp_filename('out.fq')

    script = 'interleave-reads.py'
    args = [infile1, infile2, '-o', outfile]

    status, out, err = utils.runscript(script, args, fail_ok=True)
    assert status == 1
    assert 'ERROR: Input files contain different number of records.' in err
Exemplo n.º 16
0
def test_interleave_reads_no_reformat():
    infile1 = utils.get_test_data('paired.fq.1')
    infile2 = utils.get_test_data('paired.malformat.fq.2')

    ex_outfile = utils.get_test_data('paired.malformat.fq')
    outfile = utils.get_temp_filename('out.fq')

    script = 'interleave-reads.py'
    args = [infile1, infile2, '--no-reformat', '-o', outfile]

    utils.runscript(script, args)

    r = open(ex_outfile).read()
    q = open(outfile).read()

    assert r == q, (r, q)
Exemplo n.º 17
0
def test_interleave_reads_no_reformat():
    infile1 = utils.get_test_data('paired.fq.1')
    infile2 = utils.get_test_data('paired.malformat.fq.2')

    ex_outfile = utils.get_test_data('paired.malformat.fq')
    outfile = utils.get_temp_filename('out.fq')

    script = 'interleave-reads.py'
    args = [infile1, infile2, '--no-reformat', '-o', outfile]

    utils.runscript(script, args)

    r = open(ex_outfile).read()
    q = open(outfile).read()

    assert r == q, (r, q)
Exemplo n.º 18
0
def test_read_bundler():
    infile = utils.get_test_data('unclean-reads.fastq')
    records = [r for r in khmer.ReadParser(infile)]
    bundle = khmer.utils.ReadBundle(*records)

    raw_seqs = (
        'GGTTGACGGGGNNNAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCG'
        'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG',
        'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGANNNCCG'
        'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG',
    )

    cleaned_seqs = (
        'GGTTGACGGGGAAAAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCG'
        'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG',
        'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGAAAACCG'
        'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG',
    )

    assert bundle.num_reads == 2
    assert bundle.total_length == 200

    for read, raw_seq, clean_seq in zip(bundle.reads, raw_seqs, cleaned_seqs):
        assert read.sequence == raw_seq
        assert read.cleaned_seq == clean_seq
Exemplo n.º 19
0
def test_read_bundler():
    infile = utils.get_test_data('unclean-reads.fastq')
    records = [r for r in khmer.ReadParser(infile)]
    bundle = khmer.utils.ReadBundle(*records)

    raw_seqs = (
        'GGTTGACGGGGNNNAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCG'
        'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG',
        'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGANNNCCG'
        'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG',
    )

    cleaned_seqs = (
        'GGTTGACGGGGAAAAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCG'
        'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG',
        'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGAAAACCG'
        'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG',
    )

    assert bundle.num_reads == 2
    assert bundle.total_length == 200

    for read, raw_seq, clean_seq in zip(bundle.reads, raw_seqs, cleaned_seqs):
        assert read.sequence == raw_seq
        assert read.cleaned_seq == clean_seq
Exemplo n.º 20
0
def test_interleave_reads_1_fq():
    # test input files
    infile1 = utils.get_test_data('paired.fq.1')
    infile2 = utils.get_test_data('paired.fq.2')

    # correct output
    ex_outfile = utils.get_test_data('paired.fq')

    # actual output file
    outfile = utils.get_temp_filename('out.fq')

    script = 'interleave-reads.py'
    args = [infile1, infile2, '-o', outfile]

    utils.runscript(script, args)

    r = open(ex_outfile).read()
    q = open(outfile).read()

    assert r == q, (r, q)
Exemplo n.º 21
0
def test_interleave_reads_1_fq():
    # test input files
    infile1 = utils.get_test_data('paired.fq.1')
    infile2 = utils.get_test_data('paired.fq.2')

    # correct output
    ex_outfile = utils.get_test_data('paired.fq')

    # actual output file
    outfile = utils.get_temp_filename('out.fq')

    script = 'interleave-reads.py'
    args = [infile1, infile2, '-o', outfile]

    utils.runscript(script, args)

    r = open(ex_outfile).read()
    q = open(outfile).read()

    assert r == q, (r, q)
Exemplo n.º 22
0
def test_interleave_read_badleft_badright():
    # create input files
    infile1 = utils.get_test_data('paired-broken.fq.badleft')
    infile2 = utils.get_test_data('paired-broken.fq.badright')

    # correct output
    ex_outfile = utils.get_test_data('paired-broken.fq.paired_bad')

    # actual output file
    outfile = utils.get_temp_filename('out.fq')

    script = 'interleave-reads.py'
    args = [infile1, infile2, '-o', outfile]

    utils.runscript(script, args)

    n = 0
    for r, q in zip(screed.open(ex_outfile), screed.open(outfile)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Exemplo n.º 23
0
def test_interleave_read_seq1_fq():
    # create input files
    infile1 = utils.get_test_data('paired-slash1.fq.1')
    infile2 = utils.get_test_data('paired-slash1.fq.2')

    # correct output
    ex_outfile = utils.get_test_data('paired-slash1.fq')

    # actual output file
    outfile = utils.get_temp_filename('out.fq')

    script = 'interleave-reads.py'
    args = [infile1, infile2, '-o', outfile]

    utils.runscript(script, args)

    n = 0
    for r, q in zip(screed.open(ex_outfile), screed.open(outfile)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Exemplo n.º 24
0
def test_interleave_reads_2_fa():
    # test input files
    infile1 = utils.get_test_data('paired.fa.1')
    infile2 = utils.get_test_data('paired.fa.2')

    # correct output
    ex_outfile = utils.get_test_data('paired.fa')

    # actual output file
    outfile = utils.get_temp_filename('out.fa')

    script = 'interleave-reads.py'
    args = [infile1, infile2, '-o', outfile]

    utils.runscript(script, args)

    n = 0
    for r, q in zip(screed.open(ex_outfile), screed.open(outfile)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Exemplo n.º 25
0
def test_extract_paired_reads_unpaired():
    # test input file
    infile = utils.get_test_data('random-20-a.fa')

    # actual output files...
    outfile1 = utils.get_temp_filename('unpaired.pe.fa')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('unpaired.se.fa', in_dir)

    script = 'extract-paired-reads.py'
    args = [infile]

    (_, _, err) = utils.runscript(script, args, in_dir, fail_ok=True)
    assert 'no paired reads!? check file formats...' in err, err
Exemplo n.º 26
0
def test_extract_paired_reads_unpaired():
    # test input file
    infile = utils.get_test_data('random-20-a.fa')

    # actual output files...
    outfile1 = utils.get_temp_filename('unpaired.pe.fa')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('unpaired.se.fa', in_dir)

    script = 'extract-paired-reads.py'
    args = [infile]

    (_, _, err) = utils.runscript(script, args, in_dir, fail_ok=True)
    assert 'no paired reads!? check file formats...' in err, err
Exemplo n.º 27
0
def test_extract_paired_reads_2_fq():
    # test input file
    infile = utils.get_test_data('paired-mixed.fq')

    ex_outfile1 = utils.get_test_data('paired-mixed.fq.pe')
    ex_outfile2 = utils.get_test_data('paired-mixed.fq.se')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired-mixed.fq.pe')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('paired-mixed.fq.se', in_dir)

    script = 'extract-paired-reads.py'
    args = [infile]

    utils.runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1),
                    screed.open(outfile1)):
        n += 1
        assert r.name == q.name, (r.name, q.name, n)
        assert r.sequence == q.sequence
        assert r.quality == q.quality
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2),
                    screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.quality == q.quality
    assert n > 0
Exemplo n.º 28
0
def test_read_bundler_empty_file():
    infile = utils.get_test_data('empty-file')
    with pytest.raises(OSError):
        records = [r for r in khmer.ReadParser(infile)]
Exemplo n.º 29
0
def test_read_bundler_single_read():
    infile = utils.get_test_data('single-read.fq')
    records = [r for r in khmer.ReadParser(infile)]
    bundle = khmer.utils.ReadBundle(*records)
    assert bundle.num_reads == 1
    assert bundle.reads[0].sequence == bundle.reads[0].cleaned_seq
Exemplo n.º 30
0
def test_read_bundler_single_read():
    infile = utils.get_test_data('single-read.fq')
    records = [r for r in khmer.ReadParser(infile)]
    bundle = khmer.utils.ReadBundle(*records)
    assert bundle.num_reads == 1
    assert bundle.reads[0].sequence == bundle.reads[0].cleaned_seq
Exemplo n.º 31
0
def test_read_bundler_empty_file():
    infile = utils.get_test_data('empty-file')
    with pytest.raises(OSError):
        records = [r for r in khmer.ReadParser(infile)]