def test_extract_paired_reads_4_output_files(): # test input file infile = utils.get_test_data('paired-mixed.fa') ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe') ex_outfile2 = utils.get_test_data('paired-mixed.fa.se') # actual output files... outfile1 = utils.get_temp_filename('out_pe') outfile2 = utils.get_temp_filename('out_se') script = 'extract-paired-reads.py' args = [infile, '-p', outfile1, '-s', outfile2] utils.runscript(script, args) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_interleave_read_stdout(): # create input files infile1 = utils.get_test_data('paired-slash1.fq.1') infile2 = utils.get_test_data('paired-slash1.fq.2') # correct output ex_outfile = utils.get_test_data('paired-slash1.fq') # actual output file outfile = utils.get_temp_filename('out.fq') script = 'interleave-reads.py' args = [infile1, infile2] (stats, out, err) = utils.runscript(script, args) with open(outfile, 'w') as ofile: ofile.write(out) n = 0 for r, q in zip(screed.open(ex_outfile), screed.open(outfile)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_extract_paired_reads_3_output_dir(): # test input file infile = utils.get_test_data('paired-mixed.fa') ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe') ex_outfile2 = utils.get_test_data('paired-mixed.fa.se') # output directory out_dir = utils.get_temp_filename('output') script = 'extract-paired-reads.py' args = [infile, '-d', out_dir] utils.runscript(script, args) outfile1 = os.path.join(out_dir, 'paired-mixed.fa.pe') outfile2 = os.path.join(out_dir, 'paired-mixed.fa.se') assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_split_paired_reads_3_output_files_right(): # test input file infile = utils.get_test_data('paired.fq') ex_outfile1 = utils.get_test_data('paired.fq.1') ex_outfile2 = utils.get_test_data('paired.fq.2') # actual output files... outfile1 = utils.get_temp_filename('paired.fq.1') output_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename('yyy', output_dir) script = 'split-paired-reads.py' args = ['-2', outfile2, '-d', output_dir, infile] utils.runscript(script, args) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert r.quality == q.quality assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert r.quality == q.quality assert n > 0
def test_split_paired_reads_1_fa(): # test input file infile = utils.get_test_data('paired.fa') ex_outfile1 = utils.get_test_data('paired.fa.1') ex_outfile2 = utils.get_test_data('paired.fa.2') # actual output files... outfile1 = utils.get_temp_filename('paired.fa.1') in_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename('paired.fa.2', in_dir) script = 'split-paired-reads.py' args = [infile] utils.runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_extract_paired_reads_2_fq(): # test input file infile = utils.get_test_data('paired-mixed.fq') ex_outfile1 = utils.get_test_data('paired-mixed.fq.pe') ex_outfile2 = utils.get_test_data('paired-mixed.fq.se') # actual output files... outfile1 = utils.get_temp_filename('paired-mixed.fq.pe') in_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename('paired-mixed.fq.se', in_dir) script = 'extract-paired-reads.py' args = [infile] utils.runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name, (r.name, q.name, n) assert r.sequence == q.sequence assert r.quality == q.quality assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert r.quality == q.quality assert n > 0
def test_interleave_reads_broken_fq(): # test input files infile1 = utils.get_test_data('paired-broken.fq.1') infile2 = utils.get_test_data('paired-broken.fq.2') # actual output file outfile = utils.get_temp_filename('out.fq') script = 'interleave-reads.py' args = [infile1, infile2, '-o', outfile] status, out, err = utils.runscript(script, args, fail_ok=True) assert status == 1 assert 'ERROR: Input files contain different number of records.' in err
def test_interleave_reads_broken_fq_5(): # test input files infile1 = utils.get_test_data('paired-broken4.fq.1') infile2 = utils.get_test_data('paired-broken4.fq.2') # actual output file outfile = utils.get_temp_filename('out.fq') script = 'interleave-reads.py' args = [infile1, infile2, '-o', outfile] status, out, err = utils.runscript(script, args, fail_ok=True) assert status == 1 assert "ERROR: This doesn't look like paired data!" in err
def test_interleave_reads_no_reformat(): infile1 = utils.get_test_data('paired.fq.1') infile2 = utils.get_test_data('paired.malformat.fq.2') ex_outfile = utils.get_test_data('paired.malformat.fq') outfile = utils.get_temp_filename('out.fq') script = 'interleave-reads.py' args = [infile1, infile2, '--no-reformat', '-o', outfile] utils.runscript(script, args) r = open(ex_outfile).read() q = open(outfile).read() assert r == q, (r, q)
def test_read_bundler(): infile = utils.get_test_data('unclean-reads.fastq') records = [r for r in khmer.ReadParser(infile)] bundle = khmer.utils.ReadBundle(*records) raw_seqs = ( 'GGTTGACGGGGNNNAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCG' 'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG', 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGANNNCCG' 'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG', ) cleaned_seqs = ( 'GGTTGACGGGGAAAAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCG' 'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG', 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGAAAACCG' 'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG', ) assert bundle.num_reads == 2 assert bundle.total_length == 200 for read, raw_seq, clean_seq in zip(bundle.reads, raw_seqs, cleaned_seqs): assert read.sequence == raw_seq assert read.cleaned_seq == clean_seq
def test_interleave_reads_1_fq(): # test input files infile1 = utils.get_test_data('paired.fq.1') infile2 = utils.get_test_data('paired.fq.2') # correct output ex_outfile = utils.get_test_data('paired.fq') # actual output file outfile = utils.get_temp_filename('out.fq') script = 'interleave-reads.py' args = [infile1, infile2, '-o', outfile] utils.runscript(script, args) r = open(ex_outfile).read() q = open(outfile).read() assert r == q, (r, q)
def test_interleave_read_badleft_badright(): # create input files infile1 = utils.get_test_data('paired-broken.fq.badleft') infile2 = utils.get_test_data('paired-broken.fq.badright') # correct output ex_outfile = utils.get_test_data('paired-broken.fq.paired_bad') # actual output file outfile = utils.get_temp_filename('out.fq') script = 'interleave-reads.py' args = [infile1, infile2, '-o', outfile] utils.runscript(script, args) n = 0 for r, q in zip(screed.open(ex_outfile), screed.open(outfile)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_interleave_read_seq1_fq(): # create input files infile1 = utils.get_test_data('paired-slash1.fq.1') infile2 = utils.get_test_data('paired-slash1.fq.2') # correct output ex_outfile = utils.get_test_data('paired-slash1.fq') # actual output file outfile = utils.get_temp_filename('out.fq') script = 'interleave-reads.py' args = [infile1, infile2, '-o', outfile] utils.runscript(script, args) n = 0 for r, q in zip(screed.open(ex_outfile), screed.open(outfile)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_interleave_reads_2_fa(): # test input files infile1 = utils.get_test_data('paired.fa.1') infile2 = utils.get_test_data('paired.fa.2') # correct output ex_outfile = utils.get_test_data('paired.fa') # actual output file outfile = utils.get_temp_filename('out.fa') script = 'interleave-reads.py' args = [infile1, infile2, '-o', outfile] utils.runscript(script, args) n = 0 for r, q in zip(screed.open(ex_outfile), screed.open(outfile)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_extract_paired_reads_unpaired(): # test input file infile = utils.get_test_data('random-20-a.fa') # actual output files... outfile1 = utils.get_temp_filename('unpaired.pe.fa') in_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename('unpaired.se.fa', in_dir) script = 'extract-paired-reads.py' args = [infile] (_, _, err) = utils.runscript(script, args, in_dir, fail_ok=True) assert 'no paired reads!? check file formats...' in err, err
def test_read_bundler_empty_file(): infile = utils.get_test_data('empty-file') with pytest.raises(OSError): records = [r for r in khmer.ReadParser(infile)]
def test_read_bundler_single_read(): infile = utils.get_test_data('single-read.fq') records = [r for r in khmer.ReadParser(infile)] bundle = khmer.utils.ReadBundle(*records) assert bundle.num_reads == 1 assert bundle.reads[0].sequence == bundle.reads[0].cleaned_seq