def test_read_pair_iterator_in_error_mode(): assert 0 rparser = \ ReadParser(utils.get_test_data("test-abund-read-paired.fa")) # If walks like an iterator and quacks like an iterator... rpi = rparser.iter_read_pairs() assert "__iter__" in dir(rpi) assert "next" in dir(rpi) # Are the alleged pairs actually pairs? read_pairs_1 = [] for read_1, read_2 in rpi: read_pairs_1.append([read_1, read_2]) assert read_1.name[: 19] == read_2.name[: 19] # Reload parser. # Note: No 'rewind' or 'reset' capability at the time of this writing. rparser = \ ReadParser(utils.get_test_data("test-abund-read-paired.fa")) # Ensure that error mode is the default mode. read_pairs_2 = [] for read_1, read_2 \ in rparser.iter_read_pairs(ReadParser.PAIR_MODE_ERROR_ON_UNPAIRED): read_pairs_2.append([read_1, read_2]) matches = \ map( lambda rp1, rp2: rp1[0].name == rp2[0].name, read_pairs_1, read_pairs_2 ) assert all(matches) # Assert ALL the matches. :-]
def test_count_overlap(): seqfile1 = utils.get_temp_filename('test-overlap1.fa') in_dir = os.path.dirname(seqfile1) seqfile2 = utils.get_temp_filename('test-overlap2.fa', in_dir) outfile = utils.get_temp_filename('overlap.out', in_dir) curvefile = utils.get_temp_filename('overlap.out.curve', in_dir) shutil.copy(utils.get_test_data('test-overlap1.fa'), seqfile1) shutil.copy(utils.get_test_data('test-overlap2.fa'), seqfile2) htfile = _make_graph(seqfile1, ksize=20) script = scriptpath('count-overlap.py') args = ['--ksize', '20', '--n_tables', '2', '--min-tablesize', '10000000', htfile + '.pt', seqfile2, outfile] (status, out, err) = runscript(script, args, in_dir) assert status == 0 assert os.path.exists(outfile), outfile data = [x.strip() for x in open(outfile)] data = set(data) assert '# of unique k-mers in dataset2: 759047' in data assert '# of overlap unique k-mers: 245621' in data assert os.path.exists(curvefile), curvefile data = [x.strip() for x in open(curvefile)] data = set(data) assert '178633 1155' in data assert '496285 2970' in data assert '752053 238627' in data
def test_split_paired_reads_1_fa(): # test input file infile = utils.get_test_data("paired.fa") ex_outfile1 = utils.get_test_data("paired.fa.1") ex_outfile2 = utils.get_test_data("paired.fa.2") # actual output files... outfile1 = utils.get_temp_filename("paired.fa.1") in_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename("paired.fa.2", in_dir) script = scriptpath("split-paired-reads.py") args = [infile] runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_normalize_by_median_force(): CUTOFF = '1' corrupt_infile = utils.get_temp_filename('test-corrupt.fq') good_infile = utils.get_temp_filename('test-good.fq', tempdir=os.path.dirname( corrupt_infile)) in_dir = os.path.dirname(good_infile) shutil.copyfile(utils.get_test_data('test-error-reads.fq'), corrupt_infile) shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), good_infile) script = scriptpath('normalize-by-median.py') args = ['-f', '-C', CUTOFF, '-k', '17', corrupt_infile, good_infile] (status, out, err) = runscript(script, args, in_dir) test_ht = khmer.load_counting_hash(corrupt_infile + '.ct.failed') test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT' test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA' assert test_ht.count(test_good_read[:17]) > 0 assert test_ht.count(test_good_read2[:17]) > 0 assert os.path.exists(corrupt_infile + '.ct.failed') assert '*** Skipping' in err assert '** IOErrors' in err
def test_with_multiple_threads(testfile="test-reads.fq.bz2"): import operator import threading reads_count_1thr = 0 rparser = ReadParser(utils.get_test_data(testfile)) for read in rparser: reads_count_1thr += 1 def count_reads(rparser, counters, tnum): counters[tnum] = reduce(operator.add, (1 for read in rparser)) N_THREADS = 4 threads = [] reads_counts_per_thread = [0] * N_THREADS rparser = ReadParser(utils.get_test_data(testfile)) for tnum in xrange(N_THREADS): t = \ threading.Thread( target=count_reads, args=[rparser, reads_counts_per_thread, tnum] ) threads.append(t) t.start() for t in threads: t.join() assert reads_count_1thr == sum(reads_counts_per_thread), \ reads_counts_per_thread
def test_extract_paired_reads_1_fa(): # test input file infile = utils.get_test_data('paired-mixed.fa') ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe') ex_outfile2 = utils.get_test_data('paired-mixed.fa.se') # actual output files... outfile1 = utils.get_temp_filename('paired-mixed.fa.pe') in_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename('paired-mixed.fa.se', in_dir) script = scriptpath('extract-paired-reads.py') args = [infile] runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_split_paired_reads_2_fq(): # test input file infile = utils.get_test_data('paired.fq') ex_outfile1 = utils.get_test_data('paired.fq.1') ex_outfile2 = utils.get_test_data('paired.fq.2') # actual output files... outfile1 = utils.get_temp_filename('paired.fq.1') in_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename('paired.fq.2', in_dir) script = scriptpath('split-paired-reads.py') args = [infile] runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert r.accuracy == q.accuracy assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert r.accuracy == q.accuracy assert n > 0
def test_with_multiple_threads(): import operator import threading reads_count_1thr = 0 rparser = ReadParser(utils.get_test_data("test-reads.fq.bz2")) for read in rparser: reads_count_1thr += 1 def count_reads(rparser, counters, tnum): counters[tnum] = reduce(operator.add, (1 for read in rparser)) N_THREADS = 4 config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(N_THREADS * 64 * 1024) threads = [] reads_counts_per_thread = [0] * N_THREADS rparser = ReadParser(utils.get_test_data("test-reads.fq.bz2"), N_THREADS) for tnum in xrange(N_THREADS): t = \ threading.Thread( target=count_reads, args=[rparser, reads_counts_per_thread, tnum] ) threads.append(t) t.start() for t in threads: t.join() config.set_reads_input_buffer_size(bufsz) assert reads_count_1thr == sum(reads_counts_per_thread)
def test_make_initial_stoptags(): # gen input files using load-graph.py -t # should keep test_data directory size down # or something like that # this assumes (obv.) load-graph works properly bzinfile = utils.get_temp_filename('test-reads.fq.bz2') shutil.copyfile(utils.get_test_data('test-reads.fq.bz2'), bzinfile) in_dir = os.path.dirname(bzinfile) genscript = scriptpath('load-graph.py') genscriptargs = ['-t', 'test-reads', 'test-reads.fq.bz2'] utils.runscript(genscript, genscriptargs, in_dir) # test input file gen'd by load-graphs infile = utils.get_temp_filename('test-reads.pt') infile2 = utils.get_temp_filename('test-reads.tagset', in_dir) # get file to compare against ex_outfile = utils.get_test_data('test-reads.stoptags') # actual output file outfile1 = utils.get_temp_filename('test-reads.stoptags', in_dir) script = scriptpath('make-initial-stoptags.py') # make-initial-stoptags has weird file argument syntax # read the code before modifying args = ['test-reads'] utils.runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1
def test_extract_long_sequences(): script = scriptpath('extract-long-sequences.py') fq_infile = utils.get_temp_filename('test.fq') fa_infile = utils.get_temp_filename('test.fa') shutil.copyfile(utils.get_test_data('paired-mixed.fq'), fq_infile) shutil.copyfile(utils.get_test_data('paired-mixed.fa'), fa_infile) fq_outfile = fq_infile + '.keep.fq' fa_outfile = fa_infile + '.keep.fa' in_dir_fq = os.path.dirname(fq_infile) in_dir_fa = os.path.dirname(fa_infile) args = [fq_infile, '-l', '10', '-o', 'fq_outfile'] (status, out, err) = runscript(script, args, in_dir_fa) countlines = sum(1 for line in open(fq_infile)) assert countlines == 44, countlines args = [fa_infile, '-l', '10', '-o', 'fa_outfile'] (status, out, err) = runscript(script, args, in_dir_fa) countlines = sum(1 for line in open(fa_infile)) assert countlines == 22, countlines
def test_normalize_by_median_indent(): infile = utils.get_test_data('paired-mixed.fa.pe') hashfile = utils.get_test_data('normC20k20.kh') script = scriptpath('normalize-by-median.py') args = ['--loadtable', hashfile, infile] (status, out, err) = utils.runscript(script, args) assert status == 0, (out, err) print(out, err)
def test_filter_if_present(): ht = khmer.Hashbits(32, 1e6, 2) maskfile = utils.get_test_data("filter-test-A.fa") inputfile = utils.get_test_data("filter-test-B.fa") outfile = utils.get_temp_filename("filter") ht.consume_fasta(maskfile) ht.filter_if_present(inputfile, outfile) records = list(fasta_iter(open(outfile))) assert len(records) == 1 assert records[0]["name"] == "3"
def test_filter_if_present(): ht = khmer.LabelHash(32, 1e6, 2) maskfile = utils.get_test_data('filter-test-A.fa') inputfile = utils.get_test_data('filter-test-B.fa') outfile = utils.get_temp_filename('filter') ht.consume_fasta(maskfile) ht.filter_if_present(inputfile, outfile) records = list(fasta_iter(open(outfile))) assert len(records) == 1 assert records[0]['name'] == '3'
def test_sweep_reads_fq(): readfile = utils.get_temp_filename('reads.fa') contigfile = utils.get_temp_filename('contigs.fp') in_dir = os.path.dirname(contigfile) shutil.copyfile(utils.get_test_data('test-sweep-reads.fq'), readfile) shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile) script = scriptpath('sweep-reads.py') args = ['-k', '25', '--prefix', 'test', '--label-by-pid', contigfile, readfile, 'junkfile.fa'] status, out, err = utils.runscript( script, args, in_dir, fail_ok=True, sandbox=True) # check if the bad file was skipped without issue assert 'ERROR' in err, err assert 'skipping' in err, err out1 = os.path.join(in_dir, 'test_0.fq') out2 = os.path.join(in_dir, 'test_1.fq') mout = os.path.join(in_dir, 'test_multi.fq') oout = os.path.join(in_dir, 'test_orphaned.fq') assert os.path.exists(out1) assert os.path.exists(out2) assert os.path.exists(mout) assert os.path.exists(oout) print open(out1).read() print os.listdir(in_dir) seqs1 = set([r.name for r in screed.open(out1)]) seqs2 = set([r.name for r in screed.open(out2)]) seqsm = set([r.name for r in screed.open(mout)]) seqso = set([r.name for r in screed.open(oout)]) print seqs1 print seqs2 print seqsm print seqso assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0']) assert seqs2 == set(['read3_p1\t1']) assert (seqsm == set(['read4_multi\t0\t1']) or seqsm == set(['read4_multi\t1\t0'])) assert seqso == set(['read5_orphan']) seqs1 = set([r.quality for r in screed.open(out1)]) seqs2 = set([r.quality for r in screed.open(out2)]) seqsm = set([r.quality for r in screed.open(mout)]) seqso = set([r.quality for r in screed.open(oout)])
def test_fakelump_stop(): fakelump_fa = utils.get_test_data('fakelump.fa') fakelump_stoptags_txt = utils.get_test_data('fakelump.fa.stoptags.txt') ht = khmer.new_hashbits(32, 1e5, 4) ht.consume_fasta_and_tag(fakelump_fa) for line in open(fakelump_stoptags_txt): ht.add_stop_tag(line.strip()) subset = ht.do_subset_partition(0, 0, True) ht.merge_subset(subset) (n_partitions, n_singletons) = ht.count_partitions() assert n_partitions == 3, n_partitions
def test_tiny_real_partitions(): filename = utils.get_test_data('real-partition-tiny.fa') ht = khmer.new_hashbits(32, 8e1, 4) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) outfile = utils.get_temp_filename('part') ht.output_partitions(filename, outfile) data = open(outfile).read() assert len(data) records = [r for r in screed.open(outfile)] names = [r.name for r in records] parts = [n.rsplit('\t', 1)[1] for n in names] assert len(parts) == 2, len(parts) assert len(set(parts)) == 1 assert set(parts) != set(['0']) test_tiny_real_partitions.runme = True
def test_abund(self): ht = khmer.new_hashtable(10, 4 ** 10) filename = utils.get_test_data('test-abund-read.fa') outname = utils.get_temp_filename('test_abund.out') ht.consume_fasta(filename) try: ht.consume_fasta() assert 0, "should fail" except TypeError as err: print str(err) try: ht.consume_fasta("nonexistent") assert 0, "should fail" except IOError as err: print str(err) ht.output_fasta_kmer_pos_freq(filename, outname) try: ht.output_fasta_kmer_pos_freq() assert 0, "should fail" except TypeError as err: print str(err) fd = open(outname, "r") output = fd.readlines() assert len(output) == 1 output = output[0] output = output.strip().split() assert ['1'] * (114 - 10 + 1) == output fd.close()
def test_extract_partitions(): seqfile = utils.get_test_data('random-20-a.fa') graphbase = _make_graph(seqfile, do_partition=True, annotate_partitions=True) in_dir = os.path.dirname(graphbase) # get the final part file partfile = os.path.join(in_dir, 'random-20-a.fa.part') # ok, now run extract-partitions. script = scriptpath('extract-partitions.py') args = ['extracted', partfile] (status, out, err) = runscript(script, args, in_dir) print out print err assert status == 0 distfile = os.path.join(in_dir, 'extracted.dist') groupfile = os.path.join(in_dir, 'extracted.group0000.fa') assert os.path.exists(distfile) assert os.path.exists(groupfile) dist = open(distfile).readline() assert dist.strip() == '99 1 1 99' parts = [ r.name.split('\t')[1] for r in screed.open(partfile) ] assert len(parts) == 99, len(parts) parts = set(parts) assert len(parts) == 1, len(parts)
def test_load_graph(): script = scriptpath('load-graph.py') args = ['-x', '1e7', '-N', '2', '-k', '20'] outfile = utils.get_temp_filename('out') infile = utils.get_test_data('random-20-a.fa') args.extend([outfile, infile]) (status, out, err) = runscript(script, args) assert status == 0 ht_file = outfile + '.ht' assert os.path.exists(ht_file), ht_file tagset_file = outfile + '.tagset' assert os.path.exists(tagset_file), tagset_file ht = khmer.load_hashbits(ht_file) ht.load_tagset(tagset_file) # check to make sure we get the expected result for this data set # upon partitioning (all in one partition). This is kind of a # roundabout way of checking that load-graph worked :) subset = ht.do_subset_partition(0, 0) x = ht.subset_count_partitions(subset) assert x == (1, 0), x
def test_filter_stoptags(): infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) stopfile = utils.get_temp_filename('stoptags', in_dir) # first, copy test-abund-read-2.fa to 'test.fa' in the temp dir. shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) # now, create a file with some stop tags in it -- K = 18 kh = khmer.new_hashbits(K, 1, 1) kh.add_stop_tag('GTTGACGGGGCTCAGGGG') kh.save_stop_tags(stopfile) del kh # finally, run filter-stoptags. script = scriptpath('filter-stoptags.py') args = ['-k', str(K), stopfile, infile, infile] (status, out, err) = runscript(script, args, in_dir) print out print err assert status == 0 # verify that the basic output file exists outfile = infile + '.stopfilt' assert os.path.exists(outfile), outfile # it should contain only one unique sequence, because we've trimmed # off everything after the beginning of the only long sequence in there. seqs = set([ r.sequence for r in screed.open(outfile) ]) assert len(seqs) == 1, seqs assert 'GGTTGACGGGGCTCAGGG' in seqs, seqs
def test_partition_graph_nojoin_k21(): # test with K=21 graphbase = _make_graph(utils.get_test_data('random-20-a.fa'), K=21) in_dir = os.path.dirname(graphbase) script = scriptpath('partition-graph.py') args = [graphbase] (status, out, err) = runscript(script, args) assert status == 0 script = scriptpath('merge-partitions.py') args = [graphbase, '-k', str(21)] (status, out, err) = runscript(script, args) print out print err assert status == 0 final_pmap_file = graphbase + '.pmap.merged' assert os.path.exists(final_pmap_file) ht = khmer.load_hashbits(graphbase + '.ht') ht.load_partitionmap(final_pmap_file) x = ht.count_partitions() assert x == (99, 0) # should be 99 partitions at K=21
def test_partition_graph_nojoin_stoptags(): # test with stoptags graphbase = _make_graph(utils.get_test_data('random-20-a.fa')) in_dir = os.path.dirname(graphbase) # add in some stop tags ht = khmer.load_hashbits(graphbase + '.ht') ht.add_stop_tag('TTGCATACGTTGAGCCAGCG') stoptags_file = graphbase + '.stoptags' ht.save_stop_tags(stoptags_file) del ht # run script with stoptags option script = scriptpath('partition-graph.py') args = ['--stoptags', stoptags_file, graphbase] (status, out, err) = runscript(script, args) assert status == 0 script = scriptpath('merge-partitions.py') args = [graphbase, '-k', str(20)] (status, out, err) = runscript(script, args) print out print err assert status == 0 final_pmap_file = graphbase + '.pmap.merged' assert os.path.exists(final_pmap_file) ht = khmer.load_hashbits(graphbase + '.ht') ht.load_partitionmap(final_pmap_file) x = ht.count_partitions() assert x == (2, 0) # should be 2 partitions
def test_n_labels(): lh = LabelHash(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lh.consume_fasta_and_tag_with_labels(filename) print lh.n_labels() assert lh.n_labels() == 4
def test_sweep_reads_3(): infile = utils.get_temp_filename('seqs.fa') shutil.copyfile(utils.get_test_data('random-20-a.fa'), infile) wdir = os.path.dirname(infile) script = scriptpath('sweep-reads.py') args = ['-m', '75', '-k', '20', '-l', '1', '--prefix', 'test', '--label-by-group', '10', infile, infile] status, out, err = utils.runscript(script, args, wdir, sandbox=True) for i in xrange(10): p = os.path.join(wdir, 'test_{i}.fa'.format(i=i)) print p, err, out assert os.path.exists(p) os.remove(p) counts_fn = os.path.join(wdir, 'test.counts.csv') with open(counts_fn) as cfp: for line in cfp: _, _, c = line.partition(',') assert int(c) in [9, 10] assert os.path.exists(counts_fn) assert os.path.exists(os.path.join(wdir, 'test.dist.txt')) assert not os.path.exists(os.path.join(wdir, 'test_multi.fa'))
def test_save_merge_from_disk_2(self): ht = khmer.new_hashbits(20, 4 ** 7 + 1) filename = utils.get_test_data('random-20-a.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) subset_size = total_reads // 2 + total_reads % 2 divvy = ht.divide_tags_into_subsets(subset_size) outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(divvy[0], divvy[1]) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(divvy[1], 0) ht.save_subset_partitionmap(y, outfile2) del y ht.merge_subset_from_disk(outfile1) ht.merge_subset_from_disk(outfile2) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_save_merge_from_disk(self): ht = khmer.new_hashbits(20, 4 ** 4 + 1) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) print divvy (a, b, c) = divvy outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(a, b) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(b, 0) ht.save_subset_partitionmap(y, outfile2) del y ht.merge_subset_from_disk(outfile1) ht.merge_subset_from_disk(outfile2) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_casava_1_8_pair_mating(): import threading config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(128 * 1024) # Note: This file, when used in conjunction with a 64 KiB per-thread # prefetch buffer, tests the paired read mating logic with the # Casava >= 1.8 read name format. rparser = ReadParser(utils.get_test_data("test-reads.fq.bz2"), 2) def thread_1_runtime(rparser): for read in rparser: pass def thread_2_runtime(rparser): for readnum, read in enumerate(rparser): if 0 == readnum: assert "895:1:1:1761:13189 2:N:0:NNNNN" == read.name t1 = threading.Thread(target=thread_1_runtime, args=[rparser]) t2 = threading.Thread(target=thread_2_runtime, args=[rparser]) t1.start() t2.start() t1.join() t2.join() config.set_reads_input_buffer_size(bufsz)
def test_random_20_a_succ_IV_save(self): ht = khmer.new_hashbits(20, 4 ** 7 + 1) filename = utils.get_test_data('random-20-a.fa') savefile_ht = utils.get_temp_filename('ht') savefile_tags = utils.get_temp_filename('tags') outfile = filename + utils.get_temp_filename('out') total_reads, _ = ht.consume_fasta_and_tag(filename) ht.save(savefile_ht) ht.save_tagset(savefile_tags) del ht ht = khmer.new_hashbits(20, 4 ** 7 + 1) ht.load(savefile_ht) ht.load_tagset(savefile_tags) divvy = ht.divide_tags_into_subsets(1) divvy.append(0) subsets = [] for i in range(len(divvy) - 1): x = ht.do_subset_partition(divvy[i], divvy[i + 1]) subsets.append(x) for x in reversed(subsets): ht.merge_subset(x) n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions
def test_tag_across_stoptraverse(): filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht = khmer.LabelHash(K, HT_SIZE, N_HT) # without tagging/joining across consume, this breaks into two partition; # with, it is one partition. ht.add_stop_tag('CCGAATATATAACAGCGACG') ht.consume_fasta_and_tag_with_stoptags(filename) # DO join reads across subset = ht.do_subset_partition(0, 0) n, _ = ht.count_partitions() assert n == 99 # reads only connected by traversal... n, _ = ht.subset_count_partitions(subset) assert n == 2 # but need main to cross stoptags. ht.merge_subset(subset) n, _ = ht.count_partitions() # ta-da! assert n == 1, n
def test_normalize_by_median_dumpfrequency(): CUTOFF = '1' infiles = [utils.get_temp_filename('test-0.fq')] in_dir = os.path.dirname(infiles[0]) for x in range(1, 5): infiles.append(utils.get_temp_filename('test-{x}.fq'.format(x=x), tempdir=in_dir)) for infile in infiles: shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile) script = scriptpath('normalize-by-median.py') args = ['-d', '2', '-C', CUTOFF, '-k', '17'] args.extend(infiles) (status, out, err) = runscript(script, args, in_dir) test_ht = khmer.load_counting_hash(os.path.join(in_dir, 'backup.ct')) test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT' test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA' assert test_ht.count(test_good_read[:17]) > 0 assert test_ht.count(test_good_read2[:17]) > 0 assert os.path.exists(os.path.join(in_dir, 'backup.ct')) assert out.count('Backup: Saving') == 2 assert 'Nothing' in out
def test_normalize_by_median_fpr(): MIN_TABLESIZE_PARAM = 1 infile = utils.get_temp_filename('test-fpr.fq') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile) script = scriptpath('normalize-by-median.py') args = ['-f', '-k 17', '-x ' + str(MIN_TABLESIZE_PARAM), infile] (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True) assert os.path.exists(infile + '.keep') assert 'fp rate estimated to be' in out, out assert '** ERROR: the k-mer counting table is too small' in err, err
def test_partition_graph_no_big_traverse(): # do NOT exhaustively traverse graphbase = _make_graph(utils.get_test_data('biglump-random-20-a.fa'), do_partition=True, stop_big_traverse=True) final_pmap_file = graphbase + '.pmap.merged' assert os.path.exists(final_pmap_file) ht = khmer.load_hashbits(graphbase + '.pt') ht.load_tagset(graphbase + '.tagset') ht.load_partitionmap(final_pmap_file) x = ht.count_partitions() assert x[0] == 4, x # should be four partitions, broken at knot.
def test_load_partitioned(): inpfile = utils.get_test_data('combine_parts_1.fa') ht = khmer.new_hashbits(32, 1, 1) ht.consume_partitioned_fasta(inpfile) assert ht.count_partitions() == (2, 0) s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT" assert ht.get(s1) s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG" assert ht.get(s2) s3 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:] assert ht.get(s3)
def test_n_occupied_1(): filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 1 # number of hashtables # test modified c++ n_occupied code ht1 = khmer.new_hashbits(K, HT_SIZE, N_HT) for n, record in enumerate(fasta_iter(open(filename))): ht1.consume(record['sequence']) # this number calculated independently assert ht1.n_occupied() == 3877
def test_consume_absentfasta_with_reads_parser(): presencetable = khmer.new_hashbits(31, 1, 1) try: presencetable.consume_fasta_with_reads_parser() assert 0, "this should fail" except TypeError as err: print str(err) try: readparser = ReadParser(utils.get_test_data('empty-file')) presencetable.consume_fasta_with_reads_parser(readparser) assert 0, "this should fail" except IOError as err: print str(err) except ValueError, err: print str(err)
def _make_graph(infilename, SIZE=1e7, N=2, K=20, do_partition=False, annotate_partitions=False, stop_big_traverse=False): script = scriptpath('load-graph.py') args = ['-x', str(SIZE), '-N', str(N), '-k', str(K)] outfile = utils.get_temp_filename('out') infile = utils.get_test_data(infilename) args.extend([outfile, infile]) runscript(script, args) ht_file = outfile + '.ht' assert os.path.exists(ht_file), ht_file tagset_file = outfile + '.tagset' assert os.path.exists(tagset_file), tagset_file if do_partition: script = scriptpath('partition-graph.py') args = [outfile] if stop_big_traverse: args.insert(0, '--no-big-traverse') runscript(script, args) script = scriptpath('merge-partitions.py') args = [outfile, '-k', str(K)] runscript(script, args) final_pmap_file = outfile + '.pmap.merged' assert os.path.exists(final_pmap_file) if annotate_partitions: script = scriptpath('annotate-partitions.py') args = ["-k", str(K), outfile, infilename] in_dir = os.path.dirname(outfile) runscript(script, args, in_dir) baseinfile = os.path.basename(infilename) assert os.path.exists(os.path.join(in_dir, baseinfile + '.part')) return outfile
def test_not_output_unassigned(self): import screed filename = utils.get_test_data('random-20-a.fa') ht = khmer.new_hashbits(21, 4, 4) ht.consume_fasta_and_tag(filename) output_file = utils.get_temp_filename('parttest') ht.output_partitions(filename, output_file, False) len1 = len(list(screed.open(filename))) len2 = len(list(screed.open(output_file))) assert len1 > 0 assert len2 == 0, len2
def test_abundance_dist_single_nobigcount(): infile = utils.get_temp_filename('test.fa') outfile = utils.get_temp_filename('test.dist') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) script = scriptpath('abundance-dist-single.py') args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-b', infile, outfile] utils.runscript(script, args, in_dir) fp = iter(open(outfile)) line = fp.next().strip() assert line == '1 96 96 0.98', line line = fp.next().strip() assert line == '255 2 98 1.0', line
def test_bloom_c_1(): # test c++ code to count unique kmers using bloom filter filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht3 = khmer.Hashbits(K, HT_SIZE, N_HT) for n, record in enumerate(fasta_iter(open(filename))): ht3.consume(record['sequence']) assert ht3.n_occupied() == 3882 assert ht3.n_unique_kmers() == 3960
def test_do_partition(): seqfile = utils.get_test_data('random-20-a.fa') graphbase = utils.get_temp_filename('out') in_dir = os.path.dirname(graphbase) script = scriptpath('do-partition.py') args = ["-k", "20", graphbase, seqfile] utils.runscript(script, args, in_dir) partfile = os.path.join(in_dir, 'random-20-a.fa.part') parts = [r.name.split('\t')[1] for r in screed.open(partfile)] parts = set(parts) assert '2' in parts assert len(parts) == 1
def test_bigcount_abund_dist_2(): kh = khmer.new_counting_hash(18, 1e7, 4) tracking = khmer.new_hashbits(18, 1e7, 4) kh.set_use_bigcount(True) seqpath = utils.get_test_data('test-abund-read.fa') kh.consume_fasta(seqpath) for i in range(1000): kh.count('GGTTGACGGGGCTCAGGG') dist = kh.abundance_distribution(seqpath, tracking) print kh.get('GGTTGACGGGGCTCAGGG') pdist = [(i, dist[i]) for i in range(len(dist)) if dist[i]] assert dist[1001] == 1, pdist
def test_consume_build_readmask(self): ht = khmer.new_hashtable(10, 4**10) filename = utils.get_test_data('simple_2.fa') outname = utils.get_temp_filename('test_filter.out') # sequence #4 (index 3) is bad; the new readmask should have that. x = ht.consume_fasta_build_readmask(filename) (total_reads, n_consumed, readmask) = x assert total_reads == 4, total_reads assert n_consumed == 63, n_consumed assert readmask.get(0) assert readmask.get(1) assert readmask.get(2) assert not readmask.get(3)
def test_filter_abund_1_singlefile(): infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) script = scriptpath('filter-abund-single.py') args = ['-x', '1e7', '-N', '2', '-k', '17', infile] runscript(script, args, in_dir) outfile = infile + '.abundfilt' assert os.path.exists(outfile), outfile seqs = set([r.sequence for r in screed.open(outfile)]) assert len(seqs) == 1, seqs assert 'GGTTGACGGGGCTCAGGG' in seqs
def test_filter_n(self): ht = khmer.new_hashtable(10, 4**10) filename = utils.get_test_data('simple_2.fa') outname = utils.get_temp_filename('test_filter.out') (total_reads, n_consumed) = ht.consume_fasta(filename) assert total_reads == 4, total_reads assert n_consumed == 63, n_consumed (total_reads, n_seq_kept) = khmer.filter_fasta_file_any(ht, filename, total_reads, outname, 1) assert n_seq_kept == 3, n_seq_kept names = load_fa_seq_names(outname) assert names == ['1', '2', '3']
def test_consume_no_update_readmask(self): ht = khmer.new_hashtable(10, 4**10) filename = utils.get_test_data('simple_2.fa') outname = utils.get_temp_filename('test_filter.out') readmask = khmer.new_readmask(4) # sequence #4 (index 3) is bad; the new readmask should NOT have that. (total_reads, n_consumed) = ht.consume_fasta(filename, 0, 0, readmask, False) assert total_reads == 4, total_reads assert n_consumed == 63, n_consumed assert readmask.get(0) assert readmask.get(1) assert readmask.get(2) assert readmask.get(3) # NOT updated
def test_do_partition_2(): # test with K=21 (no joining of sequences) seqfile = utils.get_test_data('random-20-a.fa') graphbase = utils.get_temp_filename('out') in_dir = os.path.dirname(graphbase) script = scriptpath('do-partition.py') args = ["-k", "21", graphbase, seqfile] runscript(script, args, in_dir) partfile = os.path.join(in_dir, 'random-20-a.fa.part') parts = [r.name.split('\t')[1] for r in screed.open(partfile)] parts = set(parts) assert len(parts) == 99, len(parts)
def test_filter_abund_2(): infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) counting_ht = _make_counting(infile, K=17) script = scriptpath('filter-abund.py') args = ['-C', '1', counting_ht, infile, infile] utils.runscript(script, args, in_dir) outfile = infile + '.abundfilt' assert os.path.exists(outfile), outfile seqs = set([r.sequence for r in screed.open(outfile)]) assert len(seqs) == 2, seqs assert 'GGTTGACGGGGCTCAGGG' in seqs
def test_extract_partitions_no_groups(): empty_file = utils.get_temp_filename('empty-file') basefile = utils.get_test_data('empty-file') shutil.copyfile(basefile, empty_file) in_dir = os.path.dirname(empty_file) # ok, now run extract-partitions. script = scriptpath('extract-partitions.py') args = ['extracted', empty_file] utils.runscript(script, args, in_dir, fail_ok=True) # No group files should be created groupfile = os.path.join(in_dir, 'extracted.group0000.fa') assert not os.path.exists(groupfile)
def test_save_fail_readonly(): lb_pre = LabelHash(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb_pre.consume_fasta_and_tag_with_labels(filename) # save labels to a file savepath = utils.get_temp_filename('saved.labels') fp = open(savepath, 'w') fp.close() os.chmod(savepath, 0x444) try: lb_pre.save_labels_and_tags(savepath) assert 0, "this should fail: read-only file" except IOError as err: print str(err)
def test_trim_low_abund_5_trim_high_abund(): infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-abund-read-3.fa'), infile) args = ["-k", "17", "-x", "1e7", "-N", "2", '-V', infile] utils.runscript('trim-low-abund.py', args, in_dir, sandbox=True) outfile = infile + '.abundtrim' assert os.path.exists(outfile), outfile seqs = set([r.sequence for r in screed.open(outfile)]) assert len(seqs) == 2, seqs # trimmed sequence @ error assert 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGC' in seqs
def test_5_merge_046(self): ht = khmer.new_hashbits(20, 4**4 + 1) filename = utils.get_test_data('test-graph5.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) assert total_reads == 6, total_reads divvy = ht.divide_tags_into_subsets(1) x = ht.do_subset_partition(divvy[0], divvy[4]) ht.merge_subset(x) y = ht.do_subset_partition(divvy[4], 0) ht.merge_subset(y) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_load_graph_no_tags(): script = scriptpath('load-graph.py') args = ['-x', '1e7', '-N', '2', '-k', '20', '-n'] outfile = utils.get_temp_filename('out') infile = utils.get_test_data('random-20-a.fa') args.extend([outfile, infile]) utils.runscript(script, args) ht_file = outfile + '.pt' assert os.path.exists(ht_file), ht_file tagset_file = outfile + '.tagset' assert not os.path.exists(tagset_file), tagset_file assert khmer.load_hashbits(ht_file)
def test_count_median(): infile = utils.get_temp_filename('test.fa') outfile = infile + '.counts' shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) counting_ht = _make_counting(infile, K=8) script = scriptpath('count-median.py') args = [counting_ht, infile, outfile] utils.runscript(script, args) assert os.path.exists(outfile), outfile data = [x.strip() for x in open(outfile)] data = set(data) assert len(data) == 2, data assert 'seq 1001 1001.0 0.0 18' in data assert '895:1:37:17593:9954/1 1 103.803741455 303.702941895 114' in data
def test_badfasta_count_kmers_by_position(): countingtable = khmer.new_counting_hash(4, 4**4, 4) try: countingtable.fasta_count_kmers_by_position() except TypeError as err: print str(err) filename = utils.get_test_data("test-short.fa") try: countingtable.fasta_count_kmers_by_position(filename, -1, 0) assert 0, "this should fail" except ValueError as err: print str(err) try: countingtable.fasta_count_kmers_by_position(filename, 0, -1) assert 0, "this should fail" except ValueError as err: print str(err)
def test_find_all_tags_list_error(): ct = khmer.new_counting_hash(4, 4**4, 4) # load each sequence but do not build tags - everything should be empty. for record in screed.open(utils.get_test_data('test-graph2.fa')): ct.consume(record.sequence) try: ct.find_all_tags_list("ATA") assert False, "a ValueError should be raised for incorrect k-mer size" except ValueError: pass try: ct.find_all_tags_list("ATAGA") assert False, "a ValueError should be raised for incorrect k-mer size" except ValueError: pass
def test_abundance_dist_single(): infile = utils.get_temp_filename('test.fa') outfile = utils.get_temp_filename('test.dist') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) script = scriptpath('abundance-dist-single.py') args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-t', infile, outfile] (status, out, err) = utils.runscript(script, args, in_dir) assert 'Total number of k-mers: 98' in err, err fp = iter(open(outfile)) line = fp.next().strip() assert line == '1 96 96 0.98', line line = fp.next().strip() assert line == '1001 2 98 1.0', line
def test_abundance_dist_nobigcount(): infile = utils.get_temp_filename('test.fa') outfile = utils.get_temp_filename('test.dist') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) htfile = _make_counting(infile, K=17, BIGCOUNT=False) script = scriptpath('abundance-dist.py') args = ['-z', htfile, infile, outfile] utils.runscript(script, args, in_dir) fp = iter(open(outfile)) line = fp.next().strip() assert line == '1 96 96 0.98', line line = fp.next().strip() assert line == '255 2 98 1.0', line
def test_abundance_by_pos(self): kh = self.kh for _ in range(0, 300): kh.count('ATCG') for _ in range(0, 10): kh.count('ATGG') short_filename = utils.get_test_data('test-short.fa') dist = kh.fasta_count_kmers_by_position(short_filename, 6, 10) assert dist[4] == 1 assert sum(dist) == 1 dist = kh.fasta_count_kmers_by_position(short_filename, 6, MAX_COUNT) assert dist[0] == 1, dist[0] assert dist[2] == 1 assert sum(dist) == 2
def test_3_merge_023(self): ht = khmer.new_hashbits(20, 4**10 + 1) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) assert total_reads == 3, total_reads (a, b, c) = ht.divide_tags_into_subsets(1) x = ht.do_subset_partition(b, c) ht.merge_subset(x) y = ht.do_subset_partition(a, b) ht.merge_subset(y) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_random_20_a_succ_II(self): ht = khmer.new_hashbits(20, 4**7 + 1) filename = utils.get_test_data('random-20-a.fa') outfile = utils.get_temp_filename('out') total_reads, _ = ht.consume_fasta_and_tag(filename) subset_size = total_reads / 2 + total_reads % 2 divvy = ht.divide_tags_into_subsets(subset_size) assert len(divvy) == 4 x = ht.do_subset_partition(divvy[0], divvy[2]) y = ht.do_subset_partition(divvy[2], 0) ht.merge_subset(x) ht.merge_subset(y) n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions