def test_load_gz(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave1.ht') loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) # save uncompressed hashtable. hi = khmer._Countgraph(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) # compress. in_file = open(savepath, 'rb') out_file = gzip.open(loadpath, 'wb') out_file.writelines(in_file) out_file.close() in_file.close() # load compressed hashtable. try: ht = khmer.load_countgraph(loadpath) except OSError as err: assert 0, "Should not produce an OSError: " + str(err) tracking = khmer._Nodegraph(12, sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer._Nodegraph(12, sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)
def test_save_load_gz(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave2.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) hi = khmer._Countgraph(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) ht = khmer._Countgraph(12, sizes) try: ht.load(savepath) except OSError as err: assert 0, 'Should not produce an OSError: ' + str(err) tracking = khmer._Nodegraph(12, sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer._Nodegraph(12, sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)
def test_bloom_c_2(): # simple one ksize = 4 # use only 1 hashtable, no bloom filter nodegraph = khmer._Nodegraph(ksize, [11]) nodegraph.count('AAAA') # 00 00 00 00 = 0 nodegraph.count('ACTG') # 00 10 01 11 = assert nodegraph.n_unique_kmers() == 2 nodegraph.count('AACG') # 00 00 10 11 = 11 # collision with 1st kmer assert nodegraph.n_unique_kmers() == 2 nodegraph.count('AGAC') # 00 11 00 10 # collision with 2nd kmer assert nodegraph.n_unique_kmers() == 2 # use two hashtables with 11,13 other_nodegraph = khmer._Nodegraph(ksize, [11, 13]) other_nodegraph.count('AAAA') # 00 00 00 00 = 0 other_nodegraph.count('ACTG') # 00 10 01 11 = 2*16 +4 +3 = 39 assert other_nodegraph.n_unique_kmers() == 2 # 00 00 10 11 = 11 # collision with only 1st kmer other_nodegraph.count('AACG') assert other_nodegraph.n_unique_kmers() == 3 other_nodegraph.count('AGAC') # 00 11 00 10 3*16 +2 = 50 # collision with both 2nd and 3rd kmers assert other_nodegraph.n_unique_kmers() == 3
def test_abund_dist_gz_bigcount_compressed_first(): infile = utils.copy_test_data('test-abund-read-2.fa') script = 'load-into-counting.py' htfile = utils.get_temp_filename('test_ct.gz') args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile] utils.runscript(script, args) # create a bigcount table assert os.path.exists(htfile) data = gzip.open(htfile, 'rb').read() # read compressed bigcount table outfile = utils.get_temp_filename('test_ct') f_out = open(outfile, 'wb') # output the bigcount table f_out.write(data) f_out.close() # load the compressed bigcount table try: countgraph = khmer.load_countgraph(outfile) except OSError as err: assert 0, 'Should not produce OSError: ' + str(err) assert countgraph.n_occupied() != 0 hashsizes = countgraph.hashsizes() kmer_size = countgraph.ksize() tracking = khmer._Nodegraph(kmer_size, hashsizes) abundances = countgraph.abundance_distribution(infile, tracking) # calculate abundance distribution for compressed bigcount table flag = False # check if abundance is > 255 # if ok gzipped bigcount was loaded correctly for _, i in enumerate(abundances): print(_, i) if _ > 255 and i > 0: flag = True break assert flag
def test_save_load_merge_nexist(self): ht = khmer._Nodegraph(20, [1]) try: ht.load_subset_partitionmap('this does not exist') assert 0, "this should not succeed" except OSError as e: print(str(e))
def main(): info('count-kmers.py', ['counting']) args = get_parser().parse_args() print ('hashtable from', args.input_count_graph_filename, file=sys.stderr) countgraph = khmer.load_countgraph( args.input_count_graph_filename) kmer_size = countgraph.ksize() hashsizes = countgraph.hashsizes() tracking = khmer._Nodegraph( # pylint: disable=protected-access kmer_size, hashsizes) if args.output_file is None: args.output_file = sys.stdout writer = csv.writer(args.output_file) for filename in args.input_sequence_filenames: for record in screed.open(filename): seq = record.sequence.replace('N', 'A') for i in range(len(seq) - kmer_size + 1): kmer = seq[i:i+kmer_size] if not tracking.get(kmer): tracking.count(kmer) writer.writerow([kmer, str(countgraph.get(kmer))]) print ('Total number of unique k-mers: {0}'.format( countgraph.n_unique_kmers()), file=sys.stderr)
def test_save_load_tagset_trunc(): nodegraph = khmer._Nodegraph(32, [1]) outfile = utils.get_temp_filename('tagset') nodegraph.add_tag('A' * 32) nodegraph.add_tag('G' * 32) nodegraph.save_tagset(outfile) # truncate tagset file... fp = open(outfile, 'rb') data = fp.read() fp.close() for i in range(len(data)): fp = open(outfile, 'wb') fp.write(data[:i]) fp.close() # try loading it... try: nodegraph.load_tagset(outfile) assert 0, "this test should fail" except OSError as err: print(str(err), i) # try loading it... try: nodegraph.load_tagset(outfile) assert 0, "this test should fail" except OSError: pass
def test__get_set_tag_density(): nodegraph = khmer._Nodegraph(32, [1]) orig = nodegraph._get_tag_density() assert orig != 2 nodegraph._set_tag_density(2) assert nodegraph._get_tag_density() == 2
def test_extract_unique_paths_2(): kh = khmer._Nodegraph(10, [5, 7, 11, 13]) kh.consume('ATGGAGAGAC') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) print(x) assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG'] # all but the 1st k-mer
def test_count_A(self): A_filename = utils.get_test_data('all-A.fa') tracking = khmer._Nodegraph(4, [5]) dist = self.kh.abundance_distribution(A_filename, tracking) assert sum(dist) == 1 assert dist[10] == 1
def test_find_stoptags(): nodegraph = khmer._Nodegraph(5, [1]) nodegraph.add_stop_tag("AAAAA") assert nodegraph.identify_stoptags_by_position("AAAAA") == [0] assert nodegraph.identify_stoptags_by_position("AAAAAA") == [0, 1] assert nodegraph.identify_stoptags_by_position("TTTTT") == [0] assert nodegraph.identify_stoptags_by_position("TTTTTT") == [0, 1]
def test_save_load_tagset_notexist(): nodegraph = khmer._Nodegraph(32, [1]) outfile = utils.get_temp_filename('tagset') try: nodegraph.load_tagset(outfile) assert 0, "this test should fail" except OSError as e: print(str(e))
def test_tagset_filetype_check(): nodegraph = khmer._Nodegraph(31, [1]) inpath = utils.get_test_data('goodversion-k32.stoptags') try: nodegraph.load_tagset(inpath) assert 0, "this should fail" except OSError as e: print(str(e))
def test_extract_unique_paths_0(): kh = khmer._Nodegraph(10, [5, 7, 11, 13]) x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG'] kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) assert not x
def test_count_kmer_degree(): inpfile = utils.get_test_data('all-A.fa') nodegraph = khmer._Nodegraph(4, [3, 5]) nodegraph.consume_fasta(inpfile) assert nodegraph.kmer_degree('AAAA') == 2 assert nodegraph.kmer_degree('AAAT') == 1 assert nodegraph.kmer_degree('AATA') == 0 assert nodegraph.kmer_degree('TAAA') == 1
def test_hashbits_file_version_check(): nodegraph = khmer._Nodegraph(12, [1]) inpath = utils.get_test_data('badversion-k12.htable') try: nodegraph.load(inpath) assert 0, "this should fail" except OSError as e: print(str(e))
def test_count_within_radius_simple(): inpfile = utils.get_test_data('all-A.fa') nodegraph = khmer._Nodegraph(4, [3, 5]) print(nodegraph.consume_fasta(inpfile)) n = nodegraph.count_kmers_within_radius('AAAA', 1) assert n == 1 n = nodegraph.count_kmers_within_radius('AAAA', 10) assert n == 1
def test_stoptags_file_version_check(): nodegraph = khmer._Nodegraph(32, [1]) inpath = utils.get_test_data('badversion-k32.stoptags') try: nodegraph.load_stop_tags(inpath) assert 0, "this should fail" except OSError as e: print(str(e))
def test_fakelump_load_stop_tags_notexist(): fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo') # ok, now try loading these stop tags; should fail. ht = khmer._Nodegraph(32, [5, 7, 11, 13]) try: ht.load_stop_tags(fakelump_fa_foo) assert 0, "this test should fail" except OSError: pass
def test_read_cleaning_abundance_distribution(Countingtype): infile = utils.get_test_data('valid-read-testing.fq') x = Countingtype(15, PRIMES_1m) y = _Nodegraph(15, PRIMES_1m) x.consume_seqfile(infile) dist = x.abundance_distribution(infile, y) assert dist[1] == 35 # k-mers with non-ACGTN => ignored. assert dist[2] == 69
def test_abund_dist_A(tabletype): A_filename = utils.get_test_data('all-A.fa') kh = tabletype(4, PRIMES_1m) tracking = khmer._Nodegraph(4, PRIMES_1m) kh.consume_seqfile(A_filename) dist = kh.abundance_distribution(A_filename, tracking) print(dist[:10]) assert sum(dist) == 1 assert dist[0] == 0
def test_nodegraph_file_type_check(): kh = khmer._Countgraph(12, [1]) savepath = utils.get_temp_filename('tempcountingsave0.ct') kh.save(savepath) nodegraph = khmer._Nodegraph(12, [1]) try: nodegraph.load(savepath) assert 0, "this should fail" except OSError as e: print(str(e))
def test_filter_if_present(): nodegraph = khmer._Nodegraph(32, [3, 5]) maskfile = utils.get_test_data('filter-test-A.fa') inputfile = utils.get_test_data('filter-test-B.fa') outfile = utils.get_temp_filename('filter') nodegraph.consume_fasta(maskfile) nodegraph.filter_if_present(inputfile, outfile) records = list(screed.open(outfile)) assert len(records) == 1 assert records[0]['name'] == '3'
def main(): info('count-kmers-single.py', ['counting']) args = get_parser().parse_args() check_input_files(args.input_sequence_filename, False) print ('making k-mer countgraph', file=sys.stderr) countgraph = khmer.Countgraph(args.ksize, args.max_tablesize, args.n_tables) # @CTB countgraph.set_use_bigcount(args.bigcount) kmer_size = countgraph.ksize() hashsizes = countgraph.hashsizes() tracking = khmer._Nodegraph( # pylint: disable=protected-access kmer_size, hashsizes) print ('kmer_size: %s' % countgraph.ksize(), file=sys.stderr) print ('k-mer countgraph sizes: %s' % (countgraph.hashsizes(),), file=sys.stderr) if args.output_file is None: args.output_file = sys.stdout writer = csv.writer(args.output_file) # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print ('consuming input, round 1 -- %s' % (args.input_sequence_filename), file=sys.stderr) for _ in range(args.threads): thread = \ threading.Thread( target=countgraph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() for record in screed.open(args.input_sequence_filename): seq = record.sequence.replace('N', 'A') for i in range(len(seq) - kmer_size + 1): kmer = seq[i:i+kmer_size] if not tracking.get(kmer): tracking.count(kmer) writer.writerow([kmer, str(countgraph.get(kmer))]) print ('Total number of unique k-mers: {0}'.format( countgraph.n_unique_kmers()), file=sys.stderr)
def _build_testfiles(): # nodegraph file inpath = utils.get_test_data('random-20-a.fa') hi = khmer._Nodegraph(12, 2) hi.consume_fasta(inpath) hi.save('/tmp/goodversion-k12.htable') # tagset file nodegraph = khmer._Nodegraph(32, [1]) nodegraph.add_tag('A' * 32) nodegraph.add_tag('G' * 32) nodegraph.save_tagset('/tmp/goodversion-k32.tagset') # stoptags file fakelump_fa = utils.get_test_data('fakelump.fa') nodegraph = khmer.Nodegraph(32, 4, 4) nodegraph.consume_fasta_and_tag(fakelump_fa) subset = nodegraph.do_subset_partition(0, 0) nodegraph.merge_subset(subset) EXCURSION_DISTANCE = 40 EXCURSION_KMER_THRESHOLD = 82 EXCURSION_KMER_COUNT_THRESHOLD = 1 counting = khmer.Countgraph(32, 4, 4) nodegraph.repartition_largest_partition(None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) nodegraph.save_stop_tags('/tmp/goodversion-k32.stoptags')
def test_load_partitioned(): inpfile = utils.get_test_data('combine_parts_1.fa') nodegraph = khmer._Nodegraph(32, [1]) nodegraph.consume_partitioned_fasta(inpfile) assert nodegraph.count_partitions() == (2, 0) first_seq = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT" assert nodegraph.get(first_seq) second_seq = "CAAATGTACATGCACTTAAAATCATCCAGCCG" assert nodegraph.get(second_seq) third_s = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:] assert nodegraph.get(third_s)
def test_not_output_unassigned(self): filename = utils.get_test_data('random-20-a.fa') ht = khmer._Nodegraph(21, [5, 7, 11, 13]) ht.consume_seqfile_and_tag(filename) output_file = utils.get_temp_filename('parttest') ht.output_partitions(filename, output_file, False) len1 = len(list(screed.open(filename))) len2 = len(list(screed.open(output_file))) assert len1 > 0 assert len2 == 0, len2
def test_save_load(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave0.ht') sizes = list(PRIMES_1m) sizes.append(1000005) hi = khmer._Countgraph(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) try: ht = khmer.load_countgraph(savepath) except OSError as err: assert 0, 'Should not produce an OSError: ' + str(err) tracking = khmer._Nodegraph(12, sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer._Nodegraph(12, sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)
def test_n_occupied_2(): # simple one ksize = 4 nodegraph = khmer._Nodegraph(ksize, [11]) nodegraph.count('AAAA') # 00 00 00 00 = 0 assert nodegraph.n_occupied() == 1 nodegraph.count('ACTG') # 00 10 01 11 = assert nodegraph.n_occupied() == 2 nodegraph.count('AACG') # 00 00 10 11 = 11 # collision 1 assert nodegraph.n_occupied() == 2 nodegraph.count('AGAC') # 00 11 00 10 # collision 2 assert nodegraph.n_occupied() == 2, nodegraph.n_occupied()
def test_n_occupied_2_add_is_count(): # 'add' synonym for 'count' ksize = 4 nodegraph = khmer._Nodegraph(ksize, [11]) nodegraph.add('AAAA') # 00 00 00 00 = 0 assert nodegraph.n_occupied() == 1 nodegraph.add('ACTG') # 00 10 01 11 = assert nodegraph.n_occupied() == 2 nodegraph.add('AACG') # 00 00 10 11 = 11 # collision 1 assert nodegraph.n_occupied() == 2 nodegraph.add('AGAC') # 00 11 00 10 # collision 2 assert nodegraph.n_occupied() == 2, nodegraph.n_occupied()
def test_load_gz(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave1.ht') loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) # save uncompressed hashtable. hi = khmer._Countgraph(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) # compress. in_file = open(savepath, 'rb') out_file = gzip.open(loadpath, 'wb') out_file.writelines(in_file) out_file.close() in_file.close() # load compressed hashtable. ht = khmer._Countgraph(12, sizes) try: ht.load(loadpath) except OSError as err: assert 0, "Should not produce an OSError: " + str(err) tracking = khmer._Nodegraph(12, sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer._Nodegraph(12, sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)
def test_consume_absentfasta_with_reads_parser(): nodegraph = khmer._Nodegraph(31, [1]) try: nodegraph.consume_fasta_with_reads_parser() assert 0, "this should fail" except TypeError as err: print(str(err)) try: readparser = ReadParser(utils.get_test_data('empty-file')) nodegraph.consume_fasta_with_reads_parser(readparser) assert 0, "this should fail" except OSError as err: print(str(err)) except ValueError as err: print(str(err))
def test_output_unassigned(self): import screed filename = utils.get_test_data('random-20-a.fa') ht = khmer._Nodegraph(21, [5, 7, 11, 13]) ht.consume_fasta_and_tag(filename) output_file = utils.get_temp_filename('part0test') ht.output_partitions(filename, output_file, True) len1 = len(list(screed.open(filename))) len2 = len(list(screed.open(output_file))) assert len1 > 0 assert len1 == len2, (len1, len2)
def test_n_occupied_2(): # simple one ksize = 4 htable_size = 10 # use 11 num_nodegraphs = 1 nodegraph = khmer._Nodegraph(ksize, [11]) nodegraph.count('AAAA') # 00 00 00 00 = 0 assert nodegraph.n_occupied() == 1 nodegraph.count('ACTG') # 00 10 01 11 = assert nodegraph.n_occupied() == 2 nodegraph.count('AACG') # 00 00 10 11 = 11 # collision 1 assert nodegraph.n_occupied() == 2 nodegraph.count('AGAC') # 00 11 00 10 # collision 2 assert nodegraph.n_occupied() == 2, nodegraph.n_occupied()
def test_stop_tags_truncate_check(): nodegraph = khmer._Nodegraph(32, [1]) inpath = utils.get_test_data('goodversion-k32.tagset') data = open(inpath, 'rb').read() truncpath = utils.get_temp_filename('zzz') for i in range(len(data)): fp = open(truncpath, 'wb') fp.write(data[:i]) fp.close() try: nodegraph.load_stop_tags(truncpath) assert 0, "expect failure of previous command" except OSError as e: print(i, str(e))
def test_fakelump_load_stop_tags_trunc(): fakelump_fa = utils.get_test_data('fakelump.fa') fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo') ht = khmer.Nodegraph(32, 1e5, 4) ht.consume_fasta_and_tag(fakelump_fa) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) (n_partitions, _) = ht.count_partitions() assert n_partitions == 1, n_partitions # now, break partitions on any k-mer that you see more than once # on big excursions, where big excursions are excursions 40 out # that encounter more than 82 k-mers. This should specifically # identify our connected sequences in fakelump... EXCURSION_DISTANCE = 40 EXCURSION_KMER_THRESHOLD = 82 EXCURSION_KMER_COUNT_THRESHOLD = 1 counting = khmer._Countgraph(32, [5, 7, 11, 13]) ht.repartition_largest_partition(None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) ht.save_stop_tags(fakelump_fa_foo) data = open(fakelump_fa_foo, 'rb').read() fp = open(fakelump_fa_foo, 'wb') fp.write(data[:10]) fp.close() # ok, now try loading these stop tags; should fail. ht = khmer._Nodegraph(32, [5, 7, 11, 13]) ht.consume_fasta_and_tag(fakelump_fa) try: ht.load_stop_tags(fakelump_fa_foo) assert 0, "this test should fail" except OSError: pass
def test_save_load_tagset(): nodegraph = khmer._Nodegraph(32, [1]) outfile = utils.get_temp_filename('tagset') nodegraph.add_tag('A' * 32) nodegraph.save_tagset(outfile) nodegraph.add_tag('G' * 32) nodegraph.load_tagset(outfile) # implicitly => clear_tags=True nodegraph.save_tagset(outfile) # if tags have been cleared, then the new tagfile will be larger (34 bytes) # else smaller (26 bytes). fp = open(outfile, 'rb') data = fp.read() fp.close() assert len(data) == 30, len(data)
def test_save_load_tagset_noclear(): nodegraph = khmer._Nodegraph(32, [1]) outfile = utils.get_temp_filename('tagset') nodegraph.add_tag('A' * 32) nodegraph.save_tagset(outfile) nodegraph.add_tag('G' * 32) nodegraph.load_tagset(outfile, False) # set clear_tags => False; zero tags nodegraph.save_tagset(outfile) # if tags have been cleared, then the new tagfile will be large (34 bytes); # else small (26 bytes). fp = open(outfile, 'rb') data = fp.read() fp.close() assert len(data) == 38, len(data)
def test_output_partitions(): filename = utils.get_test_data('test-output-partitions.fa') ht = khmer._Nodegraph(10, [1]) ht.set_partition_id('TTAGGACTGC', 2) ht.set_partition_id('TGCGTTTCAA', 3) ht.set_partition_id('ATACTGTAAA', 4) outfile = utils.get_temp_filename('part') ht.output_partitions(filename, outfile) data = open(outfile).read() assert len(data) records = [r for r in screed.open(outfile)] names = [r.name for r in records] parts = [n.rsplit('\t', 1)[1] for n in names] assert parts[0] == '2' assert parts[1] == '3' assert parts[2] == '4'
def test_kmer_neighbors(): inpfile = utils.get_test_data('all-A.fa') nodegraph = khmer._Nodegraph(4, [3, 5]) nodegraph.consume_fasta(inpfile) h = khmer.forward_hash('AAAA', 4) print(type('AAAA')) assert nodegraph.neighbors(h) == [0, 0] # AAAA on both sides assert nodegraph.neighbors('AAAA') == [0, 0] # AAAA on both sides h = khmer.forward_hash('AAAT', 4) assert nodegraph.neighbors(h) == [0] # AAAA on one side assert nodegraph.neighbors('AAAT') == [0] # AAAA on one side h = khmer.forward_hash('AATA', 4) assert nodegraph.neighbors(h) == [] # no neighbors assert nodegraph.neighbors('AATA') == [] # AAAA on one side h = khmer.forward_hash('TAAA', 4) assert nodegraph.neighbors(h) == [0] # AAAA on both sides assert nodegraph.neighbors('TAAA') == [0] # AAAA on both sides
def test_save_merge_from_disk_ksize(self): ht = khmer.Nodegraph(20, 4 ** 4 + 1, 2) filename = utils.get_test_data('test-graph2.fa') (total_reads, _) = ht.consume_seqfile_and_tag(filename) assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) print(divvy) (a, b, _) = divvy outfile1 = utils.get_temp_filename('x.pmap') x = ht.do_subset_partition(a, b) ht.save_subset_partitionmap(x, outfile1) del x ht = khmer._Nodegraph(19, [1]) try: ht.merge_subset_from_disk(outfile1) assert 0, "this should fail" except OSError as e: print(str(e))
def test_kmer_neighbors_wrong_ksize(): inpfile = utils.get_test_data('all-A.fa') nodegraph = khmer._Nodegraph(4, [3, 5]) nodegraph.consume_fasta(inpfile) try: nodegraph.neighbors('AAAAA') assert 0, "neighbors() should fail with too long string" except ValueError: pass try: nodegraph.neighbors(b'AAAAA') assert 0, "neighbors() should fail with too long string" except ValueError: pass try: nodegraph.neighbors({}) assert 0, "neighbors() should fail with non hash/str arg" except ValueError: pass
def test_combine_pe(): inpfile = utils.get_test_data('combine_parts_1.fa') nodegraph = khmer._Nodegraph(32, [1]) nodegraph.consume_partitioned_fasta(inpfile) assert nodegraph.count_partitions() == (2, 0) first_seq = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT" pid1 = nodegraph.get_partition_id(first_seq) second_seq = "CAAATGTACATGCACTTAAAATCATCCAGCCG" pid2 = nodegraph.get_partition_id(second_seq) assert pid1 == 2 assert pid2 == 80293 nodegraph.join_partitions(pid1, pid2) pid1 = nodegraph.get_partition_id(first_seq) pid2 = nodegraph.get_partition_id(second_seq) assert pid1 == pid2 assert nodegraph.count_partitions() == (1, 0)
def _build_testfiles(): # nodegraph file inpath = utils.get_test_data('random-20-a.fa') hi = khmer.Nodegraph(12, 2) hi.consume_fasta(inpath) hi.save('/tmp/goodversion-k12.htable') # tagset file nodegraph = khmer._Nodegraph(32, [1]) nodegraph.add_tag('A' * 32) nodegraph.add_tag('G' * 32) nodegraph.save_tagset('/tmp/goodversion-k32.tagset') # stoptags file fakelump_fa = utils.get_test_data('fakelump.fa') nodegraph = khmer.Nodegraph(32, 4, 4) nodegraph.consume_fasta_and_tag(fakelump_fa) subset = nodegraph.do_subset_partition(0, 0) nodegraph.merge_subset(subset) EXCURSION_DISTANCE = 40 EXCURSION_KMER_THRESHOLD = 82 EXCURSION_KMER_COUNT_THRESHOLD = 1 counting = khmer.Countgraph(32, 4, 4) nodegraph.repartition_largest_partition(None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) nodegraph.save_stop_tags('/tmp/goodversion-k32.stoptags')
def test_get_ksize(): kh = khmer._Nodegraph(22, [1]) assert kh.ksize() == 22
def test_bad_create(): try: nodegraph = khmer._Nodegraph(5, []) except ValueError as err: assert 'tablesizes needs to be one or more numbers' in str(err)
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) infiles = [args.input_count_graph_filename, args.input_sequence_filename] for infile in infiles: check_input_files(infile, False) log_info('Loading counting graph from {graph}', graph=args.input_count_graph_filename) countgraph = khmer.load_countgraph(args.input_count_graph_filename) if not countgraph.get_use_bigcount() and args.bigcount: log_warn("WARNING: The loaded graph has bigcount DISABLED while " "bigcount reporting is ENABLED--counts higher than 255 will " "not be reported.") countgraph.set_use_bigcount(args.bigcount) kmer_size = countgraph.ksize() hashsizes = countgraph.hashsizes() tracking = khmer._Nodegraph( # pylint: disable=protected-access kmer_size, hashsizes) log_info('K: {ksize}', ksize=kmer_size) log_info('outputting to {output}', output=args.output_histogram_filename) if args.output_histogram_filename in ('-', '/dev/stdout'): pass elif os.path.exists(args.output_histogram_filename): if not args.squash_output: log_error('ERROR: {output} exists; not squashing.', output=args.output_histogram_filename) sys.exit(1) log_info('** squashing existing file {output}', output=args.output_histogram_filename) log_info('preparing hist...') abundances = countgraph.abundance_distribution( args.input_sequence_filename, tracking) total = sum(abundances) if 0 == total: log_error("ERROR: abundance distribution is uniformly zero; " "nothing to report.") log_error("\tPlease verify that the input files are valid.") sys.exit(1) if args.output_histogram_filename in ('-', '/dev/stdout'): countgraph_fp = sys.stdout else: countgraph_fp = open(args.output_histogram_filename, 'w') countgraph_fp_csv = csv.writer(countgraph_fp) # write headers: countgraph_fp_csv.writerow( ['abundance', 'count', 'cumulative', 'cumulative_fraction']) sofar = 0 for _, i in enumerate(abundances): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) countgraph_fp_csv.writerow([_, i, sofar, round(frac, 3)]) if sofar == total: break
def test_add_stop_tag(): nodegraph = khmer._Nodegraph(6, [1]) nodegraph.add_stop_tag('AATAAG') print(nodegraph.get_stop_tags()) assert nodegraph.get_stop_tags() == ['AATAAG']
def test_find_stoptagsecond_seq(): nodegraph = khmer._Nodegraph(4, [1]) nodegraph.add_stop_tag("ATGC") x = nodegraph.identify_stoptags_by_position("ATGCATGCGCAT") assert x == [0, 2, 4, 8], x
def test_consume_partitioned_fail(): inpfile = utils.get_test_data('test-reads.fa') nodegraph = khmer._Nodegraph(32, [1]) with pytest.raises(ValueError): nodegraph.consume_partitioned_fasta(inpfile)
def test_bad_primes_list(): try: coutingtable = khmer._Nodegraph(31, ["a", "b", "c"], 1) assert 0, "Bad primes list should fail" except TypeError as e: print(str(e))