def test_get_primes(): primes = khmer.get_n_primes_near_x(7, 20) assert primes == [19, 17, 13, 11, 7, 5, 3] primes_not_float = khmer.get_n_primes_near_x(7, 20.) assert primes_not_float == [19, 17, 13, 11, 7, 5, 3] assert all(isinstance(p, int) for p in primes_not_float)
def test_get_primes_fal(): try: primes = khmer.get_n_primes_near_x(5, 5) assert 0, "previous statement should fail" except AssertionError: raise except Exception as err: assert "unable to find 5 prime numbers < 5" in str(err)
def test_save_load_large(ctfile): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename(ctfile) sizes = khmer.get_n_primes_near_x(1, 2**31 + 1000) orig = khmer._Countgraph(12, sizes) orig.consume_seqfile(inpath) orig.save(savepath) loaded = khmer.load_countgraph(savepath) orig_count = orig.n_occupied() loaded_count = loaded.n_occupied() assert orig_count == 3966, orig_count assert loaded_count == orig_count, loaded_count
def do_test(ctfile): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename(ctfile) sizes = khmer.get_n_primes_near_x(1, 2 ** 31 + 1000) orig = khmer._Countgraph(12, sizes) orig.consume_fasta(inpath) orig.save(savepath) loaded = khmer.load_countgraph(savepath) orig_count = orig.n_occupied() loaded_count = loaded.n_occupied() assert orig_count == 3966, orig_count assert loaded_count == orig_count, loaded_count
def test_load_truncated(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('save.ht') truncpath = utils.get_temp_filename('trunc.ht') sizes = khmer.get_n_primes_near_x(3, 200) hi = khmer._Countgraph(12, sizes) hi.consume_seqfile(inpath) hi.save(savepath) data = open(savepath, 'rb').read() for i in range(len(data)): fp = open(truncpath, 'wb') fp.write(data[:i]) fp.close() try: khmer.load_countgraph(truncpath) assert 0, "this should not be reached!" except OSError as err: print(str(err))
def test_load_truncated(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('save.ht') truncpath = utils.get_temp_filename('trunc.ht') sizes = khmer.get_n_primes_near_x(3, 200) hi = khmer._Countgraph(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) data = open(savepath, 'rb').read() for i in range(len(data)): fp = open(truncpath, 'wb') fp.write(data[:i]) fp.close() try: khmer.load_countgraph(truncpath) assert 0, "this should not be reached!" except OSError as err: print(str(err))
def test_get_primes(): primes = khmer.get_n_primes_near_x(7, 20) assert primes == [19, 17, 13, 11, 7, 5, 3]
def test_get_primes_fal(): try: khmer.get_n_primes_near_x(5, 5) assert 0, "previous statement should fail" except RuntimeError as err: assert "unable to find 5 prime numbers < 5" in str(err)
def test_init_with_primes(sketchtype): primes = khmer.get_n_primes_near_x(4, random.randint(1000, 2000)) sketch = sketchtype(31, 1, 1, primes=primes) assert sketch.hashsizes() == primes
def main(filename): global ht n = 5 basename = os.path.basename(filename) fd = open("log.txt", "w") primes = [] below = khmer.get_n_primes_near_x(N_HT * n, HASHTABLE_SIZE) above = khmer.get_n_primes_above_x(N_HT * n, HASHTABLE_SIZE) primes = below + above random.shuffle(primes) for run in range(n): print primes[run * N_HT:run * N_HT + N_HT] ht = khmer._new_hashbits(K, primes[run * N_HT:run * N_HT + N_HT]) #ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) # populate the hash table and tag set if not load_ht: ht.consume_fasta_and_tag(filename) # save to a file (optional) if save_ht: ht.save(basename + '.ht') ht.save_tagset(basename + '.tagset') # calculate the hashtable occupancy print '---' print 'hashtable occupancy:', ht.n_occupied() / float( HASHTABLE_SIZE) print '---' else: ht.load(basename + '.ht') ht.load_tagset(basename + '.tagset') # did we just want to load the ht/tagset? if stop_after_n_subsets == 0: sys.exit(0) #stop_tags = pickle.load(open(sys.argv[2])) #for stop_tag in stop_tags: # ht.add_stop_tag(stop_tag) # divide the tags up into subsets divvy = ht.divide_tags_into_subsets(SUBSET_SIZE) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() for i in range(0, n_subsets): if stop_after_n_subsets is not None and i >= stop_after_n_subsets: break start = divvy[i] end = divvy[i + 1] worker_q.put((ht, i, start, end)) open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) threads = [] for th in range(N_THREADS): t = threading.Thread(target=worker, args=(worker_q, basename)) threads.append(t) t.start() # wait for threads for t in threads: t.join() ### del ht gc.collect() # create a new, empty ht object for merging; K matters, but not # hashtable size. ht = khmer.new_hashbits(K, 1, 1) # load & merge all pmap files for i in range(0, n_subsets): pmap_file = basename + '.subset.%d.pmap' % (i, ) ht.merge_subset_from_disk(pmap_file) # save merged partitionmap if save_merged_pmap: ht.save_partitionmap(basename + '.pmap.merged') if remove_orig_pmap: for i in range(0, n_subsets): pmap_file = basename + '.subset.%d.pmap' % (i, ) os.unlink(pmap_file) # output partitions! n_partitions = ht.output_partitions(filename, basename + '.part') (n_partitions, n_singletons) = ht.count_partitions() print n_partitions fd.write(str(n_partitions) + "\n") #print os.listdir(os.getcwd()) for file in glob.glob(os.getcwd() + "/*pmap*"): os.remove(file) for file in glob.glob(os.getcwd() + "/*.info"): os.remove(file) for file in glob.glob(os.getcwd() + "/*.part"): os.remove(file) fd.close()
def main(filename): global ht n = 5 basename = os.path.basename(filename) fd = open("log.txt", "w") primes = [] below = khmer.get_n_primes_near_x(N_HT * n, HASHTABLE_SIZE) above = khmer.get_n_primes_above_x(N_HT * n, HASHTABLE_SIZE) primes = below + above random.shuffle(primes) for run in range(n): print primes[run*N_HT:run*N_HT+N_HT] ht = khmer._new_hashbits(K, primes[run*N_HT:run*N_HT+N_HT]) #ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) # populate the hash table and tag set if not load_ht: ht.consume_fasta_and_tag(filename) # save to a file (optional) if save_ht: ht.save(basename + '.ht') ht.save_tagset(basename + '.tagset') # calculate the hashtable occupancy print '---' print 'hashtable occupancy:', ht.n_occupied() / float(HASHTABLE_SIZE) print '---' else: ht.load(basename + '.ht') ht.load_tagset(basename + '.tagset') # did we just want to load the ht/tagset? if stop_after_n_subsets == 0: sys.exit(0) #stop_tags = pickle.load(open(sys.argv[2])) #for stop_tag in stop_tags: # ht.add_stop_tag(stop_tag) # divide the tags up into subsets divvy = ht.divide_tags_into_subsets(SUBSET_SIZE) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() for i in range(0, n_subsets): if stop_after_n_subsets is not None and i >= stop_after_n_subsets: break start = divvy[i] end = divvy[i+1] worker_q.put((ht, i, start, end)) open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) threads = [] for th in range(N_THREADS): t = threading.Thread(target=worker, args=(worker_q, basename)) threads.append(t) t.start() # wait for threads for t in threads: t.join() ### del ht gc.collect() # create a new, empty ht object for merging; K matters, but not # hashtable size. ht = khmer.new_hashbits(K, 1, 1) # load & merge all pmap files for i in range(0, n_subsets): pmap_file = basename + '.subset.%d.pmap' % (i,) ht.merge_subset_from_disk(pmap_file) # save merged partitionmap if save_merged_pmap: ht.save_partitionmap(basename + '.pmap.merged') if remove_orig_pmap: for i in range(0, n_subsets): pmap_file = basename + '.subset.%d.pmap' % (i,) os.unlink(pmap_file) # output partitions! n_partitions = ht.output_partitions(filename, basename + '.part') (n_partitions, n_singletons) = ht.count_partitions() print n_partitions fd.write(str(n_partitions) + "\n") #print os.listdir(os.getcwd()) for file in glob.glob(os.getcwd() + "/*pmap*"): os.remove(file) for file in glob.glob(os.getcwd() + "/*.info"): os.remove(file) for file in glob.glob(os.getcwd() + "/*.part"): os.remove(file) fd.close()