def test_load_gz(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave1.ht') loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) # save uncompressed hashtable. hi = khmer._new_counting_hash(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) # compress. in_file = open(savepath, 'rb') out_file = gzip.open(loadpath, 'wb') out_file.writelines(in_file) out_file.close() in_file.close() # load compressed hashtable. ht = khmer._new_counting_hash(12, sizes) ht.load(loadpath) tracking = khmer._new_hashbits(12, sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer._new_hashbits(12, sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)
def main(): info('abundance-dist.py', ['counting']) args = get_parser().parse_args() infiles = [ args.input_counting_table_filename, args.input_sequence_filename ] for infile in infiles: check_file_status(infile) check_space(infiles) print('hashtable from', args.input_counting_table_filename) counting_hash = khmer.load_counting_hash( args.input_counting_table_filename) kmer_size = counting_hash.ksize() hashsizes = counting_hash.hashsizes() tracking = khmer._new_hashbits( # pylint: disable=protected-access kmer_size, hashsizes) print('K:', kmer_size) print('HT sizes:', hashsizes) print('outputting to', args.output_histogram_filename) if os.path.exists(args.output_histogram_filename): if not args.squash_output: print('ERROR: %s exists; not squashing.' % args.output_histogram_filename, file=sys.stderr) sys.exit(1) print('** squashing existing file %s' % args.output_histogram_filename) print('preparing hist...') abundances = counting_hash.abundance_distribution( args.input_sequence_filename, tracking) total = sum(abundances) if 0 == total: print( "ERROR: abundance distribution is uniformly zero; " "nothing to report.", file=sys.stderr) print("\tPlease verify that the input files are valid.", file=sys.stderr) sys.exit(1) hash_fp = open(args.output_histogram_filename, 'w') sofar = 0 for _, i in enumerate(abundances): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print(_, i, sofar, round(frac, 3), file=hash_fp) if sofar == total: break
def main(): info('abundance-dist.py', ['counting']) args = get_parser().parse_args() infiles = [args.input_counting_table_filename, args.input_sequence_filename] for infile in infiles: check_file_status(infile) check_space(infiles) print('hashtable from', args.input_counting_table_filename) counting_hash = khmer.load_counting_hash( args.input_counting_table_filename) kmer_size = counting_hash.ksize() hashsizes = counting_hash.hashsizes() tracking = khmer._new_hashbits( # pylint: disable=protected-access kmer_size, hashsizes) print('K:', kmer_size) print('HT sizes:', hashsizes) print('outputting to', args.output_histogram_filename) if os.path.exists(args.output_histogram_filename): if not args.squash_output: print('ERROR: %s exists; not squashing.' % args.output_histogram_filename, file=sys.stderr) sys.exit(1) print('** squashing existing file %s' % args.output_histogram_filename) print('preparing hist...') abundances = counting_hash.abundance_distribution( args.input_sequence_filename, tracking) total = sum(abundances) if 0 == total: print("ERROR: abundance distribution is uniformly zero; " "nothing to report.", file=sys.stderr) print("\tPlease verify that the input files are valid.", file=sys.stderr) sys.exit(1) hash_fp = open(args.output_histogram_filename, 'w') sofar = 0 for _, i in enumerate(abundances): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print(_, i, sofar, round(frac, 3), file=hash_fp) if sofar == total: break
def main(): parser = argparse.ArgumentParser(description="Output k-mer abundance distribution.") parser.add_argument('hashname') parser.add_argument('datafile') parser.add_argument('histout') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output 0-count bins') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite output file if it exists') args = parser.parse_args() hashfile = args.hashname datafile = args.datafile histout = args.histout print 'hashtable from', hashfile ht = khmer.load_counting_hash(hashfile) K = ht.ksize() sizes = ht.hashsizes() tracking = khmer._new_hashbits(K, sizes) print 'K:', K print 'HT sizes:', sizes print 'outputting to', histout if os.path.exists(histout): if not args.squash_output: print >>sys.stderr, 'ERROR: %s exists; not squashing.' % histout sys.exit(-1) print '** squashing existing file %s' % histout print 'preparing hist...' z = ht.abundance_distribution(datafile, tracking) total = sum(z) fp = open(histout, 'w') sofar = 0 for n, i in enumerate(z): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >>fp, n, i, sofar, round(frac, 3) if sofar == total: break
def test_save_load_gz(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave2.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) hi = khmer._new_counting_hash(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) ht = khmer._new_counting_hash(12, sizes) ht.load(savepath) tracking = khmer._new_hashbits(12, sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer._new_hashbits(12, sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)
def test_save_load_gz(): thisdir = os.path.dirname(__file__) inpath = os.path.join(thisdir, 'test-data/random-20-a.fa') savepath = os.path.join(thisdir, 'tempcountingsave2.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) hi = khmer._new_counting_hash(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) ht = khmer._new_counting_hash(12, sizes) ht.load(savepath) tracking = khmer._new_hashbits(12, sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer._new_hashbits(12, sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x,y)
def main(): parser = build_construct_args( "Output k-mer abundance distribution (single file version).") add_threading_args(parser) parser.add_argument('datafile') parser.add_argument('histout') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output 0-count bins') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('--savehash', dest='savehash', default='') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) datafile = args.datafile histout = args.histout print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) print 'building tracking ht' K = ht.ksize() sizes = ht.hashsizes() tracking = khmer._new_hashbits(K, sizes) print 'K:', K print 'HT sizes:', sizes print 'outputting to', histout config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) # start loading rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 1 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() z_list = [] def do_abundance_dist(r): z = ht.abundance_distribution_with_reads_parser(r, tracking) z_list.append(z) print 'preparing hist from %s...' % datafile rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 2 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=do_abundance_dist, args=(rparser,) ) threads.append(t) t.start() for t in threads: t.join() assert len(z_list) == n_threads, len(z_list) z = {} for zz in z_list: for i, count in enumerate(zz): z[i] = z.get(i, 0) + count total = sum(z.values()) if 0 == total: print >>sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >>sys.stderr, "\tPlease verify that the input files are valid." sys.exit(-1) fp = open(histout, 'w') sofar = 0 for n, i in sorted(z.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >>fp, n, i, sofar, round(frac, 3) if sofar == total: break if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def test_bad_primes_list(): try: coutingtable = khmer._new_hashbits(31, ["a", "b", "c"], 1) assert 0, "Bad primes list should fail" except TypeError, e: print str(e)
def test_bad_primes_list(): try: coutingtable = khmer._new_hashbits(31, ["a", "b", "c"], 1) assert 0, "Bad primes list should fail" except TypeError as e: print str(e)
def main(): parser = argparse.ArgumentParser( description="Output k-mer abundance distribution.") parser.add_argument('hashname') parser.add_argument('datafile') parser.add_argument('histout') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output 0-count bins') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite output file if it exists') args = parser.parse_args() hashfile = args.hashname datafile = args.datafile histout = args.histout print 'hashtable from', hashfile ht = khmer.load_counting_hash(hashfile) K = ht.ksize() sizes = ht.hashsizes() tracking = khmer._new_hashbits(K, sizes) print 'K:', K print 'HT sizes:', sizes print 'outputting to', histout if os.path.exists(histout): if not args.squash_output: print >> sys.stderr, 'ERROR: %s exists; not squashing.' % histout sys.exit(-1) print '** squashing existing file %s' % histout print 'preparing hist...' z = ht.abundance_distribution(datafile, tracking) total = sum(z) if 0 == total: print >> sys.stderr, "ERROR: abundance distribution is uniformly zero; nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(-1) fp = open(histout, 'w') sofar = 0 for n, i in enumerate(z): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >> fp, n, i, sofar, round(frac, 3) if sofar == total: break
def main(filename): global ht n = 5 basename = os.path.basename(filename) fd = open("log.txt", "w") primes = [] below = khmer.get_n_primes_near_x(N_HT * n, HASHTABLE_SIZE) above = khmer.get_n_primes_above_x(N_HT * n, HASHTABLE_SIZE) primes = below + above random.shuffle(primes) for run in range(n): print primes[run * N_HT:run * N_HT + N_HT] ht = khmer._new_hashbits(K, primes[run * N_HT:run * N_HT + N_HT]) #ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) # populate the hash table and tag set if not load_ht: ht.consume_fasta_and_tag(filename) # save to a file (optional) if save_ht: ht.save(basename + '.ht') ht.save_tagset(basename + '.tagset') # calculate the hashtable occupancy print '---' print 'hashtable occupancy:', ht.n_occupied() / float( HASHTABLE_SIZE) print '---' else: ht.load(basename + '.ht') ht.load_tagset(basename + '.tagset') # did we just want to load the ht/tagset? if stop_after_n_subsets == 0: sys.exit(0) #stop_tags = pickle.load(open(sys.argv[2])) #for stop_tag in stop_tags: # ht.add_stop_tag(stop_tag) # divide the tags up into subsets divvy = ht.divide_tags_into_subsets(SUBSET_SIZE) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() for i in range(0, n_subsets): if stop_after_n_subsets is not None and i >= stop_after_n_subsets: break start = divvy[i] end = divvy[i + 1] worker_q.put((ht, i, start, end)) open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) threads = [] for th in range(N_THREADS): t = threading.Thread(target=worker, args=(worker_q, basename)) threads.append(t) t.start() # wait for threads for t in threads: t.join() ### del ht gc.collect() # create a new, empty ht object for merging; K matters, but not # hashtable size. ht = khmer.new_hashbits(K, 1, 1) # load & merge all pmap files for i in range(0, n_subsets): pmap_file = basename + '.subset.%d.pmap' % (i, ) ht.merge_subset_from_disk(pmap_file) # save merged partitionmap if save_merged_pmap: ht.save_partitionmap(basename + '.pmap.merged') if remove_orig_pmap: for i in range(0, n_subsets): pmap_file = basename + '.subset.%d.pmap' % (i, ) os.unlink(pmap_file) # output partitions! n_partitions = ht.output_partitions(filename, basename + '.part') (n_partitions, n_singletons) = ht.count_partitions() print n_partitions fd.write(str(n_partitions) + "\n") #print os.listdir(os.getcwd()) for file in glob.glob(os.getcwd() + "/*pmap*"): os.remove(file) for file in glob.glob(os.getcwd() + "/*.info"): os.remove(file) for file in glob.glob(os.getcwd() + "/*.part"): os.remove(file) fd.close()
def main(): parser = build_construct_args( "Output k-mer abundance distribution (single file version).") add_threading_args(parser) parser.add_argument('datafile') parser.add_argument('histout') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output 0-count bins') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('--savehash', dest='savehash', default='') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) datafile = args.datafile histout = args.histout print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) print 'building tracking ht' K = ht.ksize() sizes = ht.hashsizes() tracking = khmer._new_hashbits(K, sizes) print 'K:', K print 'HT sizes:', sizes print 'outputting to', histout config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) # start loading rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 1 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() z_list = [] def do_abundance_dist(r): z = ht.abundance_distribution_with_reads_parser(r, tracking) z_list.append(z) print 'preparing hist from %s...' % datafile rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 2 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=do_abundance_dist, args=(rparser,) ) threads.append(t) t.start() for t in threads: t.join() assert len(z_list) == n_threads, len(z_list) z = {} for zz in z_list: for i, count in enumerate(zz): z[i] = z.get(i, 0) + count total = sum(z.values()) if 0 == total: print >>sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(-1) fp = open(histout, 'w') sofar = 0 for n, i in sorted(z.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >> fp, n, i, sofar, round(frac, 3) if sofar == total: break if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(filename): global ht n = 5 basename = os.path.basename(filename) fd = open("log.txt", "w") primes = [] below = khmer.get_n_primes_near_x(N_HT * n, HASHTABLE_SIZE) above = khmer.get_n_primes_above_x(N_HT * n, HASHTABLE_SIZE) primes = below + above random.shuffle(primes) for run in range(n): print primes[run*N_HT:run*N_HT+N_HT] ht = khmer._new_hashbits(K, primes[run*N_HT:run*N_HT+N_HT]) #ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) # populate the hash table and tag set if not load_ht: ht.consume_fasta_and_tag(filename) # save to a file (optional) if save_ht: ht.save(basename + '.ht') ht.save_tagset(basename + '.tagset') # calculate the hashtable occupancy print '---' print 'hashtable occupancy:', ht.n_occupied() / float(HASHTABLE_SIZE) print '---' else: ht.load(basename + '.ht') ht.load_tagset(basename + '.tagset') # did we just want to load the ht/tagset? if stop_after_n_subsets == 0: sys.exit(0) #stop_tags = pickle.load(open(sys.argv[2])) #for stop_tag in stop_tags: # ht.add_stop_tag(stop_tag) # divide the tags up into subsets divvy = ht.divide_tags_into_subsets(SUBSET_SIZE) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() for i in range(0, n_subsets): if stop_after_n_subsets is not None and i >= stop_after_n_subsets: break start = divvy[i] end = divvy[i+1] worker_q.put((ht, i, start, end)) open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) threads = [] for th in range(N_THREADS): t = threading.Thread(target=worker, args=(worker_q, basename)) threads.append(t) t.start() # wait for threads for t in threads: t.join() ### del ht gc.collect() # create a new, empty ht object for merging; K matters, but not # hashtable size. ht = khmer.new_hashbits(K, 1, 1) # load & merge all pmap files for i in range(0, n_subsets): pmap_file = basename + '.subset.%d.pmap' % (i,) ht.merge_subset_from_disk(pmap_file) # save merged partitionmap if save_merged_pmap: ht.save_partitionmap(basename + '.pmap.merged') if remove_orig_pmap: for i in range(0, n_subsets): pmap_file = basename + '.subset.%d.pmap' % (i,) os.unlink(pmap_file) # output partitions! n_partitions = ht.output_partitions(filename, basename + '.part') (n_partitions, n_singletons) = ht.count_partitions() print n_partitions fd.write(str(n_partitions) + "\n") #print os.listdir(os.getcwd()) for file in glob.glob(os.getcwd() + "/*pmap*"): os.remove(file) for file in glob.glob(os.getcwd() + "/*.info"): os.remove(file) for file in glob.glob(os.getcwd() + "/*.part"): os.remove(file) fd.close()
def main(): parser = argparse.ArgumentParser(description="Output k-mer abundance distribution.") parser.add_argument("hashname") parser.add_argument("datafile") parser.add_argument("histout") parser.add_argument( "-z", "--no-zero", dest="output_zero", default=True, action="store_false", help="Do not output 0-count bins" ) parser.add_argument( "-s", "--squash", dest="squash_output", default=False, action="store_true", help="Overwrite output file if it exists", ) args = parser.parse_args() hashfile = args.hashname datafile = args.datafile histout = args.histout print "hashtable from", hashfile ht = khmer.load_counting_hash(hashfile) K = ht.ksize() sizes = ht.hashsizes() tracking = khmer._new_hashbits(K, sizes) print "K:", K print "HT sizes:", sizes print "outputting to", histout if os.path.exists(histout): if not args.squash_output: print >> sys.stderr, "ERROR: %s exists; not squashing." % histout sys.exit(-1) print "** squashing existing file %s" % histout print "preparing hist..." z = ht.abundance_distribution(datafile, tracking) total = sum(z) if 0 == total: print >> sys.stderr, "ERROR: abundance distribution is uniformly zero; nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(-1) fp = open(histout, "w") sofar = 0 for n, i in enumerate(z): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >> fp, n, i, sofar, round(frac, 3) if sofar == total: break