Пример #1
0
def test_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')

    savepath = utils.get_temp_filename('tempcountingsave1.ht')
    loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    # save uncompressed hashtable.
    hi = khmer._new_counting_hash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    # compress.
    in_file = open(savepath, 'rb')
    out_file = gzip.open(loadpath, 'wb')
    out_file.writelines(in_file)
    out_file.close()
    in_file.close()

    # load compressed hashtable.
    ht = khmer._new_counting_hash(12, sizes)
    ht.load(loadpath)

    tracking = khmer._new_hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._new_hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Пример #2
0
def test_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')

    savepath = utils.get_temp_filename('tempcountingsave1.ht')
    loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    # save uncompressed hashtable.
    hi = khmer._new_counting_hash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    # compress.
    in_file = open(savepath, 'rb')
    out_file = gzip.open(loadpath, 'wb')
    out_file.writelines(in_file)
    out_file.close()
    in_file.close()

    # load compressed hashtable.
    ht = khmer._new_counting_hash(12, sizes)
    ht.load(loadpath)

    tracking = khmer._new_hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._new_hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Пример #3
0
def main():
    info('abundance-dist.py', ['counting'])
    args = get_parser().parse_args()
    infiles = [
        args.input_counting_table_filename, args.input_sequence_filename
    ]
    for infile in infiles:
        check_file_status(infile)

    check_space(infiles)

    print('hashtable from', args.input_counting_table_filename)
    counting_hash = khmer.load_counting_hash(
        args.input_counting_table_filename)

    kmer_size = counting_hash.ksize()
    hashsizes = counting_hash.hashsizes()
    tracking = khmer._new_hashbits(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    print('K:', kmer_size)
    print('HT sizes:', hashsizes)
    print('outputting to', args.output_histogram_filename)

    if os.path.exists(args.output_histogram_filename):
        if not args.squash_output:
            print('ERROR: %s exists; not squashing.' %
                  args.output_histogram_filename,
                  file=sys.stderr)
            sys.exit(1)

        print('** squashing existing file %s' % args.output_histogram_filename)

    print('preparing hist...')
    abundances = counting_hash.abundance_distribution(
        args.input_sequence_filename, tracking)
    total = sum(abundances)

    if 0 == total:
        print(
            "ERROR: abundance distribution is uniformly zero; "
            "nothing to report.",
            file=sys.stderr)
        print("\tPlease verify that the input files are valid.",
              file=sys.stderr)
        sys.exit(1)
    hash_fp = open(args.output_histogram_filename, 'w')

    sofar = 0
    for _, i in enumerate(abundances):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print(_, i, sofar, round(frac, 3), file=hash_fp)

        if sofar == total:
            break
Пример #4
0
def main():
    info('abundance-dist.py', ['counting'])
    args = get_parser().parse_args()
    infiles = [args.input_counting_table_filename,
               args.input_sequence_filename]
    for infile in infiles:
        check_file_status(infile)

    check_space(infiles)

    print('hashtable from', args.input_counting_table_filename)
    counting_hash = khmer.load_counting_hash(
        args.input_counting_table_filename)

    kmer_size = counting_hash.ksize()
    hashsizes = counting_hash.hashsizes()
    tracking = khmer._new_hashbits(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    print('K:', kmer_size)
    print('HT sizes:', hashsizes)
    print('outputting to', args.output_histogram_filename)

    if os.path.exists(args.output_histogram_filename):
        if not args.squash_output:
            print('ERROR: %s exists; not squashing.' %
                  args.output_histogram_filename,
                  file=sys.stderr)
            sys.exit(1)

        print('** squashing existing file %s' % args.output_histogram_filename)

    print('preparing hist...')
    abundances = counting_hash.abundance_distribution(
        args.input_sequence_filename, tracking)
    total = sum(abundances)

    if 0 == total:
        print("ERROR: abundance distribution is uniformly zero; "
              "nothing to report.", file=sys.stderr)
        print("\tPlease verify that the input files are valid.",
              file=sys.stderr)
        sys.exit(1)
    hash_fp = open(args.output_histogram_filename, 'w')

    sofar = 0
    for _, i in enumerate(abundances):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print(_, i, sofar, round(frac, 3), file=hash_fp)

        if sofar == total:
            break
Пример #5
0
def main():
    parser = argparse.ArgumentParser(description="Output k-mer abundance distribution.")
    
    parser.add_argument('hashname')
    parser.add_argument('datafile')
    parser.add_argument('histout')

    parser.add_argument('-z', '--no-zero', dest='output_zero', default=True,
                        action='store_false',
                        help='Do not output 0-count bins')
    parser.add_argument('-s', '--squash', dest='squash_output', default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')

    args = parser.parse_args()
    hashfile = args.hashname
    datafile = args.datafile
    histout = args.histout

    print 'hashtable from', hashfile
    ht = khmer.load_counting_hash(hashfile)

    K = ht.ksize()
    sizes = ht.hashsizes()
    tracking = khmer._new_hashbits(K, sizes)

    print 'K:', K
    print 'HT sizes:', sizes
    print 'outputting to', histout

    if os.path.exists(histout):
        if not args.squash_output:
            print >>sys.stderr, 'ERROR: %s exists; not squashing.' % histout
            sys.exit(-1)
        
        print '** squashing existing file %s' % histout

    print 'preparing hist...'
    z = ht.abundance_distribution(datafile, tracking)
    total = sum(z)
        
    fp = open(histout, 'w')

    sofar = 0
    for n, i in enumerate(z):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print >>fp, n, i, sofar, round(frac, 3)

        if sofar == total:
            break
Пример #6
0
def test_save_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave2.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    hi = khmer._new_counting_hash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    ht = khmer._new_counting_hash(12, sizes)
    ht.load(savepath)

    tracking = khmer._new_hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._new_hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Пример #7
0
def test_save_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave2.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    hi = khmer._new_counting_hash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    ht = khmer._new_counting_hash(12, sizes)
    ht.load(savepath)

    tracking = khmer._new_hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._new_hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Пример #8
0
def test_save_load_gz():
    thisdir = os.path.dirname(__file__)
    inpath = os.path.join(thisdir, 'test-data/random-20-a.fa')    
    savepath = os.path.join(thisdir, 'tempcountingsave2.ht.gz')
    
    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    hi = khmer._new_counting_hash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    ht = khmer._new_counting_hash(12, sizes)
    ht.load(savepath)

    tracking = khmer._new_hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._new_hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x,y)
Пример #9
0
def main():
    parser = build_construct_args(
        "Output k-mer abundance distribution (single file version).")
    add_threading_args(parser)

    parser.add_argument('datafile')
    parser.add_argument('histout')

    parser.add_argument('-z', '--no-zero', dest='output_zero', default=True,
                        action='store_false',
                        help='Do not output 0-count bins')
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('-s', '--squash', dest='squash_output', default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('--savehash', dest='savehash', default='')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)
    
    datafile = args.datafile
    histout = args.histout

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)
    ht.set_use_bigcount(args.bigcount)

    print 'building tracking ht'
    K = ht.ksize()
    sizes = ht.hashsizes()
    tracking = khmer._new_hashbits(K, sizes)

    print 'K:', K
    print 'HT sizes:', sizes
    print 'outputting to', histout

    config = khmer.get_config()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    # start loading
    rparser = khmer.ReadParser(datafile, n_threads)
    threads = []
    print 'consuming input, round 1 --', datafile
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    z_list = []
    def do_abundance_dist(r):
        z = ht.abundance_distribution_with_reads_parser(r, tracking)
        z_list.append(z)

    print 'preparing hist from %s...' % datafile
    rparser = khmer.ReadParser(datafile, n_threads)
    threads = []
    print 'consuming input, round 2 --', datafile
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=do_abundance_dist,
                args=(rparser,)
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    assert len(z_list) == n_threads, len(z_list)
    z = {}
    for zz in z_list:
        for i, count in enumerate(zz):
            z[i] = z.get(i, 0) + count

    total = sum(z.values())

    if 0 == total:
        print >>sys.stderr, \
            "ERROR: abundance distribution is uniformly zero; " \
            "nothing to report."
        print >>sys.stderr, "\tPlease verify that the input files are valid."
        sys.exit(-1)

    fp = open(histout, 'w')

    sofar = 0
    for n, i in sorted(z.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print >>fp, n, i, sofar, round(frac, 3)

        if sofar == total:
            break

    if args.savehash:
        print 'Saving hashfile', args.savehash
        print '...saving to', args.savehash
        ht.save(args.savehash)
Пример #10
0
def test_bad_primes_list():
    try:
        coutingtable = khmer._new_hashbits(31, ["a", "b", "c"], 1)
        assert 0, "Bad primes list should fail"
    except TypeError, e:
        print str(e)
Пример #11
0
def test_bad_primes_list():
    try:
        coutingtable = khmer._new_hashbits(31, ["a", "b", "c"], 1)
        assert 0, "Bad primes list should fail"
    except TypeError as e:
        print str(e)
Пример #12
0
def main():
    parser = argparse.ArgumentParser(
        description="Output k-mer abundance distribution.")

    parser.add_argument('hashname')
    parser.add_argument('datafile')
    parser.add_argument('histout')

    parser.add_argument('-z',
                        '--no-zero',
                        dest='output_zero',
                        default=True,
                        action='store_false',
                        help='Do not output 0-count bins')
    parser.add_argument('-s',
                        '--squash',
                        dest='squash_output',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')

    args = parser.parse_args()
    hashfile = args.hashname
    datafile = args.datafile
    histout = args.histout

    print 'hashtable from', hashfile
    ht = khmer.load_counting_hash(hashfile)

    K = ht.ksize()
    sizes = ht.hashsizes()
    tracking = khmer._new_hashbits(K, sizes)

    print 'K:', K
    print 'HT sizes:', sizes
    print 'outputting to', histout

    if os.path.exists(histout):
        if not args.squash_output:
            print >> sys.stderr, 'ERROR: %s exists; not squashing.' % histout
            sys.exit(-1)

        print '** squashing existing file %s' % histout

    print 'preparing hist...'
    z = ht.abundance_distribution(datafile, tracking)
    total = sum(z)

    if 0 == total:
        print >> sys.stderr, "ERROR: abundance distribution is uniformly zero; nothing to report."
        print >> sys.stderr, "\tPlease verify that the input files are valid."
        sys.exit(-1)

    fp = open(histout, 'w')

    sofar = 0
    for n, i in enumerate(z):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print >> fp, n, i, sofar, round(frac, 3)

        if sofar == total:
            break
Пример #13
0
def main(filename):
    global ht

    n = 5

    basename = os.path.basename(filename)

    fd = open("log.txt", "w")

    primes = []

    below = khmer.get_n_primes_near_x(N_HT * n, HASHTABLE_SIZE)
    above = khmer.get_n_primes_above_x(N_HT * n, HASHTABLE_SIZE)

    primes = below + above
    random.shuffle(primes)

    for run in range(n):

        print primes[run * N_HT:run * N_HT + N_HT]

        ht = khmer._new_hashbits(K, primes[run * N_HT:run * N_HT + N_HT])
        #ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)

        # populate the hash table and tag set
        if not load_ht:
            ht.consume_fasta_and_tag(filename)

            # save to a file (optional)
            if save_ht:
                ht.save(basename + '.ht')
                ht.save_tagset(basename + '.tagset')

            # calculate the hashtable occupancy
            print '---'
            print 'hashtable occupancy:', ht.n_occupied() / float(
                HASHTABLE_SIZE)
            print '---'
        else:
            ht.load(basename + '.ht')
            ht.load_tagset(basename + '.tagset')

        # did we just want to load the ht/tagset?
        if stop_after_n_subsets == 0:
            sys.exit(0)

        #stop_tags = pickle.load(open(sys.argv[2]))

        #for stop_tag in stop_tags:
        #    ht.add_stop_tag(stop_tag)

        # divide the tags up into subsets
        divvy = ht.divide_tags_into_subsets(SUBSET_SIZE)
        n_subsets = len(divvy)
        divvy.append(0)

        # build a queue of tasks:
        worker_q = Queue.Queue()

        for i in range(0, n_subsets):
            if stop_after_n_subsets is not None and i >= stop_after_n_subsets:
                break

            start = divvy[i]
            end = divvy[i + 1]
            worker_q.put((ht, i, start, end))

        open('%s.info' % basename,
             'w').write('%d subsets total\n' % (n_subsets))

        threads = []
        for th in range(N_THREADS):
            t = threading.Thread(target=worker, args=(worker_q, basename))
            threads.append(t)
            t.start()

        # wait for threads
        for t in threads:
            t.join()

        ###

        del ht
        gc.collect()

        # create a new, empty ht object for merging; K matters, but not
        # hashtable size.
        ht = khmer.new_hashbits(K, 1, 1)

        # load & merge all pmap files
        for i in range(0, n_subsets):
            pmap_file = basename + '.subset.%d.pmap' % (i, )
            ht.merge_subset_from_disk(pmap_file)

        # save merged partitionmap
        if save_merged_pmap:
            ht.save_partitionmap(basename + '.pmap.merged')

        if remove_orig_pmap:
            for i in range(0, n_subsets):
                pmap_file = basename + '.subset.%d.pmap' % (i, )
                os.unlink(pmap_file)

        # output partitions!
        n_partitions = ht.output_partitions(filename, basename + '.part')
        (n_partitions, n_singletons) = ht.count_partitions()
        print n_partitions

        fd.write(str(n_partitions) + "\n")
        #print os.listdir(os.getcwd())

        for file in glob.glob(os.getcwd() + "/*pmap*"):
            os.remove(file)
        for file in glob.glob(os.getcwd() + "/*.info"):
            os.remove(file)
        for file in glob.glob(os.getcwd() + "/*.part"):
            os.remove(file)

    fd.close()
Пример #14
0
def main():
    parser = build_construct_args(
        "Output k-mer abundance distribution (single file version).")
    add_threading_args(parser)

    parser.add_argument('datafile')
    parser.add_argument('histout')

    parser.add_argument('-z',
                        '--no-zero',
                        dest='output_zero',
                        default=True,
                        action='store_false',
                        help='Do not output 0-count bins')
    parser.add_argument('-b',
                        '--no-bigcount',
                        dest='bigcount',
                        default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('-s',
                        '--squash',
                        dest='squash_output',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('--savehash', dest='savehash', default='')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    datafile = args.datafile
    histout = args.histout

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)
    ht.set_use_bigcount(args.bigcount)

    print 'building tracking ht'
    K = ht.ksize()
    sizes = ht.hashsizes()
    tracking = khmer._new_hashbits(K, sizes)

    print 'K:', K
    print 'HT sizes:', sizes
    print 'outputting to', histout

    config = khmer.get_config()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    # start loading
    rparser = khmer.ReadParser(datafile, n_threads)
    threads = []
    print 'consuming input, round 1 --', datafile
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    z_list = []

    def do_abundance_dist(r):
        z = ht.abundance_distribution_with_reads_parser(r, tracking)
        z_list.append(z)

    print 'preparing hist from %s...' % datafile
    rparser = khmer.ReadParser(datafile, n_threads)
    threads = []
    print 'consuming input, round 2 --', datafile
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=do_abundance_dist,
                args=(rparser,)
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    assert len(z_list) == n_threads, len(z_list)
    z = {}
    for zz in z_list:
        for i, count in enumerate(zz):
            z[i] = z.get(i, 0) + count

    total = sum(z.values())

    if 0 == total:
        print >>sys.stderr, \
            "ERROR: abundance distribution is uniformly zero; " \
            "nothing to report."
        print >> sys.stderr, "\tPlease verify that the input files are valid."
        sys.exit(-1)

    fp = open(histout, 'w')

    sofar = 0
    for n, i in sorted(z.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print >> fp, n, i, sofar, round(frac, 3)

        if sofar == total:
            break

    if args.savehash:
        print 'Saving hashfile', args.savehash
        print '...saving to', args.savehash
        ht.save(args.savehash)
Пример #15
0
def main(filename):
    global ht
    
    n = 5

    basename = os.path.basename(filename)

    fd = open("log.txt", "w")
    
    primes = []

    below = khmer.get_n_primes_near_x(N_HT * n, HASHTABLE_SIZE)
    above = khmer.get_n_primes_above_x(N_HT * n, HASHTABLE_SIZE)

    primes = below + above
    random.shuffle(primes)

    for run in range(n):

        print primes[run*N_HT:run*N_HT+N_HT]

        ht = khmer._new_hashbits(K, primes[run*N_HT:run*N_HT+N_HT])
        #ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
    
        # populate the hash table and tag set
        if not load_ht:
            ht.consume_fasta_and_tag(filename)

            # save to a file (optional)
            if save_ht:
                ht.save(basename + '.ht')
                ht.save_tagset(basename + '.tagset')
            
            # calculate the hashtable occupancy
            print '---'
            print 'hashtable occupancy:', ht.n_occupied() / float(HASHTABLE_SIZE)
            print '---'
        else:
            ht.load(basename + '.ht')
            ht.load_tagset(basename + '.tagset')

        # did we just want to load the ht/tagset?
        if stop_after_n_subsets == 0:
            sys.exit(0)

        #stop_tags = pickle.load(open(sys.argv[2]))

        #for stop_tag in stop_tags:
        #    ht.add_stop_tag(stop_tag)

        # divide the tags up into subsets
        divvy = ht.divide_tags_into_subsets(SUBSET_SIZE)
        n_subsets = len(divvy)
        divvy.append(0)

        # build a queue of tasks:
        worker_q = Queue.Queue()

        for i in range(0, n_subsets):
            if stop_after_n_subsets is not None and i >= stop_after_n_subsets:
                break

            start = divvy[i]
            end = divvy[i+1]
            worker_q.put((ht, i, start, end))

        open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))

        threads = []
        for th in range(N_THREADS):
            t = threading.Thread(target=worker, args=(worker_q, basename))
            threads.append(t)
            t.start()

        # wait for threads
        for t in threads:
            t.join()

        ###

        del ht
        gc.collect()

        # create a new, empty ht object for merging; K matters, but not
        # hashtable size.
        ht = khmer.new_hashbits(K, 1, 1)

        # load & merge all pmap files
        for i in range(0, n_subsets):
            pmap_file = basename + '.subset.%d.pmap' % (i,)
            ht.merge_subset_from_disk(pmap_file)

        # save merged partitionmap
        if save_merged_pmap:
            ht.save_partitionmap(basename + '.pmap.merged')

        if remove_orig_pmap:
            for i in range(0, n_subsets):
                pmap_file = basename + '.subset.%d.pmap' % (i,)
                os.unlink(pmap_file)

        # output partitions!
        n_partitions = ht.output_partitions(filename, basename + '.part')
        (n_partitions, n_singletons) = ht.count_partitions()
        print n_partitions

        fd.write(str(n_partitions) + "\n")
        #print os.listdir(os.getcwd())

        for file in glob.glob(os.getcwd() + "/*pmap*"):
            os.remove(file)
        for file in glob.glob(os.getcwd() + "/*.info"):
            os.remove(file)
        for file in glob.glob(os.getcwd() + "/*.part"):
            os.remove(file)

    fd.close()
Пример #16
0
def main():
    parser = argparse.ArgumentParser(description="Output k-mer abundance distribution.")

    parser.add_argument("hashname")
    parser.add_argument("datafile")
    parser.add_argument("histout")

    parser.add_argument(
        "-z", "--no-zero", dest="output_zero", default=True, action="store_false", help="Do not output 0-count bins"
    )
    parser.add_argument(
        "-s",
        "--squash",
        dest="squash_output",
        default=False,
        action="store_true",
        help="Overwrite output file if it exists",
    )

    args = parser.parse_args()
    hashfile = args.hashname
    datafile = args.datafile
    histout = args.histout

    print "hashtable from", hashfile
    ht = khmer.load_counting_hash(hashfile)

    K = ht.ksize()
    sizes = ht.hashsizes()
    tracking = khmer._new_hashbits(K, sizes)

    print "K:", K
    print "HT sizes:", sizes
    print "outputting to", histout

    if os.path.exists(histout):
        if not args.squash_output:
            print >> sys.stderr, "ERROR: %s exists; not squashing." % histout
            sys.exit(-1)

        print "** squashing existing file %s" % histout

    print "preparing hist..."
    z = ht.abundance_distribution(datafile, tracking)
    total = sum(z)

    if 0 == total:
        print >> sys.stderr, "ERROR: abundance distribution is uniformly zero; nothing to report."
        print >> sys.stderr, "\tPlease verify that the input files are valid."
        sys.exit(-1)

    fp = open(histout, "w")

    sofar = 0
    for n, i in enumerate(z):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print >> fp, n, i, sofar, round(frac, 3)

        if sofar == total:
            break