예제 #1
0
def test_casava_1_8_pair_mating():

    import threading

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(128 * 1024)
    # Note: This file, when used in conjunction with a 64 KiB per-thread
    #       prefetch buffer, tests the paired read mating logic with the
    #       Casava >= 1.8 read name format.
    rparser = ReadParser(utils.get_test_data("test-reads.fq.bz2"), 2)

    def thread_1_runtime(rparser):
        for read in rparser:
            pass

    def thread_2_runtime(rparser):
        for readnum, read in enumerate(rparser):
            if 0 == readnum:
                assert "895:1:1:1761:13189 2:N:0:NNNNN" == read.name

    t1 = threading.Thread(target=thread_1_runtime, args=[rparser])
    t2 = threading.Thread(target=thread_2_runtime, args=[rparser])

    t1.start()
    t2.start()

    t1.join()
    t2.join()

    config.set_reads_input_buffer_size(bufsz)
예제 #2
0
def main():
    info("filter-abund-single.py", ["counting"])
    args = get_parser().parse_args()
    check_file_status(args.datafile)
    check_space([args.datafile])
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)
    report_on_config(args)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    print "making k-mer counting table"
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile, args.threads)
    threads = []
    print "consuming input, round 1 --", args.datafile
    for _ in xrange(args.threads):
        cur_thread = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,))
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    fp_rate = khmer.calc_expected_collisions(htable)
    print "fp rate estimated to be %1.3f" % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    print "filtering", args.datafile
    outfile = os.path.basename(args.datafile) + ".abundfilt"
    outfp = open(outfile, "w")

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print "output in", outfile

    if args.savetable:
        print "Saving k-mer counting table filename", args.savetable
        print "...saving to", args.savetable
        htable.save(args.savetable)
예제 #3
0
def test_with_multiple_threads():

    import operator
    import threading

    reads_count_1thr = 0
    rparser = ReadParser(utils.get_test_data("test-reads.fq.bz2"))
    for read in rparser:
        reads_count_1thr += 1

    def count_reads(rparser, counters, tnum):
        counters[tnum] = reduce(operator.add, (1 for read in rparser))

    N_THREADS = 4
    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(N_THREADS * 64 * 1024)
    threads = []
    reads_counts_per_thread = [0] * N_THREADS
    rparser = ReadParser(utils.get_test_data("test-reads.fq.bz2"), N_THREADS)
    for tnum in xrange(N_THREADS):
        t = \
            threading.Thread(
                target=count_reads,
                args=[rparser, reads_counts_per_thread, tnum]
            )
        threads.append(t)
        t.start()
    for t in threads:
        t.join()
    config.set_reads_input_buffer_size(bufsz)

    assert reads_count_1thr == sum(reads_counts_per_thread)
예제 #4
0
def test_old_illumina_pair_mating():

    import threading

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(65600 * 2)
    # Note: This file, when used in conjunction with a 65600 byte per-thread
    #       prefetch buffer, tests the paired read mating logic with the
    #       old Illumina read name format.
    rparser = ReadParser(utils.get_test_data("test-reads.fa"), 2)

    def thread_1_runtime(rparser):
        for read in rparser:
            pass

    def thread_2_runtime(rparser):
        for readnum, read in enumerate(rparser):
            if 0 == readnum:
                assert "850:2:1:1198:16820/1" == read.name

    t1 = threading.Thread(target=thread_1_runtime, args=[rparser])
    t2 = threading.Thread(target=thread_2_runtime, args=[rparser])

    t1.start()
    t2.start()

    t1.join()
    t2.join()

    config.set_reads_input_buffer_size(bufsz)
예제 #5
0
def test_casava_1_8_pair_mating():

    import threading

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(128 * 1024)
    # Note: This file, when used in conjunction with a 64 KiB per-thread
    #       prefetch buffer, tests the paired read mating logic with the
    #       Casava >= 1.8 read name format.
    rparser = ReadParser(utils.get_test_data("test-reads.fq.bz2"), 2)

    def thread_1_runtime(rparser):
        for read in rparser:
            pass

    def thread_2_runtime(rparser):
        for readnum, read in enumerate(rparser):
            if 0 == readnum:
                assert "895:1:1:1761:13189 2:N:0:NNNNN" == read.name

    t1 = threading.Thread(target=thread_1_runtime, args=[rparser])
    t2 = threading.Thread(target=thread_2_runtime, args=[rparser])

    t1.start()
    t2.start()

    t1.join()
    t2.join()

    config.set_reads_input_buffer_size(bufsz)
예제 #6
0
def test_with_multiple_threads():

    import operator
    import threading

    reads_count_1thr = 0
    rparser = ReadParser(utils.get_test_data("test-reads.fq.bz2"))
    for read in rparser:
        reads_count_1thr += 1

    def count_reads(rparser, counters, tnum):
        counters[tnum] = reduce(operator.add, (1 for read in rparser))

    N_THREADS = 4
    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(N_THREADS * 64 * 1024)
    threads = []
    reads_counts_per_thread = [0] * N_THREADS
    rparser = ReadParser(utils.get_test_data("test-reads.fq.bz2"), N_THREADS)
    for tnum in xrange(N_THREADS):
        t = \
            threading.Thread(
                target=count_reads,
                args=[rparser, reads_counts_per_thread, tnum]
            )
        threads.append(t)
        t.start()
    for t in threads:
        t.join()
    config.set_reads_input_buffer_size(bufsz)

    assert reads_count_1thr == sum(reads_counts_per_thread)
예제 #7
0
def test_old_illumina_pair_mating():

    import threading

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(65600 * 2)
    # Note: This file, when used in conjunction with a 65600 byte per-thread
    #       prefetch buffer, tests the paired read mating logic with the
    #       old Illumina read name format.
    rparser = ReadParser(utils.get_test_data("test-reads.fa"), 2)

    def thread_1_runtime(rparser):
        for read in rparser:
            pass

    def thread_2_runtime(rparser):
        for readnum, read in enumerate(rparser):
            if 0 == readnum:
                assert "850:2:1:1198:16820/1" == read.name

    t1 = threading.Thread(target=thread_1_runtime, args=[rparser])
    t2 = threading.Thread(target=thread_2_runtime, args=[rparser])

    t1.start()
    t2.start()

    t1.join()
    t2.join()

    config.set_reads_input_buffer_size(bufsz)
def main():
    parser = build_construct_args()
    add_threading_args(parser)

    parser.add_argument('datafile')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)

    filename = args.datafile

    ### first, load reads into hash table
    rparser = khmer.ReadParser(filename, n_threads)
    threads = []
    print 'consuming input, round 1 --', filename
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    ### now, count.
    total = 0
    total_unique = 0
    for n, record in enumerate(screed.open(filename)):
        total += 1
        last_kmer = record.sequence[-K:]
        count = ht.get(last_kmer)
        if count == 1:
            total_unique += 1

    print 'singletons: %d unique; of %d total; %.3f' % \
        (total_unique, total, total_unique/float(total))
def main():
    parser = build_construct_args()
    add_threading_args(parser)

    parser.add_argument('datafile')
    
    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)

    filename = args.datafile
    
    ### first, load reads into hash table
    rparser = khmer.ReadParser(filename, n_threads)
    threads = []
    print 'consuming input, round 1 --', filename
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    ### now, count.
    total = 0
    total_unique = 0
    for n, record in enumerate(screed.open(filename)):
        total += 1
        last_kmer = record.sequence[-K:]
        count = ht.get(last_kmer)
        if count == 1:
            total_unique += 1

    print 'singletons: %d unique; of %d total; %.3f' % \
        (total_unique, total, total_unique/float(total))
예제 #10
0
def test_KmerCount():
    # test KmerCount class

    km = khmer.KmerCount(4)
    km.consume('AAAAAC')
    expected = (('AAAA', 2), ('AAAC', 1))

    for i, (kmer, count) in enumerate(km.pairs):
        e = expected[i]
        assert kmer == e[0], (kmer, i)
        assert count == e[1], (count, i)

    assert km['AAAA'] == 2
    assert km['AAAC'] == 1

    km = khmer.KmerCount(4, report_zero=True)
    km.consume('AAAAAC')
    expected = (('AAAA', 2), ('AAAC', 1))

    i = 0
    for kmer, count in km.pairs:
        if count:
            e = expected[i]
            assert kmer == e[0], (kmer, i)
            assert count == e[1], (count, i)
            i += 1

    assert i == 2

    # test capital letters vs lowercase
    config = khmer.get_config()
    if config.has_extra_sanity_checks():
        km = khmer.KmerCount(4, report_zero=True)
        km.consume('AAAAAC'.lower())
        expected = (('AAAA', 2), ('AAAC', 1))

        assert km['AAAA'] == 2
        assert km['AAAC'] == 1

    # hooray, done!

    print 'SUCCESS, all tests passed.'
예제 #11
0
def test_KmerCount():
    # test KmerCount class

    km = khmer.KmerCount(4)
    km.consume('AAAAAC')
    expected = (('AAAA', 2), ('AAAC', 1))

    for i, (kmer, count) in enumerate(km.pairs):
        e = expected[i]
        assert kmer == e[0], (kmer, i)
        assert count == e[1], (count, i)

    assert km['AAAA'] == 2
    assert km['AAAC'] == 1

    km = khmer.KmerCount(4, report_zero=True)
    km.consume('AAAAAC')
    expected = (('AAAA', 2), ('AAAC', 1))

    i = 0
    for kmer, count in km.pairs:
        if count:
            e = expected[i]
            assert kmer == e[0], (kmer, i)
            assert count == e[1], (count, i)
            i += 1

    assert i == 2

    # test capital letters vs lowercase
    config = khmer.get_config()
    if config.has_extra_sanity_checks():
        km = khmer.KmerCount(4, report_zero=True)
        km.consume('AAAAAC'.lower())
        expected = (('AAAA', 2), ('AAAC', 1))

        assert km['AAAA'] == 2
        assert km['AAAC'] == 1

    # hooray, done!

    print 'SUCCESS, all tests passed.'
예제 #12
0
def main():
    parser = build_construct_args(
        "Filter k-mers at the given abundance (inmem version).")
    add_threading_args(parser)

    parser.add_argument('--cutoff',
                        '-C',
                        dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savehash', dest='savehash', default='')
    parser.add_argument('datafile')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)

    filename = args.datafile

    # first, load reads into hash table
    rparser = khmer.ReadParser(filename, n_threads)
    threads = []
    print 'consuming input, round 1 --', filename
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    # the filtering loop
    print 'filtering', filename
    outfile = os.path.basename(filename) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(filename), outfp)

    print 'output in', outfile

    if args.savehash:
        print 'Saving hashfile', args.savehash
        print '...saving to', args.savehash
        ht.save(args.savehash)
예제 #13
0
def main():
    parser = build_construct_args()
    add_threading_args(parser)
    parser.add_argument('output_filename')
    parser.add_argument('input_filenames', nargs='+')
    parser.add_argument('-b',
                        '--no-bigcount',
                        dest='bigcount',
                        default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames
    n_threads = int(args.n_threads)

    print 'Saving hashtable to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    #

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)
    ht.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    for n, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, n_threads)
        threads = []
        print 'consuming input', filename
        for tnum in xrange(n_threads):
            t = \
                threading.Thread(
                    target=ht.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

        if n > 0 and n % 10 == 0:
            print 'mid-save', base
            ht.save(base)
            open(base + '.info', 'w').write('through %s' % filename)

    print 'saving', base
    ht.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the counting hash is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        sys.exit(-1)

    print 'DONE.'
예제 #14
0
def main():  # pylint: disable=too-many-locals,too-many-branches
    info('abundance-dist-single.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    check_file_status(args.input_sequence_filename)
    check_space([args.input_sequence_filename])
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)

    if (not args.squash_output
            and os.path.exists(args.output_histogram_filename)):
        print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \
            args.output_histogram_filename
        sys.exit(1)
    else:
        hist_fp = open(args.output_histogram_filename, 'w')

    print 'making k-mer counting table'
    counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                            args.n_tables, args.threads)
    counting_hash.set_use_bigcount(args.bigcount)

    print 'building k-mer tracking table'
    tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize,
                                  args.n_tables)

    print 'kmer_size:', counting_hash.ksize()
    print 'k-mer counting table sizes:', counting_hash.hashsizes()
    print 'outputting to', args.output_histogram_filename

    khmer.get_config().set_reads_input_buffer_size(args.threads * 64 * 1024)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename, args.threads)
    threads = []
    print 'consuming input, round 1 --', args.input_sequence_filename
    for _ in xrange(args.threads):
        thread = \
            threading.Thread(
                target=counting_hash.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of k-mers: {0}'.format(
            counting_hash.n_occupied())

    abundance_lists = []

    def __do_abundance_dist__(read_parser):
        abundances = counting_hash.abundance_distribution_with_reads_parser(
            read_parser, tracking)
        abundance_lists.append(abundances)

    print 'preparing hist from %s...' % args.input_sequence_filename
    rparser = khmer.ReadParser(args.input_sequence_filename, args.threads)
    threads = []
    print 'consuming input, round 2 --', args.input_sequence_filename
    for _ in xrange(args.threads):
        thread = \
            threading.Thread(
                target=__do_abundance_dist__,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == args.threads, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        print >> sys.stderr, \
            "ERROR: abundance distribution is uniformly zero; " \
            "nothing to report."
        print >> sys.stderr, "\tPlease verify that the input files are valid."
        sys.exit(1)

    sofar = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print >> hist_fp, _, i, sofar, round(frac, 3)

        if sofar == total:
            break

    if args.savetable:
        print 'Saving k-mer counting table ', args.savetable
        print '...saving to', args.savetable
        counting_hash.save(args.savetable)
예제 #15
0
def main():

    info('load-into-counting.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_file_status(name)

    check_space(args.input_sequence_filename)
    check_space_for_hashtable(args.n_tables * args.min_tablesize)

    print 'Saving k-mer counting table to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    print 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables, args.n_threads)
    htable.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.n_threads * 64 * 1024)

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, args.n_threads)
        threads = []
        print 'consuming input', filename
        for _ in xrange(args.n_threads):
            cur_thrd = \
                threading.Thread(
                    target=htable.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for _ in threads:
            _.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize)
            print 'mid-save', base
            htable.save(base)
            open(base + '.info', 'w').write('through %s' % filename)

    print 'saving', base
    htable.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " this data set.  Increase tablesize/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)

    print 'DONE.'
예제 #16
0
def main():

    info('load-into-counting.py', ['counting'])

    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_file_status(name)

    check_space(args.input_sequence_filename)
    check_space_for_hashtable(args.n_tables * args.min_tablesize)

    print >>sys.stderr, 'Saving k-mer counting table to %s' % base
    print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames)

    # clobber the '.info' file now, as we always open in append mode below
    if os.path.exists(base + '.info'):
        os.remove(base + '.info')

    print >>sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables, args.threads)
    htable.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    filename = None

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, args.threads)
        threads = []
        print >>sys.stderr, 'consuming input', filename
        for _ in xrange(args.threads):
            cur_thrd = \
                threading.Thread(
                    target=htable.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for _ in threads:
            _.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize)
            print >>sys.stderr, 'mid-save', base
            htable.save(base)
        with open(base + '.info', 'a') as info_fh:
            print >> info_fh, 'through', filename

    n_kmers = htable.n_unique_kmers()
    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers:', n_kmers
        with open(base + '.info', 'a') as info_fp:
            print >>info_fp, 'Total number of unique k-mers:', n_kmers

    print >>sys.stderr, 'saving', base
    htable.save(base)

    fp_rate = khmer.calc_expected_collisions(htable)

    with open(base + '.info', 'a') as info_fp:
        print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate

    if args.summary_info:
        mr_fmt = args.summary_info.lower()
        mr_file = base + '.info.' + mr_fmt
        print >> sys.stderr, "Writing summmary info to", mr_file
        with open(mr_file, 'w') as mr_fh:
            if mr_fmt == 'json':
                mr_data = {
                    "ht_name": os.path.basename(base),
                    "fpr": fp_rate,
                    "num_kmers": n_kmers,
                    "files": filenames,
                    "mrinfo_version": "0.1.0",
                }
                json.dump(mr_data, mr_fh)
                mr_fh.write('\n')
            elif mr_fmt == 'tsv':
                mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n")
                mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format(
                    b=os.path.basename(base), fpr=fp_rate, k=n_kmers,
                    fls=";".join(filenames)))

    print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    # Change 0.2 only if you really grok it.  HINT: You don't.
    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the k-mer counting table is too small",
        print >> sys.stderr, "for this data set. Increase tablesize/# tables."
        print >> sys.stderr, "**"
        sys.exit(1)

    print >>sys.stderr, 'DONE.'
    print >>sys.stderr, 'wrote to:', base + '.info'
예제 #17
0
def main():  # pylint: disable=too-many-locals,too-many-branches
    info('abundance-dist-single.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    check_file_status(args.input_sequence_filename)
    check_space([args.input_sequence_filename])
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)

    if (not args.squash_output and
            os.path.exists(args.output_histogram_filename)):
        print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \
            args.output_histogram_filename
        sys.exit(1)
    else:
        hist_fp = open(args.output_histogram_filename, 'w')

    print >>sys.stderr, 'making k-mer counting table'
    counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                            args.n_tables,
                                            args.threads)
    counting_hash.set_use_bigcount(args.bigcount)

    print >> sys.stderr, 'building k-mer tracking table'
    tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize,
                                  args.n_tables)

    print >>sys.stderr, 'kmer_size:', counting_hash.ksize()
    print >>sys.stderr, 'k-mer counting table sizes:', \
        counting_hash.hashsizes()
    print >>sys.stderr, 'outputting to', args.output_histogram_filename

    khmer.get_config().set_reads_input_buffer_size(args.threads * 64 * 1024)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename, args.threads)
    threads = []
    print >>sys.stderr, 'consuming input, round 1 --', \
        args.input_sequence_filename
    for _ in xrange(args.threads):
        thread = \
            threading.Thread(
                target=counting_hash.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            counting_hash.n_unique_kmers())

    abundance_lists = []

    def __do_abundance_dist__(read_parser):
        abundances = counting_hash.abundance_distribution_with_reads_parser(
            read_parser, tracking)
        abundance_lists.append(abundances)

    print >>sys.stderr, 'preparing hist from %s...' % \
        args.input_sequence_filename
    rparser = khmer.ReadParser(args.input_sequence_filename, args.threads)
    threads = []
    print >>sys.stderr, 'consuming input, round 2 --', \
        args.input_sequence_filename
    for _ in xrange(args.threads):
        thread = \
            threading.Thread(
                target=__do_abundance_dist__,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == args.threads, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        print >> sys.stderr, \
            "ERROR: abundance distribution is uniformly zero; " \
            "nothing to report."
        print >> sys.stderr, "\tPlease verify that the input files are valid."
        sys.exit(1)

    sofar = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print >> hist_fp, _, i, sofar, round(frac, 3)

        if sofar == total:
            break

    if args.savetable:
        print >>sys.stderr, 'Saving k-mer counting table ', args.savetable
        print >>sys.stderr, '...saving to', args.savetable
        counting_hash.save(args.savetable)

    print >> sys.stderr, 'wrote to: ' + args.output_histogram_filename
예제 #18
0
add_threading_args(parser)

parser.add_argument('htfile')
parser.add_argument('input')
parser.add_argument('output')

args = parser.parse_args()

htfile = args.htfile
input_filename = args.input
output_filename = args.output

n_threads = int(args.n_threads)

config = khmer.get_config()
bufsz = config.get_reads_input_buffer_size()
#default_threads = config.get_number_of_threads()
#print '>>>>> bufsz: %d; default_threads: %d' %(bufsz, default_threads)

config.set_number_of_threads(n_threads)
new_bufsz = n_threads * bufsz
config.set_reads_input_buffer_size(new_bufsz)
rparser = khmer.ReadParser(input_filename, n_threads)
print >> sys.stderr, '### buffer size: %d; threads: %d' %(new_bufsz,\
                                                            n_threads)

print >> sys.stderr, 'loading counting hash from %s' %htfile
ht = khmer.load_counting_hash(htfile)
end1 = time.time()
print >> sys.stderr, 'loading took %d sec' %(end1-start)
예제 #19
0
파일: load-graph.py 프로젝트: b-wyss/khmer
def main():
    info("load-graph.py", ["graph"])
    args = get_parser().parse_args()
    report_on_config(args, hashtype="hashbits")

    base = args.output_filename
    filenames = args.input_filenames

    for _ in args.input_filenames:
        check_file_status(_)

    check_space(args.input_filenames)
    check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.0)

    print >>sys.stderr, "Saving k-mer presence table to %s" % base
    print >>sys.stderr, "Loading kmers from sequences in %s" % repr(filenames)
    if args.no_build_tagset:
        print >>sys.stderr, "We WILL NOT build the tagset."
    else:
        print >>sys.stderr, "We WILL build the tagset", " (for partitioning/traversal)."

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    print >>sys.stderr, "making k-mer presence table"
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    if args.no_build_tagset:
        target_method = htable.consume_fasta_with_reads_parser
    else:
        target_method = htable.consume_fasta_and_tag_with_reads_parser

    for _, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, 1)
        print >>sys.stderr, "consuming input", filename
        target_method(rparser)

    if args.report_total_kmers:
        print >>sys.stderr, "Total number of unique k-mers: {0}".format(htable.n_unique_kmers())

    print >>sys.stderr, "saving k-mer presence table in", base + ".pt"
    htable.save(base + ".pt")

    if not args.no_build_tagset:
        print >>sys.stderr, "saving tagset in", base + ".tagset"
        htable.save_tagset(base + ".tagset")

    info_fp = open(base + ".info", "w")
    info_fp.write("%d unique k-mers" % htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable)
    print >>sys.stderr, "fp rate estimated to be %1.3f" % fp_rate
    if args.write_fp_rate:
        print >> info_fp, "\nfalse positive rate estimated to be %1.3f" % fp_rate

    if fp_rate > 0.15:  # 0.18 is ACTUAL MAX. Do not change.
        print >>sys.stderr, "**"
        print >>sys.stderr, (
            "** ERROR: the graph structure is too small for " "this data set. Increase table size/# tables."
        )
        print >>sys.stderr, "**"
        sys.exit(1)

    print >>sys.stderr, "wrote to", base + ".info and", base + ".pt"
    if not args.no_build_tagset:
        print >>sys.stderr, "and " + base + ".tagset"
예제 #20
0
def main():
    parser = build_construct_args()
    add_threading_args(parser)
    parser.add_argument('--no-build-tagset', '-n', default=False,
                        action='store_true', dest='no_build_tagset',
                        help='Do NOT construct tagset while loading sequences')
    parser.add_argument('output_filename')
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames
    n_threads = int(args.n_threads)

    print 'Saving hashtable to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)
    if args.no_build_tagset:
        print 'We WILL NOT build the tagset.'
    else:
        print 'We WILL build the tagset (for partitioning/traversal).'

    #

    print 'making hashtable'
    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

    if args.no_build_tagset:
        target_method = ht.consume_fasta_with_reads_parser
    else:
        target_method = ht.consume_fasta_and_tag_with_reads_parser

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    for n, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, n_threads)
        threads = []
        print 'consuming input', filename
        for tnum in xrange(n_threads):
            t = threading.Thread(target=target_method, args=(rparser, ))
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

    print 'saving hashtable in', base + '.ht'
    ht.save(base + '.ht')

    if not args.no_build_tagset:
        print 'saving tagset in', base + '.tagset'
        ht.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % ht.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate
    if fp_rate > 0.15:          # 0.18 is ACTUAL MAX. Do not change.
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the graph structure is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        sys.exit(-1)
예제 #21
0
def main():
    parser = build_construct_args(
        "Output k-mer abundance distribution (single file version).")
    add_threading_args(parser)

    parser.add_argument('datafile')
    parser.add_argument('histout')

    parser.add_argument('-z',
                        '--no-zero',
                        dest='output_zero',
                        default=True,
                        action='store_false',
                        help='Do not output 0-count bins')
    parser.add_argument('-b',
                        '--no-bigcount',
                        dest='bigcount',
                        default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('-s',
                        '--squash',
                        dest='squash_output',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('--savehash', dest='savehash', default='')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    datafile = args.datafile
    histout = args.histout

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)
    ht.set_use_bigcount(args.bigcount)

    print 'building tracking ht'
    K = ht.ksize()
    sizes = ht.hashsizes()
    tracking = khmer._new_hashbits(K, sizes)

    print 'K:', K
    print 'HT sizes:', sizes
    print 'outputting to', histout

    config = khmer.get_config()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    # start loading
    rparser = khmer.ReadParser(datafile, n_threads)
    threads = []
    print 'consuming input, round 1 --', datafile
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    z_list = []

    def do_abundance_dist(r):
        z = ht.abundance_distribution_with_reads_parser(r, tracking)
        z_list.append(z)

    print 'preparing hist from %s...' % datafile
    rparser = khmer.ReadParser(datafile, n_threads)
    threads = []
    print 'consuming input, round 2 --', datafile
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=do_abundance_dist,
                args=(rparser,)
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    assert len(z_list) == n_threads, len(z_list)
    z = {}
    for zz in z_list:
        for i, count in enumerate(zz):
            z[i] = z.get(i, 0) + count

    total = sum(z.values())

    if 0 == total:
        print >>sys.stderr, \
            "ERROR: abundance distribution is uniformly zero; " \
            "nothing to report."
        print >> sys.stderr, "\tPlease verify that the input files are valid."
        sys.exit(-1)

    fp = open(histout, 'w')

    sofar = 0
    for n, i in sorted(z.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print >> fp, n, i, sofar, round(frac, 3)

        if sofar == total:
            break

    if args.savehash:
        print 'Saving hashfile', args.savehash
        print '...saving to', args.savehash
        ht.save(args.savehash)
예제 #22
0
def main():

    info('load-into-counting.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_file_status(name)

    check_space(args.input_sequence_filename)
    check_space_for_hashtable(args.n_tables * args.min_tablesize)

    print 'Saving k-mer counting table to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    print 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables, args.n_threads)
    htable.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.n_threads * 64 * 1024)

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, args.n_threads)
        threads = []
        print 'consuming input', filename
        for _ in xrange(args.n_threads):
            cur_thrd = \
                threading.Thread(
                    target=htable.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for _ in threads:
            _.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize)
            print 'mid-save', base
            htable.save(base)
            open(base + '.info', 'w').write('through %s' % filename)

    print 'saving', base
    htable.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " this data set.  Increase tablesize/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)

    print 'DONE.'
예제 #23
0
def main():
    parser = build_construct_args()
    add_threading_args(parser)
    parser.add_argument('output_filename')
    parser.add_argument('input_filenames', nargs='+')
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames
    n_threads = int(args.n_threads)

    print 'Saving hashtable to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    ###

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)
    ht.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    for n, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, n_threads)
        threads = []
        print 'consuming input', filename
        for tnum in xrange(n_threads):
            t = \
                threading.Thread(
                    target=ht.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

        if n > 0 and n % 10 == 0:
            print 'mid-save', base
            ht.save(base)
            open(base + '.info', 'w').write('through %s' % filename)

    print 'saving', base
    ht.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >>info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the counting hash is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        sys.exit(-1)

    print 'DONE.'
예제 #24
0
def main():
    parser = build_construct_args(
        "Output k-mer abundance distribution (single file version).")
    add_threading_args(parser)

    parser.add_argument('datafile')
    parser.add_argument('histout')

    parser.add_argument('-z', '--no-zero', dest='output_zero', default=True,
                        action='store_false',
                        help='Do not output 0-count bins')
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('-s', '--squash', dest='squash_output', default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('--savehash', dest='savehash', default='')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)
    
    datafile = args.datafile
    histout = args.histout

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)
    ht.set_use_bigcount(args.bigcount)

    print 'building tracking ht'
    K = ht.ksize()
    sizes = ht.hashsizes()
    tracking = khmer._new_hashbits(K, sizes)

    print 'K:', K
    print 'HT sizes:', sizes
    print 'outputting to', histout

    config = khmer.get_config()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    # start loading
    rparser = khmer.ReadParser(datafile, n_threads)
    threads = []
    print 'consuming input, round 1 --', datafile
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    z_list = []
    def do_abundance_dist(r):
        z = ht.abundance_distribution_with_reads_parser(r, tracking)
        z_list.append(z)

    print 'preparing hist from %s...' % datafile
    rparser = khmer.ReadParser(datafile, n_threads)
    threads = []
    print 'consuming input, round 2 --', datafile
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=do_abundance_dist,
                args=(rparser,)
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    assert len(z_list) == n_threads, len(z_list)
    z = {}
    for zz in z_list:
        for i, count in enumerate(zz):
            z[i] = z.get(i, 0) + count

    total = sum(z.values())

    if 0 == total:
        print >>sys.stderr, \
            "ERROR: abundance distribution is uniformly zero; " \
            "nothing to report."
        print >>sys.stderr, "\tPlease verify that the input files are valid."
        sys.exit(-1)

    fp = open(histout, 'w')

    sofar = 0
    for n, i in sorted(z.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print >>fp, n, i, sofar, round(frac, 3)

        if sofar == total:
            break

    if args.savehash:
        print 'Saving hashfile', args.savehash
        print '...saving to', args.savehash
        ht.save(args.savehash)
예제 #25
0
def main():
    info('filter-abund-single.py', ['counting'])
    args = get_parser().parse_args()
    check_file_status(args.datafile)
    check_space([args.datafile])
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)
    report_on_config(args)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    print >>sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables,
                                     args.threads)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile, args.threads)
    threads = []
    print >>sys.stderr, 'consuming input, round 1 --', args.datafile
    for _ in xrange(args.threads):
        cur_thread = \
            threading.Thread(
                target=htable.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable)
    print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    print >>sys.stderr, 'filtering', args.datafile
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print >>sys.stderr, 'output in', outfile

    if args.savetable:
        print >>sys.stderr, 'Saving k-mer counting table filename', \
            args.savetable
        print >>sys.stderr, '...saving to', args.savetable
        htable.save(args.savetable)
    print >>sys.stderr, 'wrote to: ', outfile
예제 #26
0
def main():
    parser = build_construct_args()
    add_threading_args(parser)
    parser.add_argument('--no-build-tagset',
                        '-n',
                        default=False,
                        action='store_true',
                        dest='no_build_tagset',
                        help='Do NOT construct tagset while loading sequences')
    parser.add_argument('output_filename')
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames
    n_threads = int(args.n_threads)

    print 'Saving hashtable to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)
    if args.no_build_tagset:
        print 'We WILL NOT build the tagset.'
    else:
        print 'We WILL build the tagset (for partitioning/traversal).'

    #

    print 'making hashtable'
    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

    if args.no_build_tagset:
        target_method = ht.consume_fasta_with_reads_parser
    else:
        target_method = ht.consume_fasta_and_tag_with_reads_parser

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    for n, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, n_threads)
        threads = []
        print 'consuming input', filename
        for tnum in xrange(n_threads):
            t = threading.Thread(target=target_method, args=(rparser, ))
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

    print 'saving hashtable in', base + '.ht'
    ht.save(base + '.ht')

    if not args.no_build_tagset:
        print 'saving tagset in', base + '.tagset'
        ht.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % ht.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate
    if fp_rate > 0.15:  # 0.18 is ACTUAL MAX. Do not change.
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the graph structure is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        sys.exit(-1)
예제 #27
0
파일: load-graph.py 프로젝트: RamRS/khmer
def main():
    info('load-graph.py', ['graph'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    base = args.output_filename
    filenames = args.input_filenames
    n_threads = int(args.n_threads)

    for _ in args.input_filenames:
        check_file_status(_)

    check_space(args.input_filenames)
    check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.)

    print 'Saving k-mer presence table to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)
    if args.no_build_tagset:
        print 'We WILL NOT build the tagset.'
    else:
        print 'We WILL build the tagset (for partitioning/traversal).'

    print 'making k-mer presence table'
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    if args.no_build_tagset:
        target_method = htable.consume_fasta_with_reads_parser
    else:
        target_method = htable.consume_fasta_and_tag_with_reads_parser

    config = khmer.get_config()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    for _, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, n_threads)
        threads = []
        print 'consuming input', filename
        for _ in xrange(n_threads):
            cur_thrd = threading.Thread(target=target_method, args=(rparser, ))
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

    print 'saving k-mer presence table in', base + '.pt'
    htable.save(base + '.pt')

    if not args.no_build_tagset:
        print 'saving tagset in', base + '.tagset'
        htable.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    if fp_rate > 0.15:          # 0.18 is ACTUAL MAX. Do not change.
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the graph structure is too small for "
                              "this data set.  Increase table size/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)
예제 #28
0
def main():
    info('filter-abund-single.py', ['counting'])
    args = get_parser().parse_args()
    check_file_status(args.datafile)
    check_space([args.datafile])
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)
    report_on_config(args)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    print >> sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables, args.threads)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile, args.threads)
    threads = []
    print >> sys.stderr, 'consuming input, round 1 --', args.datafile
    for _ in xrange(args.threads):
        cur_thread = \
            threading.Thread(
                target=htable.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable)
    print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    print >> sys.stderr, 'filtering', args.datafile
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print >> sys.stderr, 'output in', outfile

    if args.savetable:
        print >>sys.stderr, 'Saving k-mer counting table filename', \
            args.savetable
        print >> sys.stderr, '...saving to', args.savetable
        htable.save(args.savetable)
    print >> sys.stderr, 'wrote to: ', outfile
예제 #29
0
def main():
    info('load-graph.py', ['graph'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    base = args.output_filename
    filenames = args.input_filenames
    n_threads = int(args.n_threads)

    for _ in args.input_filenames:
        check_file_status(_)

    check_space(args.input_filenames)
    check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.)

    print 'Saving k-mer presence table to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)
    if args.no_build_tagset:
        print 'We WILL NOT build the tagset.'
    else:
        print 'We WILL build the tagset (for partitioning/traversal).'

    print 'making k-mer presence table'
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    if args.no_build_tagset:
        target_method = htable.consume_fasta_with_reads_parser
    else:
        target_method = htable.consume_fasta_and_tag_with_reads_parser

    config = khmer.get_config()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    for _, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, n_threads)
        threads = []
        print 'consuming input', filename
        for _ in xrange(n_threads):
            cur_thrd = threading.Thread(target=target_method, args=(rparser, ))
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of k-mers: {0}'.format(
            htable.n_occupied())

    print 'saving k-mer presence table in', base + '.pt'
    htable.save(base + '.pt')

    if not args.no_build_tagset:
        print 'saving tagset in', base + '.tagset'
        htable.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    if args.write_fp_rate:
        print >> info_fp, \
            '\nfalse positive rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.15:  # 0.18 is ACTUAL MAX. Do not change.
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the graph structure is too small for "
                              "this data set. Increase table size/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)
예제 #30
0
def main():

    info('load-into-counting.py', ['counting'])

    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_file_status(name)

    check_space(args.input_sequence_filename)
    check_space_for_hashtable(args.n_tables * args.min_tablesize)

    print >> sys.stderr, 'Saving k-mer counting table to %s' % base
    print >> sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames)

    # clobber the '.info' file now, as we always open in append mode below
    if os.path.exists(base + '.info'):
        os.remove(base + '.info')

    print >> sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables, args.threads)
    htable.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    filename = None

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, args.threads)
        threads = []
        print >> sys.stderr, 'consuming input', filename
        for _ in xrange(args.threads):
            cur_thrd = \
                threading.Thread(
                    target=htable.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for _ in threads:
            _.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize)
            print >> sys.stderr, 'mid-save', base
            htable.save(base)
        with open(base + '.info', 'a') as info_fh:
            print >> info_fh, 'through', filename

    n_kmers = htable.n_unique_kmers()
    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers:', n_kmers
        with open(base + '.info', 'a') as info_fp:
            print >> info_fp, 'Total number of unique k-mers:', n_kmers

    print >> sys.stderr, 'saving', base
    htable.save(base)

    fp_rate = khmer.calc_expected_collisions(htable)

    with open(base + '.info', 'a') as info_fp:
        print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate

    if args.summary_info:
        mr_fmt = args.summary_info.lower()
        mr_file = base + '.info.' + mr_fmt
        print >> sys.stderr, "Writing summmary info to", mr_file
        with open(mr_file, 'w') as mr_fh:
            if mr_fmt == 'json':
                mr_data = {
                    "ht_name": os.path.basename(base),
                    "fpr": fp_rate,
                    "num_kmers": n_kmers,
                    "files": filenames,
                    "mrinfo_version": "0.1.0",
                }
                json.dump(mr_data, mr_fh)
                mr_fh.write('\n')
            elif mr_fmt == 'tsv':
                mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n")
                mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format(
                    b=os.path.basename(base),
                    fpr=fp_rate,
                    k=n_kmers,
                    fls=";".join(filenames)))

    print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    # Change 0.2 only if you really grok it.  HINT: You don't.
    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the k-mer counting table is too small",
        print >> sys.stderr, "for this data set. Increase tablesize/# tables."
        print >> sys.stderr, "**"
        sys.exit(1)

    print >> sys.stderr, 'DONE.'
    print >> sys.stderr, 'wrote to:', base + '.info'
예제 #31
0
def main():
    parser = build_construct_args(
        "Filter k-mers at the given abundance (inmem version).")
    add_threading_args(parser)

    parser.add_argument('--cutoff', '-C', dest='cutoff',
                        default=DEFAULT_CUTOFF, type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savehash', dest='savehash', default='')
    parser.add_argument('datafile')
    
    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)

    filename = args.datafile

    ### first, load reads into hash table
    rparser = khmer.ReadParser(filename, n_threads)
    threads = []
    print 'consuming input, round 1 --', filename
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    ### now, trim.

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    ### the filtering loop
    print 'filtering', filename
    outfile = os.path.basename(filename) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(filename), outfp)

    print 'output in', outfile

    if args.savehash:
        print 'Saving hashfile', args.savehash
        print '...saving to', args.savehash
        ht.save(args.savehash)
예제 #32
0
def main():

    info("load-into-counting.py", ["counting"])
    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_file_status(name)

    check_space(args.input_sequence_filename)
    check_space_for_hashtable(args.n_tables * args.min_tablesize)

    print "Saving k-mer counting table to %s" % base
    print "Loading kmers from sequences in %s" % repr(filenames)

    print "making k-mer counting table"
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.n_threads)
    htable.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.n_threads * 64 * 1024)

    filename = None

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, args.n_threads)
        threads = []
        print "consuming input", filename
        for _ in xrange(args.n_threads):
            cur_thrd = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,))
            threads.append(cur_thrd)
            cur_thrd.start()

        for _ in threads:
            _.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize)
            print "mid-save", base
            htable.save(base)
            open(base + ".info", "w").write("through %s" % filename)

    if args.report_total_kmers:
        print >> sys.stderr, "Total number of k-mers: {0}".format(htable.n_occupied())

    print "saving", base
    htable.save(base)

    info_fp = open(base + ".info", "w")
    info_fp.write("through end: %s\n" % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable)
    print "fp rate estimated to be %1.3f" % fp_rate
    print >> info_fp, "fp rate estimated to be %1.3f" % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, (
            "** ERROR: the k-mer counting table is too small" " this data set.  Increase tablesize/# tables."
        )
        print >> sys.stderr, "**"
        sys.exit(1)

    print "DONE."