def test_casava_1_8_pair_mating(): import threading config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(128 * 1024) # Note: This file, when used in conjunction with a 64 KiB per-thread # prefetch buffer, tests the paired read mating logic with the # Casava >= 1.8 read name format. rparser = ReadParser(utils.get_test_data("test-reads.fq.bz2"), 2) def thread_1_runtime(rparser): for read in rparser: pass def thread_2_runtime(rparser): for readnum, read in enumerate(rparser): if 0 == readnum: assert "895:1:1:1761:13189 2:N:0:NNNNN" == read.name t1 = threading.Thread(target=thread_1_runtime, args=[rparser]) t2 = threading.Thread(target=thread_2_runtime, args=[rparser]) t1.start() t2.start() t1.join() t2.join() config.set_reads_input_buffer_size(bufsz)
def main(): info("filter-abund-single.py", ["counting"]) args = get_parser().parse_args() check_file_status(args.datafile) check_space([args.datafile]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) report_on_config(args) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print "making k-mer counting table" htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile, args.threads) threads = [] print "consuming input, round 1 --", args.datafile for _ in xrange(args.threads): cur_thread = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() fp_rate = khmer.calc_expected_collisions(htable) print "fp rate estimated to be %1.3f" % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print "filtering", args.datafile outfile = os.path.basename(args.datafile) + ".abundfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print "output in", outfile if args.savetable: print "Saving k-mer counting table filename", args.savetable print "...saving to", args.savetable htable.save(args.savetable)
def test_with_multiple_threads(): import operator import threading reads_count_1thr = 0 rparser = ReadParser(utils.get_test_data("test-reads.fq.bz2")) for read in rparser: reads_count_1thr += 1 def count_reads(rparser, counters, tnum): counters[tnum] = reduce(operator.add, (1 for read in rparser)) N_THREADS = 4 config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(N_THREADS * 64 * 1024) threads = [] reads_counts_per_thread = [0] * N_THREADS rparser = ReadParser(utils.get_test_data("test-reads.fq.bz2"), N_THREADS) for tnum in xrange(N_THREADS): t = \ threading.Thread( target=count_reads, args=[rparser, reads_counts_per_thread, tnum] ) threads.append(t) t.start() for t in threads: t.join() config.set_reads_input_buffer_size(bufsz) assert reads_count_1thr == sum(reads_counts_per_thread)
def test_old_illumina_pair_mating(): import threading config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(65600 * 2) # Note: This file, when used in conjunction with a 65600 byte per-thread # prefetch buffer, tests the paired read mating logic with the # old Illumina read name format. rparser = ReadParser(utils.get_test_data("test-reads.fa"), 2) def thread_1_runtime(rparser): for read in rparser: pass def thread_2_runtime(rparser): for readnum, read in enumerate(rparser): if 0 == readnum: assert "850:2:1:1198:16820/1" == read.name t1 = threading.Thread(target=thread_1_runtime, args=[rparser]) t2 = threading.Thread(target=thread_2_runtime, args=[rparser]) t1.start() t2.start() t1.join() t2.join() config.set_reads_input_buffer_size(bufsz)
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('datafile') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) filename = args.datafile ### first, load reads into hash table rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input, round 1 --', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate ### now, count. total = 0 total_unique = 0 for n, record in enumerate(screed.open(filename)): total += 1 last_kmer = record.sequence[-K:] count = ht.get(last_kmer) if count == 1: total_unique += 1 print 'singletons: %d unique; of %d total; %.3f' % \ (total_unique, total, total_unique/float(total))
def test_KmerCount(): # test KmerCount class km = khmer.KmerCount(4) km.consume('AAAAAC') expected = (('AAAA', 2), ('AAAC', 1)) for i, (kmer, count) in enumerate(km.pairs): e = expected[i] assert kmer == e[0], (kmer, i) assert count == e[1], (count, i) assert km['AAAA'] == 2 assert km['AAAC'] == 1 km = khmer.KmerCount(4, report_zero=True) km.consume('AAAAAC') expected = (('AAAA', 2), ('AAAC', 1)) i = 0 for kmer, count in km.pairs: if count: e = expected[i] assert kmer == e[0], (kmer, i) assert count == e[1], (count, i) i += 1 assert i == 2 # test capital letters vs lowercase config = khmer.get_config() if config.has_extra_sanity_checks(): km = khmer.KmerCount(4, report_zero=True) km.consume('AAAAAC'.lower()) expected = (('AAAA', 2), ('AAAC', 1)) assert km['AAAA'] == 2 assert km['AAAC'] == 1 # hooray, done! print 'SUCCESS, all tests passed.'
def main(): parser = build_construct_args( "Filter k-mers at the given abundance (inmem version).") add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savehash', dest='savehash', default='') parser.add_argument('datafile') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) filename = args.datafile # first, load reads into hash table rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input, round 1 --', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None # the filtering loop print 'filtering', filename outfile = os.path.basename(filename) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(filename), outfp) print 'output in', outfile if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) # print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for n, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() if n > 0 and n % 10 == 0: print 'mid-save', base ht.save(base) open(base + '.info', 'w').write('through %s' % filename) print 'saving', base ht.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" sys.exit(-1) print 'DONE.'
def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting']) args = get_parser().parse_args() report_on_config(args) check_file_status(args.input_sequence_filename) check_space([args.input_sequence_filename]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \ args.output_histogram_filename sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') print 'making k-mer counting table' counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) counting_hash.set_use_bigcount(args.bigcount) print 'building k-mer tracking table' tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize, args.n_tables) print 'kmer_size:', counting_hash.ksize() print 'k-mer counting table sizes:', counting_hash.hashsizes() print 'outputting to', args.output_histogram_filename khmer.get_config().set_reads_input_buffer_size(args.threads * 64 * 1024) # start loading rparser = khmer.ReadParser(args.input_sequence_filename, args.threads) threads = [] print 'consuming input, round 1 --', args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=counting_hash.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( counting_hash.n_occupied()) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = counting_hash.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) print 'preparing hist from %s...' % args.input_sequence_filename rparser = khmer.ReadParser(args.input_sequence_filename, args.threads) threads = [] print 'consuming input, round 2 --', args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print >> sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >> hist_fp, _, i, sofar, round(frac, 3) if sofar == total: break if args.savetable: print 'Saving k-mer counting table ', args.savetable print '...saving to', args.savetable counting_hash.save(args.savetable)
def main(): info('load-into-counting.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print 'Saving k-mer counting table to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.n_threads) htable.set_use_bigcount(args.bigcount) config = khmer.get_config() config.set_reads_input_buffer_size(args.n_threads * 64 * 1024) for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, args.n_threads) threads = [] print 'consuming input', filename for _ in xrange(args.n_threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for _ in threads: _.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize) print 'mid-save', base htable.save(base) open(base + '.info', 'w').write('through %s' % filename) print 'saving', base htable.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " this data set. Increase tablesize/# tables.") print >> sys.stderr, "**" sys.exit(1) print 'DONE.'
def main(): info('load-into-counting.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print >>sys.stderr, 'Saving k-mer counting table to %s' % base print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) # clobber the '.info' file now, as we always open in append mode below if os.path.exists(base + '.info'): os.remove(base + '.info') print >>sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) htable.set_use_bigcount(args.bigcount) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) filename = None for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, args.threads) threads = [] print >>sys.stderr, 'consuming input', filename for _ in xrange(args.threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for _ in threads: _.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize) print >>sys.stderr, 'mid-save', base htable.save(base) with open(base + '.info', 'a') as info_fh: print >> info_fh, 'through', filename n_kmers = htable.n_unique_kmers() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers:', n_kmers with open(base + '.info', 'a') as info_fp: print >>info_fp, 'Total number of unique k-mers:', n_kmers print >>sys.stderr, 'saving', base htable.save(base) fp_rate = khmer.calc_expected_collisions(htable) with open(base + '.info', 'a') as info_fp: print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt print >> sys.stderr, "Writing summmary info to", mr_file with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.1.0", } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n") mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format( b=os.path.basename(base), fpr=fp_rate, k=n_kmers, fls=";".join(filenames))) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # Change 0.2 only if you really grok it. HINT: You don't. if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the k-mer counting table is too small", print >> sys.stderr, "for this data set. Increase tablesize/# tables." print >> sys.stderr, "**" sys.exit(1) print >>sys.stderr, 'DONE.' print >>sys.stderr, 'wrote to:', base + '.info'
def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting']) args = get_parser().parse_args() report_on_config(args) check_file_status(args.input_sequence_filename) check_space([args.input_sequence_filename]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \ args.output_histogram_filename sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') print >>sys.stderr, 'making k-mer counting table' counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) counting_hash.set_use_bigcount(args.bigcount) print >> sys.stderr, 'building k-mer tracking table' tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize, args.n_tables) print >>sys.stderr, 'kmer_size:', counting_hash.ksize() print >>sys.stderr, 'k-mer counting table sizes:', \ counting_hash.hashsizes() print >>sys.stderr, 'outputting to', args.output_histogram_filename khmer.get_config().set_reads_input_buffer_size(args.threads * 64 * 1024) # start loading rparser = khmer.ReadParser(args.input_sequence_filename, args.threads) threads = [] print >>sys.stderr, 'consuming input, round 1 --', \ args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=counting_hash.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( counting_hash.n_unique_kmers()) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = counting_hash.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) print >>sys.stderr, 'preparing hist from %s...' % \ args.input_sequence_filename rparser = khmer.ReadParser(args.input_sequence_filename, args.threads) threads = [] print >>sys.stderr, 'consuming input, round 2 --', \ args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print >> sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >> hist_fp, _, i, sofar, round(frac, 3) if sofar == total: break if args.savetable: print >>sys.stderr, 'Saving k-mer counting table ', args.savetable print >>sys.stderr, '...saving to', args.savetable counting_hash.save(args.savetable) print >> sys.stderr, 'wrote to: ' + args.output_histogram_filename
add_threading_args(parser) parser.add_argument('htfile') parser.add_argument('input') parser.add_argument('output') args = parser.parse_args() htfile = args.htfile input_filename = args.input output_filename = args.output n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() #default_threads = config.get_number_of_threads() #print '>>>>> bufsz: %d; default_threads: %d' %(bufsz, default_threads) config.set_number_of_threads(n_threads) new_bufsz = n_threads * bufsz config.set_reads_input_buffer_size(new_bufsz) rparser = khmer.ReadParser(input_filename, n_threads) print >> sys.stderr, '### buffer size: %d; threads: %d' %(new_bufsz,\ n_threads) print >> sys.stderr, 'loading counting hash from %s' %htfile ht = khmer.load_counting_hash(htfile) end1 = time.time() print >> sys.stderr, 'loading took %d sec' %(end1-start)
def main(): info("load-graph.py", ["graph"]) args = get_parser().parse_args() report_on_config(args, hashtype="hashbits") base = args.output_filename filenames = args.input_filenames for _ in args.input_filenames: check_file_status(_) check_space(args.input_filenames) check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.0) print >>sys.stderr, "Saving k-mer presence table to %s" % base print >>sys.stderr, "Loading kmers from sequences in %s" % repr(filenames) if args.no_build_tagset: print >>sys.stderr, "We WILL NOT build the tagset." else: print >>sys.stderr, "We WILL build the tagset", " (for partitioning/traversal)." config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print >>sys.stderr, "making k-mer presence table" htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) if args.no_build_tagset: target_method = htable.consume_fasta_with_reads_parser else: target_method = htable.consume_fasta_and_tag_with_reads_parser for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, 1) print >>sys.stderr, "consuming input", filename target_method(rparser) if args.report_total_kmers: print >>sys.stderr, "Total number of unique k-mers: {0}".format(htable.n_unique_kmers()) print >>sys.stderr, "saving k-mer presence table in", base + ".pt" htable.save(base + ".pt") if not args.no_build_tagset: print >>sys.stderr, "saving tagset in", base + ".tagset" htable.save_tagset(base + ".tagset") info_fp = open(base + ".info", "w") info_fp.write("%d unique k-mers" % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print >>sys.stderr, "fp rate estimated to be %1.3f" % fp_rate if args.write_fp_rate: print >> info_fp, "\nfalse positive rate estimated to be %1.3f" % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >>sys.stderr, "**" print >>sys.stderr, ( "** ERROR: the graph structure is too small for " "this data set. Increase table size/# tables." ) print >>sys.stderr, "**" sys.exit(1) print >>sys.stderr, "wrote to", base + ".info and", base + ".pt" if not args.no_build_tagset: print >>sys.stderr, "and " + base + ".tagset"
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('--no-build-tagset', '-n', default=False, action='store_true', dest='no_build_tagset', help='Do NOT construct tagset while loading sequences') parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print 'We WILL NOT build the tagset.' else: print 'We WILL build the tagset (for partitioning/traversal).' # print 'making hashtable' ht = khmer.new_hashbits(K, HT_SIZE, N_HT) if args.no_build_tagset: target_method = ht.consume_fasta_with_reads_parser else: target_method = ht.consume_fasta_and_tag_with_reads_parser config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for n, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for tnum in xrange(n_threads): t = threading.Thread(target=target_method, args=(rparser, )) threads.append(t) t.start() for t in threads: t.join() print 'saving hashtable in', base + '.ht' ht.save(base + '.ht') if not args.no_build_tagset: print 'saving tagset in', base + '.tagset' ht.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % ht.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the graph structure is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" sys.exit(-1)
def main(): parser = build_construct_args( "Output k-mer abundance distribution (single file version).") add_threading_args(parser) parser.add_argument('datafile') parser.add_argument('histout') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output 0-count bins') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('--savehash', dest='savehash', default='') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) datafile = args.datafile histout = args.histout print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) print 'building tracking ht' K = ht.ksize() sizes = ht.hashsizes() tracking = khmer._new_hashbits(K, sizes) print 'K:', K print 'HT sizes:', sizes print 'outputting to', histout config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) # start loading rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 1 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() z_list = [] def do_abundance_dist(r): z = ht.abundance_distribution_with_reads_parser(r, tracking) z_list.append(z) print 'preparing hist from %s...' % datafile rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 2 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=do_abundance_dist, args=(rparser,) ) threads.append(t) t.start() for t in threads: t.join() assert len(z_list) == n_threads, len(z_list) z = {} for zz in z_list: for i, count in enumerate(zz): z[i] = z.get(i, 0) + count total = sum(z.values()) if 0 == total: print >>sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(-1) fp = open(histout, 'w') sofar = 0 for n, i in sorted(z.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >> fp, n, i, sofar, round(frac, 3) if sofar == total: break if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) ### print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for n, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() if n > 0 and n % 10 == 0: print 'mid-save', base ht.save(base) open(base + '.info', 'w').write('through %s' % filename) print 'saving', base ht.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate print >>info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" sys.exit(-1) print 'DONE.'
def main(): parser = build_construct_args( "Output k-mer abundance distribution (single file version).") add_threading_args(parser) parser.add_argument('datafile') parser.add_argument('histout') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output 0-count bins') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('--savehash', dest='savehash', default='') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) datafile = args.datafile histout = args.histout print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) print 'building tracking ht' K = ht.ksize() sizes = ht.hashsizes() tracking = khmer._new_hashbits(K, sizes) print 'K:', K print 'HT sizes:', sizes print 'outputting to', histout config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) # start loading rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 1 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() z_list = [] def do_abundance_dist(r): z = ht.abundance_distribution_with_reads_parser(r, tracking) z_list.append(z) print 'preparing hist from %s...' % datafile rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 2 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=do_abundance_dist, args=(rparser,) ) threads.append(t) t.start() for t in threads: t.join() assert len(z_list) == n_threads, len(z_list) z = {} for zz in z_list: for i, count in enumerate(zz): z[i] = z.get(i, 0) + count total = sum(z.values()) if 0 == total: print >>sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >>sys.stderr, "\tPlease verify that the input files are valid." sys.exit(-1) fp = open(histout, 'w') sofar = 0 for n, i in sorted(z.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >>fp, n, i, sofar, round(frac, 3) if sofar == total: break if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): info('filter-abund-single.py', ['counting']) args = get_parser().parse_args() check_file_status(args.datafile) check_space([args.datafile]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) report_on_config(args) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print >>sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile, args.threads) threads = [] print >>sys.stderr, 'consuming input, round 1 --', args.datafile for _ in xrange(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print >>sys.stderr, 'filtering', args.datafile outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print >>sys.stderr, 'output in', outfile if args.savetable: print >>sys.stderr, 'Saving k-mer counting table filename', \ args.savetable print >>sys.stderr, '...saving to', args.savetable htable.save(args.savetable) print >>sys.stderr, 'wrote to: ', outfile
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('--no-build-tagset', '-n', default=False, action='store_true', dest='no_build_tagset', help='Do NOT construct tagset while loading sequences') parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print 'We WILL NOT build the tagset.' else: print 'We WILL build the tagset (for partitioning/traversal).' # print 'making hashtable' ht = khmer.new_hashbits(K, HT_SIZE, N_HT) if args.no_build_tagset: target_method = ht.consume_fasta_with_reads_parser else: target_method = ht.consume_fasta_and_tag_with_reads_parser config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for n, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for tnum in xrange(n_threads): t = threading.Thread(target=target_method, args=(rparser, )) threads.append(t) t.start() for t in threads: t.join() print 'saving hashtable in', base + '.ht' ht.save(base + '.ht') if not args.no_build_tagset: print 'saving tagset in', base + '.tagset' ht.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % ht.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the graph structure is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" sys.exit(-1)
def main(): info('load-graph.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) for _ in args.input_filenames: check_file_status(_) check_space(args.input_filenames) check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.) print 'Saving k-mer presence table to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print 'We WILL NOT build the tagset.' else: print 'We WILL build the tagset (for partitioning/traversal).' print 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) if args.no_build_tagset: target_method = htable.consume_fasta_with_reads_parser else: target_method = htable.consume_fasta_and_tag_with_reads_parser config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for _ in xrange(n_threads): cur_thrd = threading.Thread(target=target_method, args=(rparser, )) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() print 'saving k-mer presence table in', base + '.pt' htable.save(base + '.pt') if not args.no_build_tagset: print 'saving tagset in', base + '.tagset' htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for " "this data set. Increase table size/# tables.") print >> sys.stderr, "**" sys.exit(1)
def main(): info('filter-abund-single.py', ['counting']) args = get_parser().parse_args() check_file_status(args.datafile) check_space([args.datafile]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) report_on_config(args) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print >> sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile, args.threads) threads = [] print >> sys.stderr, 'consuming input, round 1 --', args.datafile for _ in xrange(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print >> sys.stderr, 'filtering', args.datafile outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print >> sys.stderr, 'output in', outfile if args.savetable: print >>sys.stderr, 'Saving k-mer counting table filename', \ args.savetable print >> sys.stderr, '...saving to', args.savetable htable.save(args.savetable) print >> sys.stderr, 'wrote to: ', outfile
def main(): info('load-graph.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) for _ in args.input_filenames: check_file_status(_) check_space(args.input_filenames) check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.) print 'Saving k-mer presence table to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print 'We WILL NOT build the tagset.' else: print 'We WILL build the tagset (for partitioning/traversal).' print 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) if args.no_build_tagset: target_method = htable.consume_fasta_with_reads_parser else: target_method = htable.consume_fasta_and_tag_with_reads_parser config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for _ in xrange(n_threads): cur_thrd = threading.Thread(target=target_method, args=(rparser, )) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( htable.n_occupied()) print 'saving k-mer presence table in', base + '.pt' htable.save(base + '.pt') if not args.no_build_tagset: print 'saving tagset in', base + '.tagset' htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate if args.write_fp_rate: print >> info_fp, \ '\nfalse positive rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for " "this data set. Increase table size/# tables.") print >> sys.stderr, "**" sys.exit(1)
def main(): info('load-into-counting.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print >> sys.stderr, 'Saving k-mer counting table to %s' % base print >> sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) # clobber the '.info' file now, as we always open in append mode below if os.path.exists(base + '.info'): os.remove(base + '.info') print >> sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) htable.set_use_bigcount(args.bigcount) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) filename = None for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, args.threads) threads = [] print >> sys.stderr, 'consuming input', filename for _ in xrange(args.threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for _ in threads: _.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize) print >> sys.stderr, 'mid-save', base htable.save(base) with open(base + '.info', 'a') as info_fh: print >> info_fh, 'through', filename n_kmers = htable.n_unique_kmers() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers:', n_kmers with open(base + '.info', 'a') as info_fp: print >> info_fp, 'Total number of unique k-mers:', n_kmers print >> sys.stderr, 'saving', base htable.save(base) fp_rate = khmer.calc_expected_collisions(htable) with open(base + '.info', 'a') as info_fp: print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt print >> sys.stderr, "Writing summmary info to", mr_file with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.1.0", } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n") mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format( b=os.path.basename(base), fpr=fp_rate, k=n_kmers, fls=";".join(filenames))) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # Change 0.2 only if you really grok it. HINT: You don't. if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the k-mer counting table is too small", print >> sys.stderr, "for this data set. Increase tablesize/# tables." print >> sys.stderr, "**" sys.exit(1) print >> sys.stderr, 'DONE.' print >> sys.stderr, 'wrote to:', base + '.info'
def main(): parser = build_construct_args( "Filter k-mers at the given abundance (inmem version).") add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savehash', dest='savehash', default='') parser.add_argument('datafile') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) filename = args.datafile ### first, load reads into hash table rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input, round 1 --', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate ### now, trim. ### the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None ### the filtering loop print 'filtering', filename outfile = os.path.basename(filename) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(filename), outfp) print 'output in', outfile if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): info("load-into-counting.py", ["counting"]) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print "Saving k-mer counting table to %s" % base print "Loading kmers from sequences in %s" % repr(filenames) print "making k-mer counting table" htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.n_threads) htable.set_use_bigcount(args.bigcount) config = khmer.get_config() config.set_reads_input_buffer_size(args.n_threads * 64 * 1024) filename = None for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, args.n_threads) threads = [] print "consuming input", filename for _ in xrange(args.n_threads): cur_thrd = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,)) threads.append(cur_thrd) cur_thrd.start() for _ in threads: _.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize) print "mid-save", base htable.save(base) open(base + ".info", "w").write("through %s" % filename) if args.report_total_kmers: print >> sys.stderr, "Total number of k-mers: {0}".format(htable.n_occupied()) print "saving", base htable.save(base) info_fp = open(base + ".info", "w") info_fp.write("through end: %s\n" % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable) print "fp rate estimated to be %1.3f" % fp_rate print >> info_fp, "fp rate estimated to be %1.3f" % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, ( "** ERROR: the k-mer counting table is too small" " this data set. Increase tablesize/# tables." ) print >> sys.stderr, "**" sys.exit(1) print "DONE."