def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('datafile') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) filename = args.datafile ### first, load reads into hash table rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input, round 1 --', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate ### now, count. total = 0 total_unique = 0 for n, record in enumerate(screed.open(filename)): total += 1 last_kmer = record.sequence[-K:] count = ht.get(last_kmer) if count == 1: total_unique += 1 print 'singletons: %d unique; of %d total; %.3f' % \ (total_unique, total, total_unique/float(total))
def main(): parser = build_construct_args( "Filter k-mers at the given abundance (inmem version).") add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savehash', dest='savehash', default='') parser.add_argument('datafile') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) filename = args.datafile ### first, load reads into hash table rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input, round 1 --', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate ### now, trim. ### the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None ### the filtering loop print 'filtering', filename outfile = os.path.basename(filename) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(filename), outfp) print 'output in', outfile if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) ### print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for n, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() if n > 0 and n % 10 == 0: print 'mid-save', base ht.save(base) open(base + '.info', 'w').write('through %s' % filename) print 'saving', base ht.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate print >>info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" sys.exit(-1) print 'DONE.'
def main(): parser = build_construct_args( "Output k-mer abundance distribution (single file version).") add_threading_args(parser) parser.add_argument('datafile') parser.add_argument('histout') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output 0-count bins') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('--savehash', dest='savehash', default='') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) datafile = args.datafile histout = args.histout print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) print 'building tracking ht' K = ht.ksize() sizes = ht.hashsizes() tracking = khmer._new_hashbits(K, sizes) print 'K:', K print 'HT sizes:', sizes print 'outputting to', histout config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) # start loading rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 1 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() z_list = [] def do_abundance_dist(r): z = ht.abundance_distribution_with_reads_parser(r, tracking) z_list.append(z) print 'preparing hist from %s...' % datafile rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 2 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=do_abundance_dist, args=(rparser,) ) threads.append(t) t.start() for t in threads: t.join() assert len(z_list) == n_threads, len(z_list) z = {} for zz in z_list: for i, count in enumerate(zz): z[i] = z.get(i, 0) + count total = sum(z.values()) if 0 == total: print >>sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >>sys.stderr, "\tPlease verify that the input files are valid." sys.exit(-1) fp = open(histout, 'w') sofar = 0 for n, i in sorted(z.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >>fp, n, i, sofar, round(frac, 3) if sofar == total: break if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): parser = build_construct_args( "Output k-mer abundance distribution (single file version).") add_threading_args(parser) parser.add_argument('datafile') parser.add_argument('histout') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output 0-count bins') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('--savehash', dest='savehash', default='') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) datafile = args.datafile histout = args.histout print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) print 'building tracking ht' K = ht.ksize() sizes = ht.hashsizes() tracking = khmer._new_hashbits(K, sizes) print 'K:', K print 'HT sizes:', sizes print 'outputting to', histout config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) # start loading rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 1 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() z_list = [] def do_abundance_dist(r): z = ht.abundance_distribution_with_reads_parser(r, tracking) z_list.append(z) print 'preparing hist from %s...' % datafile rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 2 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=do_abundance_dist, args=(rparser,) ) threads.append(t) t.start() for t in threads: t.join() assert len(z_list) == n_threads, len(z_list) z = {} for zz in z_list: for i, count in enumerate(zz): z[i] = z.get(i, 0) + count total = sum(z.values()) if 0 == total: print >>sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(-1) fp = open(histout, 'w') sofar = 0 for n, i in sorted(z.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >> fp, n, i, sofar, round(frac, 3) if sofar == total: break if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('--no-build-tagset', '-n', default=False, action='store_true', dest='no_build_tagset', help='Do NOT construct tagset while loading sequences') parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print 'We WILL NOT build the tagset.' else: print 'We WILL build the tagset (for partitioning/traversal).' # print 'making hashtable' ht = khmer.new_hashbits(K, HT_SIZE, N_HT) if args.no_build_tagset: target_method = ht.consume_fasta_with_reads_parser else: target_method = ht.consume_fasta_and_tag_with_reads_parser config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for n, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for tnum in xrange(n_threads): t = threading.Thread(target=target_method, args=(rparser, )) threads.append(t) t.start() for t in threads: t.join() print 'saving hashtable in', base + '.ht' ht.save(base + '.ht') if not args.no_build_tagset: print 'saving tagset in', base + '.tagset' ht.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % ht.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the graph structure is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" sys.exit(-1)
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) # print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for n, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() if n > 0 and n % 10 == 0: print 'mid-save', base ht.save(base) open(base + '.info', 'w').write('through %s' % filename) print 'saving', base ht.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" sys.exit(-1) print 'DONE.'
def main(): parser = build_construct_args( "Filter k-mers at the given abundance (inmem version).") add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savehash', dest='savehash', default='') parser.add_argument('datafile') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) filename = args.datafile # first, load reads into hash table rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input, round 1 --', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None # the filtering loop print 'filtering', filename outfile = os.path.basename(filename) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(filename), outfp) print 'output in', outfile if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('--no-build-tagset', '-n', default=False, action='store_true', dest='no_build_tagset', help='Do NOT construct tagset while loading sequences') parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print 'We WILL NOT build the tagset.' else: print 'We WILL build the tagset (for partitioning/traversal).' # print 'making hashtable' ht = khmer.new_hashbits(K, HT_SIZE, N_HT) if args.no_build_tagset: target_method = ht.consume_fasta_with_reads_parser else: target_method = ht.consume_fasta_and_tag_with_reads_parser config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for n, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for tnum in xrange(n_threads): t = threading.Thread(target=target_method, args=(rparser, )) threads.append(t) t.start() for t in threads: t.join() print 'saving hashtable in', base + '.ht' ht.save(base + '.ht') if not args.no_build_tagset: print 'saving tagset in', base + '.tagset' ht.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % ht.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the graph structure is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" sys.exit(-1)