def main(): parser = build_construct_args() parser.add_argument('-l', '--lower-cutoff', type=int, dest='lower_cutoff', default=DEFAULT_LOWER_CUTOFF) parser.add_argument('-u', '--upper-cutoff', type=int, dest='upper_cutoff', default=DEFAULT_UPPER_CUTOFF) parser.add_argument('output_filename') parser.add_argument('input_filename') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >>sys.stderr, "** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize print >> sys.stderr, '' print >> sys.stderr, 'Estimated memory usage is %.2g bytes " \ "(n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes output = args.output_filename input = args.input_filename print 'lower cutoff:', args.lower_cutoff print 'upper cutoff:', args.upper_cutoff print 'Saving stoptags to %s' % output print 'Loading sequences in %s' % input ### print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) ht.set_use_bigcount(True) print 'consuming input', input hb = ht.collect_high_abundance_kmers(input, args.lower_cutoff, args.upper_cutoff) print 'saving stoptags', output hb.save_stop_tags(output)
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('datafile') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) filename = args.datafile ### first, load reads into hash table rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input, round 1 --', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate ### now, count. total = 0 total_unique = 0 for n, record in enumerate(screed.open(filename)): total += 1 last_kmer = record.sequence[-K:] count = ht.get(last_kmer) if count == 1: total_unique += 1 print 'singletons: %d unique; of %d total; %.3f' % \ (total_unique, total, total_unique/float(total))
def main(): parser = build_construct_args() parser.add_argument('-l', '--lower-cutoff', type=int, dest='lower_cutoff', default=DEFAULT_LOWER_CUTOFF) parser.add_argument('-u', '--upper-cutoff', type=int, dest='upper_cutoff', default=DEFAULT_UPPER_CUTOFF) parser.add_argument('output_filename') parser.add_argument('input_filename') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >>sys.stderr, "** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize print >>sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes " \ "(n_hashes x min_hashsize)' % ( args.n_hashes * args.min_hashsize) print >>sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes output = args.output_filename input = args.input_filename print 'lower cutoff:', args.lower_cutoff print 'upper cutoff:', args.upper_cutoff print 'Saving stoptags to %s' % output print 'Loading sequences in %s' % input ### print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) ht.set_use_bigcount(True) print 'consuming input', input hb = ht.collect_high_abundance_kmers(input, args.lower_cutoff, args.upper_cutoff) print 'saving stoptags', output hb.save_stop_tags(output)
def main(): parser = build_construct_args() parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('-R', '--report-to-file', dest='report_file', type=argparse.FileType('w')) parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print>>sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print>>sys.stderr, '\nPARAMETERS:' print>>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print>>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print>>sys.stderr, '' print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize) print>>sys.stderr, '-'*8 K=args.ksize HT_SIZE=args.min_hashsize N_HT=args.n_hashes DESIRED_COVERAGE=args.cutoff report_fp = args.report_file filenames = args.input_filenames if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') n = -1 for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 100000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename if report_fp: print>>report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() total += 1 if len(record.sequence) < K: continue seq = record.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) if med < DESIRED_COVERAGE: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, record.sequence)) else: discarded += 1 if -1 < n: print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name else: print 'SKIPPED empty file', input_filename if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" print >>sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = build_construct_args() parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-p', '--paired', action='store_true') parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('-R', '--report-to-file', dest='report_file', type=argparse.FileType('w')) parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print>>sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print>>sys.stderr, '\nPARAMETERS:' print>>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print>>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print>>sys.stderr, ' - paired = %s \t\t(-p)' % args.paired print>>sys.stderr, '' print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize) print>>sys.stderr, '-'*8 K=args.ksize HT_SIZE=args.min_hashsize N_HT=args.n_hashes DESIRED_COVERAGE=args.cutoff report_fp = args.report_file filenames = args.input_filenames # In paired mode we read two records at a time batch_size = 1 if args.paired: batch_size = 2 if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') n = -1 for n, batch in enumerate(batchwise(screed.open(input_filename), batch_size)): if n > 0 and n % 100000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename if report_fp: print>>report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() total += batch_size # If in paired mode, check that the reads are properly interleaved if args.paired: if not validpair(batch[0], batch[1]): print >>sys.stderr, 'Error: Improperly interleaved pairs %s %s' % (batch[0].name, batch[1].name) sys.exit(-1) # Emit the batch of reads if any read passes the filter # and all reads are longer than K passed_filter = False passed_length = True for record in batch: if len(record.sequence) < K: passed_length = False continue seq = record.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) if med < DESIRED_COVERAGE: ht.consume(seq) passed_filter = True # Emit records if any passed if passed_length and passed_filter: for record in batch: if hasattr(record,'accuracy'): outfp.write('@%s\n%s\n+\n%s\n' % (record.name, record.sequence, record.accuracy)) else: outfp.write('>%s\n%s\n' % (record.name, record.sequence)) else: discarded += batch_size if -1 < n: print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name else: print 'SKIPPED empty file', input_filename if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" print >>sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = build_construct_args() parser.add_argument('input_filenames', nargs='+') parser.add_argument('read_filename') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print>>sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print>>sys.stderr, '\nPARAMETERS:' print>>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print>>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print>>sys.stderr, '' print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize) print>>sys.stderr, '-'*8 K=args.ksize HT_SIZE=args.min_hashsize N_HT=args.n_hashes inputlist = args.input_filenames readsfile = args.read_filename query_list = [] for n, inp_name in enumerate(inputlist): # create a hashbits data structure # ht = khmer.new_hashbits(K, HT_SIZE, N_HT) # create a counting hash ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) outfile = os.path.basename(inp_name) + '.sweep3' outfp = open(outfile, 'w') query_list.append((ht, outfp)) for n, inp_name in enumerate(inputlist): ht = query_list[n][0] # load contigs, connect into N partitions print 'loading input reads from', inp_name ht.consume_fasta(inp_name) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" print >>sys.stderr, "** Do not use these results!!" sys.exit(-1) print 'starting sweep.' n = 0 m = 0 for n, record in enumerate(screed.open(readsfile)): if n % 10000 == 0: print '...', n, m for ht, outfp in query_list: count = ht.get_median_count(record.sequence)[0] if count: outfp.write('>%s\n%s\n' % (record.name, record.sequence))
def main(): parser = build_construct_args( "Output k-mer abundance distribution (single file version).") add_threading_args(parser) parser.add_argument('datafile') parser.add_argument('histout') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output 0-count bins') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('--savehash', dest='savehash', default='') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) datafile = args.datafile histout = args.histout print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) print 'building tracking ht' K = ht.ksize() sizes = ht.hashsizes() tracking = khmer._new_hashbits(K, sizes) print 'K:', K print 'HT sizes:', sizes print 'outputting to', histout config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) # start loading rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 1 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() z_list = [] def do_abundance_dist(r): z = ht.abundance_distribution_with_reads_parser(r, tracking) z_list.append(z) print 'preparing hist from %s...' % datafile rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 2 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=do_abundance_dist, args=(rparser,) ) threads.append(t) t.start() for t in threads: t.join() assert len(z_list) == n_threads, len(z_list) z = {} for zz in z_list: for i, count in enumerate(zz): z[i] = z.get(i, 0) + count total = sum(z.values()) if 0 == total: print >>sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >>sys.stderr, "\tPlease verify that the input files are valid." sys.exit(-1) fp = open(histout, 'w') sofar = 0 for n, i in sorted(z.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >>fp, n, i, sofar, round(frac, 3) if sofar == total: break if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): parser = build_construct_args() parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-p', '--paired', action='store_true') parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('-R', '--report-to-file', dest='report_file', type=argparse.FileType('w')) parser.add_argument('-f', '--force-processing', dest='force', help='continue on next file if read errors are \ encountered', action='store_true') parser.add_argument('-d', '--dump-frequency', dest='dump_frequency', type=int, help='dump hashtable every d files', default=-1) parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE and not args.loadhash: print >>sys.stderr, \ "** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >> sys.stderr, '\nPARAMETERS:' print >>sys.stderr, \ ' - kmer size = {ksize:d} \t\t(-k)'.format(ksize=args.ksize) print >>sys.stderr, \ ' - n hashes = {nhash:d} \t\t(-N)'.format(nhash=args.n_hashes) print >>sys.stderr, \ ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=args.min_hashsize) print >> sys.stderr, ' - paired = {pr} \t\t(-p)'.format(pr=args.paired) print >> sys.stderr, '' print >>sys.stderr, \ 'Estimated memory usage is {prod:.2g} bytes \ (n_hashes x min_hashsize)' .format(prod=args.n_hashes * args.min_hashsize) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes report_fp = args.report_file filenames = args.input_filenames force = args.force dump_frequency = args.dump_frequency # list to save error files along with throwing exceptions if force is True: corrupt_files = [] if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) total = 0 discarded = 0 for n, input_filename in enumerate(filenames): output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') total_acc = 0 discarded_acc = 0 try: total_acc, discarded_acc = normalize_by_median( input_filename, outfp, ht, args, report_fp) except IOError as e: handle_error(e, output_name, input_filename, ht) if not force: print >> sys.stderr, '** Exiting!' sys.exit(-1) else: print >> sys.stderr, '*** Skipping error file, moving on...' corrupt_files.append(input_filename) pass else: if total_acc == 0 and discarded_acc == 0: print 'SKIPPED empty file', input_filename else: total += total_acc discarded += discarded_acc print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ .format(inp=input_filename, kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) print 'output in', output_name if dump_frequency > 0 and n > 0 and n % dump_frequency == 0: print 'Backup: Saving hashfile through', input_filename if args.savehash: hashname = args.savehash print '...saving to', hashname else: hashname = 'backup.ht' print 'Nothing given for savehash, saving to', hashname ht.save(hashname) if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if force and len(corrupt_files) > 0: print >> sys.stderr, "** WARNING: Finished with errors!" print >> sys.stderr, "** IOErrors occurred in the following files:" print >> sys.stderr, "\t", " ".join(corrupt_files) if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = build_construct_args() parser.add_argument("-C", "--cutoff", type=int, dest="cutoff", default=DEFAULT_MINIMUM_COVERAGE) parser.add_argument("-s", "--savehash", dest="savehash", default="") parser.add_argument("-l", "--loadhash", dest="loadhash", default="") parser.add_argument("input_filenames", nargs="+") args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE and not args.loadhash: print >> sys.stderr, "** WARNING: hashsize is default! " "You absodefly want to increase this!\n** " "Please read the docs!" print >> sys.stderr, "\nPARAMETERS:" print >> sys.stderr, " - kmer size = %d \t\t(-k)" % args.ksize print >> sys.stderr, " - n hashes = %d \t\t(-N)" % args.n_hashes print >> sys.stderr, " - min hashsize = %-5.2g \t(-x)" % args.min_hashsize print >> sys.stderr, "" print >> sys.stderr, "Estimated memory usage is %.2g bytes " "(n_hashes x min_hashsize)" % ( args.n_hashes * args.min_hashsize ) print >> sys.stderr, "-" * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes DESIRED_COVERAGE = args.cutoff filenames = args.input_filenames if args.loadhash: print "loading hashtable from", args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print "making hashtable" ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + ".minkeep" outfp = open(output_name, "w") for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print "... kept", total - discarded, "of", total, ", or", int( 100.0 - discarded / float(total) * 100.0 ), "%" print "... in file", input_filename total += 1 if len(record.sequence) < K: continue seq = record.sequence.replace("N", "A") mincount = ht.get_min_count(seq) if mincount < DESIRED_COVERAGE: ht.consume(seq) outfp.write(">%s\n%s\n" % (record.name, record.sequence)) else: discarded += 1 print "DONE with", input_filename, "; kept", total - discarded, "of", total, "or", int( 100.0 - discarded / float(total) * 100.0 ), "%" print "output in", output_name if args.savehash: print "Saving hashfile through", input_filename print "...saving to", args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print "fp rate estimated to be %1.3f" % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = build_construct_args() parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-p', '--paired', action='store_true') parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('-R', '--report-to-file', dest='report_file', type=argparse.FileType('w')) parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >>sys.stderr, \ "** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, \ ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print >>sys.stderr, ' - paired = %s \t\t(-p)' % args.paired print >>sys.stderr, '' print >>sys.stderr, \ 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' \ % (args.n_hashes * args.min_hashsize) print >>sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes DESIRED_COVERAGE = args.cutoff report_fp = args.report_file filenames = args.input_filenames # In paired mode we read two records at a time batch_size = 1 if args.paired: batch_size = 2 if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') n = -1 for n, batch in enumerate(batchwise(screed.open( input_filename), batch_size)): if n > 0 and n % 10000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename if report_fp: print>>report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() total += batch_size # If in paired mode, check that the reads are properly interleaved if args.paired: if not validpair(batch[0], batch[1]): print >>sys.stderr, \ 'Error: Improperly interleaved pairs %s %s' \ % (batch[0].name, batch[1].name) sys.exit(-1) # Emit the batch of reads if any read passes the filter # and all reads are longer than K passed_filter = False passed_length = True for record in batch: if len(record.sequence) < K: passed_length = False continue seq = record.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) if med < DESIRED_COVERAGE: ht.consume(seq) passed_filter = True # Emit records if any passed if passed_length and passed_filter: for record in batch: if hasattr(record, 'accuracy'): outfp.write('@%s\n%s\n+\n%s\n' % (record.name, record.sequence, record.accuracy)) else: outfp.write( '>%s\n%s\n' % (record.name, record.sequence)) else: discarded += batch_size if -1 < n: print \ 'DONE with', input_filename, '; kept', total - discarded, \ 'of', total, 'or', \ int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name else: print 'SKIPPED empty file', input_filename if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" print >>sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = build_construct_args() parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >> sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print >> sys.stderr, '' print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % ( args.n_hashes * args.min_hashsize) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes DESIRED_COVERAGE = args.cutoff filenames = args.input_filenames if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keepkad' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename total += 1 if len(record.sequence) < K: continue seq = record.sequence.replace('N', 'A') kad = ht.get_kadian_count(seq) if kad < DESIRED_COVERAGE: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, record.sequence)) else: discarded += 1 print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = build_construct_args() parser.add_argument("output_filename") parser.add_argument("input_filenames", nargs="+") args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >> sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print >> sys.stderr, "\nPARAMETERS:" print >> sys.stderr, " - kmer size = %d \t\t(-k)" % args.ksize print >> sys.stderr, " - n hashes = %d \t\t(-N)" % args.n_hashes print >> sys.stderr, " - min hashsize = %-5.2g \t(-x)" % args.min_hashsize print >> sys.stderr, "" print >> sys.stderr, "Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)" % ( args.n_hashes * args.min_hashsize ) print >> sys.stderr, "-" * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames print "Saving hashtable to %s" % base print "Loading kmers from sequences in %s" % repr(filenames) ### print "making hashtable" ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) ht.set_use_bigcount(True) for n, filename in enumerate(filenames): print "consuming input", filename ht.consume_fasta(filename) if n > 0 and n % 10 == 0: print "mid-save", base ht.save(base) open(base + ".info", "w").write("through %s" % filename) print "saving", base ht.save(base) info_fp = open(base + ".info", "w") info_fp.write("through end: %s\n" % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print "fp rate estimated to be %1.3f" % fp_rate print >> info_fp, "fp rate estimated to be %1.3f" % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" sys.exit(-1)
def main(): parser = build_construct_args( "Output k-mer abundance distribution (single file version).") add_threading_args(parser) parser.add_argument('datafile') parser.add_argument('histout') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output 0-count bins') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('--savehash', dest='savehash', default='') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) datafile = args.datafile histout = args.histout print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) print 'building tracking ht' K = ht.ksize() sizes = ht.hashsizes() tracking = khmer._new_hashbits(K, sizes) print 'K:', K print 'HT sizes:', sizes print 'outputting to', histout config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) # start loading rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 1 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() z_list = [] def do_abundance_dist(r): z = ht.abundance_distribution_with_reads_parser(r, tracking) z_list.append(z) print 'preparing hist from %s...' % datafile rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 2 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=do_abundance_dist, args=(rparser,) ) threads.append(t) t.start() for t in threads: t.join() assert len(z_list) == n_threads, len(z_list) z = {} for zz in z_list: for i, count in enumerate(zz): z[i] = z.get(i, 0) + count total = sum(z.values()) if 0 == total: print >>sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(-1) fp = open(histout, 'w') sofar = 0 for n, i in sorted(z.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >> fp, n, i, sofar, round(frac, 3) if sofar == total: break if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): parser = build_construct_args() parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print>>sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print>>sys.stderr, '\nPARAMETERS:' print>>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print>>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print>>sys.stderr, '' print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize) print>>sys.stderr, '-'*8 K=args.ksize HT_SIZE=args.min_hashsize N_HT=args.n_hashes base = args.output_filename filenames = args.input_filenames print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) ### print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) ht.set_use_bigcount(True) for n, filename in enumerate(filenames): print 'consuming input', filename ht.consume_fasta(filename) if n > 0 and n % 10 == 0: print 'mid-save', base ht.save(base) open(base + '.info', 'w').write('through %s' % filename) print 'saving', base ht.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate print >>info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" sys.exit(-1) print 'DONE.'
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) # print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for n, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() if n > 0 and n % 10 == 0: print 'mid-save', base ht.save(base) open(base + '.info', 'w').write('through %s' % filename) print 'saving', base ht.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" sys.exit(-1) print 'DONE.'
def main(): parser = build_construct_args( "Filter k-mers at the given abundance (inmem version).") add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savehash', dest='savehash', default='') parser.add_argument('datafile') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) filename = args.datafile # first, load reads into hash table rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input, round 1 --', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None # the filtering loop print 'filtering', filename outfile = os.path.basename(filename) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(filename), outfp) print 'output in', outfile if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) ### print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for n, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() if n > 0 and n % 10 == 0: print 'mid-save', base ht.save(base) open(base + '.info', 'w').write('through %s' % filename) print 'saving', base ht.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate print >>info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" sys.exit(-1) print 'DONE.'
def main(): parser = build_construct_args() parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-p', '--paired', action='store_true') parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('-R', '--report-to-file', dest='report_file', type=argparse.FileType('w')) parser.add_argument('-f', '--force-processing', dest='force', help='continue on next file if read errors are \ encountered', action='store_true') parser.add_argument('-d', '--dump-frequency', dest='dump_frequency', type=int, help='dump hashtable every d files', default=-1) parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >>sys.stderr, \ "** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, \ ' - kmer size = {ksize:d} \t\t(-k)'.format(ksize=args.ksize) print >>sys.stderr, \ ' - n hashes = {nhash:d} \t\t(-N)'.format(nhash=args.n_hashes) print >>sys.stderr, \ ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=args.min_hashsize) print >>sys.stderr, ' - paired = {pr} \t\t(-p)'.format(pr=args.paired) print >>sys.stderr, '' print >>sys.stderr, \ 'Estimated memory usage is {prod:.2g} bytes \ (n_hashes x min_hashsize)'.format(prod=args.n_hashes*args.min_hashsize) print >>sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes report_fp = args.report_file filenames = args.input_filenames force=args.force dump_frequency = args.dump_frequency # list to save error files along with throwing exceptions if force == True: corrupt_files = [] if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) total = 0 discarded = 0 for n, input_filename in enumerate(filenames): output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') total_acc = 0 discarded_acc = 0 try: total_acc, discarded_acc = normalize_by_median(input_filename, outfp, ht, args, report_fp) except IOError as e: handle_error(e, output_name, input_filename, ht) if not force: print >>sys.stderr, '** Exiting!' sys.exit(-1) else: print >>sys.stderr, '*** Skipping error file, moving on...' corrupt_files.append(input_filename) pass else: if total_acc == 0 and discarded_acc == 0: print 'SKIPPED empty file', input_filename else: total += total_acc discarded += discarded_acc print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ .format(inp=input_filename, kept=total-discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) print 'output in', output_name if dump_frequency > 0 and n > 0 and n % dump_frequency == 0: print 'Backup: Saving hashfile through', input_filename if args.savehash: hashname = args.savehash print '...saving to', hashname else: hashname = 'backup.ht' print 'Nothing given for savehash, saving to', hashname ht.save(hashname) if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if force and len(corrupt_files) > 0: print >>sys.stderr, "** WARNING: Finished with errors!" print >>sys.stderr, "** IOErrors occurred in the following files:" print >>sys.stderr, "\t", " ".join(corrupt_files) if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" print >>sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = build_construct_args( "Filter k-mers at the given abundance (inmem version).") add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savehash', dest='savehash', default='') parser.add_argument('datafile') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) filename = args.datafile ### first, load reads into hash table rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input, round 1 --', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate ### now, trim. ### the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None ### the filtering loop print 'filtering', filename outfile = os.path.basename(filename) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(filename), outfp) print 'output in', outfile if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): parser = build_construct_args() parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >> sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print >> sys.stderr, '' print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % ( args.n_hashes * args.min_hashsize) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) ### print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) ht.set_use_bigcount(True) for n, filename in enumerate(filenames): print 'consuming input', filename ht.consume_fasta(filename) if n > 0 and n % 10 == 0: print 'mid-save', base ht.save(base) open(base + '.info', 'w').write('through %s' % filename) print 'saving', base ht.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" sys.exit(-1) print 'DONE.'