def main(): parser = build_construct_args() parser.add_argument( "--build-tagset", "-t", default=True, action="store_false", help="Construct tagset while loading sequences" ) parser.add_argument("input_contigs") parser.add_argument("input_reads", nargs="+") args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >> sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print >> sys.stderr, "\nPARAMETERS:" print >> sys.stderr, " - kmer size = %d \t\t(-k)" % args.ksize print >> sys.stderr, " - n hashes = %d \t\t(-N)" % args.n_hashes print >> sys.stderr, " - min hashsize = %-5.2g \t(-x)" % args.min_hashsize print >> sys.stderr, "" print >> sys.stderr, "Estimated memory usage is %.2g bytes (n_hashes x min_hashsize / 8)" % ( args.n_hashes * args.min_hashsize / 8.0 ) print >> sys.stderr, "-" * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.input_contigs filenames = args.input_reads ### print "making hashtable" ht = khmer.new_hashbits(K, HT_SIZE, N_HT) for filename in filenames: print "consuming input", filename ht.consume_fasta(filename) ### print "reading contigs from", args.input_contigs fp = open(os.path.basename(args.input_contigs) + ".unfound", "w") for contig in screed.open(base): seq = contig.sequence n_not_found = 0 found = True for start in range(0, len(seq) - K): kmer = seq[start : start + K] if not ht.get(kmer): n_not_found += 1 found = False if not found: fp.write(">%s %d %d\n%s\n" % (contig.name, n_not_found, len(contig.sequence), contig.sequence))
def main(): parser = build_construct_args() parser.add_argument("input_filenames", nargs="+") parser.add_argument("read_filename") args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >> sys.stderr, "** WARNING: hashsize is default! " "You absodefly want to increase this!\n** " "Please read the docs!" print >> sys.stderr, "\nPARAMETERS:" print >> sys.stderr, " - kmer size = %d \t\t(-k)" % args.ksize print >> sys.stderr, " - n hashes = %d \t\t(-N)" % args.n_hashes print >> sys.stderr, " - min hashsize = %-5.2g \t(-x)" % args.min_hashsize print >> sys.stderr, "" print >> sys.stderr, "Estimated memory usage is %.2g bytes " "(n_hashes x min_hashsize / 8)" % ( args.n_hashes * args.min_hashsize * len(args.input_filenames) / 8.0 ) print >> sys.stderr, "-" * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes inputlist = args.input_filenames readsfile = args.read_filename query_list = [] for n, inp_name in enumerate(inputlist): # create a hashbits data structure ht = khmer.new_hashbits(K, HT_SIZE, N_HT) outfile = os.path.basename(inp_name) + ".sweep3" outfp = open(outfile, "w") query_list.append((ht, outfp)) for n, inp_name in enumerate(inputlist): ht = query_list[n][0] # load contigs, connect into N partitions print "loading input reads from", inp_name ht.consume_fasta(inp_name) print "starting sweep." n = 0 m = 0 for n, record in enumerate(screed.open(readsfile)): if len(record.sequence) < K: continue if n % 10000 == 0: print "...", n, m for ht, outfp in query_list: count = ht.get_median_count(record.sequence)[0] if count: outfp.write(output_single(record))
def main(): parser = build_construct_args() parser.add_argument('input_filename') parser.add_argument('read_filename') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >>sys.stderr, "** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize print >>sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize / 8)' % ( args.n_hashes * args.min_hashsize / 8.) print >>sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes inp = args.input_filename readsfile = args.read_filename outfile = os.path.basename(readsfile) + '.sweep2' outfp = open(outfile, 'w') # create a hashbits data structure ht = khmer.new_hashbits(K, HT_SIZE, N_HT) # load contigs, connect into N partitions print 'loading input reads from', inp ht.consume_fasta(inp) print 'starting sweep.' n = 0 m = 0 for record in screed.open(readsfile): if len(record.sequence) < K: continue if n % 10000 == 0: print '...', n, m count = ht.get_median_count(record.sequence)[0] if count: m += 1 outfp.write('>%s\n%s\n' % (record.name, record.sequence)) n += 1
def main(): parser = build_construct_args() parser.add_argument('--build-tagset', '-t', default=True, action='store_false', help='Construct tagset while loading sequences') parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >> sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print >> sys.stderr, '' print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize / 8)' % ( args.n_hashes * args.min_hashsize / 8.) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) ### print 'making hashtable' ht = khmer.new_hashbits(K, HT_SIZE, N_HT) for n, filename in enumerate(filenames): print 'consuming input', filename ht.consume_fasta_and_tag(filename) print 'saving hashtable in', base + '.ht' ht.save(base + '.ht') print 'saving tagset in', base + '.tagset' ht.save_tagset(base + '.tagset') fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the graph structure is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" sys.exit(-1)
def main(): parser = build_construct_args() parser.add_argument("input_filename") parser.add_argument("read_filename") args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >> sys.stderr, "** WARNING: hashsize is default! " "You absodefly want to increase this!\n** " "Please read the docs!" print >> sys.stderr, "\nPARAMETERS:" print >> sys.stderr, " - kmer size = %d \t\t(-k)" % args.ksize print >> sys.stderr, " - n hashes = %d \t\t(-N)" % args.n_hashes print >> sys.stderr, " - min hashsize = %-5.2g \t(-x)" % args.min_hashsize print >> sys.stderr, "" print >> sys.stderr, "Estimated memory usage is %.2g bytes " "(n_hashes x min_hashsize / 8)" % ( args.n_hashes * args.min_hashsize / 8.0 ) print >> sys.stderr, "-" * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes inp = args.input_filename readsfile = args.read_filename outfile = os.path.basename(readsfile) + ".sweep2" outfp = open(outfile, "w") # create a hashbits data structure ht = khmer.new_hashbits(K, HT_SIZE, N_HT) # load contigs, connect into N partitions print "loading input reads from", inp ht.consume_fasta(inp) print "starting sweep." n = 0 m = 0 for record in screed.open(readsfile): if len(record.sequence) < K: continue if n % 10000 == 0: print "...", n, m count = ht.get_median_count(record.sequence)[0] if count: m += 1 outfp.write(">%s\n%s\n" % (record.name, record.sequence)) n += 1
def main(): parser = build_construct_args() parser.add_argument('--build-tagset', '-t', default=True, action='store_false', help='Construct tagset while loading sequences') parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print>>sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print>>sys.stderr, '\nPARAMETERS:' print>>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print>>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print>>sys.stderr, '' print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize / 8)' % (args.n_hashes * args.min_hashsize / 8.) print>>sys.stderr, '-'*8 K=args.ksize HT_SIZE=args.min_hashsize N_HT=args.n_hashes base = args.output_filename filenames = args.input_filenames print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) ### print 'making hashtable' ht = khmer.new_hashbits(K, HT_SIZE, N_HT) for n, filename in enumerate(filenames): print 'consuming input', filename ht.consume_fasta_and_tag(filename) print 'saving hashtable in', base + '.ht' ht.save(base + '.ht') print 'saving tagset in', base + '.tagset' ht.save_tagset(base + '.tagset') fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the graph structure is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" sys.exit(-1)
def main(): parser = build_construct_args() parser.add_argument('input_filename') parser.add_argument('read_filename') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >>sys.stderr, "** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize print >>sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize / 8)' % ( args.n_hashes * args.min_hashsize / 8.) print >>sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes inp = args.input_filename readsfile = args.read_filename tag = args.tag outfile = os.path.basename(readsfile) + '.' + tag + '.sweep2' outfp = open(outfile, 'w') # create a hashbits data structure ht = khmer.new_hashbits(K, HT_SIZE, N_HT) # load contigs, connect into N partitions print 'loading input reads from', inp ht.consume_fasta(inp) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" print >>sys.stderr, "** Do not use these results!!" sys.exit(-1) print 'starting sweep.' n = 0 m = 0 for record in screed.open(readsfile): if len(record.sequence) < K: continue if n % 100000 == 0: print '...', n, m count = ht.get_median_count(record.sequence)[0] if count: m += 1 outfp.write('>%s\n%s\n' % (record.name, record.sequence)) n += 1
def main(): parser = build_construct_args() parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE, dest='subset_size', type=float, help='Set subset size (usually 1e5-1e6 is good)') parser.add_argument('--no-big-traverse', dest='no_big_traverse', action='store_true', default=False, help='Truncate graph joins at big traversals') parser.add_argument('--threads', '-T', dest='n_threads', default=DEFAULT_N_THREADS, help='Number of simultaneous threads to execute') parser.add_argument('--keep-subsets', dest='remove_subsets', default=True, action='store_false', help='Keep individual subsets (default: False)') parser.add_argument('graphbase') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >>sys.stderr, \ "** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, \ ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print >>sys.stderr, '' print >>sys.stderr, \ 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize / 8)' \ % (args.n_hashes * args.min_hashsize / 8.) print >>sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.graphbase filenames = args.input_filenames print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) print '--' print 'SUBSET SIZE', args.subset_size print 'N THREADS', args.n_threads print '--' ### load-graph print 'making hashtable' ht = khmer.new_hashbits(K, HT_SIZE, N_HT) for n, filename in enumerate(filenames): print 'consuming input', filename ht.consume_fasta_and_tag(filename) fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the graph structure is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" sys.exit(-1) ### partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print '** This script brakes for lumps: stop_big_traversals is true.' else: print '** Traverse all the things: stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = ht.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for i in range(0, n_subsets): start = divvy[i] end = divvy[i + 1] worker_q.put((ht, i, start, end)) print 'enqueued %d subset tasks' % n_subsets open('%s.info' % base, 'w').write('%d subsets total\n' % (n_subsets)) n_threads = int(args.n_threads) if n_subsets < n_threads: n_threads = n_subsets # start threads! print 'starting %d threads' % n_threads print '---' threads = [] for n in range(n_threads): t = threading.Thread(target=worker, args=(worker_q, base, stop_big_traversals)) threads.append(t) t.start() print 'done starting threads' # wait for threads for t in threads: t.join() print '---' print 'done making subsets! see %s.subset.*.pmap' % (base,) ### merge-partitions output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print 'loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]) ht = khmer.new_hashbits(K, 1, 1) for pmap_file in pmap_files: print 'merging', pmap_file ht.merge_subset_from_disk(pmap_file) if args.remove_subsets: print 'removing pmap files' for pmap_file in pmap_files: os.unlink(pmap_file) ### annotate-partitions for infile in args.input_filenames: print 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' n = ht.output_partitions(infile, outfile) print 'output %d partitions for %s' % (n, infile) print 'partitions are in', outfile
def main(): parser = build_construct_args() parser.add_argument('input_filenames', nargs='+') parser.add_argument('read_filename') parser.add_argument('tag') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >>sys.stderr, "** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize print >>sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize / 8)' % ( args.n_hashes * args.min_hashsize * len(args.input_filenames) / 8.) print >>sys.stderr, '-'*8 K=args.ksize HT_SIZE=args.min_hashsize N_HT=args.n_hashes inputlist = args.input_filenames readsfile = args.read_filename tag = args.tag logfile = open('%s.sweep3.log' %(tag), 'wb') query_list = [] for n, inp_name in enumerate(inputlist): # create a hashbits data structure ht = khmer.new_hashbits(K, HT_SIZE, N_HT) outfile = os.path.basename(inp_name) + '.' + tag + '.sweep3' outfp = open(outfile, 'w') query_list.append((ht, outfp)) new_lis = [] cnt = 0 for n, inp_name in enumerate(inputlist): ht = query_list[n][0] outfp = query_list[n][1] # load contigs, connect into N partitions print 'loading input reads from', inp_name ht.consume_fasta(inp_name) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** %s. Increase hashsize/num ht." %(inp_name) print >>sys.stderr, "**" print >>sys.stderr, "** Do not use these results!!" print >>sys.stderr, "%s is not processed, inscrease mem" %(inp_name) print >> logfile, "%s is not processed, inscrease mem" %(inp_name) cnt += 1 outfp.close() os.remove('%s.sweep3' %os.path.basename(inp_name)) else: new_lis.append(query_list[n]) print '%d files do not have enough mem assigned' %cnt print 'starting sweep.' n = 0 m = 0 for n, record in enumerate(screed.open(readsfile)): if len(record.sequence) < K: continue if n % 100000 == 0: print '...', n, m for ht, outfp in new_lis: count = ht.get_median_count(record.sequence)[0] if count: outfp.write('>%s\n%s\n' % (record.name, record.sequence))
def main(): parser = build_construct_args() parser.add_argument('--build-tagset', '-t', default=True, action='store_false', help='Construct tagset while loading sequences') parser.add_argument('input_contigs') parser.add_argument('input_reads', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >>sys.stderr, "** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize print >> sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize / 8)' % ( args.n_hashes * args.min_hashsize / 8.) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.input_contigs filenames = args.input_reads ### print 'making hashtable' ht = khmer.new_hashbits(K, HT_SIZE, N_HT) for filename in filenames: print 'consuming input', filename ht.consume_fasta(filename) ### print 'reading contigs from', args.input_contigs fp = open(os.path.basename(args.input_contigs) + '.unfound', 'w') for contig in screed.open(base): seq = contig.sequence n_not_found = 0 found = True for start in range(0, len(seq) - K): kmer = seq[start:start + K] if not ht.get(kmer): n_not_found += 1 found = False if not found: fp.write('>%s %d %d\n%s\n' % (contig.name, n_not_found, len( contig.sequence), contig.sequence))
def main(): parser = build_construct_args() parser.add_argument('--build-tagset', '-t', default=True, action='store_false', help='Construct tagset while loading sequences') parser.add_argument('input_contigs') parser.add_argument('input_reads', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >>sys.stderr, "** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize print >>sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize / 8)' % ( args.n_hashes * args.min_hashsize / 8.) print >>sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.input_contigs filenames = args.input_reads ### print 'making hashtable' ht = khmer.new_hashbits(K, HT_SIZE, N_HT) for filename in filenames: print 'consuming input', filename ht.consume_fasta(filename) ### print 'reading contigs from', args.input_contigs fp = open(os.path.basename(args.input_contigs) + '.unfound', 'w') for contig in screed.open(base): seq = contig.sequence n_not_found = 0 found = True for start in range(0, len(seq) - K): kmer = seq[start:start + K] if not ht.get(kmer): n_not_found += 1 found = False if not found: fp.write('>%s %d %d\n%s\n' % (contig.name, n_not_found, len(contig.sequence), contig.sequence))
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('--no-build-tagset', '-n', default=False, action='store_true', dest='no_build_tagset', help='Do NOT construct tagset while loading sequences') parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print 'We WILL NOT build the tagset.' else: print 'We WILL build the tagset (for partitioning/traversal).' # print 'making hashtable' ht = khmer.new_hashbits(K, HT_SIZE, N_HT) if args.no_build_tagset: target_method = ht.consume_fasta_with_reads_parser else: target_method = ht.consume_fasta_and_tag_with_reads_parser config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for n, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for tnum in xrange(n_threads): t = threading.Thread(target=target_method, args=(rparser, )) threads.append(t) t.start() for t in threads: t.join() print 'saving hashtable in', base + '.ht' ht.save(base + '.ht') if not args.no_build_tagset: print 'saving tagset in', base + '.tagset' ht.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % ht.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the graph structure is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" sys.exit(-1)
def main(): parser = build_construct_args() parser.add_argument('input_filenames', nargs='+') parser.add_argument('read_filename') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >>sys.stderr, "** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize print >> sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize / 8)' % ( args.n_hashes * args.min_hashsize * len(args.input_filenames) / 8.) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes inputlist = args.input_filenames readsfile = args.read_filename query_list = [] for n, inp_name in enumerate(inputlist): # create a hashbits data structure ht = khmer.new_hashbits(K, HT_SIZE, N_HT) outfile = os.path.basename(inp_name) + '.sweep3' outfp = open(outfile, 'w') query_list.append((ht, outfp)) for n, inp_name in enumerate(inputlist): ht = query_list[n][0] # load contigs, connect into N partitions print 'loading input reads from', inp_name ht.consume_fasta(inp_name) print 'starting sweep.' n = 0 m = 0 for n, record in enumerate(screed.open(readsfile)): if len(record.sequence) < K: continue if n % 10000 == 0: print '...', n, m for ht, outfp in query_list: count = ht.get_median_count(record.sequence)[0] if count: outfp.write(output_single(record))
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('--no-build-tagset', '-n', default=False, action='store_true', dest='no_build_tagset', help='Do NOT construct tagset while loading sequences') parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print 'We WILL NOT build the tagset.' else: print 'We WILL build the tagset (for partitioning/traversal).' # print 'making hashtable' ht = khmer.new_hashbits(K, HT_SIZE, N_HT) if args.no_build_tagset: target_method = ht.consume_fasta_with_reads_parser else: target_method = ht.consume_fasta_and_tag_with_reads_parser config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for n, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for tnum in xrange(n_threads): t = threading.Thread(target=target_method, args=(rparser, )) threads.append(t) t.start() for t in threads: t.join() print 'saving hashtable in', base + '.ht' ht.save(base + '.ht') if not args.no_build_tagset: print 'saving tagset in', base + '.tagset' ht.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % ht.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the graph structure is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" sys.exit(-1)
def main(): parser = build_construct_args() parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE, dest='subset_size', type=float, help='Set subset size (usually 1e5-1e6 is good)') parser.add_argument('--no-big-traverse', dest='no_big_traverse', action='store_true', default=False, help='Truncate graph joins at big traversals') parser.add_argument('--threads', '-T', dest='n_threads', default=DEFAULT_N_THREADS, help='Number of simultaneous threads to execute') parser.add_argument('--keep-subsets', dest='remove_subsets', default=True, action='store_false', help='Keep individual subsets (default: False)') parser.add_argument('graphbase') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >>sys.stderr, \ "** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, \ ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print >>sys.stderr, '' print >>sys.stderr, \ 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize / 8)' \ % (args.n_hashes * args.min_hashsize / 8.) print >>sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.graphbase filenames = args.input_filenames print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) print '--' print 'SUBSET SIZE', args.subset_size print 'N THREADS', args.n_threads print '--' # load-graph print 'making hashtable' ht = khmer.new_hashbits(K, HT_SIZE, N_HT) for n, filename in enumerate(filenames): print 'consuming input', filename ht.consume_fasta_and_tag(filename) fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the graph structure is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" sys.exit(-1) # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print '** This script brakes for lumps: stop_big_traversals is true.' else: print '** Traverse all the things: stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = ht.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for i in range(0, n_subsets): start = divvy[i] end = divvy[i + 1] worker_q.put((ht, i, start, end)) print 'enqueued %d subset tasks' % n_subsets open('%s.info' % base, 'w').write('%d subsets total\n' % (n_subsets)) n_threads = int(args.n_threads) if n_subsets < n_threads: n_threads = n_subsets # start threads! print 'starting %d threads' % n_threads print '---' threads = [] for n in range(n_threads): t = threading.Thread(target=worker, args=(worker_q, base, stop_big_traversals)) threads.append(t) t.start() print 'done starting threads' # wait for threads for t in threads: t.join() print '---' print 'done making subsets! see %s.subset.*.pmap' % (base,) # merge-partitions output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print 'loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]) ht = khmer.new_hashbits(K, 1, 1) for pmap_file in pmap_files: print 'merging', pmap_file ht.merge_subset_from_disk(pmap_file) if args.remove_subsets: print 'removing pmap files' for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: print 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' n = ht.output_partitions(infile, outfile) print 'output %d partitions for %s' % (n, infile) print 'partitions are in', outfile