def test_banding_to_disk(ksize, memory, numbands): """ Test accuracy of banding in terms of the data structure contents. Stronger than the functional in-memory test, this function tests whether a computing k-mer abundances in banding mode produces the same data structure as counting k-mer abundances in the normal fashion. """ infile = utils.get_test_data('banding-reads.fq') path1 = utils.get_temp_filename('normal.ct') path2 = utils.get_temp_filename('banding.ct') ct = khmer.Counttable(ksize, memory / 4, 4) ct.consume_seqfile(infile) ct.save(path1) fpr = khmer.calc_expected_collisions(ct) print('FPR', fpr) ct = khmer.Counttable(ksize, memory / 4, 4) for band in range(numbands): ct.consume_seqfile_banding(infile, numbands, band) ct.save(path2) fpr = khmer.calc_expected_collisions(ct) print('FPR', fpr) with open(path1, 'rb') as f1, open(path2, 'rb') as f2: assert f1.read() == f2.read()
def normalizeByMedian(cf): """Wrapper for the normalizeByMedian_impl function.""" kmersize = cf.get_parameter('kmersize', 'int') minhashsize = cf.get_parameter('minhashsize', 'float') nhashes = cf.get_parameter('nhashes', 'int') cutoff = cf.get_parameter('cutoff', 'int') inputfile = cf.get_input('inputfile') outputfile = cf.get_output('outputfile') infp = screed.open(inputfile) outfp = open(outputfile, 'w') discarded, total, ht, n = normalizeByMedian_impl(cf, infp, outfp, kmersize, \ minhashsize, nhashes, cutoff) outfp.close() infp.close() if -1 < n: percent_kept = int(100. - discarded / float(total) * 100.) cf.write_log("DONE with %s; kept %s of %s or %s%%" % (inputfile, \ total - discarded, total, percent_kept)) cf.write_log("Output in %s" % outputfile) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) cf.write_log("fp rate estimated to be %1.3f" % fp_rate) if fp_rate > 0.20: cf.write_error("ERROR: the counting hash is too small.") cf.write_error("Increase the hashsize/num ht.") return constants.GENERIC_ERROR return constants.OK
def __str__(self): return ('**Leaf:{name} [occupied: {nb}, fpr: {fpr:.2}]' '-> {metadata}'.format(name=self.name, metadata=self.metadata, nb=self.data.n_occupied(), fpr=khmer.calc_expected_collisions( self.data, True, 1.1)))
def __str__(self): return "**Leaf:{name} [occupied: {nb}, fpr: {fpr:.2}] -> {metadata}".format( name=self.name, metadata=self.metadata, nb=self.graph.n_occupied(), fpr=khmer.calc_expected_collisions(self.graph, True, 1.1), )
def main(): info("filter-abund-single.py", ["counting"]) args = get_parser().parse_args() check_file_status(args.datafile) check_space([args.datafile]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) report_on_config(args) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print "making k-mer counting table" htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile, args.threads) threads = [] print "consuming input, round 1 --", args.datafile for _ in xrange(args.threads): cur_thread = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() fp_rate = khmer.calc_expected_collisions(htable) print "fp rate estimated to be %1.3f" % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print "filtering", args.datafile outfile = os.path.basename(args.datafile) + ".abundfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print "output in", outfile if args.savetable: print "Saving k-mer counting table filename", args.savetable print "...saving to", args.savetable htable.save(args.savetable)
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, "countgraph") check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info("making countgraph") graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info("consuming input, round 1 -- {datafile}", datafile=args.datafile) for _ in range(args.threads): cur_thread = threading.Thread(target=graph.consume_fasta_with_reads_parser, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info("Total number of unique k-mers: {nk}", nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info("fp rate estimated to be {fpr:1.3f}", fpr=fp_rate) # the filtering loop log_info("filtering {datafile}", datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + ".abundfilt" else: outfile = args.outfile outfp = open(outfile, "wb") outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(args.datafile), min_length=graph.ksize(), force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: print((trimmed_record,)) write_record(trimmed_record, outfp) log_info("output in {outfile}", outfile=outfile) if args.savegraph: log_info("Saving k-mer countgraph filename {graph}", graph=args.savegraph) graph.save(args.savegraph)
def main(args): info('build-graph.py', ['graph', 'SeqAn']) report_on_config(args, hashtype='nodegraph') base = args.output_filename filenames = args.input_filenames for fname in args.input_filenames: check_input_files(fname, args.force) # if optimization args are given, do optimization args = functions.do_sanity_checking(args, 0.01) check_space(args.input_filenames, args.force) check_space_for_hashtable(args, 'nodegraph', args.force) print('Saving k-mer presence table to %s' % base, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(filenames), file=sys.stderr) if args.no_build_tagset: print('We WILL NOT build the tagset.', file=sys.stderr) else: print('We WILL build the tagset (for partitioning/traversal).', file=sys.stderr) print('making nodegraph', file=sys.stderr) htable = khmer_args.create_nodegraph(args) functions.build_graph(filenames, htable, args.threads, not args.no_build_tagset) print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()), file=sys.stderr) print('saving k-mer presence table in', base + '.pt', file=sys.stderr) htable.save(base + '.pt') if not args.no_build_tagset: print('saving tagset in', base + '.tagset', file=sys.stderr) htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15) # 0.18 is ACTUAL MAX. Do not change. print('false positive rate estimated to be %1.3f' % fp_rate, file=sys.stderr) print('\nfalse positive rate estimated to be %1.3f' % fp_rate, file=info_fp) print('wrote to', base + '.info and', base + '.pt', file=sys.stderr) if not args.no_build_tagset: print('and ' + base + '.tagset', file=sys.stderr) sys.exit(0)
def main(args): graph_type = 'nodegraph' report_on_config(args, graphtype=graph_type) base = args.output_filename filenames = args.input_filenames for fname in args.input_filenames: check_input_files(fname, args.force) graphsize = calculate_graphsize(args, graph_type) space_needed = (args.n_tables * graphsize / khmer._buckets_per_byte[graph_type]) check_space_for_graph(args.output_filename, space_needed, args.force) print('Saving k-mer nodegraph to %s' % base, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(filenames), file=sys.stderr) if args.no_build_tagset: print('We WILL NOT build the tagset.', file=sys.stderr) else: print('We WILL build the tagset (for partitioning/traversal).', file=sys.stderr) print('making nodegraph', file=sys.stderr) nodegraph = khmer_args.create_nodegraph(args) oxfuncs.build_graph(filenames, nodegraph, args.threads, not args.no_build_tagset) print('Total number of unique k-mers: {0}'.format( nodegraph.n_unique_kmers()), file=sys.stderr) print('saving k-mer nodegraph in', base, file=sys.stderr) nodegraph.save(base) if not args.no_build_tagset: print('saving tagset in', base + '.tagset', file=sys.stderr) nodegraph.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % nodegraph.n_unique_kmers()) fp_rate = \ khmer.calc_expected_collisions( nodegraph, args.force, max_false_pos=.15) # 0.18 is ACTUAL MAX. Do not change. print('false positive rate estimated to be %1.3f' % fp_rate, file=sys.stderr) print('\nfalse positive rate estimated to be %1.3f' % fp_rate, file=info_fp) print('wrote to ' + base + '.info and ' + base, file=sys.stderr) if not args.no_build_tagset: print('and ' + base + '.tagset', file=sys.stderr) sys.exit(0)
def main(): parser = build_construct_args() parser.add_argument('--build-tagset', '-t', default=True, action='store_false', help='Construct tagset while loading sequences') parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >> sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print >> sys.stderr, '' print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize / 8)' % ( args.n_hashes * args.min_hashsize / 8.) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) ### print 'making hashtable' ht = khmer.new_hashbits(K, HT_SIZE, N_HT) for n, filename in enumerate(filenames): print 'consuming input', filename ht.consume_fasta_and_tag(filename) print 'saving hashtable in', base + '.ht' ht.save(base + '.ht') print 'saving tagset in', base + '.tagset' ht.save_tagset(base + '.tagset') fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the graph structure is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" sys.exit(-1)
def main(): info('optimal_args_hashbits.py', ['graph', 'SeqAn']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') filenames = args.input_filenames base = filenames[0] for _ in args.input_filenames: check_input_files(_, False) check_space(args.input_filenames, False) print('Counting kmers from sequences in %s' % repr(filenames), file=sys.stderr) htable = khmer.new_hashbits(args.ksize, args.max_tablesize, args.n_tables) target_method = htable.consume_fasta_with_reads_parser for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print('consuming input', filename, file=sys.stderr) for num in xrange(args.threads): cur_thread = threading.Thread(target=target_method, args=(rparser, )) threads.append(cur_thread) cur_thread.start() for thread in threads: thread.join() unique_kmers = htable.n_unique_kmers() print('Total number of unique k-mers: {0}'.format(unique_kmers), file=sys.stderr) info_optimal = open(base + '.optimal_args', 'w') fp_rate = khmer.calc_expected_collisions(htable) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print("**", file=sys.stderr) print( "** ERROR: the graph structure is too small for this data set." "Increase table size/# tables.", file=sys.stderr) print("**", file=sys.stderr) if not False: sys.exit(1) to_print = output_gen(unique_kmers, fp_rate) print(to_print, file=info_optimal) print('optimal arguments were written to', base + '.optimal_args', file=sys.stderr)
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('datafile') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) filename = args.datafile ### first, load reads into hash table rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input, round 1 --', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate ### now, count. total = 0 total_unique = 0 for n, record in enumerate(screed.open(filename)): total += 1 last_kmer = record.sequence[-K:] count = ht.get(last_kmer) if count == 1: total_unique += 1 print 'singletons: %d unique; of %d total; %.3f' % \ (total_unique, total, total_unique/float(total))
def main(): parser = build_construct_args() parser.add_argument('--build-tagset', '-t', default=True, action='store_false', help='Construct tagset while loading sequences') parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print>>sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print>>sys.stderr, '\nPARAMETERS:' print>>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print>>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print>>sys.stderr, '' print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize / 8)' % (args.n_hashes * args.min_hashsize / 8.) print>>sys.stderr, '-'*8 K=args.ksize HT_SIZE=args.min_hashsize N_HT=args.n_hashes base = args.output_filename filenames = args.input_filenames print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) ### print 'making hashtable' ht = khmer.new_hashbits(K, HT_SIZE, N_HT) for n, filename in enumerate(filenames): print 'consuming input', filename ht.consume_fasta_and_tag(filename) print 'saving hashtable in', base + '.ht' ht.save(base + '.ht') print 'saving tagset in', base + '.tagset' ht.save_tagset(base + '.tagset') fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the graph structure is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" sys.exit(-1)
def main(): info('optimal_args_nodegraph.py', ['graph', 'SeqAn']) args = get_parser().parse_args() report_on_config(args, graphtype='nodegraph') filenames = args.input_filenames base = filenames[0] for _ in args.input_filenames: check_input_files(_, False) check_space(args.input_filenames, False) print('Counting kmers from sequences in %s' % repr(filenames), file=sys.stderr) htable = khmer.new_nodegraph(args.ksize, args.max_tablesize, args.n_tables) target_method = htable.consume_fasta_with_reads_parser for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print('consuming input', filename, file=sys.stderr) for num in xrange(args.threads): cur_thread = threading.Thread( target=target_method, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for thread in threads: thread.join() unique_kmers = htable.n_unique_kmers() print('Total number of unique k-mers: {0}'.format(unique_kmers), file=sys.stderr) info_optimal = open(base + '.optimal_args', 'w') fp_rate = khmer.calc_expected_collisions(htable) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print("**", file=sys.stderr) print("** ERROR: the graph structure is too small for this data set." "Increase table size/# tables.", file=sys.stderr) print("**", file=sys.stderr) if not False: sys.exit(1) to_print = graphsize_args_report(unique_kmers, fp_rate) print(to_print, file=info_optimal) print('optimal arguments were written to', base + '.optimal_args', file=sys.stderr)
def main(args): info('build-graph.py', ['graph', 'SeqAn']) report_on_config(args, hashtype='hashbits') base = args.output_filename filenames = args.input_filenames for fname in args.input_filenames: check_input_files(fname, args.force) check_space(args.input_filenames, args.force) check_space_for_hashtable( (float(args.n_tables * args.min_tablesize) / 8.), args.force) print >>sys.stderr, 'Saving k-mer presence table to %s' % base print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print >>sys.stderr, 'We WILL NOT build the tagset.' else: print >>sys.stderr, 'We WILL build the tagset', \ ' (for partitioning/traversal).' print >>sys.stderr, 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) functions.build_graph(filenames, htable, args.threads, not args.no_build_tagset) print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) print >>sys.stderr, 'saving k-mer presence table in', base + '.pt' htable.save(base + '.pt') if not args.no_build_tagset: print >>sys.stderr, 'saving tagset in', base + '.tagset' htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15) # 0.18 is ACTUAL MAX. Do not change. print >>sys.stderr, 'false positive rate estimated to be %1.3f' % fp_rate print >>info_fp, '\nfalse positive rate estimated to be %1.3f' % fp_rate print >> sys.stderr, 'wrote to', base + '.info and', base + '.pt' if not args.no_build_tagset: print >> sys.stderr, 'and ' + base + '.tagset' sys.exit(0)
def main(args): info("build-graph.py", ["graph", "SeqAn"]) report_on_config(args, hashtype="hashbits") base = args.output_filename filenames = args.input_filenames for fname in args.input_filenames: check_input_files(fname, args.force) check_space(args.input_filenames, args.force) check_space_for_hashtable((float(args.n_tables * args.min_tablesize) / 8.0), args.force) print("Saving k-mer presence table to %s" % base, file=sys.stderr) print("Loading kmers from sequences in %s" % repr(filenames), file=sys.stderr) if args.no_build_tagset: print("We WILL NOT build the tagset.", file=sys.stderr) else: print("We WILL build the tagset (for partitioning/traversal).", file=sys.stderr) print("making k-mer presence table", file=sys.stderr) htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) functions.build_graph(filenames, htable, args.threads, not args.no_build_tagset) print("Total number of unique k-mers: {0}".format(htable.n_unique_kmers()), file=sys.stderr) print("saving k-mer presence table in", base + ".pt", file=sys.stderr) htable.save(base + ".pt") if not args.no_build_tagset: print("saving tagset in", base + ".tagset", file=sys.stderr) htable.save_tagset(base + ".tagset") info_fp = open(base + ".info", "w") info_fp.write("%d unique k-mers" % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=0.15) # 0.18 is ACTUAL MAX. Do not change. print("false positive rate estimated to be %1.3f" % fp_rate, file=sys.stderr) print("\nfalse positive rate estimated to be %1.3f" % fp_rate, file=info_fp) print("wrote to", base + ".info and", base + ".pt", file=sys.stderr) if not args.no_build_tagset: print("and " + base + ".tagset", file=sys.stderr) sys.exit(0)
def main(args): info("build-graph.py", ["graph", "SeqAn"]) report_on_config(args, graphtype="nodegraph") base = args.output_filename filenames = args.input_filenames for fname in args.input_filenames: check_input_files(fname, args.force) graphsize = calculate_graphsize(args, "nodegraph") check_space_for_graph(args.output_filename, graphsize, args.force) print("Saving k-mer nodegraph to %s" % base, file=sys.stderr) print("Loading kmers from sequences in %s" % repr(filenames), file=sys.stderr) if args.no_build_tagset: print("We WILL NOT build the tagset.", file=sys.stderr) else: print("We WILL build the tagset (for partitioning/traversal).", file=sys.stderr) print("making nodegraph", file=sys.stderr) nodegraph = khmer_args.create_nodegraph(args) oxfuncs.build_graph(filenames, nodegraph, args.threads, not args.no_build_tagset) print("Total number of unique k-mers: {0}".format(nodegraph.n_unique_kmers()), file=sys.stderr) print("saving k-mer nodegraph in", base, file=sys.stderr) nodegraph.save(base) if not args.no_build_tagset: print("saving tagset in", base + ".tagset", file=sys.stderr) nodegraph.save_tagset(base + ".tagset") info_fp = open(base + ".info", "w") info_fp.write("%d unique k-mers" % nodegraph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(nodegraph, args.force, max_false_pos=0.15) # 0.18 is ACTUAL MAX. Do not change. print("false positive rate estimated to be %1.3f" % fp_rate, file=sys.stderr) print("\nfalse positive rate estimated to be %1.3f" % fp_rate, file=info_fp) print("wrote to " + base + ".info and " + base, file=sys.stderr) if not args.no_build_tagset: print("and " + base + ".tagset", file=sys.stderr) sys.exit(0)
def main(): # pylint: disable=too-many-branches,too-many-statements info('saturate-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report report_frequency = args.report_frequency check_valid_file_exists(args.input_filenames) check_space(args.input_filenames) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) # list to save error files along with throwing exceptions if args.force: corrupt_files = [] if args.loadtable: print 'loading k-mer counting table from', args.loadtable htable = khmer.load_counting_hash(args.loadtable) else: print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) total = 0 discarded = 0 for index, input_filename in enumerate(args.input_filenames): total_acc = 0 discarded_acc = 0 try: total_acc, discarded_acc = normalize_by_median( input_filename, htable, args, report_fp, report_frequency) except IOError as err: handle_error(err, input_filename) if not args.force: print >> sys.stderr, '** Exiting!' sys.exit(1) else: print >> sys.stderr, '*** Skipping error file, moving on...' corrupt_files.append(input_filename) else: if total_acc == 0 and discarded_acc == 0: print 'SKIPPED empty file', input_filename else: total += total_acc discarded += discarded_acc print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ .format(inp=input_filename, kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) if args.savetable: print 'Saving k-mer counting table through', input_filename print '...saving to', args.savetable htable.save(args.savetable) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if args.force and len(corrupt_files) > 0: print >> sys.stderr, "** WARNING: Finished with errors!" print >> sys.stderr, "** IOErrors occurred in the following files:" print >> sys.stderr, "\t", " ".join(corrupt_files) if fp_rate > MAX_FALSE_POSITIVE_RATE: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " for this data set. Increase tablesize/# " "tables.") print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(1)
def main(): info('load-into-counting.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print 'Saving k-mer counting table to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.n_threads) htable.set_use_bigcount(args.bigcount) config = khmer.get_config() config.set_reads_input_buffer_size(args.n_threads * 64 * 1024) for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, args.n_threads) threads = [] print 'consuming input', filename for _ in xrange(args.n_threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for _ in threads: _.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize) print 'mid-save', base htable.save(base) open(base + '.info', 'w').write('through %s' % filename) print 'saving', base htable.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " this data set. Increase tablesize/# tables.") print >> sys.stderr, "**" sys.exit(1) print 'DONE.'
def main(): parser = build_construct_args() parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >> sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print >> sys.stderr, '' print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % ( args.n_hashes * args.min_hashsize) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes DESIRED_COVERAGE = args.cutoff filenames = args.input_filenames if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keepkad' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename total += 1 if len(record.sequence) < K: continue seq = record.sequence.replace('N', 'A') kad = ht.get_kadian_count(seq) if kad < DESIRED_COVERAGE: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, record.sequence)) else: discarded += 1 print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): # pylint: disable=too-many-branches,too-many-statements parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, countgraph) with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) else: if '-' in filenames or '/dev/stdin' in filenames: print( "Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(reader, filename): if record is not None: write_record(record, outfp) log_info('output in {name}', name=describe_file_handle(outfp)) if not args.single_output_file: outfp.close() # finished - print out some diagnostics. log_info('Total number of unique k-mers: {umers}', umers=countgraph.n_unique_kmers()) if args.savegraph is not None: log_info('...saving to {name}', name=args.savegraph) countgraph.save(args.savegraph) fp_rate = \ khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.force and len(corrupt_files) > 0: log_error("** WARNING: Finished with errors!") log_error("** I/O Errors occurred in the following files:") log_error("\t" + " ".join(corrupt_files))
def main(): info('load-into-counting.py', ['counting', 'SeqAn']) args = sanitize_help(get_parser()).parse_args() report_on_config(args) base = args.output_countgraph_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_input_files(name, args.force) tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.output_countgraph_filename, tablesize, args.force) check_file_writable(base) check_file_writable(base + ".info") print('Saving k-mer countgraph to %s' % base, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(filenames), file=sys.stderr) # clobber the '.info' file now, as we always open in append mode below if os.path.exists(base + '.info'): os.remove(base + '.info') print('making countgraph', file=sys.stderr) countgraph = khmer_args.create_countgraph(args) countgraph.set_use_bigcount(args.bigcount) filename = None total_num_reads = 0 for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print('consuming input', filename, file=sys.stderr) for _ in range(args.threads): cur_thrd = \ threading.Thread( target=countgraph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if index > 0 and index % 10 == 0: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(base, tablesize, args.force) print('mid-save', base, file=sys.stderr) countgraph.save(base) with open(base + '.info', 'a') as info_fh: print('through', filename, file=info_fh) total_num_reads += rparser.num_reads n_kmers = countgraph.n_unique_kmers() print('Total number of unique k-mers:', n_kmers, file=sys.stderr) with open(base + '.info', 'a') as info_fp: print('Total number of unique k-mers:', n_kmers, file=info_fp) print('saving', base, file=sys.stderr) countgraph.save(base) # Change max_false_pos=0.2 only if you really grok it. HINT: You don't fp_rate = \ khmer.calc_expected_collisions( countgraph, args.force, max_false_pos=.2) with open(base + '.info', 'a') as info_fp: print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp) if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt print("Writing summmary info to", mr_file, file=sys.stderr) with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.2.0", "num_reads": total_num_reads, } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n") vals = [ os.path.basename(base), "{:1.3f}".format(fp_rate), str(n_kmers), str(total_num_reads), ";".join(filenames), ] mr_fh.write("\t".join(vals) + "\n") print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) print('DONE.', file=sys.stderr) print('wrote to:', base + '.info', file=sys.stderr)
def main(): parser = argparse.ArgumentParser(description='XXX') env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K) env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT) env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_MIN_HASHSIZE) parser.add_argument('--ksize', '-k', type=int, dest='ksize', default=env_ksize, help='k-mer size to use') parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes', default=env_n_hashes, help='number of hash tables to use') parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize', default=env_hashsize, help='lower bound on hashsize to use') parser.add_argument('--cutoff', '-C', type=int, dest='abund_cutoff', help='remove k-mers below this abundance', default=DEFAULT_CUTOFF) parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='base cutoff on median k-mer abundance of this', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('--variable-coverage', '-V', action='store_true', dest='variable_coverage', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') parser.add_argument('--tempdir', '-T', type=str, dest='tempdir', default='./') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes CUTOFF = args.abund_cutoff NORMALIZE_LIMIT = args.normalize_to print >>sys.stderr, "K:", K print >>sys.stderr, "HT SIZE:", HT_SIZE print >>sys.stderr, "N HT:", N_HT print >>sys.stderr, "CUTOFF:", CUTOFF print >>sys.stderr, "NORMALIZE_LIMIT:", NORMALIZE_LIMIT print >>sys.stderr, 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print >>sys.stderr, 'created temporary directory %s; ' % tempdir + \ 'use -T to change location' ### save_pass2 = 0 read_bp = 0 read_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) pass2list.append((filename, pass2filename)) pass2fp = open(pass2filename, 'w') for n, read in enumerate(screed.open(filename)): if n % 100000 == 0: print >>sys.stderr, '...', n, filename, save_pass2, \ read_reads, read_bp read_reads += 1 read_bp += len(read.sequence) seq = read.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ht.consume(seq) pass2fp.write(output_single(read)) save_pass2 += 1 else: posns = ht.find_spectral_error_positions(seq, CUTOFF) posns = add_n_posns(posns, read.sequence) print read.name, ",".join(map(str, posns)) pass2fp.close() print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' %\ (filename, save_pass2, n + 1, filename) n_omitted = 0 for orig_filename, pass2filename in pass2list: print >>sys.stderr,'second pass: looking at ' + \ 'sequences kept aside in %s' % pass2filename for n, read in enumerate(screed.open(pass2filename)): if n % 100000 == 0: print >>sys.stderr, '... x 2', n, pass2filename, read_reads, \ read_bp seq = read.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) if med >= NORMALIZE_LIMIT or not args.variable_coverage: posns = ht.find_spectral_error_positions(seq, CUTOFF) posns = add_n_posns(posns, read.sequence) print read.name, ",".join(map(str, posns)) if args.variable_coverage and med < NORMALIZE_LIMIT: print read.name, 'V' n_omitted += 1 print >>sys.stderr, 'removing %s' % pass2filename os.unlink(pass2filename) print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir shutil.rmtree(tempdir) print >>sys.stderr, 'read %d reads, %d bp' % (read_reads, read_bp,) if args.variable_coverage: print >>sys.stderr, 'omitted %d reads for -V' % (n_omitted) fp_rate = khmer.calc_expected_collisions(ht) print >>sys.stderr, \ 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if fp_rate > MAX_FALSE_POSITIVE_RATE: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " for this data set. Increase tablesize/# " "tables.") print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(1)
def main(): parser = build_construct_args() parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('-R', '--report-to-file', dest='report_file', type=argparse.FileType('w')) parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print>>sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print>>sys.stderr, '\nPARAMETERS:' print>>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print>>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print>>sys.stderr, '' print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize) print>>sys.stderr, '-'*8 K=args.ksize HT_SIZE=args.min_hashsize N_HT=args.n_hashes DESIRED_COVERAGE=args.cutoff report_fp = args.report_file filenames = args.input_filenames if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') n = -1 for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 100000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename if report_fp: print>>report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() total += 1 if len(record.sequence) < K: continue seq = record.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) if med < DESIRED_COVERAGE: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, record.sequence)) else: discarded += 1 if -1 < n: print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name else: print 'SKIPPED empty file', input_filename if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" print >>sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = build_construct_args() parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-p', '--paired', action='store_true') parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('-R', '--report-to-file', dest='report_file', type=argparse.FileType('w')) parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >>sys.stderr, \ "** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, \ ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print >>sys.stderr, ' - paired = %s \t\t(-p)' % args.paired print >>sys.stderr, '' print >>sys.stderr, \ 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' \ % (args.n_hashes * args.min_hashsize) print >>sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes DESIRED_COVERAGE = args.cutoff report_fp = args.report_file filenames = args.input_filenames # In paired mode we read two records at a time batch_size = 1 if args.paired: batch_size = 2 if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') n = -1 for n, batch in enumerate(batchwise(screed.open( input_filename), batch_size)): if n > 0 and n % 10000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename if report_fp: print>>report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() total += batch_size # If in paired mode, check that the reads are properly interleaved if args.paired: if not validpair(batch[0], batch[1]): print >>sys.stderr, \ 'Error: Improperly interleaved pairs %s %s' \ % (batch[0].name, batch[1].name) sys.exit(-1) # Emit the batch of reads if any read passes the filter # and all reads are longer than K passed_filter = False passed_length = True for record in batch: if len(record.sequence) < K: passed_length = False continue seq = record.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) if med < DESIRED_COVERAGE: ht.consume(seq) passed_filter = True # Emit records if any passed if passed_length and passed_filter: for record in batch: if hasattr(record, 'accuracy'): outfp.write('@%s\n%s\n+\n%s\n' % (record.name, record.sequence, record.accuracy)) else: outfp.write( '>%s\n%s\n' % (record.name, record.sequence)) else: discarded += batch_size if -1 < n: print \ 'DONE with', input_filename, '; kept', total - discarded, \ 'of', total, 'or', \ int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name else: print 'SKIPPED empty file', input_filename if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" print >>sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): # pylint: disable=too-many-branches,too-many-statements info('saturate-by-median.py', ['diginorm']) parser = sanitize_help(get_parser()) args = parser.parse_args() report_on_config(args) report_fp = args.report report_frequency = args.report_frequency check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, False) if args.savegraph: check_space_for_graph(args, 'countgraph', False) # list to save error files along with throwing exceptions if args.force: corrupt_files = [] if args.loadgraph: print('loading k-mer countgraph from', args.loadgraph) htable = khmer.load_countgraph(args.loadgraph) else: print('making countgraph') htable = create_countgraph(args) total = 0 discarded = 0 for index, input_filename in enumerate(args.input_filenames): total_acc = 0 discarded_acc = 0 try: total_acc, discarded_acc = normalize_by_median(input_filename, htable, args, report_fp, report_frequency) except IOError as err: handle_error(err, input_filename) if not args.force: print("NOTE: This can be overridden using the --force" " argument", file=sys.stderr) print('** Exiting!', file=sys.stderr) sys.exit(1) else: print('*** Skipping error file, moving on...', file=sys.stderr) corrupt_files.append(input_filename) else: if total_acc == 0 and discarded_acc == 0: print('SKIPPED empty file', input_filename) else: total += total_acc discarded += discarded_acc print('DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ .format(inp=input_filename, kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.))) if args.savegraph: print('Saving k-mer countgraph through', input_filename) print('...saving to', args.savegraph) htable.save(args.savegraph) # re: threshold, see Zhang et al., # http://arxiv.org/abs/1309.2975 fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8) print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)) if args.force and len(corrupt_files) > 0: print("** WARNING: Finished with errors!", file=sys.stderr) print("** I/O Errors occurred in the following files:", file=sys.stderr) print("\t", " ".join(corrupt_files), file=sys.stderr)
def main(): parser = argparse.ArgumentParser() parser.add_argument('seqfiles', nargs='+') parser.add_argument('-o', '--output', default=None) parser.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int) parser.add_argument('-x', '--tablesize', default=NODEGRAPH_SIZE, type=float) parser.add_argument('--force', action='store_true') args = parser.parse_args() assert args.ksize % 2, "ksize must be odd" assert args.output, "you probably want an output file" print('building graphs and loading files') # Create graph, and two stop bloom filters - one for loading, one for # traversing. Create them all here so that we can error out quickly # if memory is a problem. graph = khmer.Nodegraph(args.ksize, args.tablesize, 2) print(graph.ksize(), graph.hashsizes()) stop_bf = khmer.Nodegraph(args.ksize, args.tablesize, 2) stop_bf2 = khmer.Nodegraph(args.ksize, args.tablesize, 2) n = 0 # load in all of the input sequences, one file at a time. for seqfile in args.seqfiles: for record in screed.open(seqfile): n += 1 if n % 10000 == 0: print('...', seqfile, n) graph.consume(record.sequence) # complain if too small set of graphs was used. fp_rate = khmer.calc_expected_collisions(graph, args.force, max_false_pos=.05) # initialize the object that will track information for us. pathy = Pathfinder(args.ksize) print('finding high degree nodes') degree_nodes = khmer.HashSet(args.ksize) n = 0 for seqfile in args.seqfiles: for record in screed.open(seqfile): n += 1 if n % 10000 == 0: print('...2', seqfile, n) # walk across sequences, find all high degree nodes, # name them and cherish them. Don't do this on identical sequences. if min(stop_bf2.get_kmer_counts(record.sequence)) == 0: stop_bf2.consume(record.sequence) degree_nodes += graph.find_high_degree_nodes(record.sequence) del stop_bf2 if not len(degree_nodes): print('no high degree nodes; exiting.') sys.exit(0) # get all of the degree > 2 nodes and give them IDs. for node in degree_nodes: pathy.new_segment(node) print('traversing linear segments from', len(degree_nodes), 'nodes') # now traverse from each high degree nodes into all neighboring nodes, # seeking adjacencies. if neighbor is high degree node, add it to # adjacencies; if neighbor is not, then traverse the linear path. also # track minhashes while we're at it. for n, k in enumerate(degree_nodes): if n % 10000 == 0: print('...', n, 'of', len(degree_nodes)) # retrieve the segment ID of the primary node. k_id = pathy.segments_r[k] # find all the neighbors of this high-degree node. nbh = graph.neighbors(k) for nk in nbh: # neighbor is high degree? fine, mark its adjacencies. if nk in degree_nodes: nk_id = pathy.segments_r[nk] pathy.add_adjacency(k_id, nk_id) else: # linear! walk it. traverse_and_mark_linear_paths(graph, nk, stop_bf, pathy, degree_nodes) print(len(pathy.segments), 'segments, containing', sum(pathy.segments.values()), 'nodes') # save to GML if args.output: print('saving to', args.output) fp = open(args.output, 'w') w = GmlWriter(fp, [], []) for k, v in pathy.segments.items(): w.add_vertex(k, v, []) for k, v in pathy.adjacencies.items(): for edge in v: w.add_edge(k, edge, [])
def main(): parser = sanitize_help(get_parser()) args = parser.parse_args() if not args.quiet: info('trim-low-abund.py', ['streaming']) configure_logging(args.quiet) ### if len(set(args.input_filenames)) != len(args.input_filenames): log_error("Error: Cannot input the same filename multiple times.") sys.exit(1) if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \ not args.variable_coverage: log_error("Error: --trim-at-coverage/-Z given, but " "--variable-coverage/-V not specified.") sys.exit(1) if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \ not args.diginorm: log_error("Error: --diginorm-coverage given, but " "--diginorm not specified.") sys.exit(1) if args.diginorm and args.single_pass: log_error("Error: --diginorm and --single-pass are incompatible!\n" "You probably want to use normalize-by-median.py instead.") sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) if args.loadgraph: log_info('loading countgraph from {graph}', graph=args.loadgraph) ct = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') ct = khmer_args.create_countgraph(args) K = ct.ksize() tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) log_info( 'created temporary directory {temp};\n' 'use -T to change location', temp=tempdir) trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff, args.trim_at_coverage) if args.diginorm: trimmer.set_diginorm(args.diginorm_coverage) # ### FIRST PASS ### save_pass2_total = 0 written_bp = 0 written_reads = 0 # only create the file writer once if outfp is specified; otherwise, # create it for each file. if args.output: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list = [] for filename in args.input_filenames: # figure out temporary filename for 2nd pass pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) pass2fp = open(pass2filename, 'w') # construct output filenames if args.output is None: # note: this will be saved in trimfp. outfp = open(os.path.basename(filename) + '.abundtrim', 'wb') # get file handle w/gzip, bzip trimfp = get_file_writer(outfp, args.gzip, args.bzip) # record all this info pass2list.append((filename, pass2filename, trimfp)) # input file stuff: get a broken_paired reader. screed_iter = screed.open(filename) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) # main loop through the file. n_start = trimmer.n_reads save_start = trimmer.n_saved watermark = REPORT_EVERY_N_READS for read in trimmer.pass1(paired_iter, pass2fp): if (trimmer.n_reads - n_start) > watermark: log_info( "... {filename} {n_saved} {n_reads} {n_bp} " "{w_reads} {w_bp}", filename=filename, n_saved=trimmer.n_saved, n_reads=trimmer.n_reads, n_bp=trimmer.n_bp, w_reads=written_reads, w_bp=written_bp) watermark += REPORT_EVERY_N_READS # write out the trimmed/etc sequences that AREN'T going to be # revisited in a 2nd pass. write_record(read, trimfp) written_bp += len(read) written_reads += 1 pass2fp.close() log_info("{filename}: kept aside {kept} of {total} from first pass", filename=filename, kept=trimmer.n_saved - save_start, total=trimmer.n_reads - n_start) # first pass goes across all the data, so record relevant stats... n_reads = trimmer.n_reads n_bp = trimmer.n_bp n_skipped = trimmer.n_skipped bp_skipped = trimmer.bp_skipped save_pass2_total = trimmer.n_saved # ### SECOND PASS. ### # nothing should have been skipped yet! assert trimmer.n_skipped == 0 assert trimmer.bp_skipped == 0 if args.single_pass: pass2list = [] # go back through all the files again. for _, pass2filename, trimfp in pass2list: log_info('second pass: looking at sequences kept aside in {pass2}', pass2=pass2filename) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. screed_iter = screed.open(pass2filename, parse_description=False) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=True) watermark = REPORT_EVERY_N_READS for read in trimmer.pass2(paired_iter): if (trimmer.n_reads - n_start) > watermark: log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}', a=trimmer.n_reads - n_start, b=pass2filename, c=trimmer.n_saved, d=trimmer.n_reads, e=trimmer.n_bp, f=written_reads, g=written_bp) watermark += REPORT_EVERY_N_READS write_record(read, trimfp) written_reads += 1 written_bp += len(read) log_info('removing {pass2}', pass2=pass2filename) os.unlink(pass2filename) # if we created our own trimfps, close 'em. if not args.output: trimfp.close() log_info('removing temp directory & contents ({temp})', temp=tempdir) shutil.rmtree(tempdir) trimmed_reads = trimmer.trimmed_reads n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp) log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp) log_info('looked at {st} reads twice ({np:.2f} passes)', st=save_pass2_total, np=n_passes) log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)', r=n_reads - written_reads, t=trimmed_reads, p=percent_reads_trimmed) log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)', p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads log_info('{n} reads were high coverage ({p:.2f}%);', n=n_reads - n_skipped, p=percent_reads_hicov) log_info('skipped {r} reads/{bp} bases because of low coverage', r=n_skipped, bp=bp_skipped) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) log_info('output in *.abundtrim') if args.savegraph: log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph) ct.save(args.savegraph)
def main(): parser = build_counting_args() parser.add_argument("-t", "--trusted-cutoff", dest="trusted_cutoff", type=int, default=3) parser.add_argument("--bits-theta", help="Tuning parameter controlling" "trade off of speed vs alignment sensitivity", default=1.0, type=float, dest="bits_theta") parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='base cutoff on abundance', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('--details-out', dest="details_out") parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: print('\nPARAMETERS:', file=sys.stderr) print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr) print(' - n hashes = %d \t\t(-N)' % args.n_tables, file=sys.stderr) print(' - min hashsize = %-5.2g \t(-x)' % \ args.max_tablesize, file=sys.stderr) print('', file=sys.stderr) print('Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize)' % \ (args.n_tables * args.max_tablesize), file=sys.stderr) print('-' * 8, file=sys.stderr) K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables filenames = args.input_filenames if args.loadhash: print('loading hashtable from', args.loadhash) ht = khmer.load_countgraph(args.loadhash) else: print('making hashtable') ht = khmer.Countgraph(K, HT_SIZE, N_HT) aligner = khmer.ReadAligner(ht, args.trusted_cutoff, args.bits_theta) if args.details_out is not None: details_out = open(args.details_out, "w") else: details_out = None total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keepvar' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print('... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%') print('... in file', input_filename) total += 1 if len(record.sequence) < K: continue seq = record.sequence.upper().replace('N', 'A') ## # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(record.sequence) # next, decide whether or to keep it. keep = False if truncated: keep = True # keep all truncated alignments - why? else: # build a better sequence -- this is the corrected one. graph_seq = graph_alignment.replace("-", "") # OR? #graph_seq = "" #for i in range(len(graph_alignment)): # if graph_alignment[i] == "-": # graph_seq += read_alignment[i] # else: # graph_seq += graph_alignment[i] # get the minimum count for this new sequence mincount = ht.get_min_count(graph_seq) if mincount < args.normalize_to: keep = True if details_out is not None: details_out.write( "+{7}\t{0:0.2f}\t{3}\t{4}\nread: " "{6}\ngraph_aln: {1}\nread_aln: {2}\nstored_seq:{5}\n" "".format(score, graph_alignment, read_alignment, truncated, keep, seq, record.sequence, record.name)) if keep: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, record.sequence)) else: discarded += 1 if total: print('DONE with', input_filename, \ '; kept', total - discarded, 'of', total, 'or', \ int(100. - discarded / float(total) * 100.), '%') print('output in', output_name) if args.savehash: print('Saving hashfile through', input_filename) print('...saving to', args.savehash) ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht, args.force, max_false_pos=.2) print('fp rate estimated to be %1.3f' % fp_rate)
def main(): # pylint: disable=too-many-branches,too-many-statements info('normalize-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable( args.n_tables * args.min_tablesize, args.force) # list to save error files along with throwing exceptions if args.force: corrupt_files = [] if args.loadtable: print 'loading k-mer counting table from', args.loadtable htable = khmer.load_counting_hash(args.loadtable) else: print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) total = 0 discarded = 0 input_filename = None for index, input_filename in enumerate(args.input_filenames): if args.single_output_filename != '': output_name = args.single_output_filename outfp = open(args.single_output_filename, 'a') else: output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') total_acc = 0 discarded_acc = 0 try: total_acc, discarded_acc = normalize_by_median(input_filename, outfp, htable, args, report_fp) except IOError as err: handle_error(err, output_name, input_filename, args.fail_save, htable) if not args.force: print >> sys.stderr, '** Exiting!' sys.exit(1) else: print >> sys.stderr, '*** Skipping error file, moving on...' corrupt_files.append(input_filename) else: if total_acc == 0 and discarded_acc == 0: print 'SKIPPED empty file', input_filename else: total += total_acc discarded += discarded_acc print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ .format(inp=input_filename, kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) print 'output in', output_name if (args.dump_frequency > 0 and index > 0 and index % args.dump_frequency == 0): print 'Backup: Saving k-mer counting file through', input_filename if args.savetable: hashname = args.savetable print '...saving to', hashname else: hashname = 'backup.ct' print 'Nothing given for savetable, saving to', hashname htable.save(hashname) if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) if args.savetable: print 'Saving k-mer counting table through', input_filename print '...saving to', args.savetable htable.save(args.savetable) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if args.force and len(corrupt_files) > 0: print >> sys.stderr, "** WARNING: Finished with errors!" print >> sys.stderr, "** IOErrors occurred in the following files:" print >> sys.stderr, "\t", " ".join(corrupt_files) if fp_rate > MAX_FALSE_POSITIVE_RATE: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " for this data set. Increase tablesize/# " "tables.") print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" if not args.force: sys.exit(1)
def main(): parser = argparse.ArgumentParser(description='XXX') env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K) env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT) env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_MIN_HASHSIZE) parser.add_argument('--ksize', '-k', type=int, dest='ksize', default=env_ksize, help='k-mer size to use') parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes', default=env_n_hashes, help='number of hash tables to use') parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize', default=env_hashsize, help='lower bound on hashsize to use') parser.add_argument('--cutoff', '-C', type=int, dest='abund_cutoff', help='remove k-mers below this abundance', default=DEFAULT_CUTOFF) parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='base cutoff on median k-mer abundance of this', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('--variable-coverage', '-V', action='store_true', dest='variable_coverage', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') parser.add_argument('--tempdir', '-T', type=str, dest='tempdir', default='./') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print >>sys.stderr, \ "Error: Cannot input the same filename multiple times." sys.exit(1) ### K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes CUTOFF = args.abund_cutoff NORMALIZE_LIMIT = args.normalize_to print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print 'created temporary directory %s; use -T to change location' % tempdir ### save_pass2_total = 0 read_bp = 0 read_reads = 0 wrote_bp = 0 wrote_reads = 0 trimmed_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) trimfilename = os.path.basename(filename) + '.abundtrim' pass2list.append((filename, pass2filename, trimfilename)) pass2fp = open(pass2filename, 'w') trimfp = open(trimfilename, 'w') save_pass2 = 0 for n, read in enumerate(screed.open(filename)): if n % 10000 == 0: print '...', n, filename, save_pass2, read_reads, read_bp, \ wrote_reads, wrote_bp read_reads += 1 read_bp += len(read.sequence) seq = read.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ht.consume(seq) pass2fp.write(output_single(read)) save_pass2 += 1 else: # trim!! trim_seq, trim_at = ht.trim_on_abundance(seq, CUTOFF) if trim_at >= K: trimfp.write(output_single(read, trim_at)) wrote_reads += 1 wrote_bp += trim_at if trim_at != len(read.sequence): trimmed_reads += 1 pass2fp.close() trimfp.close() print '%s: kept aside %d of %d from first pass, in %s' % \ (filename, save_pass2, n, filename) save_pass2_total += save_pass2 skipped_n = 0 skipped_bp = 0 for orig_filename, pass2filename, trimfilename in pass2list: print 'second pass: looking at sequences kept aside in %s' % \ pass2filename for n, read in enumerate(screed.open(pass2filename)): if n % 10000 == 0: print '... x 2', n, pass2filename, read_reads, read_bp, \ wrote_reads, wrote_bp trimfp = open(trimfilename, 'a') seq = read.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: trimfp.write(output_single(read)) wrote_reads += 1 wrote_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/trim/truncate. else: # med >= NORMALIZE LIMIT or not args.variable_coverage trim_seq, trim_at = ht.trim_on_abundance(seq, CUTOFF) if trim_at >= K: trimfp.write(output_single(read, trim_at)) wrote_reads += 1 wrote_bp += trim_at if trim_at != len(read.sequence): trimmed_reads += 1 print 'removing %s' % pass2filename os.unlink(pass2filename) print 'removing temp directory & contents (%s)' % tempdir shutil.rmtree(tempdir) print 'read %d reads, %d bp' % ( read_reads, read_bp, ) print 'wrote %d reads, %d bp' % ( wrote_reads, wrote_bp, ) print 'removed %d reads and trimmed %d reads' % ( read_reads - wrote_reads, trimmed_reads, ) print 'looked at %d reads twice' % (save_pass2_total, ) print 'trimmed or removed %.2f%% of bases (%d total)' % \ ((1 - (wrote_bp / float(read_bp))) * 100., read_bp - wrote_bp) if args.variable_coverage: print 'skipped %d reads/%d bases because of low coverage' % \ (skipped_n, skipped_bp) print 'output in *.abundtrim' fp_rate = khmer.calc_expected_collisions(ht) print >>sys.stderr, \ 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if fp_rate > MAX_FALSE_POSITIVE_RATE: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " for this data set. Increase tablesize/# " "tables.") print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(1)
def main(): # pylint: disable=too-many-locals,too-many-statements info('do-partition.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in args.input_filenames: check_file_status(infile) check_space(args.input_filenames) print 'Saving k-mer presence table to %s' % args.graphbase print 'Loading kmers from sequences in %s' % repr(args.input_filenames) print '--' print 'SUBSET SIZE', args.subset_size print 'N THREADS', args.n_threads print '--' # load-graph print 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) for _, filename in enumerate(args.input_filenames): print 'consuming input', filename htable.consume_fasta_and_tag(filename) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for" " this data set. Increase k-mer presence table " "size/num of tables.") print >> sys.stderr, "**" sys.exit(1) # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print '** This script brakes for lumps: stop_big_traversals is true.' else: print '** Traverse all the things: stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = htable.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((htable, _, start, end)) print 'enqueued %d subset tasks' % n_subsets open('%s.info' % args.graphbase, 'w').write('%d subsets total\n' % (n_subsets)) if n_subsets < args.n_threads: args.n_threads = n_subsets # start threads! print 'starting %d threads' % args.n_threads print '---' threads = [] for _ in range(args.n_threads): cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals)) threads.append(cur_thread) cur_thread.start() print 'done starting threads' # wait for threads for _ in threads: _.join() print '---' print 'done making subsets! see %s.subset.*.pmap' % (args.graphbase, ) # merge-partitions pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print 'loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]) htable = khmer.new_hashbits(args.ksize, 1, 1) for pmap_file in pmap_files: print 'merging', pmap_file htable.merge_subset_from_disk(pmap_file) if args.remove_subsets: print 'removing pmap files' for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: print 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) print 'output %d partitions for %s' % (part_count, infile) print 'partitions are in', outfile
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) ### print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for n, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() if n > 0 and n % 10 == 0: print 'mid-save', base ht.save(base) open(base + '.info', 'w').write('through %s' % filename) print 'saving', base ht.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate print >>info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" sys.exit(-1) print 'DONE.'
def main(): info('collect-reads.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_input_files(name, False) check_space(args.input_sequence_filename, False) check_space_for_hashtable(args.n_tables * args.min_tablesize, False) print 'Saving k-mer counting table to %s' % base print 'Loading sequences from %s' % repr(filenames) if args.output: print 'Outputting sequences to', args.output print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize) htable.set_use_bigcount(args.bigcount) total_coverage = 0. n = 0 for index, filename in enumerate(filenames): for record in screed.open(filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'A') try: med, _, _ = htable.get_median_count(seq) except ValueError: continue total_coverage += med n += 1 if total_coverage / float(n) > args.coverage: print 'reached target average coverage:', \ total_coverage / float(n) break htable.consume(seq) if args.output: args.output.write(output_single(record)) if n % 100000 == 0: print '...', index, filename, n, total_coverage / float(n) if total_coverage / float(n) > args.coverage: break print 'Collected %d reads' % (n, ) if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( htable.n_occupied()) print 'saving', base htable.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filenames[-1]) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate print 'DONE.'
def main(): # pylint: disable=too-many-locals,too-many-statements args = sanitize_help(get_parser()).parse_args() report_on_config(args, graphtype='nodegraph') for infile in args.input_filenames: check_input_files(infile, args.force) check_space(args.input_filenames, args.force) print('Saving k-mer nodegraph to %s' % args.graphbase, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(args.input_filenames), file=sys.stderr) print('--', file=sys.stderr) print('SUBSET SIZE', args.subset_size, file=sys.stderr) print('N THREADS', args.threads, file=sys.stderr) print('--', file=sys.stderr) # load-graph.py print('making nodegraph', file=sys.stderr) nodegraph = khmer_args.create_nodegraph(args) for _, filename in enumerate(args.input_filenames): print('consuming input', filename, file=sys.stderr) nodegraph.consume_seqfile_and_tag(filename) # 0.18 is ACTUAL MAX. Do not change. fp_rate = \ khmer.calc_expected_collisions( nodegraph, args.force, max_false_pos=.15) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print('** This script brakes for lumps: ', 'stop_big_traversals is true.', file=sys.stderr) else: print('** Traverse all the things:', ' stop_big_traversals is false.', file=sys.stderr) # # now, partition! # # divide the tags up into subsets divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size)) divvy = list(divvy) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((nodegraph, _, start, end)) print('enqueued %d subset tasks' % n_subsets, file=sys.stderr) open('%s.info' % args.graphbase, 'w').write('%d subsets total\n' % (n_subsets)) if n_subsets < args.threads: args.threads = n_subsets # start threads! print('starting %d threads' % args.threads, file=sys.stderr) print('---', file=sys.stderr) threads = [] for _ in range(args.threads): cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals)) threads.append(cur_thread) cur_thread.start() print('done starting threads', file=sys.stderr) # wait for threads for _ in threads: _.join() print('---', file=sys.stderr) print('done making subsets! see %s.subset.*.pmap' % (args.graphbase, ), file=sys.stderr) # merge-partitions pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) nodegraph = khmer.Nodegraph(args.ksize, 1, 1) for pmap_file in pmap_files: print('merging', pmap_file, file=sys.stderr) nodegraph.merge_subset_from_disk(pmap_file) if not args.keep_subsets: print('removing pmap files', file=sys.stderr) for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: print('outputting partitions for', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.part' part_count = nodegraph.output_partitions(infile, outfile) print('output %d partitions for %s' % (part_count, infile), file=sys.stderr) print('partitions are in', outfile, file=sys.stderr)
def main(): parser = build_counting_args() parser.add_argument("-t", "--trusted-cutoff", dest="trusted_cutoff", type=int, default=3) parser.add_argument( "--bits-theta", help= "Tuning parameter controlling trade off of speed vs alignment sensitivity", default=1.0, type=float, dest="bits_theta") parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_MINIMUM_COVERAGE) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('--details-out', dest="details_out") parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize print >> sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize)' % ( args.n_hashes * args.min_hashsize) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes DESIRED_COVERAGE = args.cutoff filenames = args.input_filenames if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) aligner = khmer.new_readaligner(ht, args.trusted_cutoff, args.bits_theta) if args.details_out != None: details_out = open(args.details_out, "w") else: details_out = None total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keepalign' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename total += 1 if len(record.sequence) < K: continue seq = record.sequence.upper().replace('N', 'A') ## score, graph_alignment, read_alignment, truncated = aligner.align( record.sequence) keep = False if truncated: keep = True else: if False: graph_seq = graph_alignment.replace("-", "") else: graph_seq = "" for i in range(len(graph_alignment)): if graph_alignment[i] == "-": graph_seq += read_alignment[i] else: graph_seq += graph_alignment[i] mincount = ht.get_min_count(graph_seq) keep = True seq = graph_seq #if mincount < DESIRED_COVERAGE: # keep = True # seq = graph_seq #else: # assert not keep if details_out != None: details_out.write( "+{7}\t{0:0.2f}\t{3}\t{4}\nread: {6}\ngraph_aln: {1}\nread_aln: {2}\nstored_seq:{5}\n" .format(score, graph_alignment, read_alignment, truncated, keep, seq, record.sequence, record.name)) if keep: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, seq)) else: discarded += 1 print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = build_construct_args() parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-p', '--paired', action='store_true') parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('-R', '--report-to-file', dest='report_file', type=argparse.FileType('w')) parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print>>sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print>>sys.stderr, '\nPARAMETERS:' print>>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print>>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print>>sys.stderr, ' - paired = %s \t\t(-p)' % args.paired print>>sys.stderr, '' print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize) print>>sys.stderr, '-'*8 K=args.ksize HT_SIZE=args.min_hashsize N_HT=args.n_hashes DESIRED_COVERAGE=args.cutoff report_fp = args.report_file filenames = args.input_filenames # In paired mode we read two records at a time batch_size = 1 if args.paired: batch_size = 2 if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') n = -1 for n, batch in enumerate(batchwise(screed.open(input_filename), batch_size)): if n > 0 and n % 100000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename if report_fp: print>>report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() total += batch_size # If in paired mode, check that the reads are properly interleaved if args.paired: if not validpair(batch[0], batch[1]): print >>sys.stderr, 'Error: Improperly interleaved pairs %s %s' % (batch[0].name, batch[1].name) sys.exit(-1) # Emit the batch of reads if any read passes the filter # and all reads are longer than K passed_filter = False passed_length = True for record in batch: if len(record.sequence) < K: passed_length = False continue seq = record.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) if med < DESIRED_COVERAGE: ht.consume(seq) passed_filter = True # Emit records if any passed if passed_length and passed_filter: for record in batch: if hasattr(record,'accuracy'): outfp.write('@%s\n%s\n+\n%s\n' % (record.name, record.sequence, record.accuracy)) else: outfp.write('>%s\n%s\n' % (record.name, record.sequence)) else: discarded += batch_size if -1 < n: print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name else: print 'SKIPPED empty file', input_filename if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" print >>sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): info('load-graph.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) for _ in args.input_filenames: check_file_status(_) check_space(args.input_filenames) check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.) print 'Saving k-mer presence table to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print 'We WILL NOT build the tagset.' else: print 'We WILL build the tagset (for partitioning/traversal).' print 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) if args.no_build_tagset: target_method = htable.consume_fasta_with_reads_parser else: target_method = htable.consume_fasta_and_tag_with_reads_parser config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for _ in xrange(n_threads): cur_thrd = threading.Thread(target=target_method, args=(rparser, )) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( htable.n_occupied()) print 'saving k-mer presence table in', base + '.pt' htable.save(base + '.pt') if not args.no_build_tagset: print 'saving tagset in', base + '.tagset' htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate if args.write_fp_rate: print >> info_fp, \ '\nfalse positive rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for " "this data set. Increase table size/# tables.") print >> sys.stderr, "**" sys.exit(1)
def main(): info('filter-abund-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) report_on_config(args) print >> sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile) threads = [] print >> sys.stderr, 'consuming input, round 1 --', args.datafile for _ in xrange(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable, args.force) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') _, trim_at = htable.trim_on_abundance(seqN, args.cutoff) if trim_at >= args.ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop print >> sys.stderr, 'filtering', args.datafile outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print >> sys.stderr, 'output in', outfile if args.savetable: print >>sys.stderr, 'Saving k-mer counting table filename', \ args.savetable print >> sys.stderr, '...saving to', args.savetable htable.save(args.savetable) print >> sys.stderr, 'wrote to: ', outfile
def main(): # pylint: disable=too-many-locals,too-many-statements info("do-partition.py", ["graph"]) args = sanitize_help(get_parser()).parse_args() report_on_config(args, graphtype="nodegraph") for infile in args.input_filenames: check_input_files(infile, args.force) check_space(args.input_filenames, args.force) print("Saving k-mer nodegraph to %s" % args.graphbase, file=sys.stderr) print("Loading kmers from sequences in %s" % repr(args.input_filenames), file=sys.stderr) print("--", file=sys.stderr) print("SUBSET SIZE", args.subset_size, file=sys.stderr) print("N THREADS", args.threads, file=sys.stderr) print("--", file=sys.stderr) # load-graph.py print("making nodegraph", file=sys.stderr) nodegraph = khmer_args.create_nodegraph(args) for _, filename in enumerate(args.input_filenames): print("consuming input", filename, file=sys.stderr) nodegraph.consume_fasta_and_tag(filename) # 0.18 is ACTUAL MAX. Do not change. fp_rate = khmer.calc_expected_collisions(nodegraph, args.force, max_false_pos=0.15) print("fp rate estimated to be %1.3f" % fp_rate, file=sys.stderr) # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print("** This script brakes for lumps: ", "stop_big_traversals is true.", file=sys.stderr) else: print("** Traverse all the things:", " stop_big_traversals is false.", file=sys.stderr) # # now, partition! # # divide the tags up into subsets divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size)) divvy = list(divvy) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((nodegraph, _, start, end)) print("enqueued %d subset tasks" % n_subsets, file=sys.stderr) open("%s.info" % args.graphbase, "w").write("%d subsets total\n" % (n_subsets)) if n_subsets < args.threads: args.threads = n_subsets # start threads! print("starting %d threads" % args.threads, file=sys.stderr) print("---", file=sys.stderr) threads = [] for _ in range(args.threads): cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals)) threads.append(cur_thread) cur_thread.start() assert threading.active_count() == args.threads + 1 print("done starting threads", file=sys.stderr) # wait for threads for _ in threads: _.join() print("---", file=sys.stderr) print("done making subsets! see %s.subset.*.pmap" % (args.graphbase,), file=sys.stderr) # merge-partitions pmap_files = glob.glob(args.graphbase + ".subset.*.pmap") print("loading %d pmap files (first one: %s)" % (len(pmap_files), pmap_files[0]), file=sys.stderr) nodegraph = khmer.Nodegraph(args.ksize, 1, 1) for pmap_file in pmap_files: print("merging", pmap_file, file=sys.stderr) nodegraph.merge_subset_from_disk(pmap_file) if args.remove_subsets: print("removing pmap files", file=sys.stderr) for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: print("outputting partitions for", infile, file=sys.stderr) outfile = os.path.basename(infile) + ".part" part_count = nodegraph.output_partitions(infile, outfile) print("output %d partitions for %s" % (part_count, infile), file=sys.stderr) print("partitions are in", outfile, file=sys.stderr)
def main(): # pylint: disable=too-many-branches,too-many-statements parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph = Countgraph.load(args.loadgraph) else: log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, countgraph) with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) else: if '-' in filenames or '/dev/stdin' in filenames: print("Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(reader, filename): if record is not None: write_record(record, outfp) log_info('output in {name}', name=describe_file_handle(outfp)) if not args.single_output_file: outfp.close() # finished - print out some diagnostics. log_info('Total number of unique k-mers: {umers}', umers=countgraph.n_unique_kmers()) if args.savegraph is not None: log_info('...saving to {name}', name=args.savegraph) countgraph.save(args.savegraph) fp_rate = \ khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.force and len(corrupt_files) > 0: log_error("** WARNING: Finished with errors!") log_error("** I/O Errors occurred in the following files:") log_error("\t" + " ".join(corrupt_files))
def main(): info('correct-reads.py', ['streaming']) args = sanitize_help(get_parser()).parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print("Error: Cannot input the same filename multiple times.", file=sys.stderr) sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) tablesize = calculate_graphsize(args, 'countgraph') if args.savegraph: check_space_for_graph(args.savegraph, tablesize, args.force) K = args.ksize CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to if args.loadgraph: print('loading k-mer countgraph from', args.loadgraph, file=sys.stderr) ct = Countgraph.load(args.loadgraph) else: print('making k-mer countgraph', file=sys.stderr) ct = create_countgraph(args, multiplier=8 / (9. + 0.3)) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print('created temporary directory %s; use -T to change location' % tempdir, file=sys.stderr) aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta) # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 corrected_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.out is None: corrfp = open(os.path.basename(filename) + '.corr', 'w') else: corrfp = args.out pass2list.append((filename, pass2filename, corrfp)) screed_iter = screed.open(filename, parse_description=False) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print('...', n, filename, save_pass2, n_reads, n_bp, written_reads, written_bp, file=sys.stderr) # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: is_aligned, new_seq1 = correct_sequence(aligner, seq1) if is_aligned: if new_seq1 != read1.sequence: corrected_reads += 1 read1.sequence = new_seq1 if hasattr(read1, 'quality'): fix_quality(read1) is_aligned, new_seq2 = correct_sequence(aligner, seq2) if is_aligned: if new_seq2 != read2.sequence: corrected_reads += 1 read2.sequence = new_seq2 if hasattr(read2, 'quality'): fix_quality(read2) write_record_pair(read1, read2, corrfp) written_reads += 2 written_bp += len(read1) written_bp += len(read2) else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read1.sequence: corrected_reads += 1 read1.sequence = new_seq if hasattr(read1, 'quality'): fix_quality(read1) write_record(read1, corrfp) written_reads += 1 written_bp += len(new_seq) pass2fp.close() print('%s: kept aside %d of %d from first pass, in %s' % (filename, save_pass2, n, filename), file=sys.stderr) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, corrfp in pass2list: print(('second pass: looking at sequences kept aside in %s') % pass2filename, file=sys.stderr) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate( screed.open(pass2filename, parse_description=False)): if n % 10000 == 0: print('... x 2', n, pass2filename, written_reads, written_bp, file=sys.stderr) seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, corrfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/correct. else: # med >= NORMALIZE LIMIT or not args.variable_coverage is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read.sequence: corrected_reads += 1 read.sequence = new_seq if hasattr(read, 'quality'): fix_quality(read) write_record(read, corrfp) written_reads += 1 written_bp += len(new_seq) print('removing %s' % pass2filename, file=sys.stderr) os.unlink(pass2filename) print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr) shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_corrected = float(corrected_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print('read %d reads, %d bp' % ( n_reads, n_bp, ), file=sys.stderr) print('wrote %d reads, %d bp' % ( written_reads, written_bp, ), file=sys.stderr) print('looked at %d reads twice (%.2f passes)' % (save_pass2_total, n_passes), file=sys.stderr) print('removed %d reads and corrected %d reads (%.2f%%)' % (n_reads - written_reads, corrected_reads, percent_reads_corrected), file=sys.stderr) print('removed %.2f%% of bases (%d total)' % ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp), file=sys.stderr) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n, percent_reads_hicov), file=sys.stderr) print(('skipped %d reads/%d bases because of low coverage') % (skipped_n, skipped_bp), file=sys.stderr) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) print('output in *.corr', file=sys.stderr) if args.savegraph: print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr) ct.save(args.savegraph)
def main(): info('trim-low-abund.py', ['streaming']) parser = get_parser() args = parser.parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print >>sys.stderr, \ "Error: Cannot input the same filename multiple times." sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) K = args.ksize CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to if args.loadtable: print >> sys.stderr, 'loading k-mer counting table from', args.loadtable ct = khmer.load_counting_hash(args.loadtable) else: print >> sys.stderr, 'making k-mer counting table' ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print >>sys.stderr, 'created temporary directory %s; ' \ 'use -T to change location' % tempdir # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 trimmed_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.out is None: trimfp = open(os.path.basename(filename) + '.abundtrim', 'w') else: trimfp = args.out pass2list.append((filename, pass2filename, trimfp)) screed_iter = screed.open(filename, parse_description=False) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print >>sys.stderr, '...', n, filename, save_pass2, \ n_reads, n_bp, written_reads, written_bp # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF) _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF) if trim_at1 >= K: read1 = trim_record(read1, trim_at1) if trim_at2 >= K: read2 = trim_record(read2, trim_at2) if trim_at1 != len(seq1): trimmed_reads += 1 if trim_at2 != len(seq2): trimmed_reads += 1 write_record_pair(read1, read2, trimfp) written_reads += 2 written_bp += trim_at1 + trim_at2 else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read1, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read1.sequence): trimmed_reads += 1 pass2fp.close() print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \ % (filename, save_pass2, n, filename) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, trimfp in pass2list: print >> sys.stderr, ('second pass: looking at sequences kept aside ' 'in %s') % pass2filename # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate( screed.open(pass2filename, parse_description=False)): if n % 10000 == 0: print >>sys.stderr, '... x 2', n, pass2filename, \ written_reads, written_bp seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, trimfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/trim/truncate. else: # med >= NORMALIZE LIMIT or not args.variable_coverage _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read.sequence): trimmed_reads += 1 print >> sys.stderr, 'removing %s' % pass2filename os.unlink(pass2filename) print >> sys.stderr, 'removing temp directory & contents (%s)' % tempdir shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print >> sys.stderr, 'read %d reads, %d bp' % ( n_reads, n_bp, ) print >> sys.stderr, 'wrote %d reads, %d bp' % ( written_reads, written_bp, ) print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \ (save_pass2_total, n_passes) print >>sys.stderr, 'removed %d reads and trimmed %d reads (%.2f%%)' % \ (n_reads - written_reads, trimmed_reads, percent_reads_trimmed) print >>sys.stderr, 'trimmed or removed %.2f%% of bases (%d total)' % \ ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \ (n_reads - skipped_n, percent_reads_hicov) print >> sys.stderr, ('skipped %d reads/%d bases because of low' 'coverage') % (skipped_n, skipped_bp) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print >>sys.stderr, \ 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) print >> sys.stderr, 'output in *.abundtrim' if args.savetable: print >> sys.stderr, "Saving k-mer counting table to", args.savetable ct.save(args.savetable)
def main(): parser = build_construct_args() parser.add_argument('input_filename') parser.add_argument('read_filename') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >>sys.stderr, "** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize print >>sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize / 8)' % ( args.n_hashes * args.min_hashsize / 8.) print >>sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes inp = args.input_filename readsfile = args.read_filename tag = args.tag outfile = os.path.basename(readsfile) + '.' + tag + '.sweep2' outfp = open(outfile, 'w') # create a hashbits data structure ht = khmer.new_hashbits(K, HT_SIZE, N_HT) # load contigs, connect into N partitions print 'loading input reads from', inp ht.consume_fasta(inp) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" print >>sys.stderr, "** Do not use these results!!" sys.exit(-1) print 'starting sweep.' n = 0 m = 0 for record in screed.open(readsfile): if len(record.sequence) < K: continue if n % 100000 == 0: print '...', n, m count = ht.get_median_count(record.sequence)[0] if count: m += 1 outfp.write('>%s\n%s\n' % (record.name, record.sequence)) n += 1
def main(): info('load-into-counting.py', ['counting', 'SeqAn']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_input_files(name, args.force) check_space(args.input_sequence_filename, args.force) check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) check_file_writable(base) check_file_writable(base + ".info") print >>sys.stderr, 'Saving k-mer counting table to %s' % base print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) # clobber the '.info' file now, as we always open in append mode below if os.path.exists(base + '.info'): os.remove(base + '.info') print >>sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) htable.set_use_bigcount(args.bigcount) filename = None for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print >>sys.stderr, 'consuming input', filename for _ in xrange(args.threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) print >>sys.stderr, 'mid-save', base htable.save(base) with open(base + '.info', 'a') as info_fh: print >> info_fh, 'through', filename n_kmers = htable.n_unique_kmers() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers:', n_kmers with open(base + '.info', 'a') as info_fp: print >>info_fp, 'Total number of unique k-mers:', n_kmers print >>sys.stderr, 'saving', base htable.save(base) # Change max_false_pos=0.2 only if you really grok it. HINT: You don't fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2) with open(base + '.info', 'a') as info_fp: print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt print >> sys.stderr, "Writing summmary info to", mr_file with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.1.0", } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n") mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format( b=os.path.basename(base), fpr=fp_rate, k=n_kmers, fls=";".join(filenames))) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate print >>sys.stderr, 'DONE.' print >>sys.stderr, 'wrote to:', base + '.info'
def main(): # pylint: disable=too-many-branches,too-many-statements info('normalize-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report # check for similar filenames filenames = [] for pathfilename in args.input_filenames: filename = pathfilename.split('/')[-1] if (filename in filenames): print >>sys.stderr, "WARNING: At least two input files are named \ %s . (The script normalize-by-median.py can not handle this, only one .keep \ file for one of the input files will be generated.)" % filename else: filenames.append(filename) # check for others check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable( args.n_tables * args.min_tablesize, args.force) # list to save error files along with throwing exceptions corrupt_files = [] if args.loadtable: print 'loading k-mer counting table from', args.loadtable htable = khmer.load_counting_hash(args.loadtable) else: print >> sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) input_filename = None for index, input_filename in enumerate(args.input_filenames): total_acc, discarded_acc, corrupt_files = \ normalize_by_median_and_check( input_filename, htable, args.single_output_file, args.fail_save, args.paired, args.cutoff, args.force, corrupt_files, report_fp) if (args.dump_frequency > 0 and index > 0 and index % args.dump_frequency == 0): print 'Backup: Saving k-mer counting file through', input_filename if args.savetable: hashname = args.savetable print '...saving to', hashname else: hashname = 'backup.ct' print 'Nothing given for savetable, saving to', hashname htable.save(hashname) if args.paired and args.unpaired_reads: args.paired = False output_name = args.unpaired_reads if not args.single_output_file: output_name = os.path.basename(args.unpaired_reads) + '.keep' outfp = open(output_name, 'w') total_acc, discarded_acc, corrupt_files = \ normalize_by_median_and_check( args.unpaired_reads, htable, args.single_output_file, args.fail_save, args.paired, args.cutoff, args.force, corrupt_files, report_fp) if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) if args.savetable: print 'Saving k-mer counting table through', input_filename print '...saving to', args.savetable htable.save(args.savetable) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print >> sys.stderr, \ 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if args.force and len(corrupt_files) > 0: print >> sys.stderr, "** WARNING: Finished with errors!" print >> sys.stderr, "** IOErrors occurred in the following files:" print >> sys.stderr, "\t", " ".join(corrupt_files)
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('load-into-counting.py', ['counting', 'SeqAn']) configure_logging(args.quiet) report_on_config(args) base = args.output_countgraph_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_input_files(name, args.force) tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.output_countgraph_filename, tablesize, args.force) info_filename = base + ".info" check_file_writable(base) check_file_writable(info_filename) log_info('Saving k-mer countgraph to {base}', base=base) log_info('Loading kmers from sequences in {filenames}', filenames=repr(filenames)) # clobber the '.info' file now, as we always open in append mode below with open(info_filename, 'w') as info_fp: print('khmer version:', khmer.__version__, file=info_fp) log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) countgraph.set_use_bigcount(args.bigcount) filename = None total_num_reads = 0 for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] log_info('consuming input {input}', input=filename) for _ in range(args.threads): cur_thrd = \ threading.Thread( target=countgraph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if index > 0 and index % 10 == 0: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(base, tablesize, args.force) log_info('mid-save {base}', base=base) countgraph.save(base) with open(info_filename, 'a') as info_fh: print('through', filename, file=info_fh) total_num_reads += rparser.num_reads n_kmers = countgraph.n_unique_kmers() log_info('Total number of unique k-mers: {nk}', nk=n_kmers) with open(info_filename, 'a') as info_fp: print('Total number of unique k-mers:', n_kmers, file=info_fp) log_info('saving {base}', base=base) countgraph.save(base) # Change max_false_pos=0.2 only if you really grok it. HINT: You don't fp_rate = \ khmer.calc_expected_collisions( countgraph, args.force, max_false_pos=.2) with open(info_filename, 'a') as info_fp: print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp) if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt log_info("Writing summmary info to {mr_file}", mr_file=mr_file) with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.2.0", "num_reads": total_num_reads, } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n") vals = [ os.path.basename(base), "{:1.3f}".format(fp_rate), str(n_kmers), str(total_num_reads), ";".join(filenames), ] mr_fh.write("\t".join(vals) + "\n") log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) log_info('DONE.') log_info('wrote to: {filename}', filename=info_filename)
def main(): # pylint: disable=too-many-locals,too-many-statements info('do-partition.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in args.input_filenames: check_file_status(infile) check_space(args.input_filenames) print 'Saving k-mer presence table to %s' % args.graphbase print 'Loading kmers from sequences in %s' % repr(args.input_filenames) print '--' print 'SUBSET SIZE', args.subset_size print 'N THREADS', args.n_threads print '--' # load-graph print 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) for _, filename in enumerate(args.input_filenames): print 'consuming input', filename htable.consume_fasta_and_tag(filename) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for" " this data set. Increase k-mer presence table " "size/num of tables.") print >> sys.stderr, "**" sys.exit(1) # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print '** This script brakes for lumps: stop_big_traversals is true.' else: print '** Traverse all the things: stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = htable.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((htable, _, start, end)) print 'enqueued %d subset tasks' % n_subsets open('%s.info' % args.graphbase, 'w').write('%d subsets total\n' % (n_subsets)) if n_subsets < args.n_threads: args.n_threads = n_subsets # start threads! print 'starting %d threads' % args.n_threads print '---' threads = [] for _ in range(args.n_threads): cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals)) threads.append(cur_thread) cur_thread.start() print 'done starting threads' # wait for threads for _ in threads: _.join() print '---' print 'done making subsets! see %s.subset.*.pmap' % (args.graphbase,) # merge-partitions pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print 'loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]) htable = khmer.new_hashbits(args.ksize, 1, 1) for pmap_file in pmap_files: print 'merging', pmap_file htable.merge_subset_from_disk(pmap_file) if args.remove_subsets: print 'removing pmap files' for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: print 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) print 'output %d partitions for %s' % (part_count, infile) print 'partitions are in', outfile
def main(): parser = build_counting_args() parser.add_argument("-t", "--trusted-cutoff", dest="trusted_cutoff", type=int, default=3) parser.add_argument("--bits-theta", help="Tuning parameter controlling trade off of speed vs alignment sensitivity", default=1.0, type=float, dest="bits_theta") parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_MINIMUM_COVERAGE) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('--details-out', dest="details_out") parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize print >>sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize)' % ( args.n_hashes * args.min_hashsize) print >>sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes DESIRED_COVERAGE = args.cutoff filenames = args.input_filenames if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) aligner = khmer.new_readaligner(ht, args.trusted_cutoff, args.bits_theta) if args.details_out != None: details_out = open(args.details_out, "w") else: details_out = None total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keepalign' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename total += 1 if len(record.sequence) < K: continue seq = record.sequence.upper().replace('N', 'A') ## score, graph_alignment, read_alignment, truncated = aligner.align(record.sequence) keep = False if truncated: keep = True else: if False: graph_seq = graph_alignment.replace("-", "") else: graph_seq = "" for i in range(len(graph_alignment)): if graph_alignment[i] == "-": graph_seq += read_alignment[i] else: graph_seq += graph_alignment[i] mincount = ht.get_min_count(graph_seq) keep = True seq = graph_seq #if mincount < DESIRED_COVERAGE: # keep = True # seq = graph_seq #else: # assert not keep if details_out != None: details_out.write("+{7}\t{0:0.2f}\t{3}\t{4}\nread: {6}\ngraph_aln: {1}\nread_aln: {2}\nstored_seq:{5}\n".format(score, graph_alignment, read_alignment, truncated, keep, seq, record.sequence, record.name)) if keep: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, seq)) else: discarded += 1 print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" print >>sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = build_construct_args( "Filter k-mers at the given abundance (inmem version).") add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savehash', dest='savehash', default='') parser.add_argument('datafile') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) filename = args.datafile ### first, load reads into hash table rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input, round 1 --', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate ### now, trim. ### the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None ### the filtering loop print 'filtering', filename outfile = os.path.basename(filename) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(filename), outfp) print 'output in', outfile if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): info('load-graph.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) for _ in args.input_filenames: check_file_status(_) check_space(args.input_filenames) check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.) print 'Saving k-mer presence table to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print 'We WILL NOT build the tagset.' else: print 'We WILL build the tagset (for partitioning/traversal).' print 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) if args.no_build_tagset: target_method = htable.consume_fasta_with_reads_parser else: target_method = htable.consume_fasta_and_tag_with_reads_parser config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for _ in xrange(n_threads): cur_thrd = threading.Thread(target=target_method, args=(rparser, )) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() print 'saving k-mer presence table in', base + '.pt' htable.save(base + '.pt') if not args.no_build_tagset: print 'saving tagset in', base + '.tagset' htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for " "this data set. Increase table size/# tables.") print >> sys.stderr, "**" sys.exit(1)
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) report_on_config(args) base = args.output_countgraph_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_input_files(name, args.force) tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.output_countgraph_filename, tablesize, args.force) info_filename = base + ".info" check_file_writable(base) check_file_writable(info_filename) log_info('Saving k-mer countgraph to {base}', base=base) log_info('Loading kmers from sequences in {filenames}', filenames=repr(filenames)) # clobber the '.info' file now, as we always open in append mode below with open(info_filename, 'w') as info_fp: print('khmer version:', khmer.__version__, file=info_fp) log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) filename = None total_num_reads = 0 for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] log_info('consuming input {input}', input=filename) for _ in range(args.threads): cur_thrd = \ threading.Thread( target=countgraph.consume_seqfile_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if index > 0 and index % 10 == 0: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(base, tablesize, args.force) log_info('mid-save {base}', base=base) countgraph.save(base) with open(info_filename, 'a') as info_fh: print('through', filename, file=info_fh) total_num_reads += rparser.num_reads n_kmers = countgraph.n_unique_kmers() log_info('Total number of unique k-mers: {nk}', nk=n_kmers) with open(info_filename, 'a') as info_fp: print('Total number of unique k-mers:', n_kmers, file=info_fp) log_info('saving {base}', base=base) countgraph.save(base) # Change max_false_pos=0.2 only if you really grok it. HINT: You don't fp_rate = \ khmer.calc_expected_collisions( countgraph, args.force, max_false_pos=.2) with open(info_filename, 'a') as info_fp: print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp) if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt log_info("Writing summmary info to {mr_file}", mr_file=mr_file) with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.2.0", "num_reads": total_num_reads, } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n") vals = [ os.path.basename(base), "{:1.3f}".format(fp_rate), str(n_kmers), str(total_num_reads), ";".join(filenames), ] mr_fh.write("\t".join(vals) + "\n") log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) log_info('DONE.') log_info('wrote to: {filename}', filename=info_filename)
def main(args): # Input and output files outstream = kevlar.open(args.out, 'w') writer = kevlar.vcf.VCFWriter( outstream, source='kevlar::call', refr=args.refr, ) writer.write_header() # Contigs = query sequences contigstream = kevlar.parse_partitioned_reads( kevlar.parse_augmented_fastx(kevlar.open(args.queryseq, 'r'))) contigs_by_partition = load_contigs(contigstream) gdnastream = kevlar.parse_partitioned_reads( kevlar.reference.load_refr_cutouts(kevlar.open(args.targetseq, 'r'))) mask = None if args.gen_mask: message = 'generating mask of variant-spanning k-mers' kevlar.plog('[kevlar::call]', message) ntables = 4 buckets = args.mask_mem * _buckets_per_byte['nodegraph'] / ntables mask = khmer.Nodetable(args.ksize, buckets, ntables) progress_indicator = kevlar.ProgressIndicator( '[kevlar::call] processed contigs/gDNAs for {counter} partitions', interval=10, breaks=[100, 1000, 10000], ) for partid, gdnas in gdnastream: progress_indicator.update() if partid not in contigs_by_partition: continue contigs = contigs_by_partition[partid] caller = call( gdnas, contigs, partid, match=args.match, mismatch=args.mismatch, gapopen=args.open, gapextend=args.extend, ksize=args.ksize, refrfile=args.refr, debug=args.debug, mindist=5, homopolyfilt=not args.no_homopoly_filter, maxtargetlen=args.max_target_length, ) for varcall in caller: if args.gen_mask: window = varcall.attribute('ALTWINDOW') if window is not None and len(window) >= args.ksize: mask.consume(window) writer.write(varcall) if args.gen_mask: fpr = khmer.calc_expected_collisions(mask, max_false_pos=1.0) if fpr > args.mask_max_fpr: message = 'WARNING: mask FPR is {:.4f}'.format(fpr) message += '; exceeds user-specified limit' message += ' of {:.4f}'.format(args.mask_max_fpr) kevlar.plog('[kevlar::call]', message) mask.save(args.gen_mask)