def main(): info("filter-abund-single.py", ["counting"]) args = get_parser().parse_args() check_file_status(args.datafile) check_space([args.datafile]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) report_on_config(args) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print "making k-mer counting table" htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile, args.threads) threads = [] print "consuming input, round 1 --", args.datafile for _ in xrange(args.threads): cur_thread = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() fp_rate = khmer.calc_expected_collisions(htable) print "fp rate estimated to be %1.3f" % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print "filtering", args.datafile outfile = os.path.basename(args.datafile) + ".abundfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print "output in", outfile if args.savetable: print "Saving k-mer counting table filename", args.savetable print "...saving to", args.savetable htable.save(args.savetable)
def main(): info('load-into-counting.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print 'Saving k-mer counting table to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.n_threads) htable.set_use_bigcount(args.bigcount) config = khmer.get_config() config.set_reads_input_buffer_size(args.n_threads * 64 * 1024) for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, args.n_threads) threads = [] print 'consuming input', filename for _ in xrange(args.n_threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for _ in threads: _.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize) print 'mid-save', base htable.save(base) open(base + '.info', 'w').write('through %s' % filename) print 'saving', base htable.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " this data set. Increase tablesize/# tables.") print >> sys.stderr, "**" sys.exit(1) print 'DONE.'
def main(): info('load-graph.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) for _ in args.input_filenames: check_file_status(_) check_space(args.input_filenames) check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.) print 'Saving k-mer presence table to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print 'We WILL NOT build the tagset.' else: print 'We WILL build the tagset (for partitioning/traversal).' print 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) if args.no_build_tagset: target_method = htable.consume_fasta_with_reads_parser else: target_method = htable.consume_fasta_and_tag_with_reads_parser config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for _ in xrange(n_threads): cur_thrd = threading.Thread(target=target_method, args=(rparser, )) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( htable.n_occupied()) print 'saving k-mer presence table in', base + '.pt' htable.save(base + '.pt') if not args.no_build_tagset: print 'saving tagset in', base + '.tagset' htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate if args.write_fp_rate: print >> info_fp, \ '\nfalse positive rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for " "this data set. Increase table size/# tables.") print >> sys.stderr, "**" sys.exit(1)
def main(): info('load-graph.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) for _ in args.input_filenames: check_file_status(_) check_space(args.input_filenames) check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.) print 'Saving k-mer presence table to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print 'We WILL NOT build the tagset.' else: print 'We WILL build the tagset (for partitioning/traversal).' print 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) if args.no_build_tagset: target_method = htable.consume_fasta_with_reads_parser else: target_method = htable.consume_fasta_and_tag_with_reads_parser config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for _ in xrange(n_threads): cur_thrd = threading.Thread(target=target_method, args=(rparser, )) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() print 'saving k-mer presence table in', base + '.pt' htable.save(base + '.pt') if not args.no_build_tagset: print 'saving tagset in', base + '.tagset' htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for " "this data set. Increase table size/# tables.") print >> sys.stderr, "**" sys.exit(1)
def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting']) args = get_parser().parse_args() report_on_config(args) check_file_status(args.input_sequence_filename) check_space([args.input_sequence_filename]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \ args.output_histogram_filename sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') print 'making k-mer counting table' counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) counting_hash.set_use_bigcount(args.bigcount) print 'building k-mer tracking table' tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize, args.n_tables) print 'kmer_size:', counting_hash.ksize() print 'k-mer counting table sizes:', counting_hash.hashsizes() print 'outputting to', args.output_histogram_filename khmer.get_config().set_reads_input_buffer_size(args.threads * 64 * 1024) # start loading rparser = khmer.ReadParser(args.input_sequence_filename, args.threads) threads = [] print 'consuming input, round 1 --', args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=counting_hash.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( counting_hash.n_occupied()) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = counting_hash.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) print 'preparing hist from %s...' % args.input_sequence_filename rparser = khmer.ReadParser(args.input_sequence_filename, args.threads) threads = [] print 'consuming input, round 2 --', args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print >> sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >> hist_fp, _, i, sofar, round(frac, 3) if sofar == total: break if args.savetable: print 'Saving k-mer counting table ', args.savetable print '...saving to', args.savetable counting_hash.save(args.savetable)
def main(): info('load-into-counting.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print >>sys.stderr, 'Saving k-mer counting table to %s' % base print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) # clobber the '.info' file now, as we always open in append mode below if os.path.exists(base + '.info'): os.remove(base + '.info') print >>sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) htable.set_use_bigcount(args.bigcount) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) filename = None for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, args.threads) threads = [] print >>sys.stderr, 'consuming input', filename for _ in xrange(args.threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for _ in threads: _.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize) print >>sys.stderr, 'mid-save', base htable.save(base) with open(base + '.info', 'a') as info_fh: print >> info_fh, 'through', filename n_kmers = htable.n_unique_kmers() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers:', n_kmers with open(base + '.info', 'a') as info_fp: print >>info_fp, 'Total number of unique k-mers:', n_kmers print >>sys.stderr, 'saving', base htable.save(base) fp_rate = khmer.calc_expected_collisions(htable) with open(base + '.info', 'a') as info_fp: print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt print >> sys.stderr, "Writing summmary info to", mr_file with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.1.0", } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n") mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format( b=os.path.basename(base), fpr=fp_rate, k=n_kmers, fls=";".join(filenames))) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # Change 0.2 only if you really grok it. HINT: You don't. if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the k-mer counting table is too small", print >> sys.stderr, "for this data set. Increase tablesize/# tables." print >> sys.stderr, "**" sys.exit(1) print >>sys.stderr, 'DONE.' print >>sys.stderr, 'wrote to:', base + '.info'
def main(): info('collect-reads.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print 'Saving k-mer counting table to %s' % base print 'Loading sequences from %s' % repr(filenames) if args.output: print 'Outputting sequences to', args.output print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) htable.set_use_bigcount(args.bigcount) total_coverage = 0. n = 0 for index, filename in enumerate(filenames): for record in screed.open(filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') try: med, _, _ = htable.get_median_count(seq) except ValueError: continue total_coverage += med n += 1 if total_coverage / float(n) > args.coverage: print 'reached target average coverage:', \ total_coverage / float(n) break htable.consume(seq) if args.output: args.output.write(output_single(record)) if n % 100000 == 0: print '...', index, filename, n, total_coverage / float(n) if total_coverage / float(n) > args.coverage: break print 'Collected %d reads' % (n,) if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( htable.n_occupied()) print 'saving', base htable.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filenames[-1]) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " this data set. Increase tablesize/# tables.") print >> sys.stderr, "**" sys.exit(1) print 'DONE.'
def main(): info('load-into-counting.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print >> sys.stderr, 'Saving k-mer counting table to %s' % base print >> sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) # clobber the '.info' file now, as we always open in append mode below if os.path.exists(base + '.info'): os.remove(base + '.info') print >> sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) htable.set_use_bigcount(args.bigcount) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) filename = None for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, args.threads) threads = [] print >> sys.stderr, 'consuming input', filename for _ in xrange(args.threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for _ in threads: _.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize) print >> sys.stderr, 'mid-save', base htable.save(base) with open(base + '.info', 'a') as info_fh: print >> info_fh, 'through', filename n_kmers = htable.n_unique_kmers() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers:', n_kmers with open(base + '.info', 'a') as info_fp: print >> info_fp, 'Total number of unique k-mers:', n_kmers print >> sys.stderr, 'saving', base htable.save(base) fp_rate = khmer.calc_expected_collisions(htable) with open(base + '.info', 'a') as info_fp: print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt print >> sys.stderr, "Writing summmary info to", mr_file with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.1.0", } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n") mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format( b=os.path.basename(base), fpr=fp_rate, k=n_kmers, fls=";".join(filenames))) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # Change 0.2 only if you really grok it. HINT: You don't. if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the k-mer counting table is too small", print >> sys.stderr, "for this data set. Increase tablesize/# tables." print >> sys.stderr, "**" sys.exit(1) print >> sys.stderr, 'DONE.' print >> sys.stderr, 'wrote to:', base + '.info'
def main(): info('collect-reads.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print 'Saving k-mer counting table to %s' % base print 'Loading sequences from %s' % repr(filenames) if args.output: print 'Outputting sequences to', args.output print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) htable.set_use_bigcount(args.bigcount) total_coverage = 0. n = 0 for index, filename in enumerate(filenames): for record in screed.open(filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') try: med, _, _ = htable.get_median_count(seq) except ValueError: continue total_coverage += med n += 1 if total_coverage / float(n) > args.coverage: print 'reached target average coverage:', \ total_coverage / float(n) break htable.consume(seq) if args.output: args.output.write(output_single(record)) if n % 100000 == 0: print '...', index, filename, n, total_coverage / float(n) if total_coverage / float(n) > args.coverage: break print 'Collected %d reads' % (n, ) if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( htable.n_occupied()) print 'saving', base htable.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filenames[-1]) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " this data set. Increase tablesize/# tables.") print >> sys.stderr, "**" sys.exit(1) print 'DONE.'
def main(): info('filter-abund-single.py', ['counting']) args = get_parser().parse_args() check_file_status(args.datafile) check_space([args.datafile]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) report_on_config(args) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print >>sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile, args.threads) threads = [] print >>sys.stderr, 'consuming input, round 1 --', args.datafile for _ in xrange(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print >>sys.stderr, 'filtering', args.datafile outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print >>sys.stderr, 'output in', outfile if args.savetable: print >>sys.stderr, 'Saving k-mer counting table filename', \ args.savetable print >>sys.stderr, '...saving to', args.savetable htable.save(args.savetable) print >>sys.stderr, 'wrote to: ', outfile
def main(): info('filter-abund-single.py', ['counting']) args = get_parser().parse_args() check_file_status(args.datafile) check_space([args.datafile]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) report_on_config(args) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print >> sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile, args.threads) threads = [] print >> sys.stderr, 'consuming input, round 1 --', args.datafile for _ in xrange(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print >> sys.stderr, 'filtering', args.datafile outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print >> sys.stderr, 'output in', outfile if args.savetable: print >>sys.stderr, 'Saving k-mer counting table filename', \ args.savetable print >> sys.stderr, '...saving to', args.savetable htable.save(args.savetable) print >> sys.stderr, 'wrote to: ', outfile
def main(): info("load-graph.py", ["graph"]) args = get_parser().parse_args() report_on_config(args, hashtype="hashbits") base = args.output_filename filenames = args.input_filenames for _ in args.input_filenames: check_file_status(_) check_space(args.input_filenames) check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.0) print >>sys.stderr, "Saving k-mer presence table to %s" % base print >>sys.stderr, "Loading kmers from sequences in %s" % repr(filenames) if args.no_build_tagset: print >>sys.stderr, "We WILL NOT build the tagset." else: print >>sys.stderr, "We WILL build the tagset", " (for partitioning/traversal)." config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print >>sys.stderr, "making k-mer presence table" htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) if args.no_build_tagset: target_method = htable.consume_fasta_with_reads_parser else: target_method = htable.consume_fasta_and_tag_with_reads_parser for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, 1) print >>sys.stderr, "consuming input", filename target_method(rparser) if args.report_total_kmers: print >>sys.stderr, "Total number of unique k-mers: {0}".format(htable.n_unique_kmers()) print >>sys.stderr, "saving k-mer presence table in", base + ".pt" htable.save(base + ".pt") if not args.no_build_tagset: print >>sys.stderr, "saving tagset in", base + ".tagset" htable.save_tagset(base + ".tagset") info_fp = open(base + ".info", "w") info_fp.write("%d unique k-mers" % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print >>sys.stderr, "fp rate estimated to be %1.3f" % fp_rate if args.write_fp_rate: print >> info_fp, "\nfalse positive rate estimated to be %1.3f" % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >>sys.stderr, "**" print >>sys.stderr, ( "** ERROR: the graph structure is too small for " "this data set. Increase table size/# tables." ) print >>sys.stderr, "**" sys.exit(1) print >>sys.stderr, "wrote to", base + ".info and", base + ".pt" if not args.no_build_tagset: print >>sys.stderr, "and " + base + ".tagset"
def main(): # pylint: disable=too-many-branches,too-many-statements info('saturate-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report report_frequency = args.report_frequency check_valid_file_exists(args.input_filenames) check_space(args.input_filenames) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) # list to save error files along with throwing exceptions if args.force: corrupt_files = [] if args.loadtable: print 'loading k-mer counting table from', args.loadtable htable = khmer.load_counting_hash(args.loadtable) else: print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) total = 0 discarded = 0 for index, input_filename in enumerate(args.input_filenames): total_acc = 0 discarded_acc = 0 try: total_acc, discarded_acc = normalize_by_median( input_filename, htable, args, report_fp, report_frequency) except IOError as err: handle_error(err, input_filename) if not args.force: print >> sys.stderr, '** Exiting!' sys.exit(1) else: print >> sys.stderr, '*** Skipping error file, moving on...' corrupt_files.append(input_filename) else: if total_acc == 0 and discarded_acc == 0: print 'SKIPPED empty file', input_filename else: total += total_acc discarded += discarded_acc print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ .format(inp=input_filename, kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) if args.savetable: print 'Saving k-mer counting table through', input_filename print '...saving to', args.savetable htable.save(args.savetable) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if args.force and len(corrupt_files) > 0: print >> sys.stderr, "** WARNING: Finished with errors!" print >> sys.stderr, "** IOErrors occurred in the following files:" print >> sys.stderr, "\t", " ".join(corrupt_files) if fp_rate > MAX_FALSE_POSITIVE_RATE: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " for this data set. Increase tablesize/# " "tables.") print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(1)
def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting']) args = get_parser().parse_args() report_on_config(args) check_file_status(args.input_sequence_filename) check_space([args.input_sequence_filename]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \ args.output_histogram_filename sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') print >>sys.stderr, 'making k-mer counting table' counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) counting_hash.set_use_bigcount(args.bigcount) print >> sys.stderr, 'building k-mer tracking table' tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize, args.n_tables) print >>sys.stderr, 'kmer_size:', counting_hash.ksize() print >>sys.stderr, 'k-mer counting table sizes:', \ counting_hash.hashsizes() print >>sys.stderr, 'outputting to', args.output_histogram_filename khmer.get_config().set_reads_input_buffer_size(args.threads * 64 * 1024) # start loading rparser = khmer.ReadParser(args.input_sequence_filename, args.threads) threads = [] print >>sys.stderr, 'consuming input, round 1 --', \ args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=counting_hash.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( counting_hash.n_unique_kmers()) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = counting_hash.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) print >>sys.stderr, 'preparing hist from %s...' % \ args.input_sequence_filename rparser = khmer.ReadParser(args.input_sequence_filename, args.threads) threads = [] print >>sys.stderr, 'consuming input, round 2 --', \ args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print >> sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >> hist_fp, _, i, sofar, round(frac, 3) if sofar == total: break if args.savetable: print >>sys.stderr, 'Saving k-mer counting table ', args.savetable print >>sys.stderr, '...saving to', args.savetable counting_hash.save(args.savetable) print >> sys.stderr, 'wrote to: ' + args.output_histogram_filename
def main(): # pylint: disable=too-many-branches,too-many-statements info('normalize-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable( args.n_tables * args.min_tablesize, args.force) # list to save error files along with throwing exceptions if args.force: corrupt_files = [] if args.loadtable: print 'loading k-mer counting table from', args.loadtable htable = khmer.load_counting_hash(args.loadtable) else: print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) total = 0 discarded = 0 input_filename = None for index, input_filename in enumerate(args.input_filenames): if args.single_output_filename != '': output_name = args.single_output_filename outfp = open(args.single_output_filename, 'a') else: output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') total_acc = 0 discarded_acc = 0 try: total_acc, discarded_acc = normalize_by_median(input_filename, outfp, htable, args, report_fp) except IOError as err: handle_error(err, output_name, input_filename, args.fail_save, htable) if not args.force: print >> sys.stderr, '** Exiting!' sys.exit(1) else: print >> sys.stderr, '*** Skipping error file, moving on...' corrupt_files.append(input_filename) else: if total_acc == 0 and discarded_acc == 0: print 'SKIPPED empty file', input_filename else: total += total_acc discarded += discarded_acc print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ .format(inp=input_filename, kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) print 'output in', output_name if (args.dump_frequency > 0 and index > 0 and index % args.dump_frequency == 0): print 'Backup: Saving k-mer counting file through', input_filename if args.savetable: hashname = args.savetable print '...saving to', hashname else: hashname = 'backup.ct' print 'Nothing given for savetable, saving to', hashname htable.save(hashname) if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) if args.savetable: print 'Saving k-mer counting table through', input_filename print '...saving to', args.savetable htable.save(args.savetable) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if args.force and len(corrupt_files) > 0: print >> sys.stderr, "** WARNING: Finished with errors!" print >> sys.stderr, "** IOErrors occurred in the following files:" print >> sys.stderr, "\t", " ".join(corrupt_files) if fp_rate > MAX_FALSE_POSITIVE_RATE: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " for this data set. Increase tablesize/# " "tables.") print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" if not args.force: sys.exit(1)
def main(): info("load-into-counting.py", ["counting"]) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print "Saving k-mer counting table to %s" % base print "Loading kmers from sequences in %s" % repr(filenames) print "making k-mer counting table" htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.n_threads) htable.set_use_bigcount(args.bigcount) config = khmer.get_config() config.set_reads_input_buffer_size(args.n_threads * 64 * 1024) filename = None for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, args.n_threads) threads = [] print "consuming input", filename for _ in xrange(args.n_threads): cur_thrd = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,)) threads.append(cur_thrd) cur_thrd.start() for _ in threads: _.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize) print "mid-save", base htable.save(base) open(base + ".info", "w").write("through %s" % filename) if args.report_total_kmers: print >> sys.stderr, "Total number of k-mers: {0}".format(htable.n_occupied()) print "saving", base htable.save(base) info_fp = open(base + ".info", "w") info_fp.write("through end: %s\n" % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable) print "fp rate estimated to be %1.3f" % fp_rate print >> info_fp, "fp rate estimated to be %1.3f" % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, ( "** ERROR: the k-mer counting table is too small" " this data set. Increase tablesize/# tables." ) print >> sys.stderr, "**" sys.exit(1) print "DONE."