def main(): args = sanitize_help(get_parser()).parse_args() total_hll = khmer.HLLCounter(args.error_rate, args.ksize) report_fp = args.report input_filename = None for _, input_filename in enumerate(args.input_filenames): hllcpp = khmer.HLLCounter(args.error_rate, args.ksize) hllcpp.consume_fasta(input_filename, stream_records=args.stream_records) cardinality = hllcpp.estimate_cardinality() print('Estimated number of unique {0}-mers in {1}: {2}'.format( args.ksize, input_filename, cardinality), file=sys.stderr) if report_fp: print(cardinality, args.ksize, '(total)', file=report_fp) report_fp.flush() total_hll.merge(hllcpp) cardinality = total_hll.estimate_cardinality() print('Total estimated number of unique {0}-mers: {1}'.format( args.ksize, cardinality), file=sys.stderr) to_print = graphsize_args_report(cardinality, args.error_rate) if args.diagnostics: print(to_print, file=sys.stderr) if report_fp: print(cardinality, args.ksize, 'total', file=report_fp) print(to_print, file=report_fp) report_fp.flush()
def main(): args = sanitize_help(get_parser()).parse_args() total_hll = khmer.HLLCounter(args.error_rate, args.ksize) report_fp = args.report input_filename = None for _, input_filename in enumerate(args.input_filenames): hllcpp = khmer.HLLCounter(args.error_rate, args.ksize) hllcpp.consume_seqfile(input_filename, stream_records=args.stream_records) cardinality = hllcpp.estimate_cardinality() print('Estimated number of unique {0}-mers in {1}: {2}'.format( args.ksize, input_filename, cardinality), file=sys.stderr) if report_fp: print(cardinality, args.ksize, '(total)', file=report_fp) report_fp.flush() total_hll.merge(hllcpp) cardinality = total_hll.estimate_cardinality() print('Total estimated number of unique {0}-mers: {1}'.format( args.ksize, cardinality), file=sys.stderr) to_print = graphsize_args_report(cardinality, args.error_rate) if args.diagnostics: print(to_print, file=sys.stderr) if report_fp: print(cardinality, args.ksize, 'total', file=report_fp) print(to_print, file=report_fp) report_fp.flush()
def main(): info('optimal_args_nodegraph.py', ['graph', 'SeqAn']) args = get_parser().parse_args() report_on_config(args, graphtype='nodegraph') filenames = args.input_filenames base = filenames[0] for _ in args.input_filenames: check_input_files(_, False) check_space(args.input_filenames, False) print('Counting kmers from sequences in %s' % repr(filenames), file=sys.stderr) htable = khmer.new_nodegraph(args.ksize, args.max_tablesize, args.n_tables) target_method = htable.consume_fasta_with_reads_parser for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print('consuming input', filename, file=sys.stderr) for num in xrange(args.threads): cur_thread = threading.Thread(target=target_method, args=(rparser, )) threads.append(cur_thread) cur_thread.start() for thread in threads: thread.join() unique_kmers = htable.n_unique_kmers() print('Total number of unique k-mers: {0}'.format(unique_kmers), file=sys.stderr) info_optimal = open(base + '.optimal_args', 'w') fp_rate = khmer.calc_expected_collisions(htable) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print("**", file=sys.stderr) print( "** ERROR: the graph structure is too small for this data set." "Increase table size/# tables.", file=sys.stderr) print("**", file=sys.stderr) if not False: sys.exit(1) to_print = graphsize_args_report(unique_kmers, fp_rate) print(to_print, file=info_optimal) print('optimal arguments were written to', base + '.optimal_args', file=sys.stderr)
def main(): info('optimal_args_nodegraph.py', ['graph', 'SeqAn']) args = get_parser().parse_args() report_on_config(args, graphtype='nodegraph') filenames = args.input_filenames base = filenames[0] for _ in args.input_filenames: check_input_files(_, False) check_space(args.input_filenames, False) print('Counting kmers from sequences in %s' % repr(filenames), file=sys.stderr) htable = khmer.new_nodegraph(args.ksize, args.max_tablesize, args.n_tables) target_method = htable.consume_fasta_with_reads_parser for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print('consuming input', filename, file=sys.stderr) for num in xrange(args.threads): cur_thread = threading.Thread( target=target_method, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for thread in threads: thread.join() unique_kmers = htable.n_unique_kmers() print('Total number of unique k-mers: {0}'.format(unique_kmers), file=sys.stderr) info_optimal = open(base + '.optimal_args', 'w') fp_rate = khmer.calc_expected_collisions(htable) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print("**", file=sys.stderr) print("** ERROR: the graph structure is too small for this data set." "Increase table size/# tables.", file=sys.stderr) print("**", file=sys.stderr) if not False: sys.exit(1) to_print = graphsize_args_report(unique_kmers, fp_rate) print(to_print, file=info_optimal) print('optimal arguments were written to', base + '.optimal_args', file=sys.stderr)
def main(): info('unique-kmers.py', ['SeqAn', 'hll']) args = get_parser().parse_args() total_hll = khmer.HLLCounter(args.error_rate, args.ksize) report_fp = args.report input_filename = None for index, input_filename in enumerate(args.input_filenames): hllcpp = khmer.HLLCounter(args.error_rate, args.ksize) for record in screed.open(input_filename): seq = record.sequence.upper().replace('N', 'A') hllcpp.consume_string(seq) if args.stream_out: write_record(record, sys.stdout) cardinality = hllcpp.estimate_cardinality() print('Estimated number of unique {0}-mers in {1}: {2}'.format( args.ksize, input_filename, cardinality), file=sys.stderr) if report_fp: print(cardinality, args.ksize, '(total)', file=report_fp) report_fp.flush() total_hll.merge(hllcpp) cardinality = total_hll.estimate_cardinality() print('Total estimated number of unique {0}-mers: {1}'.format( args.ksize, cardinality), file=sys.stderr) to_print = graphsize_args_report(cardinality, args.error_rate) if args.diagnostics: print(to_print, file=sys.stderr) if report_fp: print(cardinality, args.ksize, 'total', file=report_fp) print(to_print, file=report_fp) report_fp.flush()
def test_output_gen(): graphsize_args_report(99, 0.00701925498897)