def main(args): info('build-graph.py', ['graph', 'SeqAn']) report_on_config(args, hashtype='nodegraph') base = args.output_filename filenames = args.input_filenames for fname in args.input_filenames: check_input_files(fname, args.force) # if optimization args are given, do optimization args = functions.do_sanity_checking(args, 0.01) check_space(args.input_filenames, args.force) check_space_for_hashtable(args, 'nodegraph', args.force) print('Saving k-mer presence table to %s' % base, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(filenames), file=sys.stderr) if args.no_build_tagset: print('We WILL NOT build the tagset.', file=sys.stderr) else: print('We WILL build the tagset (for partitioning/traversal).', file=sys.stderr) print('making nodegraph', file=sys.stderr) htable = khmer_args.create_nodegraph(args) functions.build_graph(filenames, htable, args.threads, not args.no_build_tagset) print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()), file=sys.stderr) print('saving k-mer presence table in', base + '.pt', file=sys.stderr) htable.save(base + '.pt') if not args.no_build_tagset: print('saving tagset in', base + '.tagset', file=sys.stderr) htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15) # 0.18 is ACTUAL MAX. Do not change. print('false positive rate estimated to be %1.3f' % fp_rate, file=sys.stderr) print('\nfalse positive rate estimated to be %1.3f' % fp_rate, file=info_fp) print('wrote to', base + '.info and', base + '.pt', file=sys.stderr) if not args.no_build_tagset: print('and ' + base + '.tagset', file=sys.stderr) sys.exit(0)
def main(args): graph_type = 'nodegraph' report_on_config(args, graphtype=graph_type) base = args.output_filename filenames = args.input_filenames for fname in args.input_filenames: check_input_files(fname, args.force) graphsize = calculate_graphsize(args, graph_type) space_needed = (args.n_tables * graphsize / khmer._buckets_per_byte[graph_type]) check_space_for_graph(args.output_filename, space_needed, args.force) print('Saving k-mer nodegraph to %s' % base, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(filenames), file=sys.stderr) if args.no_build_tagset: print('We WILL NOT build the tagset.', file=sys.stderr) else: print('We WILL build the tagset (for partitioning/traversal).', file=sys.stderr) print('making nodegraph', file=sys.stderr) nodegraph = khmer_args.create_nodegraph(args) oxfuncs.build_graph(filenames, nodegraph, args.threads, not args.no_build_tagset) print('Total number of unique k-mers: {0}'.format( nodegraph.n_unique_kmers()), file=sys.stderr) print('saving k-mer nodegraph in', base, file=sys.stderr) nodegraph.save(base) if not args.no_build_tagset: print('saving tagset in', base + '.tagset', file=sys.stderr) nodegraph.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % nodegraph.n_unique_kmers()) fp_rate = \ khmer.calc_expected_collisions( nodegraph, args.force, max_false_pos=.15) # 0.18 is ACTUAL MAX. Do not change. print('false positive rate estimated to be %1.3f' % fp_rate, file=sys.stderr) print('\nfalse positive rate estimated to be %1.3f' % fp_rate, file=info_fp) print('wrote to ' + base + '.info and ' + base, file=sys.stderr) if not args.no_build_tagset: print('and ' + base + '.tagset', file=sys.stderr) sys.exit(0)
def test_create_nodegraph_4(): # tests too-big number of tables WITHOUT force ksize = khmer_args.DEFAULT_K n_tables = 21 # some number larger than 20 max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE max_mem = 1e7 args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0, 0) sys.stderr = capture = StringIO() try: khmer_args.create_nodegraph(args, ksize=None) assert 0, "should not reach this" except SystemExit: err = capture.getvalue() assert 'khmer only supports number of tables <= 20.' in err, err
def test_create_nodegraph_3(): # tests too-big ksize ksize = khmer_args.DEFAULT_K n_tables = khmer_args.DEFAULT_N_TABLES max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE max_mem = 1e7 args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0) sys.stderr = capture = StringIO() try: khmer_args.create_nodegraph(args, ksize=35) assert 0, "should not reach this" except SystemExit: err = capture.getvalue() assert 'khmer only supports k-mer sizes <= 32.' in err, err
def test_create_nodegraph_4(): # tests too-big number of tables WITHOUT force ksize = khmer_args.DEFAULT_K n_tables = 21 # some number larger than 20 max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE max_mem = 1e7 args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0, False, 0) sys.stderr = capture = StringIO() try: khmer_args.create_nodegraph(args, ksize=None) assert 0, "should not reach this" except SystemExit: err = capture.getvalue() assert 'khmer only supports number of tables <= 20.' in err, err
def test_create_nodegraph_5(): # tests too-big number of tables WITH force ksize = khmer_args.DEFAULT_K n_tables = 21 # some number larger than 20 max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE max_mem = 1e7 args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0, 1) sys.stderr = capture = StringIO() try: khmer_args.create_nodegraph(args, ksize=None) message = "Warning: Maximum recommended number of tables is 20, " + \ "discarded by force nonetheless!" assert message in capture.getvalue() except SystemExit as e: print(str(e))
def test_create_nodegraph_5(): # tests too-big number of tables WITH force ksize = khmer_args.DEFAULT_K n_tables = 21 # some number larger than 20 max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE max_mem = 1e7 args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0, False, 1) sys.stderr = capture = StringIO() try: khmer_args.create_nodegraph(args, ksize=None) message = "Warning: Maximum recommended number of tables is 20, " + \ "discarded by force nonetheless!" assert message in capture.getvalue() except SystemExit as e: print(str(e))
def test_create_nodegraph_4_multiplier(): ksize = khmer_args.DEFAULT_K n_tables = khmer_args.DEFAULT_N_TABLES max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE max_mem = 1e7 args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0) nodegraph = khmer_args.create_nodegraph(args, multiplier=2.0) assert sum(nodegraph.hashsizes()) / 8.0 < max_mem / 2.0, \ sum(nodegraph.hashsizes())
def test_create_nodegraph_2(): # tests overriding ksize by passing into create_nodegraph explicitly. ksize = khmer_args.DEFAULT_K n_tables = khmer_args.DEFAULT_N_TABLES max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE max_mem = 1e7 args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0) nodegraph = khmer_args.create_nodegraph(args, ksize=15) assert nodegraph.ksize() == 15
def test_create_nodegraph_1(): ksize = khmer_args.DEFAULT_K n_tables = khmer_args.DEFAULT_N_TABLES max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE max_mem = 1e7 args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0) nodegraph = khmer_args.create_nodegraph(args) expected_hashsz = utils.longify([19999999, 19999981, 19999963, 19999927]) assert nodegraph.hashsizes() == expected_hashsz, nodegraph.hashsizes() assert sum(nodegraph.hashsizes()) / \ 8.0 < max_mem, sum(nodegraph.hashsizes())
def main(args): info("build-graph.py", ["graph", "SeqAn"]) report_on_config(args, graphtype="nodegraph") base = args.output_filename filenames = args.input_filenames for fname in args.input_filenames: check_input_files(fname, args.force) graphsize = calculate_graphsize(args, "nodegraph") check_space_for_graph(args.output_filename, graphsize, args.force) print("Saving k-mer nodegraph to %s" % base, file=sys.stderr) print("Loading kmers from sequences in %s" % repr(filenames), file=sys.stderr) if args.no_build_tagset: print("We WILL NOT build the tagset.", file=sys.stderr) else: print("We WILL build the tagset (for partitioning/traversal).", file=sys.stderr) print("making nodegraph", file=sys.stderr) nodegraph = khmer_args.create_nodegraph(args) oxfuncs.build_graph(filenames, nodegraph, args.threads, not args.no_build_tagset) print("Total number of unique k-mers: {0}".format(nodegraph.n_unique_kmers()), file=sys.stderr) print("saving k-mer nodegraph in", base, file=sys.stderr) nodegraph.save(base) if not args.no_build_tagset: print("saving tagset in", base + ".tagset", file=sys.stderr) nodegraph.save_tagset(base + ".tagset") info_fp = open(base + ".info", "w") info_fp.write("%d unique k-mers" % nodegraph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(nodegraph, args.force, max_false_pos=0.15) # 0.18 is ACTUAL MAX. Do not change. print("false positive rate estimated to be %1.3f" % fp_rate, file=sys.stderr) print("\nfalse positive rate estimated to be %1.3f" % fp_rate, file=info_fp) print("wrote to " + base + ".info and " + base, file=sys.stderr) if not args.no_build_tagset: print("and " + base + ".tagset", file=sys.stderr) sys.exit(0)
def main(): info('count-overlap.py', ['counting']) args = get_parser().parse_args() report_on_config(args, hashtype='nodegraph') for infile in [args.ptfile, args.fafile]: check_input_files(infile, args.force) check_space([args.ptfile, args.fafile], args.force) print('loading k-mer presence table from', args.ptfile, file=sys.stderr) ht1 = khmer.load_hashbits(args.ptfile) kmer_size = ht1.ksize() output = open(args.report_filename, 'w') f_curve_obj = open(args.report_filename + '.curve', 'w') if args.csv: f_curve_obj_csv = csv.writer(f_curve_obj) # write headers: f_curve_obj_csv.writerow(['input_seq', 'overlap_kmer']) ht2 = khmer_args.create_nodegraph(args, ksize=kmer_size) (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1) printout1 = """\ dataset1(pt file): %s dataset2: %s # of unique k-mers in dataset2: %d # of overlap unique k-mers: %d """ % (args.ptfile, args.fafile, n_unique, n_overlap) output.write(printout1) for i in range(100): if args.csv: f_curve_obj_csv.writerow([list_curve[100 + i], list_curve[i]]) else: print(list_curve[100 + i], list_curve[i], file=f_curve_obj) print('wrote to: ' + args.report_filename, file=sys.stderr)
def main(): # pylint: disable=too-many-locals,too-many-branches info("abundance-dist-single.py", ["counting", "SeqAn"]) args = get_parser().parse_args() report_on_config(args) check_input_files(args.input_sequence_filename, args.force) check_space([args.input_sequence_filename], args.force) if args.savetable: check_space_for_hashtable(args, "countgraph", args.force) if not args.squash_output and os.path.exists(args.output_histogram_filename): print("ERROR: %s exists; not squashing." % args.output_histogram_filename, file=sys.stderr) sys.exit(1) else: hist_fp = open(args.output_histogram_filename, "w") if args.csv: hist_fp_csv = csv.writer(hist_fp) # write headers: hist_fp_csv.writerow(["abundance", "count", "cumulative", "cumulative_fraction"]) print("making countgraph", file=sys.stderr) counting_hash = khmer_args.create_countgraph(args, multiplier=1.1) counting_hash.set_use_bigcount(args.bigcount) print("building k-mer tracking table", file=sys.stderr) tracking = khmer_args.create_nodegraph(args, multiplier=1.1) print("kmer_size:", counting_hash.ksize(), file=sys.stderr) print("k-mer counting table sizes:", counting_hash.hashsizes(), file=sys.stderr) print("outputting to", args.output_histogram_filename, file=sys.stderr) # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print("consuming input, round 1 --", args.input_sequence_filename, file=sys.stderr) for _ in range(args.threads): thread = threading.Thread(target=counting_hash.consume_fasta_with_reads_parser, args=(rparser,)) threads.append(thread) thread.start() for thread in threads: thread.join() if args.report_total_kmers: print("Total number of unique k-mers: {0}".format(counting_hash.n_unique_kmers()), file=sys.stderr) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = counting_hash.abundance_distribution_with_reads_parser(read_parser, tracking) abundance_lists.append(abundances) print("preparing hist from %s..." % args.input_sequence_filename, file=sys.stderr) rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print("consuming input, round 2 --", args.input_sequence_filename, file=sys.stderr) for _ in range(args.threads): thread = threading.Thread(target=__do_abundance_dist__, args=(rparser,)) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print("ERROR: abundance distribution is uniformly zero; " "nothing to report.", file=sys.stderr) print("\tPlease verify that the input files are valid.", file=sys.stderr) sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) if args.csv: hist_fp_csv.writerow([_, i, sofar, round(frac, 3)]) else: print(_, i, sofar, round(frac, 3), file=hist_fp) if sofar == total: break if args.savetable: print("Saving k-mer counting table ", args.savetable, file=sys.stderr) print("...saving to", args.savetable, file=sys.stderr) counting_hash.save(args.savetable) print("wrote to: " + args.output_histogram_filename, file=sys.stderr)
def main(): # pylint: disable=too-many-locals,too-many-branches args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) report_on_config(args) check_input_files(args.input_sequence_filename, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): log_error('ERROR: {output} exists; not squashing.', output=args.output_histogram_filename) sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') hist_fp_csv = csv.writer(hist_fp) # write headers: hist_fp_csv.writerow( ['abundance', 'count', 'cumulative', 'cumulative_fraction']) log_info('making countgraph') countgraph = khmer_args.create_countgraph(args, multiplier=1.1) countgraph.set_use_bigcount(args.bigcount) log_info('building k-mer tracking graph') tracking = khmer_args.create_nodegraph(args, multiplier=1.1) log_info('kmer_size: {ksize}', ksize=countgraph.ksize()) log_info('k-mer countgraph sizes: {sizes}', sizes=countgraph.hashsizes()) log_info('outputting to {output}', output=args.output_histogram_filename) # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] log_info('consuming input, round 1 -- {input}', input=args.input_sequence_filename) for _ in range(args.threads): thread = \ threading.Thread( target=countgraph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() log_info('Total number of unique k-mers: {nk}', nk=countgraph.n_unique_kmers()) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = countgraph.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) log_info('preparing hist from {seqfile}...', seqfile=args.input_sequence_filename) rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] log_info('consuming input, round 2 -- {filename}', filename=args.input_sequence_filename) for _ in range(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: log_error("ERROR: abundance distribution is uniformly zero; " "nothing to report.") log_error("\tPlease verify that the input files are valid.") sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) hist_fp_csv.writerow([_, i, sofar, round(frac, 3)]) if sofar == total: break if args.savegraph: log_info('Saving k-mer countgraph to {savegraph}', savegraph=args.savegraph) countgraph.save(args.savegraph) log_info('wrote to: {output}', output=args.output_histogram_filename)
def main(): # pylint: disable=too-many-locals,too-many-statements info("do-partition.py", ["graph"]) args = sanitize_help(get_parser()).parse_args() report_on_config(args, graphtype="nodegraph") for infile in args.input_filenames: check_input_files(infile, args.force) check_space(args.input_filenames, args.force) print("Saving k-mer nodegraph to %s" % args.graphbase, file=sys.stderr) print("Loading kmers from sequences in %s" % repr(args.input_filenames), file=sys.stderr) print("--", file=sys.stderr) print("SUBSET SIZE", args.subset_size, file=sys.stderr) print("N THREADS", args.threads, file=sys.stderr) print("--", file=sys.stderr) # load-graph.py print("making nodegraph", file=sys.stderr) nodegraph = khmer_args.create_nodegraph(args) for _, filename in enumerate(args.input_filenames): print("consuming input", filename, file=sys.stderr) nodegraph.consume_fasta_and_tag(filename) # 0.18 is ACTUAL MAX. Do not change. fp_rate = khmer.calc_expected_collisions(nodegraph, args.force, max_false_pos=0.15) print("fp rate estimated to be %1.3f" % fp_rate, file=sys.stderr) # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print("** This script brakes for lumps: ", "stop_big_traversals is true.", file=sys.stderr) else: print("** Traverse all the things:", " stop_big_traversals is false.", file=sys.stderr) # # now, partition! # # divide the tags up into subsets divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size)) divvy = list(divvy) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((nodegraph, _, start, end)) print("enqueued %d subset tasks" % n_subsets, file=sys.stderr) open("%s.info" % args.graphbase, "w").write("%d subsets total\n" % (n_subsets)) if n_subsets < args.threads: args.threads = n_subsets # start threads! print("starting %d threads" % args.threads, file=sys.stderr) print("---", file=sys.stderr) threads = [] for _ in range(args.threads): cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals)) threads.append(cur_thread) cur_thread.start() assert threading.active_count() == args.threads + 1 print("done starting threads", file=sys.stderr) # wait for threads for _ in threads: _.join() print("---", file=sys.stderr) print("done making subsets! see %s.subset.*.pmap" % (args.graphbase,), file=sys.stderr) # merge-partitions pmap_files = glob.glob(args.graphbase + ".subset.*.pmap") print("loading %d pmap files (first one: %s)" % (len(pmap_files), pmap_files[0]), file=sys.stderr) nodegraph = khmer.Nodegraph(args.ksize, 1, 1) for pmap_file in pmap_files: print("merging", pmap_file, file=sys.stderr) nodegraph.merge_subset_from_disk(pmap_file) if args.remove_subsets: print("removing pmap files", file=sys.stderr) for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: print("outputting partitions for", infile, file=sys.stderr) outfile = os.path.basename(infile) + ".part" part_count = nodegraph.output_partitions(infile, outfile) print("output %d partitions for %s" % (part_count, infile), file=sys.stderr) print("partitions are in", outfile, file=sys.stderr)
def main(): # pylint: disable=too-many-locals,too-many-statements args = sanitize_help(get_parser()).parse_args() report_on_config(args, graphtype='nodegraph') for infile in args.input_filenames: check_input_files(infile, args.force) check_space(args.input_filenames, args.force) print('Saving k-mer nodegraph to %s' % args.graphbase, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(args.input_filenames), file=sys.stderr) print('--', file=sys.stderr) print('SUBSET SIZE', args.subset_size, file=sys.stderr) print('N THREADS', args.threads, file=sys.stderr) print('--', file=sys.stderr) # load-graph.py print('making nodegraph', file=sys.stderr) nodegraph = khmer_args.create_nodegraph(args) for _, filename in enumerate(args.input_filenames): print('consuming input', filename, file=sys.stderr) nodegraph.consume_seqfile_and_tag(filename) # 0.18 is ACTUAL MAX. Do not change. fp_rate = \ khmer.calc_expected_collisions( nodegraph, args.force, max_false_pos=.15) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print('** This script brakes for lumps: ', 'stop_big_traversals is true.', file=sys.stderr) else: print('** Traverse all the things:', ' stop_big_traversals is false.', file=sys.stderr) # # now, partition! # # divide the tags up into subsets divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size)) divvy = list(divvy) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((nodegraph, _, start, end)) print('enqueued %d subset tasks' % n_subsets, file=sys.stderr) open('%s.info' % args.graphbase, 'w').write('%d subsets total\n' % (n_subsets)) if n_subsets < args.threads: args.threads = n_subsets # start threads! print('starting %d threads' % args.threads, file=sys.stderr) print('---', file=sys.stderr) threads = [] for _ in range(args.threads): cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals)) threads.append(cur_thread) cur_thread.start() print('done starting threads', file=sys.stderr) # wait for threads for _ in threads: _.join() print('---', file=sys.stderr) print('done making subsets! see %s.subset.*.pmap' % (args.graphbase, ), file=sys.stderr) # merge-partitions pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) nodegraph = khmer.Nodegraph(args.ksize, 1, 1) for pmap_file in pmap_files: print('merging', pmap_file, file=sys.stderr) nodegraph.merge_subset_from_disk(pmap_file) if not args.keep_subsets: print('removing pmap files', file=sys.stderr) for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: print('outputting partitions for', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.part' part_count = nodegraph.output_partitions(infile, outfile) print('output %d partitions for %s' % (part_count, infile), file=sys.stderr) print('partitions are in', outfile, file=sys.stderr)
def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting', 'SeqAn']) args = sanitize_help(get_parser()).parse_args() report_on_config(args) check_input_files(args.input_sequence_filename, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): print('ERROR: %s exists; not squashing.' % args.output_histogram_filename, file=sys.stderr) sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') hist_fp_csv = csv.writer(hist_fp) # write headers: hist_fp_csv.writerow(['abundance', 'count', 'cumulative', 'cumulative_fraction']) print('making countgraph', file=sys.stderr) countgraph = khmer_args.create_countgraph(args, multiplier=1.1) countgraph.set_use_bigcount(args.bigcount) print('building k-mer tracking graph', file=sys.stderr) tracking = khmer_args.create_nodegraph(args, multiplier=1.1) print('kmer_size:', countgraph.ksize(), file=sys.stderr) print('k-mer countgraph sizes:', countgraph.hashsizes(), file=sys.stderr) print('outputting to', args.output_histogram_filename, file=sys.stderr) # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print('consuming input, round 1 --', args.input_sequence_filename, file=sys.stderr) for _ in range(args.threads): thread = \ threading.Thread( target=countgraph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() print('Total number of unique k-mers: {0}'.format( countgraph.n_unique_kmers()), file=sys.stderr) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = countgraph.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) print('preparing hist from %s...' % args.input_sequence_filename, file=sys.stderr) rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print('consuming input, round 2 --', args.input_sequence_filename, file=sys.stderr) for _ in range(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print("ERROR: abundance distribution is uniformly zero; " "nothing to report.", file=sys.stderr) print( "\tPlease verify that the input files are valid.", file=sys.stderr) sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) hist_fp_csv.writerow([_, i, sofar, round(frac, 3)]) if sofar == total: break if args.savegraph: print('Saving k-mer countgraph ', args.savegraph, file=sys.stderr) print('...saving to', args.savegraph, file=sys.stderr) countgraph.save(args.savegraph) print('wrote to: ' + args.output_histogram_filename, file=sys.stderr)
def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() report_on_config(args) check_input_files(args.input_sequence_filename, args.force) check_space([args.input_sequence_filename], args.force) if args.savetable: check_space_for_hashtable(args, 'countgraph', args.force) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): print('ERROR: %s exists; not squashing.' % args.output_histogram_filename, file=sys.stderr) sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') if args.csv: hist_fp_csv = csv.writer(hist_fp) # write headers: hist_fp_csv.writerow( ['abundance', 'count', 'cumulative', 'cumulative_fraction']) print('making countgraph', file=sys.stderr) counting_hash = khmer_args.create_countgraph(args, multiplier=1.1) counting_hash.set_use_bigcount(args.bigcount) print('building k-mer tracking table', file=sys.stderr) tracking = khmer_args.create_nodegraph(args, multiplier=1.1) print('kmer_size:', counting_hash.ksize(), file=sys.stderr) print('k-mer counting table sizes:', counting_hash.hashsizes(), file=sys.stderr) print('outputting to', args.output_histogram_filename, file=sys.stderr) # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print('consuming input, round 1 --', args.input_sequence_filename, file=sys.stderr) for _ in range(args.threads): thread = \ threading.Thread( target=counting_hash.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() if args.report_total_kmers: print('Total number of unique k-mers: {0}'.format( counting_hash.n_unique_kmers()), file=sys.stderr) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = counting_hash.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) print('preparing hist from %s...' % args.input_sequence_filename, file=sys.stderr) rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print('consuming input, round 2 --', args.input_sequence_filename, file=sys.stderr) for _ in range(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print( "ERROR: abundance distribution is uniformly zero; " "nothing to report.", file=sys.stderr) print("\tPlease verify that the input files are valid.", file=sys.stderr) sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) if args.csv: hist_fp_csv.writerow([_, i, sofar, round(frac, 3)]) else: print(_, i, sofar, round(frac, 3), file=hist_fp) if sofar == total: break if args.savetable: print('Saving k-mer counting table ', args.savetable, file=sys.stderr) print('...saving to', args.savetable, file=sys.stderr) counting_hash.save(args.savetable) print('wrote to: ' + args.output_histogram_filename, file=sys.stderr)
def create_nodegraph(self): return khmer_args.create_nodegraph(self.args)
def main(): # pylint: disable=too-many-locals,too-many-statements info('do-partition.py', ['graph']) args = get_parser().parse_args() report_on_config(args, graphtype='nodegraph') for infile in args.input_filenames: check_input_files(infile, args.force) check_space(args.input_filenames, args.force) print('Saving k-mer nodegraph to %s' % args.graphbase, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(args.input_filenames), file=sys.stderr) print('--', file=sys.stderr) print('SUBSET SIZE', args.subset_size, file=sys.stderr) print('N THREADS', args.threads, file=sys.stderr) print('--', file=sys.stderr) # load-graph print('making nodegraph', file=sys.stderr) nodegraph = khmer_args.create_nodegraph(args) for _, filename in enumerate(args.input_filenames): print('consuming input', filename, file=sys.stderr) nodegraph.consume_fasta_and_tag(filename) # 0.18 is ACTUAL MAX. Do not change. fp_rate = \ khmer.calc_expected_collisions( nodegraph, args.force, max_false_pos=.15) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print('** This script brakes for lumps: ', 'stop_big_traversals is true.', file=sys.stderr) else: print('** Traverse all the things:', ' stop_big_traversals is false.', file=sys.stderr) # # now, partition! # # divide the tags up into subsets divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((nodegraph, _, start, end)) print('enqueued %d subset tasks' % n_subsets, file=sys.stderr) open('%s.info' % args.graphbase, 'w').write('%d subsets total\n' % (n_subsets)) if n_subsets < args.threads: args.threads = n_subsets # start threads! print('starting %d threads' % args.threads, file=sys.stderr) print('---', file=sys.stderr) threads = [] for _ in range(args.threads): cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals)) threads.append(cur_thread) cur_thread.start() assert threading.active_count() == args.threads + 1 print('done starting threads', file=sys.stderr) # wait for threads for _ in threads: _.join() print('---', file=sys.stderr) print('done making subsets! see %s.subset.*.pmap' % (args.graphbase,), file=sys.stderr) # merge-partitions pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) nodegraph = khmer.Nodegraph(args.ksize, 1, 1) for pmap_file in pmap_files: print('merging', pmap_file, file=sys.stderr) nodegraph.merge_subset_from_disk(pmap_file) if args.remove_subsets: print('removing pmap files', file=sys.stderr) for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: print('outputting partitions for', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.part' part_count = nodegraph.output_partitions(infile, outfile) print('output %d partitions for %s' % ( part_count, infile), file=sys.stderr) print('partitions are in', outfile, file=sys.stderr)