示例#1
0
def main(args):
    info('build-graph.py', ['graph', 'SeqAn'])

    report_on_config(args, hashtype='nodegraph')
    base = args.output_filename
    filenames = args.input_filenames

    for fname in args.input_filenames:
        check_input_files(fname, args.force)

    # if optimization args are given, do optimization
    args = functions.do_sanity_checking(args, 0.01)

    check_space(args.input_filenames, args.force)
    check_space_for_hashtable(args, 'nodegraph', args.force)

    print('Saving k-mer presence table to %s' % base, file=sys.stderr)
    print('Loading kmers from sequences in %s' %
          repr(filenames), file=sys.stderr)
    if args.no_build_tagset:
        print('We WILL NOT build the tagset.', file=sys.stderr)
    else:
        print('We WILL build the tagset (for partitioning/traversal).',
              file=sys.stderr)

    print('making nodegraph', file=sys.stderr)
    htable = khmer_args.create_nodegraph(args)

    functions.build_graph(filenames, htable, args.threads,
                          not args.no_build_tagset)

    print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()),
          file=sys.stderr)

    print('saving k-mer presence table in', base + '.pt', file=sys.stderr)
    htable.save(base + '.pt')

    if not args.no_build_tagset:
        print('saving tagset in', base + '.tagset', file=sys.stderr)
        htable.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % htable.n_unique_kmers())

    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15)
    # 0.18 is ACTUAL MAX. Do not change.

    print('false positive rate estimated to be %1.3f' % fp_rate,
          file=sys.stderr)
    print('\nfalse positive rate estimated to be %1.3f' % fp_rate,
          file=info_fp)

    print('wrote to', base + '.info and', base + '.pt', file=sys.stderr)
    if not args.no_build_tagset:
        print('and ' + base + '.tagset', file=sys.stderr)

    sys.exit(0)
示例#2
0
def main(args):
    graph_type = 'nodegraph'
    report_on_config(args, graphtype=graph_type)
    base = args.output_filename
    filenames = args.input_filenames

    for fname in args.input_filenames:
        check_input_files(fname, args.force)

    graphsize = calculate_graphsize(args, graph_type)
    space_needed = (args.n_tables * graphsize /
                    khmer._buckets_per_byte[graph_type])
    check_space_for_graph(args.output_filename, space_needed, args.force)

    print('Saving k-mer nodegraph to %s' % base, file=sys.stderr)
    print('Loading kmers from sequences in %s' % repr(filenames),
          file=sys.stderr)
    if args.no_build_tagset:
        print('We WILL NOT build the tagset.', file=sys.stderr)
    else:
        print('We WILL build the tagset (for partitioning/traversal).',
              file=sys.stderr)

    print('making nodegraph', file=sys.stderr)
    nodegraph = khmer_args.create_nodegraph(args)

    oxfuncs.build_graph(filenames, nodegraph, args.threads,
                        not args.no_build_tagset)

    print('Total number of unique k-mers: {0}'.format(
        nodegraph.n_unique_kmers()),
          file=sys.stderr)

    print('saving k-mer nodegraph in', base, file=sys.stderr)
    nodegraph.save(base)

    if not args.no_build_tagset:
        print('saving tagset in', base + '.tagset', file=sys.stderr)
        nodegraph.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % nodegraph.n_unique_kmers())

    fp_rate = \
        khmer.calc_expected_collisions(
            nodegraph, args.force, max_false_pos=.15)
    # 0.18 is ACTUAL MAX. Do not change.

    print('false positive rate estimated to be %1.3f' % fp_rate,
          file=sys.stderr)
    print('\nfalse positive rate estimated to be %1.3f' % fp_rate,
          file=info_fp)

    print('wrote to ' + base + '.info and ' + base, file=sys.stderr)
    if not args.no_build_tagset:
        print('and ' + base + '.tagset', file=sys.stderr)

    sys.exit(0)
def test_create_nodegraph_4():
    # tests too-big number of tables WITHOUT force

    ksize = khmer_args.DEFAULT_K
    n_tables = 21  # some number larger than 20
    max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
    max_mem = 1e7

    args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0, 0)

    sys.stderr = capture = StringIO()

    try:
        khmer_args.create_nodegraph(args, ksize=None)
        assert 0, "should not reach this"
    except SystemExit:
        err = capture.getvalue()
        assert 'khmer only supports number of tables <= 20.' in err, err
示例#4
0
def test_create_nodegraph_3():
    # tests too-big ksize

    ksize = khmer_args.DEFAULT_K
    n_tables = khmer_args.DEFAULT_N_TABLES
    max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
    max_mem = 1e7

    args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0)

    sys.stderr = capture = StringIO()

    try:
        khmer_args.create_nodegraph(args, ksize=35)
        assert 0, "should not reach this"
    except SystemExit:
        err = capture.getvalue()
        assert 'khmer only supports k-mer sizes <= 32.' in err, err
示例#5
0
def main(args):
    graph_type = 'nodegraph'
    report_on_config(args, graphtype=graph_type)
    base = args.output_filename
    filenames = args.input_filenames

    for fname in args.input_filenames:
        check_input_files(fname, args.force)

    graphsize = calculate_graphsize(args, graph_type)
    space_needed = (args.n_tables * graphsize /
                    khmer._buckets_per_byte[graph_type])
    check_space_for_graph(args.output_filename, space_needed, args.force)

    print('Saving k-mer nodegraph to %s' % base, file=sys.stderr)
    print('Loading kmers from sequences in %s' %
          repr(filenames), file=sys.stderr)
    if args.no_build_tagset:
        print('We WILL NOT build the tagset.', file=sys.stderr)
    else:
        print('We WILL build the tagset (for partitioning/traversal).',
              file=sys.stderr)

    print('making nodegraph', file=sys.stderr)
    nodegraph = khmer_args.create_nodegraph(args)

    oxfuncs.build_graph(filenames, nodegraph, args.threads,
                        not args.no_build_tagset)

    print('Total number of unique k-mers: {0}'.format(
        nodegraph.n_unique_kmers()), file=sys.stderr)

    print('saving k-mer nodegraph in', base, file=sys.stderr)
    nodegraph.save(base)

    if not args.no_build_tagset:
        print('saving tagset in', base + '.tagset', file=sys.stderr)
        nodegraph.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % nodegraph.n_unique_kmers())

    fp_rate = \
        khmer.calc_expected_collisions(
            nodegraph, args.force, max_false_pos=.15)
    # 0.18 is ACTUAL MAX. Do not change.

    print('false positive rate estimated to be %1.3f' % fp_rate,
          file=sys.stderr)
    print('\nfalse positive rate estimated to be %1.3f' % fp_rate,
          file=info_fp)

    print('wrote to ' + base + '.info and ' + base, file=sys.stderr)
    if not args.no_build_tagset:
        print('and ' + base + '.tagset', file=sys.stderr)

    sys.exit(0)
示例#6
0
def test_create_nodegraph_4():
    # tests too-big number of tables WITHOUT force

    ksize = khmer_args.DEFAULT_K
    n_tables = 21  # some number larger than 20
    max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
    max_mem = 1e7

    args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0,
                              False, 0)

    sys.stderr = capture = StringIO()

    try:
        khmer_args.create_nodegraph(args, ksize=None)
        assert 0, "should not reach this"
    except SystemExit:
        err = capture.getvalue()
        assert 'khmer only supports number of tables <= 20.' in err, err
def test_create_nodegraph_5():
    # tests too-big number of tables WITH force

    ksize = khmer_args.DEFAULT_K
    n_tables = 21  # some number larger than 20
    max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
    max_mem = 1e7

    args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0, 1)

    sys.stderr = capture = StringIO()

    try:
        khmer_args.create_nodegraph(args, ksize=None)
        message = "Warning: Maximum recommended number of tables is 20, " + \
                  "discarded by force nonetheless!"
        assert message in capture.getvalue()
    except SystemExit as e:
        print(str(e))
示例#8
0
def test_create_nodegraph_5():
    # tests too-big number of tables WITH force

    ksize = khmer_args.DEFAULT_K
    n_tables = 21  # some number larger than 20
    max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
    max_mem = 1e7

    args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0,
                              False, 1)

    sys.stderr = capture = StringIO()

    try:
        khmer_args.create_nodegraph(args, ksize=None)
        message = "Warning: Maximum recommended number of tables is 20, " + \
                  "discarded by force nonetheless!"
        assert message in capture.getvalue()
    except SystemExit as e:
        print(str(e))
示例#9
0
def test_create_nodegraph_4_multiplier():
    ksize = khmer_args.DEFAULT_K
    n_tables = khmer_args.DEFAULT_N_TABLES
    max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
    max_mem = 1e7

    args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0)

    nodegraph = khmer_args.create_nodegraph(args, multiplier=2.0)
    assert sum(nodegraph.hashsizes()) / 8.0 < max_mem / 2.0, \
        sum(nodegraph.hashsizes())
示例#10
0
def test_create_nodegraph_2():
    # tests overriding ksize by passing into create_nodegraph explicitly.

    ksize = khmer_args.DEFAULT_K
    n_tables = khmer_args.DEFAULT_N_TABLES
    max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
    max_mem = 1e7

    args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0)

    nodegraph = khmer_args.create_nodegraph(args, ksize=15)
    assert nodegraph.ksize() == 15
示例#11
0
def test_create_nodegraph_1():
    ksize = khmer_args.DEFAULT_K
    n_tables = khmer_args.DEFAULT_N_TABLES
    max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
    max_mem = 1e7

    args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0)

    nodegraph = khmer_args.create_nodegraph(args)
    expected_hashsz = utils.longify([19999999, 19999981, 19999963, 19999927])
    assert nodegraph.hashsizes() == expected_hashsz, nodegraph.hashsizes()

    assert sum(nodegraph.hashsizes()) / \
        8.0 < max_mem, sum(nodegraph.hashsizes())
示例#12
0
def main(args):
    info("build-graph.py", ["graph", "SeqAn"])

    report_on_config(args, graphtype="nodegraph")
    base = args.output_filename
    filenames = args.input_filenames

    for fname in args.input_filenames:
        check_input_files(fname, args.force)

    graphsize = calculate_graphsize(args, "nodegraph")
    check_space_for_graph(args.output_filename, graphsize, args.force)

    print("Saving k-mer nodegraph to %s" % base, file=sys.stderr)
    print("Loading kmers from sequences in %s" % repr(filenames), file=sys.stderr)
    if args.no_build_tagset:
        print("We WILL NOT build the tagset.", file=sys.stderr)
    else:
        print("We WILL build the tagset (for partitioning/traversal).", file=sys.stderr)

    print("making nodegraph", file=sys.stderr)
    nodegraph = khmer_args.create_nodegraph(args)

    oxfuncs.build_graph(filenames, nodegraph, args.threads, not args.no_build_tagset)

    print("Total number of unique k-mers: {0}".format(nodegraph.n_unique_kmers()), file=sys.stderr)

    print("saving k-mer nodegraph in", base, file=sys.stderr)
    nodegraph.save(base)

    if not args.no_build_tagset:
        print("saving tagset in", base + ".tagset", file=sys.stderr)
        nodegraph.save_tagset(base + ".tagset")

    info_fp = open(base + ".info", "w")
    info_fp.write("%d unique k-mers" % nodegraph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(nodegraph, args.force, max_false_pos=0.15)
    # 0.18 is ACTUAL MAX. Do not change.

    print("false positive rate estimated to be %1.3f" % fp_rate, file=sys.stderr)
    print("\nfalse positive rate estimated to be %1.3f" % fp_rate, file=info_fp)

    print("wrote to " + base + ".info and " + base, file=sys.stderr)
    if not args.no_build_tagset:
        print("and " + base + ".tagset", file=sys.stderr)

    sys.exit(0)
示例#13
0
def main():
    info('count-overlap.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='nodegraph')

    for infile in [args.ptfile, args.fafile]:
        check_input_files(infile, args.force)

    check_space([args.ptfile, args.fafile], args.force)

    print('loading k-mer presence table from', args.ptfile, file=sys.stderr)
    ht1 = khmer.load_hashbits(args.ptfile)
    kmer_size = ht1.ksize()

    output = open(args.report_filename, 'w')
    f_curve_obj = open(args.report_filename + '.curve', 'w')
    if args.csv:
        f_curve_obj_csv = csv.writer(f_curve_obj)
        # write headers:
        f_curve_obj_csv.writerow(['input_seq', 'overlap_kmer'])

    ht2 = khmer_args.create_nodegraph(args, ksize=kmer_size)

    (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1)

    printout1 = """\
dataset1(pt file): %s
dataset2: %s

# of unique k-mers in dataset2: %d
# of overlap unique k-mers: %d

""" % (args.ptfile, args.fafile, n_unique, n_overlap)
    output.write(printout1)

    for i in range(100):
        if args.csv:
            f_curve_obj_csv.writerow([list_curve[100 + i], list_curve[i]])
        else:
            print(list_curve[100 + i], list_curve[i], file=f_curve_obj)

    print('wrote to: ' + args.report_filename, file=sys.stderr)
示例#14
0
def main():
    info('count-overlap.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='nodegraph')

    for infile in [args.ptfile, args.fafile]:
        check_input_files(infile, args.force)

    check_space([args.ptfile, args.fafile], args.force)

    print('loading k-mer presence table from', args.ptfile, file=sys.stderr)
    ht1 = khmer.load_hashbits(args.ptfile)
    kmer_size = ht1.ksize()

    output = open(args.report_filename, 'w')
    f_curve_obj = open(args.report_filename + '.curve', 'w')
    if args.csv:
        f_curve_obj_csv = csv.writer(f_curve_obj)
        # write headers:
        f_curve_obj_csv.writerow(['input_seq', 'overlap_kmer'])

    ht2 = khmer_args.create_nodegraph(args, ksize=kmer_size)

    (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1)

    printout1 = """\
dataset1(pt file): %s
dataset2: %s

# of unique k-mers in dataset2: %d
# of overlap unique k-mers: %d

""" % (args.ptfile, args.fafile, n_unique, n_overlap)
    output.write(printout1)

    for i in range(100):
        if args.csv:
            f_curve_obj_csv.writerow([list_curve[100 + i], list_curve[i]])
        else:
            print(list_curve[100 + i], list_curve[i], file=f_curve_obj)

    print('wrote to: ' + args.report_filename, file=sys.stderr)
示例#15
0
def main():  # pylint: disable=too-many-locals,too-many-branches
    info("abundance-dist-single.py", ["counting", "SeqAn"])
    args = get_parser().parse_args()
    report_on_config(args)

    check_input_files(args.input_sequence_filename, args.force)
    check_space([args.input_sequence_filename], args.force)
    if args.savetable:
        check_space_for_hashtable(args, "countgraph", args.force)

    if not args.squash_output and os.path.exists(args.output_histogram_filename):
        print("ERROR: %s exists; not squashing." % args.output_histogram_filename, file=sys.stderr)
        sys.exit(1)
    else:
        hist_fp = open(args.output_histogram_filename, "w")
        if args.csv:
            hist_fp_csv = csv.writer(hist_fp)
            # write headers:
            hist_fp_csv.writerow(["abundance", "count", "cumulative", "cumulative_fraction"])

    print("making countgraph", file=sys.stderr)
    counting_hash = khmer_args.create_countgraph(args, multiplier=1.1)
    counting_hash.set_use_bigcount(args.bigcount)

    print("building k-mer tracking table", file=sys.stderr)
    tracking = khmer_args.create_nodegraph(args, multiplier=1.1)

    print("kmer_size:", counting_hash.ksize(), file=sys.stderr)
    print("k-mer counting table sizes:", counting_hash.hashsizes(), file=sys.stderr)
    print("outputting to", args.output_histogram_filename, file=sys.stderr)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    print("consuming input, round 1 --", args.input_sequence_filename, file=sys.stderr)
    for _ in range(args.threads):
        thread = threading.Thread(target=counting_hash.consume_fasta_with_reads_parser, args=(rparser,))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    if args.report_total_kmers:
        print("Total number of unique k-mers: {0}".format(counting_hash.n_unique_kmers()), file=sys.stderr)

    abundance_lists = []

    def __do_abundance_dist__(read_parser):
        abundances = counting_hash.abundance_distribution_with_reads_parser(read_parser, tracking)
        abundance_lists.append(abundances)

    print("preparing hist from %s..." % args.input_sequence_filename, file=sys.stderr)
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    print("consuming input, round 2 --", args.input_sequence_filename, file=sys.stderr)
    for _ in range(args.threads):
        thread = threading.Thread(target=__do_abundance_dist__, args=(rparser,))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == args.threads, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        print("ERROR: abundance distribution is uniformly zero; " "nothing to report.", file=sys.stderr)
        print("\tPlease verify that the input files are valid.", file=sys.stderr)
        sys.exit(1)

    sofar = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        if args.csv:
            hist_fp_csv.writerow([_, i, sofar, round(frac, 3)])
        else:
            print(_, i, sofar, round(frac, 3), file=hist_fp)

        if sofar == total:
            break

    if args.savetable:
        print("Saving k-mer counting table ", args.savetable, file=sys.stderr)
        print("...saving to", args.savetable, file=sys.stderr)
        counting_hash.save(args.savetable)

    print("wrote to: " + args.output_histogram_filename, file=sys.stderr)
示例#16
0
def main():  # pylint: disable=too-many-locals,too-many-branches
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    check_input_files(args.input_sequence_filename, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)
    if (not args.squash_output
            and os.path.exists(args.output_histogram_filename)):
        log_error('ERROR: {output} exists; not squashing.',
                  output=args.output_histogram_filename)
        sys.exit(1)
    else:
        hist_fp = open(args.output_histogram_filename, 'w')
        hist_fp_csv = csv.writer(hist_fp)
        # write headers:
        hist_fp_csv.writerow(
            ['abundance', 'count', 'cumulative', 'cumulative_fraction'])

    log_info('making countgraph')
    countgraph = khmer_args.create_countgraph(args, multiplier=1.1)
    countgraph.set_use_bigcount(args.bigcount)

    log_info('building k-mer tracking graph')
    tracking = khmer_args.create_nodegraph(args, multiplier=1.1)

    log_info('kmer_size: {ksize}', ksize=countgraph.ksize())
    log_info('k-mer countgraph sizes: {sizes}', sizes=countgraph.hashsizes())
    log_info('outputting to {output}', output=args.output_histogram_filename)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    log_info('consuming input, round 1 -- {input}',
             input=args.input_sequence_filename)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=countgraph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    log_info('Total number of unique k-mers: {nk}',
             nk=countgraph.n_unique_kmers())

    abundance_lists = []

    def __do_abundance_dist__(read_parser):
        abundances = countgraph.abundance_distribution_with_reads_parser(
            read_parser, tracking)
        abundance_lists.append(abundances)

    log_info('preparing hist from {seqfile}...',
             seqfile=args.input_sequence_filename)
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    log_info('consuming input, round 2 -- {filename}',
             filename=args.input_sequence_filename)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=__do_abundance_dist__,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == args.threads, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        log_error("ERROR: abundance distribution is uniformly zero; "
                  "nothing to report.")
        log_error("\tPlease verify that the input files are valid.")
        sys.exit(1)

    sofar = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        hist_fp_csv.writerow([_, i, sofar, round(frac, 3)])

        if sofar == total:
            break

    if args.savegraph:
        log_info('Saving k-mer countgraph to {savegraph}',
                 savegraph=args.savegraph)
        countgraph.save(args.savegraph)

    log_info('wrote to: {output}', output=args.output_histogram_filename)
示例#17
0
def main():  # pylint: disable=too-many-locals,too-many-statements
    info("do-partition.py", ["graph"])
    args = sanitize_help(get_parser()).parse_args()

    report_on_config(args, graphtype="nodegraph")

    for infile in args.input_filenames:
        check_input_files(infile, args.force)

    check_space(args.input_filenames, args.force)

    print("Saving k-mer nodegraph to %s" % args.graphbase, file=sys.stderr)
    print("Loading kmers from sequences in %s" % repr(args.input_filenames), file=sys.stderr)
    print("--", file=sys.stderr)
    print("SUBSET SIZE", args.subset_size, file=sys.stderr)
    print("N THREADS", args.threads, file=sys.stderr)
    print("--", file=sys.stderr)

    # load-graph.py

    print("making nodegraph", file=sys.stderr)
    nodegraph = khmer_args.create_nodegraph(args)

    for _, filename in enumerate(args.input_filenames):
        print("consuming input", filename, file=sys.stderr)
        nodegraph.consume_fasta_and_tag(filename)

    # 0.18 is ACTUAL MAX. Do not change.
    fp_rate = khmer.calc_expected_collisions(nodegraph, args.force, max_false_pos=0.15)
    print("fp rate estimated to be %1.3f" % fp_rate, file=sys.stderr)

    # partition-graph

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print("** This script brakes for lumps: ", "stop_big_traversals is true.", file=sys.stderr)
    else:
        print("** Traverse all the things:", " stop_big_traversals is false.", file=sys.stderr)

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size))
    divvy = list(divvy)
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((nodegraph, _, start, end))

    print("enqueued %d subset tasks" % n_subsets, file=sys.stderr)
    open("%s.info" % args.graphbase, "w").write("%d subsets total\n" % (n_subsets))

    if n_subsets < args.threads:
        args.threads = n_subsets

    # start threads!
    print("starting %d threads" % args.threads, file=sys.stderr)
    print("---", file=sys.stderr)

    threads = []
    for _ in range(args.threads):
        cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals))
        threads.append(cur_thread)
        cur_thread.start()

    assert threading.active_count() == args.threads + 1

    print("done starting threads", file=sys.stderr)

    # wait for threads
    for _ in threads:
        _.join()

    print("---", file=sys.stderr)
    print("done making subsets! see %s.subset.*.pmap" % (args.graphbase,), file=sys.stderr)

    # merge-partitions

    pmap_files = glob.glob(args.graphbase + ".subset.*.pmap")

    print("loading %d pmap files (first one: %s)" % (len(pmap_files), pmap_files[0]), file=sys.stderr)

    nodegraph = khmer.Nodegraph(args.ksize, 1, 1)

    for pmap_file in pmap_files:
        print("merging", pmap_file, file=sys.stderr)
        nodegraph.merge_subset_from_disk(pmap_file)

    if args.remove_subsets:
        print("removing pmap files", file=sys.stderr)
        for pmap_file in pmap_files:
            os.unlink(pmap_file)

    # annotate-partitions

    for infile in args.input_filenames:
        print("outputting partitions for", infile, file=sys.stderr)
        outfile = os.path.basename(infile) + ".part"
        part_count = nodegraph.output_partitions(infile, outfile)
        print("output %d partitions for %s" % (part_count, infile), file=sys.stderr)
        print("partitions are in", outfile, file=sys.stderr)
示例#18
0
def main():  # pylint: disable=too-many-locals,too-many-statements
    args = sanitize_help(get_parser()).parse_args()

    report_on_config(args, graphtype='nodegraph')

    for infile in args.input_filenames:
        check_input_files(infile, args.force)

    check_space(args.input_filenames, args.force)

    print('Saving k-mer nodegraph to %s' % args.graphbase, file=sys.stderr)
    print('Loading kmers from sequences in %s' % repr(args.input_filenames),
          file=sys.stderr)
    print('--', file=sys.stderr)
    print('SUBSET SIZE', args.subset_size, file=sys.stderr)
    print('N THREADS', args.threads, file=sys.stderr)
    print('--', file=sys.stderr)

    # load-graph.py

    print('making nodegraph', file=sys.stderr)
    nodegraph = khmer_args.create_nodegraph(args)

    for _, filename in enumerate(args.input_filenames):
        print('consuming input', filename, file=sys.stderr)
        nodegraph.consume_seqfile_and_tag(filename)

    # 0.18 is ACTUAL MAX. Do not change.
    fp_rate = \
        khmer.calc_expected_collisions(
            nodegraph, args.force, max_false_pos=.15)
    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    # partition-graph

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print('** This script brakes for lumps: ',
              'stop_big_traversals is true.',
              file=sys.stderr)
    else:
        print('** Traverse all the things:',
              ' stop_big_traversals is false.',
              file=sys.stderr)

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size))
    divvy = list(divvy)
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((nodegraph, _, start, end))

    print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
    open('%s.info' % args.graphbase,
         'w').write('%d subsets total\n' % (n_subsets))

    if n_subsets < args.threads:
        args.threads = n_subsets

    # start threads!
    print('starting %d threads' % args.threads, file=sys.stderr)
    print('---', file=sys.stderr)

    threads = []
    for _ in range(args.threads):
        cur_thread = threading.Thread(target=worker,
                                      args=(worker_q, args.graphbase,
                                            stop_big_traversals))
        threads.append(cur_thread)
        cur_thread.start()

    print('done starting threads', file=sys.stderr)

    # wait for threads
    for _ in threads:
        _.join()

    print('---', file=sys.stderr)
    print('done making subsets! see %s.subset.*.pmap' % (args.graphbase, ),
          file=sys.stderr)

    # merge-partitions

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print('loading %d pmap files (first one: %s)' %
          (len(pmap_files), pmap_files[0]),
          file=sys.stderr)

    nodegraph = khmer.Nodegraph(args.ksize, 1, 1)

    for pmap_file in pmap_files:
        print('merging', pmap_file, file=sys.stderr)
        nodegraph.merge_subset_from_disk(pmap_file)

    if not args.keep_subsets:
        print('removing pmap files', file=sys.stderr)
        for pmap_file in pmap_files:
            os.unlink(pmap_file)

    # annotate-partitions

    for infile in args.input_filenames:
        print('outputting partitions for', infile, file=sys.stderr)
        outfile = os.path.basename(infile) + '.part'
        part_count = nodegraph.output_partitions(infile, outfile)
        print('output %d partitions for %s' % (part_count, infile),
              file=sys.stderr)
        print('partitions are in', outfile, file=sys.stderr)
示例#19
0
def main():  # pylint: disable=too-many-locals,too-many-branches
    info('abundance-dist-single.py', ['counting', 'SeqAn'])
    args = sanitize_help(get_parser()).parse_args()
    report_on_config(args)

    check_input_files(args.input_sequence_filename, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)
    if (not args.squash_output and
            os.path.exists(args.output_histogram_filename)):
        print('ERROR: %s exists; not squashing.' %
              args.output_histogram_filename, file=sys.stderr)
        sys.exit(1)
    else:
        hist_fp = open(args.output_histogram_filename, 'w')
        hist_fp_csv = csv.writer(hist_fp)
        # write headers:
        hist_fp_csv.writerow(['abundance', 'count', 'cumulative',
                              'cumulative_fraction'])

    print('making countgraph', file=sys.stderr)
    countgraph = khmer_args.create_countgraph(args, multiplier=1.1)
    countgraph.set_use_bigcount(args.bigcount)

    print('building k-mer tracking graph', file=sys.stderr)
    tracking = khmer_args.create_nodegraph(args, multiplier=1.1)

    print('kmer_size:', countgraph.ksize(), file=sys.stderr)
    print('k-mer countgraph sizes:',
          countgraph.hashsizes(), file=sys.stderr)
    print('outputting to', args.output_histogram_filename, file=sys.stderr)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    print('consuming input, round 1 --',
          args.input_sequence_filename, file=sys.stderr)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=countgraph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    print('Total number of unique k-mers: {0}'.format(
        countgraph.n_unique_kmers()), file=sys.stderr)

    abundance_lists = []

    def __do_abundance_dist__(read_parser):
        abundances = countgraph.abundance_distribution_with_reads_parser(
            read_parser, tracking)
        abundance_lists.append(abundances)

    print('preparing hist from %s...' %
          args.input_sequence_filename, file=sys.stderr)
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    print('consuming input, round 2 --',
          args.input_sequence_filename, file=sys.stderr)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=__do_abundance_dist__,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == args.threads, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        print("ERROR: abundance distribution is uniformly zero; "
              "nothing to report.", file=sys.stderr)
        print(
            "\tPlease verify that the input files are valid.", file=sys.stderr)
        sys.exit(1)

    sofar = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        hist_fp_csv.writerow([_, i, sofar, round(frac, 3)])

        if sofar == total:
            break

    if args.savegraph:
        print('Saving k-mer countgraph ', args.savegraph, file=sys.stderr)
        print('...saving to', args.savegraph, file=sys.stderr)
        countgraph.save(args.savegraph)

    print('wrote to: ' + args.output_histogram_filename, file=sys.stderr)
示例#20
0
def main():  # pylint: disable=too-many-locals,too-many-branches
    info('abundance-dist-single.py', ['counting', 'SeqAn'])
    args = get_parser().parse_args()
    report_on_config(args)

    check_input_files(args.input_sequence_filename, args.force)
    check_space([args.input_sequence_filename], args.force)
    if args.savetable:
        check_space_for_hashtable(args, 'countgraph', args.force)

    if (not args.squash_output
            and os.path.exists(args.output_histogram_filename)):
        print('ERROR: %s exists; not squashing.' %
              args.output_histogram_filename,
              file=sys.stderr)
        sys.exit(1)
    else:
        hist_fp = open(args.output_histogram_filename, 'w')
        if args.csv:
            hist_fp_csv = csv.writer(hist_fp)
            # write headers:
            hist_fp_csv.writerow(
                ['abundance', 'count', 'cumulative', 'cumulative_fraction'])

    print('making countgraph', file=sys.stderr)
    counting_hash = khmer_args.create_countgraph(args, multiplier=1.1)
    counting_hash.set_use_bigcount(args.bigcount)

    print('building k-mer tracking table', file=sys.stderr)
    tracking = khmer_args.create_nodegraph(args, multiplier=1.1)

    print('kmer_size:', counting_hash.ksize(), file=sys.stderr)
    print('k-mer counting table sizes:',
          counting_hash.hashsizes(),
          file=sys.stderr)
    print('outputting to', args.output_histogram_filename, file=sys.stderr)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    print('consuming input, round 1 --',
          args.input_sequence_filename,
          file=sys.stderr)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=counting_hash.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    if args.report_total_kmers:
        print('Total number of unique k-mers: {0}'.format(
            counting_hash.n_unique_kmers()),
              file=sys.stderr)

    abundance_lists = []

    def __do_abundance_dist__(read_parser):
        abundances = counting_hash.abundance_distribution_with_reads_parser(
            read_parser, tracking)
        abundance_lists.append(abundances)

    print('preparing hist from %s...' % args.input_sequence_filename,
          file=sys.stderr)
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    print('consuming input, round 2 --',
          args.input_sequence_filename,
          file=sys.stderr)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=__do_abundance_dist__,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == args.threads, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        print(
            "ERROR: abundance distribution is uniformly zero; "
            "nothing to report.",
            file=sys.stderr)
        print("\tPlease verify that the input files are valid.",
              file=sys.stderr)
        sys.exit(1)

    sofar = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        if args.csv:
            hist_fp_csv.writerow([_, i, sofar, round(frac, 3)])
        else:
            print(_, i, sofar, round(frac, 3), file=hist_fp)

        if sofar == total:
            break

    if args.savetable:
        print('Saving k-mer counting table ', args.savetable, file=sys.stderr)
        print('...saving to', args.savetable, file=sys.stderr)
        counting_hash.save(args.savetable)

    print('wrote to: ' + args.output_histogram_filename, file=sys.stderr)
示例#21
0
 def create_nodegraph(self):
     return khmer_args.create_nodegraph(self.args)
示例#22
0
def main():  # pylint: disable=too-many-locals,too-many-statements
    info('do-partition.py', ['graph'])
    args = get_parser().parse_args()

    report_on_config(args, graphtype='nodegraph')

    for infile in args.input_filenames:
        check_input_files(infile, args.force)

    check_space(args.input_filenames, args.force)

    print('Saving k-mer nodegraph to %s' %
          args.graphbase, file=sys.stderr)
    print('Loading kmers from sequences in %s' %
          repr(args.input_filenames), file=sys.stderr)
    print('--', file=sys.stderr)
    print('SUBSET SIZE', args.subset_size, file=sys.stderr)
    print('N THREADS', args.threads, file=sys.stderr)
    print('--', file=sys.stderr)

    # load-graph

    print('making nodegraph', file=sys.stderr)
    nodegraph = khmer_args.create_nodegraph(args)

    for _, filename in enumerate(args.input_filenames):
        print('consuming input', filename, file=sys.stderr)
        nodegraph.consume_fasta_and_tag(filename)

    # 0.18 is ACTUAL MAX. Do not change.
    fp_rate = \
        khmer.calc_expected_collisions(
            nodegraph, args.force, max_false_pos=.15)
    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    # partition-graph

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print('** This script brakes for lumps: ',
              'stop_big_traversals is true.', file=sys.stderr)
    else:
        print('** Traverse all the things:',
              ' stop_big_traversals is false.', file=sys.stderr)

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size))
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((nodegraph, _, start, end))

    print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
    open('%s.info' % args.graphbase, 'w').write('%d subsets total\n'
                                                % (n_subsets))

    if n_subsets < args.threads:
        args.threads = n_subsets

    # start threads!
    print('starting %d threads' % args.threads, file=sys.stderr)
    print('---', file=sys.stderr)

    threads = []
    for _ in range(args.threads):
        cur_thread = threading.Thread(target=worker,
                                      args=(worker_q, args.graphbase,
                                            stop_big_traversals))
        threads.append(cur_thread)
        cur_thread.start()

    assert threading.active_count() == args.threads + 1

    print('done starting threads', file=sys.stderr)

    # wait for threads
    for _ in threads:
        _.join()

    print('---', file=sys.stderr)
    print('done making subsets! see %s.subset.*.pmap' %
          (args.graphbase,), file=sys.stderr)

    # merge-partitions

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print('loading %d pmap files (first one: %s)' %
          (len(pmap_files), pmap_files[0]), file=sys.stderr)

    nodegraph = khmer.Nodegraph(args.ksize, 1, 1)

    for pmap_file in pmap_files:
        print('merging', pmap_file, file=sys.stderr)
        nodegraph.merge_subset_from_disk(pmap_file)

    if args.remove_subsets:
        print('removing pmap files', file=sys.stderr)
        for pmap_file in pmap_files:
            os.unlink(pmap_file)

    # annotate-partitions

    for infile in args.input_filenames:
        print('outputting partitions for', infile, file=sys.stderr)
        outfile = os.path.basename(infile) + '.part'
        part_count = nodegraph.output_partitions(infile, outfile)
        print('output %d partitions for %s' % (
            part_count, infile), file=sys.stderr)
        print('partitions are in', outfile, file=sys.stderr)