def main():
    parser = build_construct_args()
    parser.add_argument('input_filenames', nargs='+')
    parser.add_argument('read_filename')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MAX_TABLESIZE:
            print("** WARNING: hashsize is default!  " \
                "You absodefly want to increase this!\n** " \
                "Please read the docs!", file=sys.stderr)

        print('\nPARAMETERS:', file=sys.stderr)
        print(' - kmer size =    %d \t\t(-k)' % args.ksize, file=sys.stderr)
        print(' - n hashes =     %d \t\t(-N)' % args.n_hashes, file=sys.stderr)
        print(' - min hashsize = %-5.2g \t(-x)' % \
            args.min_hashsize, file=sys.stderr)
        print('', file=sys.stderr)
        print('Estimated memory usage is %.2g bytes ' \
            '(n_hashes x min_hashsize / 8)' % (
                args.n_hashes * args.min_hashsize * len(args.input_filenames) / 8.), file=sys.stderr)
        print('-' * 8, file=sys.stderr)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    inputlist = args.input_filenames
    readsfile = args.read_filename

    query_list = []
    for n, inp_name in enumerate(inputlist):
        # create a nodegraph data structure
        ht = khmer.Nodegraph(K, HT_SIZE, N_HT)

        outfile = os.path.basename(inp_name) + '.sweep3'
        outfp = open(outfile, 'w')
        query_list.append((ht, outfp))

    for n, inp_name in enumerate(inputlist):
        ht = query_list[n][0]

        # load contigs, connect into N partitions
        print('loading input reads from', inp_name)
        ht.consume_fasta(inp_name)

    print('starting sweep.')

    n = 0
    m = 0
    for n, record in enumerate(screed.open(readsfile)):
        if len(record.sequence) < K:
            continue

        if n % 10000 == 0:
            print('...', n, m)

        for ht, outfp in query_list:
            count = ht.get_median_count(record.sequence)[0]
            if count:
                outfp.write(output_single(record))
Exemplo n.º 2
0
def test_add_stop_tag():
    nodegraph = khmer.Nodegraph(6, 1, 1)

    nodegraph.add_stop_tag('AATAAG')
    print(nodegraph.get_stop_tags())
    assert nodegraph.get_stop_tags() == ['AATAAG']
Exemplo n.º 3
0
 def setup(self):
     self.ht = khmer.Nodegraph(12, 1e4, 2)
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "This script will create node graph for a given k-mer size and query file (can be used as input to QueryDNADatabase.py)",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-fp',
                        '--fp_rate',
                        type=restricted_float,
                        help="False positive rate.",
                        default=0.0001)
    parser.add_argument(
        '-i',
        '--intersect_nodegraph',
        help=
        "Location of Node Graph. Will only insert query k-mers in bloom filter if they appear anywhere in the training"
        " database. Note that the Jaccard estimates will now be "
        "J(query intersect union_i training_i, training_i) instead of J(query, training_i), "
        "but will use significantly less space (unfortunately will also disable threading)."
    )
    parser.add_argument('-k',
                        '--k_size',
                        type=int,
                        help="K-mer size",
                        default=21)
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        help="Number of threads to use",
                        default=multiprocessing.cpu_count())
    parser.add_argument('in_file',
                        help="Input file: FASTQ/A file (can be gzipped).")
    parser.add_argument('out_dir', help='Output directory')

    # Parse and check args
    args = parser.parse_args()
    query_file = os.path.abspath(args.in_file)
    ksize = args.k_size
    num_threads = args.threads
    node_graph_out = os.path.join(
        os.path.abspath(args.out_dir),
        os.path.basename(query_file) + ".NodeGraph.K" + str(ksize))
    if args.intersect_nodegraph is not None:
        intersect_nodegraph_file = args.intersect_nodegraph
    else:
        intersect_nodegraph_file = None
    intersect_nodegraph = None
    if intersect_nodegraph_file is not None:
        if not os.path.exists(intersect_nodegraph_file):
            raise Exception(
                "Intersection nodegraph does not exist. Please re-run MakeDNADatabase.py with the -i flag."
            )
        try:
            intersect_nodegraph = khmer.load_nodegraph(
                intersect_nodegraph_file)
            if intersect_nodegraph.ksize() != ksize:
                raise Exception(
                    "Given intersect nodegraph %s has K-mer size %d while the database K-mer size is %d"
                    % (intersect_nodegraph_file, intersect_nodegraph.ksize(),
                       ksize))
        except:
            raise Exception("Could not load given intersect nodegraph %s" %
                            intersect_nodegraph_file)
    fprate = args.fp_rate
    hll = khmer.HLLCounter(0.01, ksize)
    hll.consume_seqfile(query_file)
    full_kmer_count_estimate = hll.estimate_cardinality()
    res = optimal_size(full_kmer_count_estimate, fp_rate=fprate)
    if intersect_nodegraph is None:  # If no intersect list was given, just populate the bloom filter
        sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables)
        #sample_kmers.consume_seqfile(query_file)
        rparser = khmer.ReadParser(query_file)
        threads = []
        for _ in range(num_threads):
            cur_thrd = threading.Thread(
                target=sample_kmers.consume_seqfile_with_reads_parser,
                args=(rparser, ))
            threads.append(cur_thrd)
            cur_thrd.start()
        for thread in threads:
            thread.join()
    else:  # Otherwise, only put a k-mer in the bloom filter if it's in the intersect list
        # (WARNING: this will cause the Jaccard index to be calculated in terms of J(query\intersect hash_list, training)
        #  instead of J(query, training)
        # (TODO: fix this after khmer is updated)
        #intersect_nodegraph_kmer_count = intersect_nodegraph.n_unique_kmers()  # Doesnt work due to khmer bug
        intersect_nodegraph_kmer_count = intersect_nodegraph.n_occupied(
        )  # Doesnt work due to khmer bug
        if intersect_nodegraph_kmer_count < full_kmer_count_estimate:  # At max, we have as many k-mers as in the union of the training database (But makes this always return 0)
            res = optimal_size(intersect_nodegraph_kmer_count, fp_rate=fprate)
            sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                           res.num_htables)
        else:
            sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                           res.num_htables)
        for record in screed.open(query_file):
            seq = record.sequence
            for i in range(len(seq) - ksize + 1):
                kmer = seq[i:i + ksize]
                if intersect_nodegraph.get(kmer) > 0:
                    sample_kmers.add(kmer)
    # Save the sample_kmers
    sample_kmers.save(node_graph_out)
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('seqfiles', nargs='+')
    parser.add_argument('-o', '--output', default=None)
    parser.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int)
    parser.add_argument('-x',
                        '--tablesize',
                        default=NODEGRAPH_SIZE,
                        type=float)
    parser.add_argument('--force', action='store_true')
    args = parser.parse_args()

    assert args.ksize % 2, "ksize must be odd"
    assert args.output, "you probably want an output file"

    print('building graphs and loading files')

    # Create graph, and two stop bloom filters - one for loading, one for
    # traversing. Create them all here so that we can error out quickly
    # if memory is a problem.

    graph = khmer.Nodegraph(args.ksize, args.tablesize, 2)
    stop_bf = khmer.Nodegraph(args.ksize, args.tablesize, 2)
    stop_bf2 = khmer.Nodegraph(args.ksize, args.tablesize, 2)
    n = 0

    # load in all of the input sequences, one file at a time.
    for seqfile in args.seqfiles:
        for record in screed.open(seqfile):
            n += 1
            if n % 10000 == 0:
                print('...', seqfile, n)
            graph.consume(record.sequence)

    # complain if too small set of graphs was used.
    fp_rate = khmer.calc_expected_collisions(graph,
                                             args.force,
                                             max_false_pos=.05)

    # initialize the object that will track information for us.
    pathy = Pathfinder(args.ksize)

    print('finding high degree nodes')
    degree_nodes = khmer.HashSet(args.ksize)
    n = 0
    for seqfile in args.seqfiles:
        for record in screed.open(seqfile):
            n += 1
            if n % 10000 == 0:
                print('...2', seqfile, n)
            # walk across sequences, find all high degree nodes,
            # name them and cherish them. Don't do this on identical sequences.
            if min(stop_bf2.get_kmer_counts(record.sequence)) == 0:
                stop_bf2.consume(record.sequence)
                degree_nodes += graph.find_high_degree_nodes(record.sequence)
    del stop_bf2

    if not len(degree_nodes):
        print('no high degree nodes; exiting.')
        sys.exit(0)

    # get all of the degree > 2 nodes and give them IDs.
    for node in degree_nodes:
        pathy.new_segment(node)

    print('traversing linear segments from', len(degree_nodes), 'nodes')

    # now traverse from each high degree nodes into all neighboring nodes,
    # seeking adjacencies.  if neighbor is high degree node, add it to
    # adjacencies; if neighbor is not, then traverse the linear path.  also
    # track minhashes while we're at it.
    for n, k in enumerate(degree_nodes):
        if n % 10000 == 0:
            print('...', n, 'of', len(degree_nodes))

        # retrieve the segment ID of the primary node.
        k_id = pathy.segments_r[k]

        # find all the neighbors of this high-degree node.
        nbh = graph.neighbors(k)
        for nk in nbh:
            # neighbor is high degree? fine, mark its adjacencies.
            if nk in degree_nodes:
                nk_id = pathy.segments_r[nk]
                pathy.add_adjacency(k_id, nk_id)
            else:
                # linear! walk it.
                traverse_and_mark_linear_paths(graph, nk, stop_bf, pathy,
                                               degree_nodes)

    print(len(pathy.segments), 'segments, containing',
          sum(pathy.segments.values()), 'nodes')

    # save to GML
    if args.output:
        print('saving to', args.output)
        fp = open(args.output, 'w')
        w = GmlWriter(fp, [], [])

        for k, v in pathy.segments.items():
            w.add_vertex(k, v, [])

        for k, v in pathy.adjacencies.items():
            for edge in v:
                w.add_edge(k, edge, [])
Exemplo n.º 6
0
 def setup(self):
     self.ht = khmer.Nodegraph(12, 4**3 + 1, 2)
Exemplo n.º 7
0
def main():  # pylint: disable=too-many-locals,too-many-statements
    info('do-partition.py', ['graph'])
    args = sanitize_help(get_parser()).parse_args()

    report_on_config(args, graphtype='nodegraph')

    for infile in args.input_filenames:
        check_input_files(infile, args.force)

    check_space(args.input_filenames, args.force)

    print('Saving k-mer nodegraph to %s' % args.graphbase, file=sys.stderr)
    print('Loading kmers from sequences in %s' % repr(args.input_filenames),
          file=sys.stderr)
    print('--', file=sys.stderr)
    print('SUBSET SIZE', args.subset_size, file=sys.stderr)
    print('N THREADS', args.threads, file=sys.stderr)
    print('--', file=sys.stderr)

    # load-graph.py

    print('making nodegraph', file=sys.stderr)
    nodegraph = khmer_args.create_nodegraph(args)

    for _, filename in enumerate(args.input_filenames):
        print('consuming input', filename, file=sys.stderr)
        nodegraph.consume_fasta_and_tag(filename)

    # 0.18 is ACTUAL MAX. Do not change.
    fp_rate = \
        khmer.calc_expected_collisions(
            nodegraph, args.force, max_false_pos=.15)
    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    # partition-graph

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print('** This script brakes for lumps: ',
              'stop_big_traversals is true.',
              file=sys.stderr)
    else:
        print('** Traverse all the things:',
              ' stop_big_traversals is false.',
              file=sys.stderr)

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size))
    divvy = list(divvy)
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((nodegraph, _, start, end))

    print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
    open('%s.info' % args.graphbase,
         'w').write('%d subsets total\n' % (n_subsets))

    if n_subsets < args.threads:
        args.threads = n_subsets

    # start threads!
    print('starting %d threads' % args.threads, file=sys.stderr)
    print('---', file=sys.stderr)

    threads = []
    for _ in range(args.threads):
        cur_thread = threading.Thread(target=worker,
                                      args=(worker_q, args.graphbase,
                                            stop_big_traversals))
        threads.append(cur_thread)
        cur_thread.start()

    assert threading.active_count() == args.threads + 1

    print('done starting threads', file=sys.stderr)

    # wait for threads
    for _ in threads:
        _.join()

    print('---', file=sys.stderr)
    print('done making subsets! see %s.subset.*.pmap' % (args.graphbase, ),
          file=sys.stderr)

    # merge-partitions

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print('loading %d pmap files (first one: %s)' %
          (len(pmap_files), pmap_files[0]),
          file=sys.stderr)

    nodegraph = khmer.Nodegraph(args.ksize, 1, 1)

    for pmap_file in pmap_files:
        print('merging', pmap_file, file=sys.stderr)
        nodegraph.merge_subset_from_disk(pmap_file)

    if args.remove_subsets:
        print('removing pmap files', file=sys.stderr)
        for pmap_file in pmap_files:
            os.unlink(pmap_file)

    # annotate-partitions

    for infile in args.input_filenames:
        print('outputting partitions for', infile, file=sys.stderr)
        outfile = os.path.basename(infile) + '.part'
        part_count = nodegraph.output_partitions(infile, outfile)
        print('output %d partitions for %s' % (part_count, infile),
              file=sys.stderr)
        print('partitions are in', outfile, file=sys.stderr)
Exemplo n.º 8
0
Arquivo: sbt.py Projeto: kdm9/sourmash
 def __call__(self):
     return khmer.Nodegraph(self.ksize, self.starting_size, self.n_tables)
Exemplo n.º 9
0
 def create_nodegraph():
     return khmer.Nodegraph(ksize, starting_size, n_tables)
Exemplo n.º 10
0
def test_KmerDegreeFunction():
    kmer = Kmer('ACCTA')
    g = khmer.Nodegraph(5, 1e4, 4)
    g.add(str(kmer))
    f = KmerDegreeFunction(g)
    assert f.evaluate_kmer(kmer) == 0
Exemplo n.º 11
0
def test_KmerCountFunction():
    kmer = Kmer('AAAAA')
    g = khmer.Nodegraph(5, 1e4, 4)
    g.add(str(kmer))
    f = KmerCountFunction(g)
    assert f.evaluate_kmer(kmer) == 1