def main(): parser = build_construct_args() parser.add_argument('input_filenames', nargs='+') parser.add_argument('read_filename') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MAX_TABLESIZE: print("** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!", file=sys.stderr) print('\nPARAMETERS:', file=sys.stderr) print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr) print(' - n hashes = %d \t\t(-N)' % args.n_hashes, file=sys.stderr) print(' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize, file=sys.stderr) print('', file=sys.stderr) print('Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize / 8)' % ( args.n_hashes * args.min_hashsize * len(args.input_filenames) / 8.), file=sys.stderr) print('-' * 8, file=sys.stderr) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes inputlist = args.input_filenames readsfile = args.read_filename query_list = [] for n, inp_name in enumerate(inputlist): # create a nodegraph data structure ht = khmer.Nodegraph(K, HT_SIZE, N_HT) outfile = os.path.basename(inp_name) + '.sweep3' outfp = open(outfile, 'w') query_list.append((ht, outfp)) for n, inp_name in enumerate(inputlist): ht = query_list[n][0] # load contigs, connect into N partitions print('loading input reads from', inp_name) ht.consume_fasta(inp_name) print('starting sweep.') n = 0 m = 0 for n, record in enumerate(screed.open(readsfile)): if len(record.sequence) < K: continue if n % 10000 == 0: print('...', n, m) for ht, outfp in query_list: count = ht.get_median_count(record.sequence)[0] if count: outfp.write(output_single(record))
def test_add_stop_tag(): nodegraph = khmer.Nodegraph(6, 1, 1) nodegraph.add_stop_tag('AATAAG') print(nodegraph.get_stop_tags()) assert nodegraph.get_stop_tags() == ['AATAAG']
def setup(self): self.ht = khmer.Nodegraph(12, 1e4, 2)
def main(): parser = argparse.ArgumentParser( description= "This script will create node graph for a given k-mer size and query file (can be used as input to QueryDNADatabase.py)", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-fp', '--fp_rate', type=restricted_float, help="False positive rate.", default=0.0001) parser.add_argument( '-i', '--intersect_nodegraph', help= "Location of Node Graph. Will only insert query k-mers in bloom filter if they appear anywhere in the training" " database. Note that the Jaccard estimates will now be " "J(query intersect union_i training_i, training_i) instead of J(query, training_i), " "but will use significantly less space (unfortunately will also disable threading)." ) parser.add_argument('-k', '--k_size', type=int, help="K-mer size", default=21) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('in_file', help="Input file: FASTQ/A file (can be gzipped).") parser.add_argument('out_dir', help='Output directory') # Parse and check args args = parser.parse_args() query_file = os.path.abspath(args.in_file) ksize = args.k_size num_threads = args.threads node_graph_out = os.path.join( os.path.abspath(args.out_dir), os.path.basename(query_file) + ".NodeGraph.K" + str(ksize)) if args.intersect_nodegraph is not None: intersect_nodegraph_file = args.intersect_nodegraph else: intersect_nodegraph_file = None intersect_nodegraph = None if intersect_nodegraph_file is not None: if not os.path.exists(intersect_nodegraph_file): raise Exception( "Intersection nodegraph does not exist. Please re-run MakeDNADatabase.py with the -i flag." ) try: intersect_nodegraph = khmer.load_nodegraph( intersect_nodegraph_file) if intersect_nodegraph.ksize() != ksize: raise Exception( "Given intersect nodegraph %s has K-mer size %d while the database K-mer size is %d" % (intersect_nodegraph_file, intersect_nodegraph.ksize(), ksize)) except: raise Exception("Could not load given intersect nodegraph %s" % intersect_nodegraph_file) fprate = args.fp_rate hll = khmer.HLLCounter(0.01, ksize) hll.consume_seqfile(query_file) full_kmer_count_estimate = hll.estimate_cardinality() res = optimal_size(full_kmer_count_estimate, fp_rate=fprate) if intersect_nodegraph is None: # If no intersect list was given, just populate the bloom filter sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) #sample_kmers.consume_seqfile(query_file) rparser = khmer.ReadParser(query_file) threads = [] for _ in range(num_threads): cur_thrd = threading.Thread( target=sample_kmers.consume_seqfile_with_reads_parser, args=(rparser, )) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() else: # Otherwise, only put a k-mer in the bloom filter if it's in the intersect list # (WARNING: this will cause the Jaccard index to be calculated in terms of J(query\intersect hash_list, training) # instead of J(query, training) # (TODO: fix this after khmer is updated) #intersect_nodegraph_kmer_count = intersect_nodegraph.n_unique_kmers() # Doesnt work due to khmer bug intersect_nodegraph_kmer_count = intersect_nodegraph.n_occupied( ) # Doesnt work due to khmer bug if intersect_nodegraph_kmer_count < full_kmer_count_estimate: # At max, we have as many k-mers as in the union of the training database (But makes this always return 0) res = optimal_size(intersect_nodegraph_kmer_count, fp_rate=fprate) sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) else: sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) for record in screed.open(query_file): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i + ksize] if intersect_nodegraph.get(kmer) > 0: sample_kmers.add(kmer) # Save the sample_kmers sample_kmers.save(node_graph_out)
def main(): parser = argparse.ArgumentParser() parser.add_argument('seqfiles', nargs='+') parser.add_argument('-o', '--output', default=None) parser.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int) parser.add_argument('-x', '--tablesize', default=NODEGRAPH_SIZE, type=float) parser.add_argument('--force', action='store_true') args = parser.parse_args() assert args.ksize % 2, "ksize must be odd" assert args.output, "you probably want an output file" print('building graphs and loading files') # Create graph, and two stop bloom filters - one for loading, one for # traversing. Create them all here so that we can error out quickly # if memory is a problem. graph = khmer.Nodegraph(args.ksize, args.tablesize, 2) stop_bf = khmer.Nodegraph(args.ksize, args.tablesize, 2) stop_bf2 = khmer.Nodegraph(args.ksize, args.tablesize, 2) n = 0 # load in all of the input sequences, one file at a time. for seqfile in args.seqfiles: for record in screed.open(seqfile): n += 1 if n % 10000 == 0: print('...', seqfile, n) graph.consume(record.sequence) # complain if too small set of graphs was used. fp_rate = khmer.calc_expected_collisions(graph, args.force, max_false_pos=.05) # initialize the object that will track information for us. pathy = Pathfinder(args.ksize) print('finding high degree nodes') degree_nodes = khmer.HashSet(args.ksize) n = 0 for seqfile in args.seqfiles: for record in screed.open(seqfile): n += 1 if n % 10000 == 0: print('...2', seqfile, n) # walk across sequences, find all high degree nodes, # name them and cherish them. Don't do this on identical sequences. if min(stop_bf2.get_kmer_counts(record.sequence)) == 0: stop_bf2.consume(record.sequence) degree_nodes += graph.find_high_degree_nodes(record.sequence) del stop_bf2 if not len(degree_nodes): print('no high degree nodes; exiting.') sys.exit(0) # get all of the degree > 2 nodes and give them IDs. for node in degree_nodes: pathy.new_segment(node) print('traversing linear segments from', len(degree_nodes), 'nodes') # now traverse from each high degree nodes into all neighboring nodes, # seeking adjacencies. if neighbor is high degree node, add it to # adjacencies; if neighbor is not, then traverse the linear path. also # track minhashes while we're at it. for n, k in enumerate(degree_nodes): if n % 10000 == 0: print('...', n, 'of', len(degree_nodes)) # retrieve the segment ID of the primary node. k_id = pathy.segments_r[k] # find all the neighbors of this high-degree node. nbh = graph.neighbors(k) for nk in nbh: # neighbor is high degree? fine, mark its adjacencies. if nk in degree_nodes: nk_id = pathy.segments_r[nk] pathy.add_adjacency(k_id, nk_id) else: # linear! walk it. traverse_and_mark_linear_paths(graph, nk, stop_bf, pathy, degree_nodes) print(len(pathy.segments), 'segments, containing', sum(pathy.segments.values()), 'nodes') # save to GML if args.output: print('saving to', args.output) fp = open(args.output, 'w') w = GmlWriter(fp, [], []) for k, v in pathy.segments.items(): w.add_vertex(k, v, []) for k, v in pathy.adjacencies.items(): for edge in v: w.add_edge(k, edge, [])
def setup(self): self.ht = khmer.Nodegraph(12, 4**3 + 1, 2)
def main(): # pylint: disable=too-many-locals,too-many-statements info('do-partition.py', ['graph']) args = sanitize_help(get_parser()).parse_args() report_on_config(args, graphtype='nodegraph') for infile in args.input_filenames: check_input_files(infile, args.force) check_space(args.input_filenames, args.force) print('Saving k-mer nodegraph to %s' % args.graphbase, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(args.input_filenames), file=sys.stderr) print('--', file=sys.stderr) print('SUBSET SIZE', args.subset_size, file=sys.stderr) print('N THREADS', args.threads, file=sys.stderr) print('--', file=sys.stderr) # load-graph.py print('making nodegraph', file=sys.stderr) nodegraph = khmer_args.create_nodegraph(args) for _, filename in enumerate(args.input_filenames): print('consuming input', filename, file=sys.stderr) nodegraph.consume_fasta_and_tag(filename) # 0.18 is ACTUAL MAX. Do not change. fp_rate = \ khmer.calc_expected_collisions( nodegraph, args.force, max_false_pos=.15) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print('** This script brakes for lumps: ', 'stop_big_traversals is true.', file=sys.stderr) else: print('** Traverse all the things:', ' stop_big_traversals is false.', file=sys.stderr) # # now, partition! # # divide the tags up into subsets divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size)) divvy = list(divvy) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((nodegraph, _, start, end)) print('enqueued %d subset tasks' % n_subsets, file=sys.stderr) open('%s.info' % args.graphbase, 'w').write('%d subsets total\n' % (n_subsets)) if n_subsets < args.threads: args.threads = n_subsets # start threads! print('starting %d threads' % args.threads, file=sys.stderr) print('---', file=sys.stderr) threads = [] for _ in range(args.threads): cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals)) threads.append(cur_thread) cur_thread.start() assert threading.active_count() == args.threads + 1 print('done starting threads', file=sys.stderr) # wait for threads for _ in threads: _.join() print('---', file=sys.stderr) print('done making subsets! see %s.subset.*.pmap' % (args.graphbase, ), file=sys.stderr) # merge-partitions pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) nodegraph = khmer.Nodegraph(args.ksize, 1, 1) for pmap_file in pmap_files: print('merging', pmap_file, file=sys.stderr) nodegraph.merge_subset_from_disk(pmap_file) if args.remove_subsets: print('removing pmap files', file=sys.stderr) for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: print('outputting partitions for', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.part' part_count = nodegraph.output_partitions(infile, outfile) print('output %d partitions for %s' % (part_count, infile), file=sys.stderr) print('partitions are in', outfile, file=sys.stderr)
def __call__(self): return khmer.Nodegraph(self.ksize, self.starting_size, self.n_tables)
def create_nodegraph(): return khmer.Nodegraph(ksize, starting_size, n_tables)
def test_KmerDegreeFunction(): kmer = Kmer('ACCTA') g = khmer.Nodegraph(5, 1e4, 4) g.add(str(kmer)) f = KmerDegreeFunction(g) assert f.evaluate_kmer(kmer) == 0
def test_KmerCountFunction(): kmer = Kmer('AAAAA') g = khmer.Nodegraph(5, 1e4, 4) g.add(str(kmer)) f = KmerCountFunction(g) assert f.evaluate_kmer(kmer) == 1