def test_partition_graph_nojoin_stoptags(): # test with stoptags graphbase = _make_graph(utils.get_test_data('random-20-a.fa')) # add in some stop tags ht = khmer.load_hashbits(graphbase + '.pt') ht.add_stop_tag('TTGCATACGTTGAGCCAGCG') stoptags_file = graphbase + '.stoptags' ht.save_stop_tags(stoptags_file) del ht # run script with stoptags option script = scriptpath('partition-graph.py') args = ['--stoptags', stoptags_file, graphbase] utils.runscript(script, args) script = scriptpath('merge-partitions.py') args = [graphbase, '-k', str(20)] utils.runscript(script, args) final_pmap_file = graphbase + '.pmap.merged' assert os.path.exists(final_pmap_file) ht = khmer.load_hashbits(graphbase + '.pt') ht.load_tagset(graphbase + '.tagset') ht.load_partitionmap(final_pmap_file) x = ht.count_partitions() assert x == (2, 0), x # should be 2 partitions
def test_partition_graph_nojoin_stoptags(): # test with stoptags graphbase = _make_graph(utils.get_test_data('random-20-a.fa')) in_dir = os.path.dirname(graphbase) # add in some stop tags ht = khmer.load_hashbits(graphbase + '.ht') ht.add_stop_tag('TTGCATACGTTGAGCCAGCG') stoptags_file = graphbase + '.stoptags' ht.save_stop_tags(stoptags_file) del ht # run script with stoptags option script = scriptpath('partition-graph.py') args = ['--stoptags', stoptags_file, graphbase] (status, out, err) = runscript(script, args) assert status == 0 script = scriptpath('merge-partitions.py') args = [graphbase, '-k', str(20)] (status, out, err) = runscript(script, args) print out print err assert status == 0 final_pmap_file = graphbase + '.pmap.merged' assert os.path.exists(final_pmap_file) ht = khmer.load_hashbits(graphbase + '.ht') ht.load_partitionmap(final_pmap_file) x = ht.count_partitions() assert x == (2, 0) # should be 2 partitions
def test_partition_graph_nojoin_stoptags(): # test with stoptags graphbase = _make_graph(utils.get_test_data("random-20-a.fa")) in_dir = os.path.dirname(graphbase) # add in some stop tags ht = khmer.load_hashbits(graphbase + ".ht") ht.add_stop_tag("TTGCATACGTTGAGCCAGCG") stoptags_file = graphbase + ".stoptags" ht.save_stop_tags(stoptags_file) del ht # run script with stoptags option script = scriptpath("partition-graph.py") args = ["--stoptags", stoptags_file, graphbase] runscript(script, args) script = scriptpath("merge-partitions.py") args = [graphbase, "-k", str(20)] runscript(script, args) final_pmap_file = graphbase + ".pmap.merged" assert os.path.exists(final_pmap_file) ht = khmer.load_hashbits(graphbase + ".ht") ht.load_partitionmap(final_pmap_file) x = ht.count_partitions() assert x == (2, 0) # should be 2 partitions
def test_partition_graph_nojoin_k21(): # test with K=21 graphbase = _make_graph(utils.get_test_data('random-20-a.fa'), K=21) in_dir = os.path.dirname(graphbase) script = scriptpath('partition-graph.py') args = [graphbase] (status, out, err) = runscript(script, args) assert status == 0 script = scriptpath('merge-partitions.py') args = [graphbase, '-k', str(21)] (status, out, err) = runscript(script, args) print out print err assert status == 0 final_pmap_file = graphbase + '.pmap.merged' assert os.path.exists(final_pmap_file) ht = khmer.load_hashbits(graphbase + '.ht') ht.load_partitionmap(final_pmap_file) x = ht.count_partitions() assert x == (99, 0) # should be 99 partitions at K=21
def main(): htfile = sys.argv[1] outfiles = sys.argv[2:] print 'loading hashbits' ht = khmer.load_hashbits(htfile) def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD) if trim_at >= ht.ksize(): return name, trim_seq return None, None for filename in outfiles: outpath = os.path.basename(filename) + '.sodd' outfp = open(outpath, 'w') tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(filename), outfp)
def main(): already_part = sys.argv[1] new_to_part = sys.argv[2] basename = os.path.basename(new_to_part) pmap_filename = sys.argv[3] # if not os.path.exists(already_part): # print '%s doesn\'t exist! dying.' % already_part # sys.exit(0) # create a fake-ish ht; K matters, but not hashtable size. ht = khmer.load_hashbits(already_part + '.ht') ht.load_tagset(already_part + '.tagset') ht.merge_subset_from_disk(pmap_filename) # find singletons n_singletons = ht.find_unpart( new_to_part, TRAVERSE_ON_UNPART, STOP_BIG_TRAVERSALS) print 'found:', n_singletons print 'saving', basename + '.unpart' n_partitions = ht.output_partitions(new_to_part, basename + '.unpart') print 'saving', basename + '.pmap' ht.save_partitionmap(basename + '.pmap') ### (n_partitions, n_singletons) = ht.count_partitions() print 'output partitions:', n_partitions print 'pmap partitions:', n_partitions print 'singletons:', n_singletons
def main(): already_part = sys.argv[1] new_to_part = sys.argv[2] basename = os.path.basename(new_to_part) pmap_filename = sys.argv[3] # if not os.path.exists(already_part): # print '%s doesn\'t exist! dying.' % already_part # sys.exit(0) # create a fake-ish ht; K matters, but not hashtable size. ht = khmer.load_hashbits(already_part + '.ht') ht.load_tagset(already_part + '.tagset') ht.merge_subset_from_disk(pmap_filename) # find singletons n_singletons = ht.find_unpart(new_to_part, TRAVERSE_ON_UNPART, STOP_BIG_TRAVERSALS) print 'found:', n_singletons print 'saving', basename + '.unpart' n_partitions = ht.output_partitions(new_to_part, basename + '.unpart') print 'saving', basename + '.pmap' ht.save_partitionmap(basename + '.pmap') ### (n_partitions, n_singletons) = ht.count_partitions() print 'output partitions:', n_partitions print 'pmap partitions:', n_partitions print 'singletons:', n_singletons
def test_load_graph(): script = scriptpath('load-graph.py') args = ['-x', '1e7', '-N', '2', '-k', '20', '-t'] outfile = utils.get_temp_filename('out') infile = utils.get_test_data('random-20-a.fa') args.extend([outfile, infile]) (status, out, err) = utils.runscript(script, args) assert 'Total number of k-mers: 3959' in err, err ht_file = outfile + '.pt' assert os.path.exists(ht_file), ht_file tagset_file = outfile + '.tagset' assert os.path.exists(tagset_file), tagset_file ht = khmer.load_hashbits(ht_file) ht.load_tagset(tagset_file) # check to make sure we get the expected result for this data set # upon partitioning (all in one partition). This is kind of a # roundabout way of checking that load-graph worked :) subset = ht.do_subset_partition(0, 0) x = ht.subset_count_partitions(subset) assert x == (1, 0), x
def test_load_graph(): script = scriptpath('load-graph.py') args = ['-x', '1e7', '-N', '2', '-k', '20'] outfile = utils.get_temp_filename('out') infile = utils.get_test_data('random-20-a.fa') args.extend([outfile, infile]) (status, out, err) = runscript(script, args) assert status == 0 ht_file = outfile + '.ht' assert os.path.exists(ht_file), ht_file tagset_file = outfile + '.tagset' assert os.path.exists(tagset_file), tagset_file ht = khmer.load_hashbits(ht_file) ht.load_tagset(tagset_file) # check to make sure we get the expected result for this data set # upon partitioning (all in one partition). This is kind of a # roundabout way of checking that load-graph worked :) subset = ht.do_subset_partition(0, 0) x = ht.subset_count_partitions(subset) assert x == (1, 0), x
def main(): info('count-overlap.py', ['counting']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in [args.ptfile, args.fafile]: check_file_status(infile) check_space([args.ptfile, args.fafile]) print 'loading k-mer presence table from', args.ptfile ht1 = khmer.load_hashbits(args.ptfile) kmer_size = ht1.ksize() output = open(args.report_filename, 'w') f_curve_obj = open(args.report_filename + '.curve', 'w') ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables) (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1) printout1 = """\ dataset1(pt file): %s dataset2: %s # of unique k-mers in dataset2: %d # of overlap unique k-mers: %d """ % (args.ptfile, args.fafile, n_unique, n_overlap) output.write(printout1) for i in range(100): to_print = str(list_curve[100 + i]) + ' ' + str(list_curve[i]) + '\n' f_curve_obj.write(to_print)
def test_load_graph(): script = scriptpath("load-graph.py") args = ["-x", "1e7", "-N", "2", "-k", "20"] outfile = utils.get_temp_filename("out") infile = utils.get_test_data("random-20-a.fa") args.extend([outfile, infile]) runscript(script, args) ht_file = outfile + ".ht" assert os.path.exists(ht_file), ht_file tagset_file = outfile + ".tagset" assert os.path.exists(tagset_file), tagset_file ht = khmer.load_hashbits(ht_file) ht.load_tagset(tagset_file) # check to make sure we get the expected result for this data set # upon partitioning (all in one partition). This is kind of a # roundabout way of checking that load-graph worked :) subset = ht.do_subset_partition(0, 0) x = ht.subset_count_partitions(subset) assert x == (1, 0), x
def main(): parser = argparse.ArgumentParser( description="Find an initial set of highly connected k-mers.") parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes', default=DEFAULT_COUNTING_HT_N, help='number of counting hash tables to use') parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize', default=DEFAULT_COUNTING_HT_SIZE, help='lower bound on counting hashsize to use') parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE, dest='subset_size', type=float, help='Set subset size (default 1e4 is prob ok)') parser.add_argument('--stoptags', '-S', dest='stoptags', default='', help="Use stoptags in this file during partitioning") parser.add_argument('graphbase') args = parser.parse_args() graphbase = args.graphbase print 'loading ht %s.ht' % graphbase ht = khmer.load_hashbits(graphbase + '.ht') # do we want to load stop tags, and do they exist? if args.stoptags: print 'loading stoptags from', args.stoptags ht.load_stop_tags(args.stoptags) print 'loading tagset %s.tagset...' % graphbase ht.load_tagset(graphbase + '.tagset') K = ht.ksize() counting = khmer.new_counting_hash(K, args.min_hashsize, args.n_hashes) # divide up into SUBSET_SIZE fragments divvy = ht.divide_tags_into_subsets(args.subset_size) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print 'doing pre-partitioning from', start, 'to', end subset = ht.do_subset_partition(start, end) # now, repartition... print 'repartitioning to find HCKs.' ht.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print 'saving stop tags' ht.save_stop_tags(graphbase + '.stoptags')
def main(): info('make-initial-stoptags.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_file_status(_) check_space(infiles) print >>sys.stderr, 'loading htable %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') # do we want to load stop tags, and do they exist? if args.stoptags: print >>sys.stderr, 'loading stoptags from', args.stoptags htable.load_stop_tags(args.stoptags) print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase htable.load_tagset(graphbase + '.tagset') ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # divide up into SUBSET_SIZE fragments divvy = htable.divide_tags_into_subsets(args.subset_size) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print >>sys.stderr, 'doing pre-partitioning from', start, 'to', end subset = htable.do_subset_partition(start, end) # now, repartition... print >>sys.stderr, 'repartitioning to find HCKs.' htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >>sys.stderr, 'saving stop tags' htable.save_stop_tags(graphbase + '.stoptags') print >> sys.stderr, 'wrote to:', graphbase + '.stoptags'
def main(): info('make-initial-stoptags.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading htable %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') # do we want to load stop tags, and do they exist? if args.stoptags: print >>sys.stderr, 'loading stoptags from', args.stoptags htable.load_stop_tags(args.stoptags) print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase htable.load_tagset(graphbase + '.tagset') ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # divide up into SUBSET_SIZE fragments divvy = htable.divide_tags_into_subsets(args.subset_size) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print >>sys.stderr, 'doing pre-partitioning from', start, 'to', end subset = htable.do_subset_partition(start, end) # now, repartition... print >>sys.stderr, 'repartitioning to find HCKs.' htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >>sys.stderr, 'saving stop tags' htable.save_stop_tags(graphbase + '.stoptags') print >> sys.stderr, 'wrote to:', graphbase + '.stoptags'
def test_partition_graph_big_traverse(): graphbase = _make_graph(utils.get_test_data('biglump-random-20-a.fa'), do_partition=True, stop_big_traverse=False) final_pmap_file = graphbase + '.pmap.merged' assert os.path.exists(final_pmap_file) ht = khmer.load_hashbits(graphbase + '.ht') ht.load_partitionmap(final_pmap_file) x = ht.count_partitions() assert x == (1, 0) # should be exactly one partition.
def main(): info("make-initial-stoptags.py", ["graph"]) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + ".pt", graphbase + ".tagset"] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_file_status(_) check_space(infiles) print >>sys.stderr, "loading htable %s.pt" % graphbase htable = khmer.load_hashbits(graphbase + ".pt") # do we want to load stop tags, and do they exist? if args.stoptags: print >>sys.stderr, "loading stoptags from", args.stoptags htable.load_stop_tags(args.stoptags) print >>sys.stderr, "loading tagset %s.tagset..." % graphbase htable.load_tagset(graphbase + ".tagset") ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # divide up into SUBSET_SIZE fragments divvy = htable.divide_tags_into_subsets(args.subset_size) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print >>sys.stderr, "doing pre-partitioning from", start, "to", end subset = htable.do_subset_partition(start, end) # now, repartition... print >>sys.stderr, "repartitioning to find HCKs." htable.repartition_largest_partition( subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD ) print >>sys.stderr, "saving stop tags" htable.save_stop_tags(graphbase + ".stoptags") print >>sys.stderr, "wrote to:", graphbase + ".stoptags"
def test_partition_graph_no_big_traverse(): # do NOT exhaustively traverse graphbase = _make_graph(utils.get_test_data('biglump-random-20-a.fa'), do_partition=True, stop_big_traverse=True) final_pmap_file = graphbase + '.pmap.merged' assert os.path.exists(final_pmap_file) ht = khmer.load_hashbits(graphbase + '.ht') ht.load_partitionmap(final_pmap_file) x = ht.count_partitions() assert x == (4, 0), x # should be four partitions, broken at knot.
def test_load_graph_no_tags(): script = scriptpath('load-graph.py') args = ['-x', '1e7', '-N', '2', '-k', '20', '-n'] outfile = utils.get_temp_filename('out') infile = utils.get_test_data('random-20-a.fa') args.extend([outfile, infile]) runscript(script, args) ht_file = outfile + '.pt' assert os.path.exists(ht_file), ht_file tagset_file = outfile + '.tagset' assert not os.path.exists(tagset_file), tagset_file assert khmer.load_hashbits(ht_file)
def test_load_graph_no_tags(): script = scriptpath("load-graph.py") args = ["-x", "1e7", "-N", "2", "-k", "20", "-n"] outfile = utils.get_temp_filename("out") infile = utils.get_test_data("random-20-a.fa") args.extend([outfile, infile]) runscript(script, args) ht_file = outfile + ".ht" assert os.path.exists(ht_file), ht_file tagset_file = outfile + ".tagset" assert not os.path.exists(tagset_file), tagset_file ht = khmer.load_hashbits(ht_file)
def test_load_graph_no_tags(): script = scriptpath('load-graph.py') args = ['-x', '1e7', '-N', '2', '-k', '20', '-n'] outfile = utils.get_temp_filename('out') infile = utils.get_test_data('random-20-a.fa') args.extend([outfile, infile]) utils.runscript(script, args) ht_file = outfile + '.pt' assert os.path.exists(ht_file), ht_file tagset_file = outfile + '.tagset' assert not os.path.exists(tagset_file), tagset_file assert khmer.load_hashbits(ht_file)
def main(): info('count-overlap.py', ['counting']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in [args.ptfile, args.fafile]: check_input_files(infile, args.force) check_space([args.ptfile, args.fafile], args.force) print('loading k-mer presence table from', args.ptfile, file=sys.stderr) ht1 = khmer.load_hashbits(args.ptfile) kmer_size = ht1.ksize() output = open(args.report_filename, 'w') f_curve_obj = open(args.report_filename + '.curve', 'w') if args.csv: f_curve_obj_csv = csv.writer(f_curve_obj) # write headers: f_curve_obj_csv.writerow(['input_seq', 'overlap_kmer']) ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables) (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1) printout1 = """\ dataset1(pt file): %s dataset2: %s # of unique k-mers in dataset2: %d # of overlap unique k-mers: %d """ % (args.ptfile, args.fafile, n_unique, n_overlap) output.write(printout1) for i in range(100): if args.csv: f_curve_obj_csv.writerow([list_curve[100 + i], list_curve[i]]) else: print(list_curve[100 + i], list_curve[i], file=f_curve_obj) print('wrote to: ' + args.report_filename, file=sys.stderr)
def test_partition_graph_1(): graphbase = _make_graph(utils.get_test_data('random-20-a.fa')) script = scriptpath('partition-graph.py') args = [graphbase] runscript(script, args) script = scriptpath('merge-partitions.py') args = [graphbase, '-k', str(20)] runscript(script, args) final_pmap_file = graphbase + '.pmap.merged' assert os.path.exists(final_pmap_file) ht = khmer.load_hashbits(graphbase + '.ht') ht.load_partitionmap(final_pmap_file) x = ht.count_partitions() assert x == (1, 0) # should be exactly one partition.
def test_partition_graph_1(): graphbase = _make_graph(utils.get_test_data("random-20-a.fa")) in_dir = os.path.dirname(graphbase) script = scriptpath("partition-graph.py") args = [graphbase] runscript(script, args) script = scriptpath("merge-partitions.py") args = [graphbase, "-k", str(20)] runscript(script, args) final_pmap_file = graphbase + ".pmap.merged" assert os.path.exists(final_pmap_file) ht = khmer.load_hashbits(graphbase + ".ht") ht.load_partitionmap(final_pmap_file) x = ht.count_partitions() assert x == (1, 0) # should be exactly one partition.
def test_partition_graph_nojoin_k21(): # test with K=21 graphbase = _make_graph(utils.get_test_data('random-20-a.fa'), ksize=21) script = scriptpath('partition-graph.py') args = [graphbase] runscript(script, args) script = scriptpath('merge-partitions.py') args = [graphbase, '-k', str(21)] runscript(script, args) final_pmap_file = graphbase + '.pmap.merged' assert os.path.exists(final_pmap_file) ht = khmer.load_hashbits(graphbase + '.pt') ht.load_tagset(graphbase + '.tagset') ht.load_partitionmap(final_pmap_file) x = ht.count_partitions() assert x == (99, 0), x # should be 99 partitions at K=21
def test_partition_graph_nojoin_k21(): # test with K=21 graphbase = _make_graph(utils.get_test_data('random-20-a.fa'), ksize=21) script = scriptpath('partition-graph.py') args = [graphbase] utils.runscript(script, args) script = scriptpath('merge-partitions.py') args = [graphbase, '-k', str(21)] utils.runscript(script, args) final_pmap_file = graphbase + '.pmap.merged' assert os.path.exists(final_pmap_file) ht = khmer.load_hashbits(graphbase + '.pt') ht.load_tagset(graphbase + '.tagset') ht.load_partitionmap(final_pmap_file) x = ht.count_partitions() assert x == (99, 0), x # should be 99 partitions at K=21
def main(): parser = argparse.ArgumentParser(description="Partition a graph.") parser.add_argument('basename') parser.add_argument('--stoptags', '-S', dest='stoptags', default='', help="Use stoptags in this file during partitioning") parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE, dest='subset_size', type=float, help='Set subset size (usually 1e5-1e6 is good)') parser.add_argument('--no-big-traverse', dest='no_big_traverse', action='store_true', default=False, help='Truncate graph joins at big traversals') parser.add_argument('--threads', '-T', dest='n_threads', default=DEFAULT_N_THREADS, help='Number of simultaneous threads to execute') args = parser.parse_args() basename = args.basename print '--' print 'SUBSET SIZE', args.subset_size print 'N THREADS', args.n_threads if args.stoptags: print 'stoptag file:', args.stoptags print '--' print 'loading ht %s.ht' % basename ht = khmer.load_hashbits(basename + '.ht') ht.load_tagset(basename + '.tagset') # retrieve K K = ht.ksize() # do we want to load stop tags, and do they exist? if args.stoptags: print 'loading stoptags from', args.stoptags ht.load_stop_tags(args.stoptags) # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print '** This script brakes for lumps: stop_big_traversals is true.' else: print '** Traverse all the things: stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = ht.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for i in range(0, n_subsets): start = divvy[i] end = divvy[i + 1] worker_q.put((ht, i, start, end)) print 'enqueued %d subset tasks' % n_subsets open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) n_threads = int(args.n_threads) if n_subsets < n_threads: n_threads = n_subsets # start threads! print 'starting %d threads' % n_threads print '---' threads = [] for n in range(n_threads): t = threading.Thread(target=worker, args=(worker_q, basename, stop_big_traversals)) threads.append(t) t.start() print 'done starting threads' # wait for threads for t in threads: t.join() print '---' print 'done making subsets! see %s.subset.*.pmap' % (basename,)
def main(): info('find-knots.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if os.path.exists(graphbase + '.stoptags'): infiles.append(graphbase + '.stoptags') for _ in infiles: check_input_files(_, False) check_space(infiles, False) print >>sys.stderr, 'loading k-mer presence table %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase htable.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print >>sys.stderr, 'loading stoptags %s.stoptags' % graphbase htable.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \ (len(pmap_files), pmap_files[0]) print >>sys.stderr, '---' print >>sys.stderr, 'output stoptags will be in', graphbase + '.stoptags' if initial_stoptags: print >>sys.stderr, \ '(these output stoptags will include the already-loaded set)' print >>sys.stderr, '---' # create counting hash ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # load & merge for index, subset_file in enumerate(pmap_files): print >>sys.stderr, '<-', subset_file subset = htable.load_subset_partitionmap(subset_file) print >>sys.stderr, '** repartitioning subset... %s' % subset_file htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >>sys.stderr, '** merging subset... %s' % subset_file htable.merge_subset(subset) print >>sys.stderr, '** repartitioning, round 2... %s' % subset_file size = htable.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >>sys.stderr, '** repartitioned size:', size print >>sys.stderr, 'saving stoptags binary' htable.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print >>sys.stderr, '(%d of %d)\n' % (index, len(pmap_files)) print >>sys.stderr, 'done!'
def main(): info('partition-graph.py', ['graph']) args = get_parser().parse_args() basename = args.basename filenames = [basename + '.pt', basename + '.tagset'] for _ in filenames: check_file_status(_) check_space(filenames) print >> sys.stderr, '--' print >> sys.stderr, 'SUBSET SIZE', args.subset_size print >> sys.stderr, 'N THREADS', args.threads if args.stoptags: print >> sys.stderr, 'stoptag file:', args.stoptags print >> sys.stderr, '--' print >> sys.stderr, 'loading ht %s.pt' % basename htable = khmer.load_hashbits(basename + '.pt') htable.load_tagset(basename + '.tagset') # do we want to load stop tags, and do they exist? if args.stoptags: print >> sys.stderr, 'loading stoptags from', args.stoptags htable.load_stop_tags(args.stoptags) # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print >>sys.stderr, '** This script brakes for lumps:', \ ' stop_big_traversals is true.' else: print >>sys.stderr, '** Traverse all the things:', \ ' stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = htable.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((htable, _, start, end)) print >> sys.stderr, 'enqueued %d subset tasks' % n_subsets open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) n_threads = args.threads if n_subsets < n_threads: n_threads = n_subsets # start threads! print >> sys.stderr, 'starting %d threads' % n_threads print >> sys.stderr, '---' threads = [] for _ in range(n_threads): cur_thrd = threading.Thread(target=worker, args=(worker_q, basename, stop_big_traversals)) threads.append(cur_thrd) cur_thrd.start() print >> sys.stderr, 'done starting threads' # wait for threads for _ in threads: _.join() print >> sys.stderr, '---' print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \ (basename,)
def main(): info("find-knots.py", ["graph"]) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + ".pt", graphbase + ".tagset"] if os.path.exists(graphbase + ".stoptags"): infiles.append(graphbase + ".stoptags") for _ in infiles: check_file_status(_) check_space(infiles) print >>sys.stderr, "loading k-mer presence table %s.pt" % graphbase htable = khmer.load_hashbits(graphbase + ".pt") print >>sys.stderr, "loading tagset %s.tagset..." % graphbase htable.load_tagset(graphbase + ".tagset") initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + ".stoptags"): print >>sys.stderr, "loading stoptags %s.stoptags" % graphbase htable.load_stop_tags(graphbase + ".stoptags") initial_stoptags = True pmap_files = glob.glob(args.graphbase + ".subset.*.pmap") print >>sys.stderr, "loading %d pmap files (first one: %s)" % (len(pmap_files), pmap_files[0]) print >>sys.stderr, "---" print >>sys.stderr, "output stoptags will be in", graphbase + ".stoptags" if initial_stoptags: print >>sys.stderr, "(these output stoptags will include the already-loaded set)" print >>sys.stderr, "---" # create counting hash ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # load & merge for index, subset_file in enumerate(pmap_files): print >>sys.stderr, "<-", subset_file subset = htable.load_subset_partitionmap(subset_file) print >>sys.stderr, "** repartitioning subset... %s" % subset_file htable.repartition_largest_partition( subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD ) print >>sys.stderr, "** merging subset... %s" % subset_file htable.merge_subset(subset) print >>sys.stderr, "** repartitioning, round 2... %s" % subset_file size = htable.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD ) print >>sys.stderr, "** repartitioned size:", size print >>sys.stderr, "saving stoptags binary" htable.save_stop_tags(graphbase + ".stoptags") os.rename(subset_file, subset_file + ".processed") print >>sys.stderr, "(%d of %d)\n" % (index, len(pmap_files)) print >>sys.stderr, "done!"
def main(): parser = argparse.ArgumentParser( description='Use bloom filter to count overlap k-mers') env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K) env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT) env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_HASHSIZE) parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') parser.add_argument('--ksize', '-k', type=int, dest='ksize', default=env_ksize, help='k-mer size to use ' '(should be the same as in htfile)') parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes', default=env_n_hashes, help='number of hash tables to use') parser.add_argument('--hashsize', '-x', type=float, dest='hashsize', default=env_hashsize, help='hashsize to use') parser.add_argument('htfile') parser.add_argument('fafile') parser.add_argument('report_filename') args = parser.parse_args() if not args.quiet: if args.hashsize == DEFAULT_HASHSIZE: print >>sys.stderr, \ "** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!" print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, ' - hashsize = %-5.2g \t(-x)' % args.hashsize print >>sys.stderr, \ 'Estimated memory usage is %.2g bytes (n_hashes x hashsize / 8)' \ % (args.n_hashes * args.hashsize / 8.) print >>sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.hashsize N_HT = args.n_hashes htfile = args.htfile fafile = args.fafile output_filename = args.report_filename curve_filename = output_filename + '.curve' print 'loading hashbits from', htfile ht1 = khmer.load_hashbits(htfile) K = ht1.ksize() output = open(output_filename, 'w') f_curve_obj = open(curve_filename, 'w') ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT) (n_unique, n_overlap, list) = ht2.count_overlap(fafile, ht1) printout1 = """\ dataset1(ht file): %s dataset2: %s # of unique k-mers in dataset2: %d # of overlap unique k-mers: %d """ % (htfile, fafile, n_unique, n_overlap) output.write(printout1) figure_list1 = [] figure_list2 = [] for i in range(100): to_print = str(list[100 + i]) + ' ' + str(list[i]) + '\n' f_curve_obj.write(to_print)
def main(): parser = argparse.ArgumentParser(description="Partition a graph.") parser.add_argument('basename') parser.add_argument('--stoptags', '-S', dest='stoptags', default='', help="Use stoptags in this file during partitioning") parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE, dest='subset_size', type=float, help='Set subset size (usually 1e5-1e6 is good)') parser.add_argument('--no-big-traverse', dest='no_big_traverse', action='store_true', default=False, help='Truncate graph joins at big traversals') parser.add_argument('--threads', '-T', dest='n_threads', default=DEFAULT_N_THREADS, help='Number of simultaneous threads to execute') args = parser.parse_args() basename = args.basename print '--' print 'SUBSET SIZE', args.subset_size print 'N THREADS', args.n_threads if args.stoptags: print 'stoptag file:', args.stoptags print '--' print 'loading ht %s.ht' % basename ht = khmer.load_hashbits(basename + '.ht') ht.load_tagset(basename + '.tagset') # retrieve K K = ht.ksize() # do we want to load stop tags, and do they exist? if args.stoptags: print 'loading stoptags from', args.stoptags ht.load_stop_tags(args.stoptags) # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print '** This script brakes for lumps: stop_big_traversals is true.' else: print '** Traverse all the things: stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = ht.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for i in range(0, n_subsets): start = divvy[i] end = divvy[i + 1] worker_q.put((ht, i, start, end)) print 'enqueued %d subset tasks' % n_subsets open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) n_threads = int(args.n_threads) if n_subsets < n_threads: n_threads = n_subsets # start threads! print 'starting %d threads' % n_threads print '---' threads = [] for n in range(n_threads): t = threading.Thread(target=worker, args=(worker_q, basename, stop_big_traversals)) threads.append(t) t.start() print 'done starting threads' # wait for threads for t in threads: t.join() print '---' print 'done making subsets! see %s.subset.*.pmap' % (basename, )
def main(): info('find-knots.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if os.path.exists(graphbase + '.stoptags'): infiles.append(graphbase + '.stoptags') for _ in infiles: check_file_status(_) check_space(infiles) print >> sys.stderr, 'loading k-mer presence table %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') print >> sys.stderr, 'loading tagset %s.tagset...' % graphbase htable.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print >> sys.stderr, 'loading stoptags %s.stoptags' % graphbase htable.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \ (len(pmap_files), pmap_files[0]) print >> sys.stderr, '---' print >> sys.stderr, 'output stoptags will be in', graphbase + '.stoptags' if initial_stoptags: print >>sys.stderr, \ '(these output stoptags will include the already-loaded set)' print >> sys.stderr, '---' # create counting hash ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # load & merge for index, subset_file in enumerate(pmap_files): print >> sys.stderr, '<-', subset_file subset = htable.load_subset_partitionmap(subset_file) print >> sys.stderr, '** repartitioning subset... %s' % subset_file htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >> sys.stderr, '** merging subset... %s' % subset_file htable.merge_subset(subset) print >> sys.stderr, '** repartitioning, round 2... %s' % subset_file size = htable.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >> sys.stderr, '** repartitioned size:', size print >> sys.stderr, 'saving stoptags binary' htable.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print >> sys.stderr, '(%d of %d)\n' % (index, len(pmap_files)) print >> sys.stderr, 'done!'
import glob TRAVERSE_ON_UNPART = True STOP_BIG_TRAVERSALS = True already_part = sys.argv[1] new_to_part = sys.argv[2] basename = os.path.basename(new_to_part) pmap_filename = sys.argv[3] # if not os.path.exists(already_part): # print '%s doesn\'t exist! dying.' % already_part # sys.exit(0) # create a fake-ish ht; K matters, but not hashtable size. ht = khmer.load_hashbits(already_part + '.ht') ht.load_tagset(already_part + '.tagset') ht.merge_subset_from_disk(pmap_filename) # find singletons n_singletons = ht.find_unpart( new_to_part, TRAVERSE_ON_UNPART, STOP_BIG_TRAVERSALS) print 'found:', n_singletons print 'saving', basename + '.unpart' n_partitions = ht.output_partitions(new_to_part, basename + '.unpart') print 'saving', basename + '.pmap' ht.save_partitionmap(basename + '.pmap') ###
import glob TRAVERSE_ON_UNPART = True STOP_BIG_TRAVERSALS = True already_part = sys.argv[1] new_to_part = sys.argv[2] basename = os.path.basename(new_to_part) pmap_filename = sys.argv[3] # if not os.path.exists(already_part): # print '%s doesn\'t exist! dying.' % already_part # sys.exit(0) # create a fake-ish ht; K matters, but not hashtable size. ht = khmer.load_hashbits(already_part + '.ht') ht.load_tagset(already_part + '.tagset') ht.merge_subset_from_disk(pmap_filename) # find singletons n_singletons = ht.find_unpart(new_to_part, TRAVERSE_ON_UNPART, STOP_BIG_TRAVERSALS) print 'found:', n_singletons print 'saving', basename + '.unpart' n_partitions = ht.output_partitions(new_to_part, basename + '.unpart') print 'saving', basename + '.pmap' ht.save_partitionmap(basename + '.pmap') ###
def main(): parser = argparse.ArgumentParser( description="Find all highly connected k-mers.") parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes', default=DEFAULT_COUNTING_HT_N, help='number of counting hash tables to use') parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize', default=DEFAULT_COUNTING_HT_SIZE, help='lower bound on counting hashsize to use') parser.add_argument('graphbase') args = parser.parse_args() graphbase = args.graphbase print 'loading ht %s.ht' % graphbase ht = khmer.load_hashbits(graphbase + '.ht') print 'loading tagset %s.tagset...' % graphbase ht.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print 'loading stoptags %s.stoptags' % graphbase ht.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print 'loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]) print '---' print 'output stoptags will be in', graphbase + '.stoptags' if initial_stoptags: print '(these output stoptags will include the already-loaded set)' print '---' # create counting hash K = ht.ksize() counting = khmer.new_counting_hash(K, args.min_hashsize, args.n_hashes) # load & merge for n, subset_file in enumerate(pmap_files): print '<-', subset_file subset = ht.load_subset_partitionmap(subset_file) print '** repartitioning subset... %s' % subset_file ht.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print '** merging subset... %s' % subset_file ht.merge_subset(subset) print '** repartitioning, round 2... %s' % subset_file size = ht.repartition_largest_partition(None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print '** repartitioned size:', size print 'saving stoptags binary' ht.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print '(%d of %d)\n' % (n, len(pmap_files)) print 'done!'
def main(): info('partition-graph.py', ['graph']) args = get_parser().parse_args() basename = args.basename filenames = [basename + '.pt', basename + '.tagset'] for _ in filenames: check_file_status(_, args.force) check_space(filenames, args.force) print >>sys.stderr, '--' print >>sys.stderr, 'SUBSET SIZE', args.subset_size print >>sys.stderr, 'N THREADS', args.threads if args.stoptags: print >>sys.stderr, 'stoptag file:', args.stoptags print >>sys.stderr, '--' print >>sys.stderr, 'loading ht %s.pt' % basename htable = khmer.load_hashbits(basename + '.pt') htable.load_tagset(basename + '.tagset') # do we want to load stop tags, and do they exist? if args.stoptags: print >>sys.stderr, 'loading stoptags from', args.stoptags htable.load_stop_tags(args.stoptags) # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print >>sys.stderr, '** This script brakes for lumps:', \ ' stop_big_traversals is true.' else: print >>sys.stderr, '** Traverse all the things:', \ ' stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = htable.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((htable, _, start, end)) print >>sys.stderr, 'enqueued %d subset tasks' % n_subsets open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) n_threads = args.threads if n_subsets < n_threads: n_threads = n_subsets # start threads! print >>sys.stderr, 'starting %d threads' % n_threads print >>sys.stderr, '---' threads = [] for _ in range(n_threads): cur_thrd = threading.Thread(target=worker, args=(worker_q, basename, stop_big_traversals)) threads.append(cur_thrd) cur_thrd.start() print >>sys.stderr, 'done starting threads' # wait for threads for _ in threads: _.join() print >>sys.stderr, '---' print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \ (basename,)
def main(): parser = argparse.ArgumentParser( description="Find all highly connected k-mers.") parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes', default=DEFAULT_COUNTING_HT_N, help='number of counting hash tables to use') parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize', default=DEFAULT_COUNTING_HT_SIZE, help='lower bound on counting hashsize to use') parser.add_argument('graphbase') args = parser.parse_args() graphbase = args.graphbase print 'loading ht %s.ht' % graphbase ht = khmer.load_hashbits(graphbase + '.ht') print 'loading tagset %s.tagset...' % graphbase ht.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print 'loading stoptags %s.stoptags' % graphbase ht.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print 'loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]) print '---' print 'output stoptags will be in', graphbase + '.stoptags' if initial_stoptags: print '(these output stoptags will include the already-loaded set)' print '---' # create counting hash K = ht.ksize() counting = khmer.new_counting_hash(K, args.min_hashsize, args.n_hashes) # load & merge for n, subset_file in enumerate(pmap_files): print '<-', subset_file subset = ht.load_subset_partitionmap(subset_file) print '** repartitioning subset... %s' % subset_file ht.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print '** merging subset... %s' % subset_file ht.merge_subset(subset) print '** repartitioning, round 2... %s' % subset_file size = ht.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print '** repartitioned size:', size print 'saving stoptags binary' ht.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print '(%d of %d)\n' % (n, len(pmap_files)) print 'done!'