def test_bloom_c_2(): # simple one K = 4 HT_SIZE = 10 # use 11 N_HT1 = 1 # hashtable size = 11 N_HT2 = 2 # hashtable size = 11,13 # use only 1 hashtable, no bloom filter ht1 = khmer.new_hashbits(K, HT_SIZE, N_HT1) ht1.count("AAAA") # 00 00 00 00 = 0 ht1.count("ACTG") # 00 10 01 11 = assert ht1.n_unique_kmers() == 2 ht1.count("AACG") # 00 00 10 11 = 11 # collision with 1st kmer assert ht1.n_unique_kmers() == 2 ht1.count("AGAC") # 00 11 00 10 # collision with 2nd kmer assert ht1.n_unique_kmers() == 2 # use two hashtables with 11,13 ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT2) ht2.count("AAAA") # 00 00 00 00 = 0 ht2.count("ACTG") # 00 10 01 11 = 2*16 +4 +3 = 39 assert ht2.n_unique_kmers() == 2 ht2.count("AACG") # 00 00 10 11 = 11 # collision with only 1st kmer assert ht2.n_unique_kmers() == 3 ht2.count("AGAC") # 00 11 00 10 3*16 +2 = 50 # collision with both 2nd and 3rd kmers assert ht2.n_unique_kmers() == 3
def test_random_20_a_succ_IV_save(self): ht = khmer.new_hashbits(20, 4 ** 7 + 1) filename = utils.get_test_data('random-20-a.fa') savefile_ht = utils.get_temp_filename('ht') savefile_tags = utils.get_temp_filename('tags') outfile = filename + utils.get_temp_filename('out') total_reads, _ = ht.consume_fasta_and_tag(filename) ht.save(savefile_ht) ht.save_tagset(savefile_tags) del ht ht = khmer.new_hashbits(20, 4 ** 7 + 1) ht.load(savefile_ht) ht.load_tagset(savefile_tags) divvy = ht.divide_tags_into_subsets(1) divvy.append(0) subsets = [] for i in range(len(divvy) - 1): x = ht.do_subset_partition(divvy[i], divvy[i + 1]) subsets.append(x) for x in reversed(subsets): ht.merge_subset(x) n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions
def test_random_20_a_succ_IV_save(self): ht = khmer.new_hashbits(20, 4**13+1) filename = os.path.join(thisdir, 'test-data/random-20-a.fa') savefile_ht = filename + '.ht' savefile_tags = filename + '.tags' outfile = filename + '.out' total_reads, _ = ht.consume_fasta_and_tag(filename) ht.save(savefile_ht); ht.save_tagset(savefile_tags); del ht ht = khmer.new_hashbits(20, 4**13+1) ht.load(savefile_ht); ht.load_tagset(savefile_tags); divvy = ht.divide_tags_into_subsets(1) divvy.append(0) subsets = [] for i in range(len(divvy) - 1): x = ht.do_subset_partition(divvy[i], divvy[i+1]) subsets.append(x) for x in reversed(subsets): ht.merge_subset(x) n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions
def count_overlap(K,HT_SIZE,N_HT,filename,filename2,file_result,file_curve): if file_curve !='N': count = 0 for n, record in enumerate(screed.open(filename2)): count = count+1 max_count = count/100 file3 = open(file_curve,'w') ht = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(screed.open(filename)): sequence = record['sequence'] seq_len = len(sequence) for n in range(0,seq_len+1-K): kmer = sequence[n:n+K] if (not ht.get(kmer)): n_unique+=1 ht.count(kmer) print filename,'has been consumed.' fpr = (1- math.exp(-n_unique/HT_SIZE))**Z printout1 = "%s:\n# of unique kmers: %n\n# of occupied bin: %n\nfalse positive\ rate: %n" %(filename,n_unique,ht.n_occupied(),fpr) # consume second dataset ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 n_overlap = 0 seq_count = 0 for n, record in enumerate(screed.open(filename2)): sequence = record['sequence'] seq_len = len(sequence) for n in range(0,seq_len+1-K): kmer = sequence[n:n+K] if (not ht2.get(kmer)): n_unique+=1 if (ht.get(kmer)): n_overlap+=1 ht2.count(kmer) if file_curve !='N': seq_count = seq_count + 1 if seq_count == max_count: #n_occu = ht2.n_occupied string = str(n_unique)+' '+str(n_overlap)+'\n' file3 = open(file_curve,'a') file3.write(string) file3.close() seq_count = 0 print filename2,'has been consumed.' fpr = (1- math.exp(-n_unique/HT_SIZE))**Z printout2 = "%s:\n# of unique k-mers: %n\n# of occupied bin: %n\nfalse \ positive rate: %n\n===============\n# of overlap unique k-mers: %n\n" \ %(filename2,n_unique,ht2.n_occupied(),n_overlap) file_result_object = open(file_result,'w') file_result_object.write(printout1) file_result_object.write(printout2)
def test_count_within_radius_big(): inpfile = utils.get_test_data('random-20-a.fa') ht = khmer.new_hashbits(20, 1e5, 4) ht.consume_fasta(inpfile) n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6)) assert n == 3960 ht = khmer.new_hashbits(21, 1e5, 4) ht.consume_fasta(inpfile) n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6)) assert n == 39
def test_count_within_radius_big(): inpfile = os.path.join(thisdir, "test-data", "random-20-a.fa") ht = khmer.new_hashbits(20, 1e6, 4) ht.consume_fasta(inpfile) n = ht.count_kmers_within_radius("CGCAGGCTGGATTCTAGAGG", 1e6) assert n == 3960 ht = khmer.new_hashbits(21, 1e6, 4) ht.consume_fasta(inpfile) n = ht.count_kmers_within_radius("CGCAGGCTGGATTCTAGAGGC", 1e6) assert n == 39
def run_no_curve(K,HT_SIZE,N_HT,filename,filename2,file_result): file_result_object = open(file_result,'w') ht = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0,seq_len+1-K): kmer = sequence[n:n+K] if (not ht.get(kmer)): n_unique+=1 ht.count(kmer) print filename,'has been consumed.' print '# of unique kmers:',n_unique print '# of occupied bin:',ht.n_occupied() printout = filename+":"+'\n' printout = printout+'# of unique kmers:'+str(n_unique)+'\n' printout = printout + '# of occupied bin:'+str(ht.n_occupied())+'\n' ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 n_overlap = 0 for n, record in enumerate(fasta_iter(open(filename2))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0,seq_len+1-K): kmer = sequence[n:n+K] if (not ht2.get(kmer)): n_unique+=1 if (ht.get(kmer)): n_overlap+=1 ht2.count(kmer) print filename2,'has been consumed.' print '# of unique kmers:',n_unique print '# of occupied bin:',ht2.n_occupied() print n_overlap,'unique kmers appears in both ',filename,' and ',filename2 printout = printout+filename2+":"+'\n' printout = printout+'# of unique kmers:'+str(n_unique)+'\n' printout = printout + '# of occupied bin:'+str(ht2.n_occupied())+'\n' printout = printout + '# of overlap unique kmers:' + str(n_overlap) + '\n' file_result_object.write(printout)
def main(): info('count-overlap.py', ['counting']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in [args.ptfile, args.fafile]: check_file_status(infile) check_space([args.ptfile, args.fafile]) print 'loading k-mer presence table from', args.ptfile ht1 = khmer.load_hashbits(args.ptfile) kmer_size = ht1.ksize() output = open(args.report_filename, 'w') f_curve_obj = open(args.report_filename + '.curve', 'w') ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables) (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1) printout1 = """\ dataset1(pt file): %s dataset2: %s # of unique k-mers in dataset2: %d # of overlap unique k-mers: %d """ % (args.ptfile, args.fafile, n_unique, n_overlap) output.write(printout1) for i in range(100): to_print = str(list_curve[100 + i]) + ' ' + str(list_curve[i]) + '\n' f_curve_obj.write(to_print)
def main(): parser = argparse.ArgumentParser( description="Annotate seqs with partitions.") parser.add_argument('--ksize', '-k', type=int, default=DEFAULT_K, help="k-mer size (default: %d)" % DEFAULT_K) parser.add_argument('graphbase') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() K = args.ksize ht = khmer.new_hashbits(K, 1, 1) partitionmap_file = args.graphbase + '.pmap.merged' print 'loading partition map from:', partitionmap_file ht.load_partitionmap(partitionmap_file) for infile in args.input_filenames: print 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' n = ht.output_partitions(infile, outfile) print 'output %d partitions for %s' % (n, infile) print 'partitions are in', outfile
def main(dir1, dir2, n_threads): # detect all of the relevant partitionmap files subset_filenames = glob.glob(os.path.join(dir1, '*.pmap')) # create empty hashtable structure ht = khmer.new_hashbits(K, 1, 1) # put jobs on queue merge_queue = Queue.Queue() for filename in subset_filenames: merge_queue.put((ht, filename)) print 'starting threads' threads = [] for n in range(n_threads): t = threading.Thread(target=pull_pair, args=(merge_queue,)) threads.append(t) t.start() # wait for threads for t in threads: t.join() # done! if merge_queue.qsize() == 1: ht, merge_file = merge_queue.get() print 'copying', merge_file shutil.copy(merge_file, os.path.join(dir2, os.path.basename(merge_file))) assert merge_queue.qsize() == 0
def diff(ht, filename): genome = khmer.new_hashbits(K, 4**K, 1) found = 0 not_found = 0 for n, record in enumerate(screed.fasta.fasta_iter(open(filename))): read = record['sequence'] name = record['name'] if 'N' in read: continue if len(read) < K: continue seq_len = len(read) for n in range(0,seq_len+1-K): kmer = read[n:n+K] if not genome.get(kmer): genome.consume(kmer) if ht.get(kmer): found += 1 else: not_found += 1 return found, not_found
def test_save_load_tagset_trunc(): ht = khmer.new_hashbits(32, 1, 1) outfile = utils.get_temp_filename('tagset') ht.add_tag('A' * 32) ht.add_tag('G' * 32) ht.save_tagset(outfile) ht.save_tagset('/tmp/goodversion-k32.tagset') # truncate tagset file... fp = open(outfile, 'rb') data = fp.read() fp.close() fp = open(outfile, 'wb') fp.write(data[:26]) fp.close() # try loading it... try: ht.load_tagset(outfile) assert 0, "this test should fail" except IOError: pass
def test_consume_absentfasta_with_reads_parser(): presencetable = khmer.new_hashbits(31, 1, 1) try: presencetable.consume_fasta_with_reads_parser() assert 0, "this should fail" except TypeError, err: print str(err)
def main(): parser = argparse.ArgumentParser(description="Merge pmap files.") parser.add_argument('--ksize', '-k', type=int, default=DEFAULT_K, help="k-mer size (default: %d)" % DEFAULT_K) parser.add_argument('--keep-subsets', dest='remove_subsets', default=True, action='store_false', help='Keep individual subsets (default: False)') parser.add_argument('graphbase') args = parser.parse_args() output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print 'loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]) K = args.ksize ht = khmer.new_hashbits(K, 1, 1) for pmap_file in pmap_files: print 'merging', pmap_file ht.merge_subset_from_disk(pmap_file) print 'saving merged to', output_file ht.save_partitionmap(output_file) if args.remove_subsets: print 'removing pmap files' for pmap_file in pmap_files: os.unlink(pmap_file)
def main(): parser = build_common_args() parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') args = parse_args(parser) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) ### print 'making hashtable' ht = khmer.new_hashbits(K, HT_SIZE, N_HT) for n, filename in enumerate(filenames): print 'consuming input', filename ht.consume_fasta(filename) if n > 0 and n % 10 == 0: print 'mid-save', base ht.save(base) open(base + '.info', 'w').write('through %s' % filename) print 'saving', base ht.save(base) open(base + '.info', 'w').write('through end: %s' % filename)
def main(): ht = khmer.new_hashbits(K, 1, 1) x = [0] * 255 y = [0] * 255 ht.load_stop_tags(sys.argv[1]) for n, record in enumerate(screed.open(sys.argv[2])): if n % 10000 == 0: sys.stderr.write('... %d\n' % n) s, p = ht.trim_on_stoptags(record.sequence) if len(s) == len(record.sequence): continue if p == 0: p = 31 else: p += 1 x[p] += 1 y[len(record.sequence)] += 1 for i, (n, m) in enumerate(zip(x, y)): if m: print '%d,%d,%d' % (i, n, m)
def main(): info('annotate-partitions.py', ['graph']) args = get_parser().parse_args() ksize = args.ksize filenames = args.input_filenames htable = khmer.new_hashbits(ksize, 1, 1) partitionmap_file = args.graphbase + '.pmap.merged' check_file_status(partitionmap_file) for _ in filenames: check_file_status(_) check_space(filenames) print 'loading partition map from:', partitionmap_file htable.load_partitionmap(partitionmap_file) for infile in filenames: print 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) print 'output %d partitions for %s' % (part_count, infile) print 'partitions are in', outfile
def test_extract_unique_paths_2(): kh = khmer.new_hashbits(10, 4, 4) kh.consume('ATGGAGAGAC') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) print x assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG'] # all but the 1st k-mer
def test_save_load_tagset_trunc(): ht = khmer.new_hashbits(32, 1, 1) outfile = utils.get_temp_filename('tagset') ht.add_tag('A' * 32) ht.add_tag('G' * 32) ht.save_tagset(outfile) # truncate tagset file... fp = open(outfile, 'rb') data = fp.read() fp.close() for i in range(len(data)): fp = open(outfile, 'wb') fp.write(data[:i]) fp.close() # try loading it... try: ht.load_tagset(outfile) assert 0, "this test should fail" except IOError as err: print str(err), i
def main(): parser = build_common_args() parser.add_argument("output_filename") parser.add_argument("input_filenames", nargs="+") args = parse_args(parser) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames print "Saving hashtable to %s" % base print "Loading kmers from sequences in %s" % repr(filenames) ### print "making hashtable" ht = khmer.new_hashbits(K, HT_SIZE, N_HT) for n, filename in enumerate(filenames): print "consuming input", filename ht.consume_fasta(filename) if n > 0 and n % 10 == 0: print "mid-save", base ht.save(base) open(base + ".info", "w").write("through %s" % filename) print "saving", base ht.save(base) open(base + ".info", "w").write("through end: %s" % filename)
def test_tag_across_stoptraverse(): filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 1e4 # size of hashtable N_HT = 3 # number of hashtables ht = khmer.new_hashbits(K, HT_SIZE, N_HT) # without tagging/joining across consume, this breaks into two partition; # with, it is one partition. ht.add_stop_tag('CCGAATATATAACAGCGACG') ht.consume_fasta_and_tag_with_stoptags(filename) # DO join reads across subset = ht.do_subset_partition(0, 0) n, _ = ht.count_partitions() assert n == 99 # reads only connected by traversal... n, _ = ht.subset_count_partitions(subset) assert n == 2 # but need main to cross stoptags. ht.merge_subset(subset) n, _ = ht.count_partitions() # ta-da! assert n == 1, n
def test_save_load_merge_nexist(self): ht = khmer.new_hashbits(20, 1) try: a = ht.load_subset_partitionmap('this does not exist') assert 0, "this should not succeed" except IOError as e: print str(e)
def test__get_set_tag_density(): ht = khmer.new_hashbits(32, 1, 1) orig = ht._get_tag_density() assert orig != 2 ht._set_tag_density(2) assert ht._get_tag_density() == 2
def test_save_merge_from_disk_2(self): ht = khmer.new_hashbits(20, 4 ** 7 + 1) filename = utils.get_test_data('random-20-a.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) subset_size = total_reads // 2 + total_reads % 2 divvy = ht.divide_tags_into_subsets(subset_size) outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(divvy[0], divvy[1]) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(divvy[1], 0) ht.save_subset_partitionmap(y, outfile2) del y ht.merge_subset_from_disk(outfile1) ht.merge_subset_from_disk(outfile2) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_save_merge_from_disk(self): ht = khmer.new_hashbits(20, 4 ** 4 + 1) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) print divvy (a, b, c) = divvy outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(a, b) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(b, 0) ht.save_subset_partitionmap(y, outfile2) del y ht.merge_subset_from_disk(outfile1) ht.merge_subset_from_disk(outfile2) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_tiny_real_partitions(): filename = utils.get_test_data('real-partition-tiny.fa') ht = khmer.new_hashbits(32, 8e1, 4) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) outfile = utils.get_temp_filename('part') ht.output_partitions(filename, outfile) data = open(outfile).read() assert len(data) records = [r for r in screed.open(outfile)] names = [r.name for r in records] parts = [n.rsplit('\t', 1)[1] for n in names] assert len(parts) == 2, len(parts) assert len(set(parts)) == 1 assert set(parts) != set(['0']) test_tiny_real_partitions.runme = True
def main(): info('merge-partitions.py', ['graph']) args = get_parser().parse_args() output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) ksize = args.ksize htable = khmer.new_hashbits(ksize, 1, 1) for _ in pmap_files: check_input_files(_, args.force) check_space(pmap_files, args.force) for pmap_file in pmap_files: print('merging', pmap_file, file=sys.stderr) htable.merge_subset_from_disk(pmap_file) print('saving merged to', output_file, file=sys.stderr) htable.save_partitionmap(output_file) if args.remove_subsets: print('removing pmap files', file=sys.stderr) for pmap_file in pmap_files: os.unlink(pmap_file)
def test_filter_stoptags(): infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) stopfile = utils.get_temp_filename('stoptags', in_dir) # first, copy test-abund-read-2.fa to 'test.fa' in the temp dir. shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) # now, create a file with some stop tags in it -- K = 18 kh = khmer.new_hashbits(K, 1, 1) kh.add_stop_tag('GTTGACGGGGCTCAGGGG') kh.save_stop_tags(stopfile) del kh # finally, run filter-stoptags. script = scriptpath('filter-stoptags.py') args = ['-k', str(K), stopfile, infile, infile] (status, out, err) = runscript(script, args, in_dir) print out print err assert status == 0 # verify that the basic output file exists outfile = infile + '.stopfilt' assert os.path.exists(outfile), outfile # it should contain only one unique sequence, because we've trimmed # off everything after the beginning of the only long sequence in there. seqs = set([ r.sequence for r in screed.open(outfile) ]) assert len(seqs) == 1, seqs assert 'GGTTGACGGGGCTCAGGG' in seqs, seqs
def main(filename): global ht basename = os.path.basename(filename) print 'input file to partition: %s' % filename print '-- settings:' print 'K', K print 'HASHTABLE SIZE %g' % HASHTABLE_SIZE print 'N HASHTABLES %d' % N_HT print '--' ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) ht.consume_fasta(filename) counting = khmer.new_counting_hash(K, COUNTING_SIZE, N_HT) ht.traverse_from_reads(filename, 100, 5000, 5, counting) print 'saving stoptags binary' ht.save_stop_tags(basename + '.stoptags') print 'saving stoptags text' ht.print_stop_tags(basename + '.stoptags.txt') sys.exit(0)
def test_save_load_merge_on_graph(): ht = khmer.new_hashbits(20, 4 ** 4 + 1) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) print(divvy) assert len(divvy) is 3 (a, b, c) = divvy outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(a, b) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(b, 0) ht.save_subset_partitionmap(y, outfile2) del y a = ht.load_partitionmap(outfile1) # <-- this is different b = ht.load_subset_partitionmap(outfile2) ht.merge_subset(b) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_3_merge_013(self): ht = khmer.new_hashbits(20, 4**14 + 1) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) assert total_reads == 3, total_reads (a, b, c) = ht.divide_tags_into_subsets(1) x = ht.do_subset_partition(a, a) ht.merge_subset(x) y = ht.do_subset_partition(b, 0) ht.merge_subset(y) outfile = utils.get_temp_filename('out') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_stop_traverse(): filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 1e4 # size of hashtable N_HT = 3 # number of hashtables ht = khmer.new_hashbits(K, HT_SIZE, N_HT) # without tagging/joining across consume, this breaks into two partition; # with, it is one partition. ht.add_stop_tag('TTGCATACGTTGAGCCAGCG') ht.consume_fasta_and_tag(filename) # DO NOT join reads across stoptags subset = ht.do_subset_partition(0, 0, True) ht.merge_subset(subset) n, _ = ht.count_partitions() assert n == 2, n
def test_notag_across_stoptraverse(): filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 1e4 # size of hashtable N_HT = 3 # number of hashtables ht = khmer.new_hashbits(K, HT_SIZE, N_HT) # connecting k-mer at the beginning/end of a read: breaks up into two. ht.add_stop_tag('TTGCATACGTTGAGCCAGCG') ht.consume_fasta_and_tag_with_stoptags(filename) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) n, _ = ht.count_partitions() assert n == 2, n
def test_find_all_tags_kmersize(self): ht = khmer.new_hashbits(20, 4 ** 4 + 1) a = "ATTGGGACTCTGGGAGCACTTATCATGGAGAT" b = "GAGCACTTTAACCCTGCAGAGTGGCCAAGGCT" c = "GGAGCACTTATCATGGAGATATATCCCGTGCTTAAACATCGCACTTTAACCCTGCAGAGT" print ht.consume(a) try: ppi = ht.find_all_tags(c[:19]) assert False, "should raise a ValueError for wrong k-mer size" except ValueError: pass try: ppi = ht.find_all_tags(c[:21]) assert False, "should raise a ValueError for wrong k-mer size" except ValueError: pass
def test_random_20_a_succ_IV(self): ht = khmer.new_hashbits(20, 4**13 + 1) filename = utils.get_test_data('random-20-a.fa') outfile = utils.get_temp_filename('out') total_reads, _ = ht.consume_fasta_and_tag(filename) subsets = [] divvy = ht.divide_tags_into_subsets(1) divvy.append(0) for i in range(len(divvy) - 1): x = ht.do_subset_partition(divvy[i], divvy[i + 1]) subsets.append(x) for x in reversed(subsets): ht.merge_subset(x) n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions
def main(): info('count-overlap.py', ['counting']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in [args.ptfile, args.fafile]: check_input_files(infile, args.force) check_space([args.ptfile, args.fafile], args.force) print('loading k-mer presence table from', args.ptfile, file=sys.stderr) ht1 = khmer.load_hashbits(args.ptfile) kmer_size = ht1.ksize() output = open(args.report_filename, 'w') f_curve_obj = open(args.report_filename + '.curve', 'w') if args.csv: f_curve_obj_csv = csv.writer(f_curve_obj) # write headers: f_curve_obj_csv.writerow(['input_seq', 'overlap_kmer']) ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables) (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1) printout1 = """\ dataset1(pt file): %s dataset2: %s # of unique k-mers in dataset2: %d # of overlap unique k-mers: %d """ % (args.ptfile, args.fafile, n_unique, n_overlap) output.write(printout1) for i in range(100): if args.csv: f_curve_obj_csv.writerow([list_curve[100 + i], list_curve[i]]) else: print(list_curve[100 + i], list_curve[i], file=f_curve_obj) print('wrote to: ' + args.report_filename, file=sys.stderr)
def main(): global done, worker_count done = False worker_count = 0 infile = sys.argv[1] outfile = infile + '.graphsize2' print 'creating ht' ht = khmer.new_hashbits(K, HASHTABLE_SIZE, 1) print 'eating fa', infile total_reads, n_consumed = ht.consume_fasta(infile) outfp = open(outfile, 'w') inqueue = Queue.Queue(50) outqueue = Queue.Queue(50) ## worker and writer threads for i in range(WORKER_THREADS): t = threading.Thread(target=process, args=(inqueue, outqueue, ht)) worker_count += 1 t.start() threading.Thread(target=write, args=(outqueue, outfp)).start() ### main thread x = [] i = 0 for n, record in enumerate(screed.fasta.fasta_iter(open(infile))): if n % 10000 == 0: print '...', n x.append(record) i += 1 if i > GROUPSIZE: inqueue.put(x) x = [] i = 0 inqueue.put(x) done = True
def main(): parser = argparse.ArgumentParser() parser.add_argument('-k', default=DEFAULT_K, type=int, help='k-mer size', dest='ksize') parser.add_argument('stoptags_file') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() K = args.ksize stoptags = args.stoptags_file infiles = args.input_filenames print 'loading stop tags, with K', K ht = khmer.new_hashbits(K, 1, 1) ht.load_stop_tags(stoptags) def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_stoptags(seq) if trim_at >= K: return name, trim_seq return None, None # the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def test_save_load_tagset(): ht = khmer.new_hashbits(32, 1, 1) outfile = utils.get_temp_filename('tagset') ht.add_tag('A' * 32) ht.save_tagset(outfile) ht.add_tag('G' * 32) ht.load_tagset(outfile) # implicitly => clear_tags=True ht.save_tagset(outfile) # if tags have been cleared, then the new tagfile will be larger (34 bytes) # else smaller (26 bytes). fp = open(outfile, 'rb') data = fp.read() fp.close() assert len(data) == 26, len(data)
def test_find_unpart_fail(): filename = utils.get_test_data('random-20-a.odd.fa') filename2 = utils.get_test_data('random-20-a.odd.fa') # <- switch to odd K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht = khmer.new_hashbits(K, HT_SIZE, N_HT) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) n, _ = ht.count_partitions() assert n == 49 ht.find_unpart(filename2, True, False) n, _ = ht.count_partitions() assert n == 49, n # only 49 sequences worth of tags
def test_find_unpart_notraverse(): filename = utils.get_test_data('random-20-a.odd.fa') filename2 = utils.get_test_data('random-20-a.even.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht = khmer.new_hashbits(K, HT_SIZE, N_HT) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) n, _ = ht.count_partitions() assert n == 49 ht.find_unpart(filename2, False, False) # <-- don't traverse n, _ = ht.count_partitions() assert n == 99, n # all sequences disconnected
def test_save_merge_from_disk_file_not_exist(self): ht = khmer.new_hashbits(20, 4**4 + 1) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) print divvy (a, b, c) = divvy outfile1 = utils.get_temp_filename('x.pmap') # fail to create file... => failure expected try: ht.merge_subset_from_disk(outfile1) assert 0, "this should fail" except IOError as e: print str(e)
def test_save_load_tagset_noclear(): ht = khmer.new_hashbits(32, 1, 1) outfile = utils.get_temp_filename('tagset') ht.add_tag('A' * 32) ht.save_tagset(outfile) ht.add_tag('G' * 32) ht.load_tagset(outfile, False) # set clear_tags => False; zero tags ht.save_tagset(outfile) # if tags have been cleared, then the new tagfile will be large (34 bytes); # else small (26 bytes). fp = open(outfile, 'rb') data = fp.read() fp.close() assert len(data) == 34, len(data)
def main(): filename = sys.argv[1] K = int(sys.argv[2]) # size of kmer HT_SIZE = int(sys.argv[3]) # size of hashtable N_HT = int(sys.argv[4]) # number of hashtables ht = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n:n + K] if (not ht.get(kmer)): n_unique += 1 ht.count(kmer) print n_unique print ht.n_occupied() print ht.n_unique_kmers()
def test_ordered_connect(self): ht = khmer.new_hashbits(20, 4 ** 4 + 1) a = "ATTGGGACTCTGGGAGCACTTATCATGGAGAT" b = "GAGCACTTTAACCCTGCAGAGTGGCCAAGGCT" c = "GGAGCACTTATCATGGAGATATATCCCGTGCTTAAACATCGCACTTTAACCCTGCAGAGT" print ht.consume(a) ppi = ht.find_all_tags(a[:20]) pid = ht.assign_partition_id(ppi) assert pid == 0, pid print ht.consume(b) ppi = ht.find_all_tags(b[:20]) pid = ht.assign_partition_id(ppi) assert pid == 0, pid print ht.consume(c) ppi = ht.find_all_tags(c[:20]) pid = ht.assign_partition_id(ppi) assert pid == 2, pid
def deg(filename, ht): kmers = khmer.new_hashbits(K, 4**K, 1) degs = {} for n, record in enumerate(screed.fasta.fasta_iter(open(filename))): read = record['sequence'] name = record['name'] if len(read) < K: continue if 'N' in read: continue get_all_kmers(ht, read[0:K], K, kmers, degs) n_occ = kmers.n_occupied() del kmers return n_occ, degs
def test_output_partitions(): filename = utils.get_test_data('test-output-partitions.fa') ht = khmer.new_hashbits(10, 1, 1) ht.set_partition_id('TTAGGACTGC', 2) ht.set_partition_id('TGCGTTTCAA', 3) ht.set_partition_id('ATACTGTAAA', 4) outfile = utils.get_temp_filename('part') ht.output_partitions(filename, outfile) data = open(outfile).read() assert len(data) records = [r for r in screed.open(outfile)] names = [r.name for r in records] parts = [n.rsplit('\t', 1)[1] for n in names] assert parts[0] == '2' assert parts[1] == '3' assert parts[2] == '4'
def main(): readsfile = sys.argv[1] contigfile = sys.argv[2] outfile = os.path.basename(readsfile) + '.sweep' if len(sys.argv) == 4: outfile = sys.argv[3] # create a hashbits data structure ht = khmer.new_hashbits(K, 1, 1) # tag every k-mer in the contigs ht._set_tag_density(0) # load contigs, connect into N partitions print 'loading contigs from', contigfile ht.consume_fasta_and_tag(contigfile) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) print 'outputting contig-partitioned reads to', outfile ht.output_partitions(readsfile, outfile, True)
def test_random_20_a_succ_III(self): ht = khmer.new_hashbits(20, 4**7 + 1) filename = utils.get_test_data('random-20-a.fa') outfile = utils.get_temp_filename('out') total_reads, _ = ht.consume_fasta_and_tag(filename) subset_size = total_reads / 2 + total_reads % 2 divvy = ht.divide_tags_into_subsets(subset_size) assert len(divvy) == 4, len(divvy) x = ht.do_subset_partition(divvy[0], divvy[2]) y = ht.do_subset_partition(divvy[2], 0) ht._validate_subset_partitionmap(x) ht._validate_subset_partitionmap(y) ht.merge_subset(y) ht.merge_subset(x) n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions
def main(): info('filter-stoptags.py', ['graph']) args = get_parser().parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading stop tags, with K', args.ksize htable = khmer.new_hashbits(args.ksize, 1, 1) htable.load_stop_tags(stoptags) def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print >>sys.stderr, 'filtering', infile outfile = os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print >>sys.stderr, 'output in', outfile
def test_badget(): hbts = khmer.new_hashbits(6, 1e6, 1) dna = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAG" hbts.consume(dna) assert hbts.get("AGCTTT") == 1 assert hbts.get("GATGAG") == 0 try: hbts.get(b"AGCTT") assert 0, "this should fail" except ValueError as err: print(str(err)) try: hbts.get(u"AGCTT") assert 0, "this should fail" except ValueError as err: print(str(err))
def test_small_real_partitions(): filename = utils.get_test_data('real-partition-small.fa') ht = khmer.new_hashbits(32, 2e2, 4) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) outfile = utils.get_temp_filename('part') ht.output_partitions(filename, outfile) data = open(outfile).read() assert len(data) records = [r for r in screed.open(outfile)] names = [r.name for r in records] parts = [n.rsplit('\t', 1)[1] for n in names] assert len(parts) == 6, len(parts) assert len(set(parts)) == 1 assert set(parts) != set(['0'])
def test_filter_sodd(): K = 32 HASHTABLE_SIZE = int(8e7) N_HT = 4 MAX_SODD = 3 ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) filename = utils.get_test_data('../../data/high-sodd.fa') ht.consume_fasta(filename) seq = "CGTTAGTTGCGGTGCCGACCGGCAAACTTGGTTTTGCCAAAAATTTTTACAGTTAGAAATTATTCACAAAGTTGCACCGGAATTCGGTTACAAACGTCATTCTAACTAAT" trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD) assert trim_seq == "CGTTAGTTGCGGTGCCGACCGGCAAACTTGGT" seq = "ACAAAATTCCACATATAGTCATAATTGTGGGCAATTTTCGTCCCAAATTAGTTAGAATGACGTTTGTAACCGAATTCCGGTGCAACTTTGTGAATAATTTCTAACTGTAAAAAT" trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD) assert trim_seq == "ACAAAATTCCACATATAGTCATAATTGTGGGCAATT" seq = "GCACGCAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTG" trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD) assert trim_seq == seq
def main(): K = 8 n = 50 add_kmers = 50 total_kmers = n + add_kmers print "\"FPR\",\"LOWER\",\"AVG\",\"UPPER\"" for p in [x / 200.0 + .01 for x in range(59)]: diam_lens = [] for j in range(500): seq = gen_circ_chrom(n, K) m = calc_m(total_kmers, p) k = opt_ht(m, total_kmers) HT_SIZE = calc_ht_size(m, k) ht = khmer.new_hashbits(K, HT_SIZE, k) ht.consume(seq) for i in range(add_kmers): ht.consume(generate_read(K)) real_kmers = get_real_kmers(seq, K) out_len = [] # step one: find the "outbranch" lengths for each real k-mer for kmer in real_kmers: out_len.append(get_level(ht, kmer, real_kmers, K)) # step two: find the shortest longest path using the info from step 1 diam_lens.append(max(out_len)) #avg = numpy.mean(diam_lens) #se = numpy.std(diam_lens) / numpy.sqrt(len(diam_lens)) #lim = se * 1.96 #print str(p) + "," + str(avg-lim) + "," + str(avg) + "," + str(avg+lim) low, med, upp = estimate_mean(diam_lens) print str(p) + "," + str(low) + "," + str(med) + "," + str(upp)
def main(filename): global ht basename = os.path.basename(filename) print 'input file to partition: %s' % filename print '-- settings:' print 'K', K print 'HASHTABLE SIZE %g' % HASHTABLE_SIZE print 'N HASHTABLES %d' % N_HT print '--' ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) counting = khmer.new_counting_hash(K, COUNTING_SIZE, N_HT) ht.consume_fasta_and_traverse(filename, 100, 500, 5, counting) print 'saving stoptags binary' ht.save_stop_tags(basename + '.stoptags') print 'saving stoptags text' ht.print_stop_tags(basename + '.stoptags.txt') sys.exit(0)
def test_combine_pe(): inpfile = utils.get_test_data('combine_parts_1.fa') ht = khmer.new_hashbits(32, 1, 1) ht.consume_partitioned_fasta(inpfile) assert ht.count_partitions() == (2, 0) s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT" pid1 = ht.get_partition_id(s1) s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG" pid2 = ht.get_partition_id(s2) assert pid1 == 2 assert pid2 == 80293 ht.join_partitions(pid1, pid2) pid1 = ht.get_partition_id(s1) pid2 = ht.get_partition_id(s2) assert pid1 == pid2 assert ht.count_partitions() == (1, 0)
def test_bloom_python_1(): # test python code to count unique kmers using bloom filter filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n:n + K] if (not ht2.get(kmer)): n_unique += 1 ht2.count(kmer) assert n_unique == 3960 assert ht2.n_occupied() == 3882 assert ht2.n_unique_kmers() == 3960 # this number equals to n_unique
def main(): filename1 = sys.argv[1] filename2 = sys.argv[2] uniq2 = open(os.path.basename(sys.argv[2]) + '.uniq', 'w') kh = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) for n, record in enumerate(screed.open(filename1)): if n % 10000 == 0: print '...', filename1, n seq = record.sequence.upper().replace('N', 'A') kh.consume(seq) path_n = 0 for n, record in enumerate(screed.open(filename2)): if n % 10000 == 0: print '...', filename2, n seq = record.sequence.upper().replace('N', 'A') paths = kh.extract_unique_paths(seq, UNIQUE_LEN, UNIQUE_F) kh.consume(seq) for path in paths: path_n += 1 print >> uniq2, '>%s from:%s\n%s' % (path_n, record.name, path)
def main(): info('merge-stoptags.py') args = get_parser().parse_args() stdbase = args.stdbase # @RamRS: This might need some more work infiles = [] for _ in glob.glob(stdbase + "*/*.stoptags"): if os.path.exists(_): check_input_files(_, False) infiles.append(_) check_space(infiles, False) ht = khmer.new_hashbits(args.ksize, 1, 1) for _ in infiles: print >> sys.stderr, 'loading stoptags %s' % _ ht.load_stop_tags(_, 0) print >> sys.stderr, 'writing file merge.stoptags' ht.save_stop_tags('merge.stoptags') print >> sys.stderr, 'done!'
def main(subset_filenames): print 'K', K print 'MIN SIZE', MIN_PARTITION_SIZE print '--' # create an empty hashtable & load in the tags ht = khmer.new_hashbits(32, 1, 1) tagmap = ht.new_tagmap() # find the maximum partition size for each tag, across all subsets for filename in subset_filenames: print 'maxifying:', filename subset = ht.load_subset_partitionmap(filename) ht.subset_maxify_partition_size(subset, tagmap) del subset gc.collect() # filter tags based on the max partition size to which they belong print 'discarding' ht.discard_tags(tagmap, MIN_PARTITION_SIZE) # finally, filter each subset filename and save. for filename in subset_filenames: print 'loading x 2', filename subset = ht.load_subset_partitionmap(filename) print 'filtering', filename ht.subset_filter_against_tags(subset, tagmap) dir = os.path.dirname(filename) new_filename = 'filtered_' + os.path.basename(filename) new_filename = os.path.join(dir, new_filename) print 'saving', new_filename ht.save_subset_partitionmap(subset, new_filename) del subset gc.collect()