def test_bloom_c_2(): # simple one K = 4 HT_SIZE = 10 # use 11 N_HT1 = 1 # hashtable size = 11 N_HT2 = 2 # hashtable size = 11,13 # use only 1 hashtable, no bloom filter ht1 = khmer.LabelHash(K, HT_SIZE, N_HT1) ht1.count('AAAA') # 00 00 00 00 = 0 ht1.count('ACTG') # 00 10 01 11 = assert ht1.n_unique_kmers() == 2 ht1.count('AACG') # 00 00 10 11 = 11 # collision with 1st kmer assert ht1.n_unique_kmers() == 2 ht1.count('AGAC') # 00 11 00 10 # collision with 2nd kmer assert ht1.n_unique_kmers() == 2 # use two hashtables with 11,13 ht2 = khmer.LabelHash(K, HT_SIZE, N_HT2) ht2.count('AAAA') # 00 00 00 00 = 0 ht2.count('ACTG') # 00 10 01 11 = 2*16 +4 +3 = 39 assert ht2.n_unique_kmers() == 2 ht2.count('AACG') # 00 00 10 11 = 11 # collision with only 1st kmer assert ht2.n_unique_kmers() == 3 ht2.count('AGAC') # 00 11 00 10 3*16 +2 = 50 # collision with both 2nd and 3rd kmers assert ht2.n_unique_kmers() == 3
def test_count_within_radius_big(): inpfile = utils.get_test_data('random-20-a.fa') ht = khmer.LabelHash(20, 1e6, 4) ht.consume_fasta(inpfile) n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6)) assert n == 3960 ht = khmer.LabelHash(21, 1e6, 4) ht.consume_fasta(inpfile) n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6)) assert n == 39
def test_tag_across_stoptraverse(): filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht = khmer.LabelHash(K, HT_SIZE, N_HT) # without tagging/joining across consume, this breaks into two partition; # with, it is one partition. ht.add_stop_tag('CCGAATATATAACAGCGACG') ht.consume_fasta_and_tag_with_stoptags(filename) # DO join reads across subset = ht.do_subset_partition(0, 0) n, _ = ht.count_partitions() assert n == 99 # reads only connected by traversal... n, _ = ht.subset_count_partitions(subset) assert n == 2 # but need main to cross stoptags. ht.merge_subset(subset) n, _ = ht.count_partitions() # ta-da! assert n == 1, n
def test__get_set_tag_density(): ht = khmer.LabelHash(32, 1, 1) orig = ht._get_tag_density() assert orig != 2 ht._set_tag_density(2) assert ht._get_tag_density() == 2
def test_extract_unique_paths_2(): kh = khmer.LabelHash(10, 1e5, 4) kh.consume('ATGGAGAGAC') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) print x assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG'] # all but the 1st k-mer
def test_find_radius_for_volume(): inpfile = utils.get_test_data('all-A.fa') ht = khmer.LabelHash(4, 1e6, 2) ht.consume_fasta(inpfile) assert ht.find_radius_for_volume('AAAA', 0, 100) == 0 assert ht.find_radius_for_volume('AAAA', 1, 100) == 0 assert ht.find_radius_for_volume('AAAA', 2, 100) == 100
def test_find_stoptags(): ht = khmer.LabelHash(5, 1, 1) ht.add_stop_tag("AAAAA") assert ht.identify_stoptags_by_position("AAAAA") == [0] assert ht.identify_stoptags_by_position("AAAAAA") == [0, 1] assert ht.identify_stoptags_by_position("TTTTT") == [0] assert ht.identify_stoptags_by_position("TTTTTT") == [0, 1]
def test_count_kmer_degree(): inpfile = utils.get_test_data('all-A.fa') ht = khmer.LabelHash(4, 1e6, 2) ht.consume_fasta(inpfile) assert ht.kmer_degree('AAAA') == 2 assert ht.kmer_degree('AAAT') == 1 assert ht.kmer_degree('AATA') == 0 assert ht.kmer_degree('TAAA') == 1
def test_extract_unique_paths_0(): kh = khmer.LabelHash(10, 1e5, 4) x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG'] kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) assert not x
def test_count_within_radius_simple(): inpfile = utils.get_test_data('all-A.fa') ht = khmer.LabelHash(4, 1e6, 2) print ht.consume_fasta(inpfile) n = ht.count_kmers_within_radius('AAAA', 1) assert n == 1 n = ht.count_kmers_within_radius('AAAA', 10) assert n == 1
def test_extract_unique_paths_4(): kh = khmer.LabelHash(10, 1e5, 4) kh.consume('ATGGAGAGAC') kh.consume('AGTGGCGATG') kh.consume('ATAGACAGGA') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) print x assert x == ['TGGAGAGACACAGATAGACAGG', 'TAGACAGGAGTGGCGAT']
def test_filter_if_present(): ht = khmer.LabelHash(32, 1e6, 2) maskfile = utils.get_test_data('filter-test-A.fa') inputfile = utils.get_test_data('filter-test-B.fa') outfile = utils.get_temp_filename('filter') ht.consume_fasta(maskfile) ht.filter_if_present(inputfile, outfile) records = list(fasta_iter(open(outfile))) assert len(records) == 1 assert records[0]['name'] == '3'
def test_load_partitioned(): inpfile = utils.get_test_data('combine_parts_1.fa') ht = khmer.LabelHash(32, 1, 1) ht.consume_partitioned_fasta(inpfile) assert ht.count_partitions() == (2, 0) s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT" assert ht.get(s1) s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG" assert ht.get(s2) s3 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:] assert ht.get(s3)
def test_n_occupied_1(): filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 1 # number of hashtables # test modified c++ n_occupied code ht1 = khmer.LabelHash(K, HT_SIZE, N_HT) for n, record in enumerate(fasta_iter(open(filename))): ht1.consume(record['sequence']) # this number calculated independently assert ht1.n_occupied() == 3877
def test_simple_median(): hi = khmer.LabelHash(6, 1e6, 2) (median, average, stddev) = hi.get_median_count("AAAAAA") print median, average, stddev assert median == 0 assert average == 0.0 assert stddev == 0.0 hi.consume("AAAAAA") (median, average, stddev) = hi.get_median_count("AAAAAA") print median, average, stddev assert median == 1 assert average == 1.0 assert stddev == 0.0
def test_bloom_c_1(): # test c++ code to count unique kmers using bloom filter filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht3 = khmer.LabelHash(K, HT_SIZE, N_HT) for n, record in enumerate(fasta_iter(open(filename))): ht3.consume(record['sequence']) assert ht3.n_occupied() == 3882 assert ht3.n_unique_kmers() == 3960
def test_circumference(): ht = khmer.LabelHash(4, 1e6, 2) ht.count('ATGC') ht.count('GATG') ht.count('ATGG') x = ht.count_kmers_on_radius('GATG', 1, 200) assert x == 2 ht.count('ATGA') x = ht.count_kmers_on_radius('GATG', 1, 200) assert x == 3, x ht.count('TGAT') x = ht.count_kmers_on_radius('GATG', 1, 200) assert x == 4, x
def test_n_occupied_2(): # simple one K = 4 HT_SIZE = 10 # use 11 N_HT = 1 ht1 = khmer.LabelHash(K, HT_SIZE, N_HT) ht1.count('AAAA') # 00 00 00 00 = 0 assert ht1.n_occupied() == 1 ht1.count('ACTG') # 00 10 01 11 = assert ht1.n_occupied() == 2 ht1.count('AACG') # 00 00 10 11 = 11 # collision 1 assert ht1.n_occupied() == 2 ht1.count('AGAC') # 00 11 00 10 # collision 2 assert ht1.n_occupied() == 2
def test_notag_across_stoptraverse(): filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht = khmer.LabelHash(K, HT_SIZE, N_HT) # connecting k-mer at the beginning/end of a read: breaks up into two. ht.add_stop_tag('TTGCATACGTTGAGCCAGCG') ht.consume_fasta_and_tag_with_stoptags(filename) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) n, _ = ht.count_partitions() assert n == 2, n
def test_stop_traverse(): filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht = khmer.LabelHash(K, HT_SIZE, N_HT) # without tagging/joining across consume, this breaks into two partition; # with, it is one partition. ht.add_stop_tag('TTGCATACGTTGAGCCAGCG') ht.consume_fasta_and_tag(filename) # DO NOT join reads across stoptags subset = ht.do_subset_partition(0, 0, True) ht.merge_subset(subset) n, _ = ht.count_partitions() assert n == 2, n
def test_save_load_tagset(): ht = khmer.LabelHash(32, 1, 1) outfile = utils.get_temp_filename('tagset') ht.add_tag('A' * 32) ht.save_tagset(outfile) ht.add_tag('G' * 32) ht.load_tagset(outfile) # implicitly => clear_tags=True ht.save_tagset(outfile) # if tags have been cleared, then the new tagfile will be larger (34 bytes) # else smaller (26 bytes). fp = open(outfile, 'rb') data = fp.read() fp.close() assert len(data) == 26, len(data)
def test_save_load_tagset_noclear(): ht = khmer.LabelHash(32, 1, 1) outfile = utils.get_temp_filename('tagset') ht.add_tag('A' * 32) ht.save_tagset(outfile) ht.add_tag('G' * 32) ht.load_tagset(outfile, False) # set clear_tags => False; zero tags ht.save_tagset(outfile) # if tags have been cleared, then the new tagfile will be large (34 bytes); # else small (26 bytes). fp = open(outfile, 'rb') data = fp.read() fp.close() assert len(data) == 34, len(data)
def test_find_unpart_fail(): filename = utils.get_test_data('random-20-a.odd.fa') filename2 = utils.get_test_data('random-20-a.odd.fa') # <- switch to odd K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht = khmer.LabelHash(K, HT_SIZE, N_HT) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) n, _ = ht.count_partitions() assert n == 49 ht.find_unpart(filename2, True, False) n, _ = ht.count_partitions() assert n == 49, n # only 49 sequences worth of tags
def test_find_unpart_notraverse(): filename = utils.get_test_data('random-20-a.odd.fa') filename2 = utils.get_test_data('random-20-a.even.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht = khmer.LabelHash(K, HT_SIZE, N_HT) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) n, _ = ht.count_partitions() assert n == 49 ht.find_unpart(filename2, False, False) # <-- don't traverse n, _ = ht.count_partitions() assert n == 99, n # all sequences disconnected
def test_bloom_python_1(): # test python code to count unique kmers using bloom filter filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht2 = khmer.LabelHash(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n:n + K] if (not ht2.get(kmer)): n_unique += 1 ht2.count(kmer) assert n_unique == 3960 assert ht2.n_occupied() == 3882 assert ht2.n_unique_kmers() == 3960 # this number equals to n_unique
def test_combine_pe(): inpfile = utils.get_test_data('combine_parts_1.fa') ht = khmer.LabelHash(32, 1, 1) ht.consume_partitioned_fasta(inpfile) assert ht.count_partitions() == (2, 0) s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT" pid1 = ht.get_partition_id(s1) s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG" pid2 = ht.get_partition_id(s2) assert pid1 == 2 assert pid2 == 80293 ht.join_partitions(pid1, pid2) pid1 = ht.get_partition_id(s1) pid2 = ht.get_partition_id(s2) assert pid1 == pid2 assert ht.count_partitions() == (1, 0)
def main(): info('sweep-reads-buffered.py', ['sweep']) parser = get_parser() args = parser.parse_args() if args.min_tablesize < MIN_HSIZE: args.min_tablesize = MIN_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, hashtype='hashbits') K = args.ksize HT_SIZE = args.min_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_file_status(args.input_fastp) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files) # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) record = ix.next() del ix extension = 'fa' if hasattr(record, 'accuracy'): # fastq! extension = 'fq' output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir, extension) # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) try: print >> sys.stderr, 'consuming input sequences...' if args.label_by_pid: print >> sys.stderr, '...labeling by partition id (pid)' ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print >> sys.stderr, '...labeling by sequence' for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print >>sys.stderr, \ '...labeling to create groups of size {s}'.format( s=args.group_size) label = -1 g = 0 try: outfp = open( '{pref}_base_{g}.{ext}'.format(pref=output_pref, g=g, ext=extension), 'wb') for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open( '{pref}_base_{g}.{ext}'.format( pref=output_pref, g=g, ext=extension), 'wb') if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels( record.sequence, label) if hasattr(record, 'accuracy'): outfp.write('@{name}\n{seq}+{accuracy}\n'.format( name=record.name, seq=record.sequence, accuracy=record.accuracy)) else: outfp.write('>{name}\n{seq}\n'.format( name=record.name, seq=record.sequence)) except IOError as e: print >> sys.stderr, '!! ERROR !!', e print >> sys.stderr, '...error splitting input. exiting...' except IOError as e: print >> sys.stderr, '!! ERROR: !!', e print >> sys.stderr, '...error consuming \ {i}. exiting...'.format(i=input_fastp) print >> sys.stderr, 'done consuming input sequence. \ added {t} tags and {l} \ labels...'.format(t=ht.n_tags(), l=ht.n_labels()) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print >> sys.stderr, '** sweeping {read_file} for labels...'.format( read_file=read_file) file_t = 0.0 try: read_fp = screed.open(read_file) except IOError as error: print >> sys.stderr, '!! ERROR: !!', error print >> sys.stderr, '*** Could not open {fn}, skipping...'.format( fn=read_file) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)' \ .format(n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t) start_t = time.clock() seq = record.sequence name = record.name try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: pass else: if hasattr(record, 'accuracy'): seq_str = fmt_fastq(name, seq, record.accuracy, labels) else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, 'multi') n_mlabeled += 1 label_dict['multi'] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, 'orphaned') label_dict['orphaned'] += 1 print >> sys.stderr, '** End of file {fn}...'.format(fn=read_file) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print >> sys.stderr, '** End of run...' output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print >> sys.stderr, '! WARNING: Sweep finished with errors !' print >> sys.stderr, '** {writee} reads not written'.format( writee=output_buffer.num_write_errors) print >> sys.stderr, '** {filee} errors opening files'.format( filee=output_buffer.num_file_errors) print >> sys.stderr, 'swept {n_reads} for labels...'.format( n_reads=n_labeled + n_orphaned) print >> sys.stderr, '...with {nc} labeled and {no} orphaned'.format( nc=n_labeled, no=n_orphaned) print >> sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled) print >> sys.stderr, '** outputting label number distribution...' fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref)) with open(fn, 'wb') as outfp: for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref)) print >> sys.stderr, '** outputting label read counts...' with open(fn, 'wb') as outfp: for k in label_dict: outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
def test_find_stoptags2(): ht = khmer.LabelHash(4, 1, 1) ht.add_stop_tag("ATGC") x = ht.identify_stoptags_by_position("ATGCATGCGCAT") assert x == [0, 2, 4, 8], x
def test_get_ksize(): kh = khmer.LabelHash(22, 1, 1) assert kh.ksize() == 22
def test_get_hashsizes(): kh = khmer.LabelHash(22, 100, 4) assert kh.hashsizes() == [101, 103, 107, 109], kh.hashsizes()