예제 #1
0
def test_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')

    savepath = utils.get_temp_filename('tempcountingsave1.ht')
    loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    # save uncompressed hashtable.
    hi = khmer.CountingHash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    # compress.
    in_file = open(savepath, 'rb')
    out_file = gzip.open(loadpath, 'wb')
    out_file.writelines(in_file)
    out_file.close()
    in_file.close()

    # load compressed hashtable.
    ht = khmer.CountingHash(12, sizes)
    ht.load(loadpath)

    tracking = khmer._Hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
예제 #2
0
def test_bloom_c_2():  # simple one
    ksize = 4

    # use only 1 hashtableable, no bloom filter
    htableable = khmer._Hashbits(ksize, [11])
    htableable.count('AAAA')  # 00 00 00 00 = 0
    htableable.count('ACTG')  # 00 10 01 11 =
    assert htableable.n_unique_kmers() == 2
    htableable.count('AACG')  # 00 00 10 11 = 11  # collision  with 1st kmer
    assert htableable.n_unique_kmers() == 2
    htableable.count('AGAC')  # 00  11 00 10 # collision  with 2nd kmer
    assert htableable.n_unique_kmers() == 2

    # use two hashtableables with 11,13
    other_htableable = khmer._Hashbits(ksize, [11, 13])
    other_htableable.count('AAAA')  # 00 00 00 00 = 0

    other_htableable.count('ACTG')  # 00 10 01 11 = 2*16 +4 +3 = 39
    assert other_htableable.n_unique_kmers() == 2
    # 00 00 10 11 = 11  # collision with only 1st kmer
    other_htableable.count('AACG')
    assert other_htableable.n_unique_kmers() == 3
    other_htableable.count('AGAC')
    # 00  11 00 10  3*16 +2 = 50
    # collision with both 2nd and 3rd kmers

    assert other_htableable.n_unique_kmers() == 3
예제 #3
0
def test_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')

    savepath = utils.get_temp_filename('tempcountingsave1.ht')
    loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    # save uncompressed hashtable.
    hi = khmer.CountingHash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    # compress.
    in_file = open(savepath, 'rb')
    out_file = gzip.open(loadpath, 'wb')
    out_file.writelines(in_file)
    out_file.close()
    in_file.close()

    # load compressed hashtable.
    ht = khmer.CountingHash(12, sizes)
    ht.load(loadpath)

    tracking = khmer._Hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
예제 #4
0
def test_bloom_c_2():  # simple one
    K = 4

    # use only 1 hashtable, no bloom filter
    ht1 = khmer._Hashbits(K, [11])
    ht1.count('AAAA')  # 00 00 00 00 = 0
    ht1.count('ACTG')  # 00 10 01 11 =
    assert ht1.n_unique_kmers() == 2
    ht1.count('AACG')  # 00 00 10 11 = 11  # collision  with 1st kmer
    assert ht1.n_unique_kmers() == 2
    ht1.count('AGAC')  # 00  11 00 10 # collision  with 2nd kmer
    assert ht1.n_unique_kmers() == 2

    # use two hashtables with 11,13
    ht2 = khmer._Hashbits(K, [11, 13])
    ht2.count('AAAA')  # 00 00 00 00 = 0

    ht2.count('ACTG')  # 00 10 01 11 = 2*16 +4 +3 = 39
    assert ht2.n_unique_kmers() == 2
    ht2.count('AACG')  # 00 00 10 11 = 11  # collision with only 1st kmer
    assert ht2.n_unique_kmers() == 3
    ht2.count('AGAC')  # 00  11 00 10  3*16 +2 = 50
    # collision with both 2nd and 3rd kmers

    assert ht2.n_unique_kmers() == 3
예제 #5
0
def test_save_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave2.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    hi = khmer.CountingHash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    ht = khmer.CountingHash(12, sizes)
    try:
        ht.load(savepath)
    except IOError as err:
        assert 0, 'Should not produce an IOError: ' + str(err)

    tracking = khmer._Hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
예제 #6
0
def test_save_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave2.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    hi = khmer._CountingHash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    ht = khmer._CountingHash(12, sizes)
    try:
        ht.load(savepath)
    except OSError as err:
        assert 0, 'Should not produce an OSError: ' + str(err)

    tracking = khmer._Hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
예제 #7
0
def test_bloom_c_2():  # simple one
    ksize = 4

    # use only 1 hashtableable, no bloom filter
    htableable = khmer._Hashbits(ksize, [11])
    htableable.count('AAAA')  # 00 00 00 00 = 0
    htableable.count('ACTG')  # 00 10 01 11 =
    assert htableable.n_unique_kmers() == 2
    htableable.count('AACG')  # 00 00 10 11 = 11  # collision  with 1st kmer
    assert htableable.n_unique_kmers() == 2
    htableable.count('AGAC')   # 00  11 00 10 # collision  with 2nd kmer
    assert htableable.n_unique_kmers() == 2

    # use two hashtableables with 11,13
    other_htableable = khmer._Hashbits(ksize, [11, 13])
    other_htableable.count('AAAA')  # 00 00 00 00 = 0

    other_htableable.count('ACTG')  # 00 10 01 11 = 2*16 +4 +3 = 39
    assert other_htableable.n_unique_kmers() == 2
    # 00 00 10 11 = 11  # collision with only 1st kmer
    other_htableable.count('AACG')
    assert other_htableable.n_unique_kmers() == 3
    other_htableable.count('AGAC')
    # 00  11 00 10  3*16 +2 = 50
    # collision with both 2nd and 3rd kmers

    assert other_htableable.n_unique_kmers() == 3
예제 #8
0
def test_bloom_c_2():  # simple one
    K = 4

    # use only 1 hashtable, no bloom filter
    ht1 = khmer._Hashbits(K, [11])
    ht1.count("AAAA")  # 00 00 00 00 = 0
    ht1.count("ACTG")  # 00 10 01 11 =
    assert ht1.n_unique_kmers() == 2
    ht1.count("AACG")  # 00 00 10 11 = 11  # collision  with 1st kmer
    assert ht1.n_unique_kmers() == 2
    ht1.count("AGAC")  # 00  11 00 10 # collision  with 2nd kmer
    assert ht1.n_unique_kmers() == 2

    # use two hashtables with 11,13
    ht2 = khmer._Hashbits(K, [11, 13])
    ht2.count("AAAA")  # 00 00 00 00 = 0

    ht2.count("ACTG")  # 00 10 01 11 = 2*16 +4 +3 = 39
    assert ht2.n_unique_kmers() == 2
    ht2.count("AACG")  # 00 00 10 11 = 11  # collision with only 1st kmer
    assert ht2.n_unique_kmers() == 3
    ht2.count("AGAC")  # 00  11 00 10  3*16 +2 = 50
    # collision with both 2nd and 3rd kmers

    assert ht2.n_unique_kmers() == 3
예제 #9
0
def test_extract_unique_paths_2():
    kh = khmer._Hashbits(10, [5, 7, 11, 13])

    kh.consume('ATGGAGAGAC')
    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    print(x)
    assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG']  # all but the 1st k-mer
예제 #10
0
def test_abund_dist_gz_bigcount():
    infile = utils.get_temp_filename('test.fa')
    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
    outfile = utils.get_temp_filename('test_ct.gz')
    script = scriptpath('load-into-counting.py')
    htfile = utils.get_temp_filename('test_ct')
    args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile]
    utils.runscript(script, args)  # create a bigcount table
    assert os.path.exists(htfile)
    data = open(htfile, 'rb').read()
    f_out = gzip.open(outfile, 'wb')  # compress the created bigcount table
    f_out.write(data)
    f_out.close()
    # load the compressed bigcount table
    counting_hash = khmer.load_counting_hash(outfile)
    hashsizes = counting_hash.hashsizes()
    kmer_size = counting_hash.ksize()
    tracking = khmer._Hashbits(kmer_size, hashsizes)
    abundances = counting_hash.abundance_distribution(infile, tracking)
    # calculate abundance distribution for compressed bigcount table
    flag = False
    # check if abundance is > 255
    # if ok  gzipped bigcount was loaded correctly
    for _, i in enumerate(abundances):
        print _, i
        if _ > 255 and i > 0:
            flag = True
            break
    assert flag
예제 #11
0
def test_save_load_tagset_trunc():
    htable = khmer._Hashbits(32, [1])

    outfile = utils.get_temp_filename('tagset')

    htable.add_tag('A' * 32)
    htable.add_tag('G' * 32)
    htable.save_tagset(outfile)

    # truncate tagset file...
    fp = open(outfile, 'rb')
    data = fp.read()
    fp.close()

    for i in range(len(data)):
        fp = open(outfile, 'wb')
        fp.write(data[:i])
        fp.close()

        # try loading it...
        try:
            htable.load_tagset(outfile)
            assert 0, "this test should fail"
        except OSError as err:
            print(str(err), i)

    # try loading it...
    try:
        htable.load_tagset(outfile)
        assert 0, "this test should fail"
    except OSError:
        pass
예제 #12
0
def main():
    info('count-kmers.py', ['counting'])
    args = get_parser().parse_args()

    print ('hashtable from', args.input_counting_table_filename,
           file=sys.stderr)
    counting_hash = khmer.load_counting_hash(
        args.input_counting_table_filename)

    kmer_size = counting_hash.ksize()
    hashsizes = counting_hash.hashsizes()
    tracking = khmer._Hashbits(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    if args.output_file is None:
        args.output_file = sys.stdout
    writer = csv.writer(args.output_file)

    for filename in args.input_sequence_filenames:
        for record in screed.open(filename):
            seq = record.sequence.replace('N', 'A')
            for i in range(len(seq) - kmer_size + 1):
                kmer = seq[i:i+kmer_size]
                if not tracking.get(kmer):
                    tracking.count(kmer)
                    writer.writerow([kmer, str(counting_hash.get(kmer))])

    print ('Total number of unique k-mers: {0}'.format(
        counting_hash.n_unique_kmers()), file=sys.stderr)
예제 #13
0
 def test_save_load_merge_nexist(self):
     ht = khmer._Hashbits(20, [1])
     try:
         a = ht.load_subset_partitionmap('this does not exist')
         assert 0, "this should not succeed"
     except IOError as e:
         print(str(e))
예제 #14
0
def test_abund_dist_gz_bigcount():
    infile = utils.get_temp_filename('test.fa')
    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
    outfile = utils.get_temp_filename('test_ct.gz')
    script = 'load-into-counting.py'
    htfile = utils.get_temp_filename('test_ct')
    args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile]
    utils.runscript(script, args)  # create a bigcount table
    assert os.path.exists(htfile)
    data = open(htfile, 'rb').read()
    f_out = gzip.open(outfile, 'wb')  # compress the created bigcount table
    f_out.write(data)
    f_out.close()
    # load the compressed bigcount table
    try:
        counting_hash = khmer.load_counting_hash(outfile)
    except IOError as err:
        assert 0, 'Should not produce IOError: ' + str(err)
    hashsizes = counting_hash.hashsizes()
    kmer_size = counting_hash.ksize()
    tracking = khmer._Hashbits(kmer_size, hashsizes)
    abundances = counting_hash.abundance_distribution(infile, tracking)
    # calculate abundance distribution for compressed bigcount table
    flag = False
    # check if abundance is > 255
    # if ok  gzipped bigcount was loaded correctly
    for _, i in enumerate(abundances):
        print(_, i)
        if _ > 255 and i > 0:
            flag = True
            break
    assert flag
예제 #15
0
def test_extract_unique_paths_2():
    kh = khmer._Hashbits(10, [5, 7, 11, 13])

    kh.consume('ATGGAGAGAC')
    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    print(x)
    assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG']  # all but the 1st k-mer
예제 #16
0
def test__get_set_tag_density():
    ht = khmer._Hashbits(32, [1])

    orig = ht._get_tag_density()
    assert orig != 2
    ht._set_tag_density(2)
    assert ht._get_tag_density() == 2
예제 #17
0
def test_save_load_tagset_trunc():
    ht = khmer._Hashbits(32, [1])

    outfile = utils.get_temp_filename('tagset')

    ht.add_tag('A' * 32)
    ht.add_tag('G' * 32)
    ht.save_tagset(outfile)

    # truncate tagset file...
    fp = open(outfile, 'rb')
    data = fp.read()
    fp.close()

    for i in range(len(data)):
        fp = open(outfile, 'wb')
        fp.write(data[:i])
        fp.close()

        # try loading it...
        try:
            ht.load_tagset(outfile)
            assert 0, "this test should fail"
        except OSError as err:
            print(str(err), i)

    # try loading it...
    try:
        ht.load_tagset(outfile)
        assert 0, "this test should fail"
    except OSError:
        pass
예제 #18
0
 def test_save_load_merge_nexist(self):
     ht = khmer._Hashbits(20, [1])
     try:
         a = ht.load_subset_partitionmap('this does not exist')
         assert 0, "this should not succeed"
     except OSError as e:
         print(str(e))
예제 #19
0
def main():
    info('count-kmers.py', ['counting'])
    args = get_parser().parse_args()

    print('hashtable from',
          args.input_counting_table_filename,
          file=sys.stderr)
    counting_hash = khmer.load_counting_hash(
        args.input_counting_table_filename)

    kmer_size = counting_hash.ksize()
    hashsizes = counting_hash.hashsizes()
    tracking = khmer._Hashbits(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    if args.output_file is None:
        args.output_file = sys.stdout
    writer = csv.writer(args.output_file)

    for filename in args.input_sequence_filenames:
        for record in screed.open(filename):
            seq = record.sequence.replace('N', 'A')
            for i in range(len(seq) - kmer_size + 1):
                kmer = seq[i:i + kmer_size]
                if not tracking.get(kmer):
                    tracking.count(kmer)
                    writer.writerow([kmer, str(counting_hash.get(kmer))])

    print('Total number of unique k-mers: {0}'.format(
        counting_hash.n_unique_kmers()),
          file=sys.stderr)
예제 #20
0
def test_extract_unique_paths_1():
    kh = khmer._Hashbits(10, [5, 7, 11, 13])

    kh.consume("AGTGGCGATG")
    x = kh.extract_unique_paths("ATGGAGAGACACAGATAGACAGGAGTGGCGATG", 10, 1)
    print(x)
    assert x == ["ATGGAGAGACACAGATAGACAGGAGTGGCGAT"]  # all but the last k-mer
예제 #21
0
def test__get_set_tag_density():
    htableable = khmer._Hashbits(32, [1])

    orig = htableable._get_tag_density()
    assert orig != 2
    htableable._set_tag_density(2)
    assert htableable._get_tag_density() == 2
예제 #22
0
def test_find_stoptags():
    ht = khmer._Hashbits(5, [1])
    ht.add_stop_tag("AAAAA")

    assert ht.identify_stoptags_by_position("AAAAA") == [0]
    assert ht.identify_stoptags_by_position("AAAAAA") == [0, 1]
    assert ht.identify_stoptags_by_position("TTTTT") == [0]
    assert ht.identify_stoptags_by_position("TTTTTT") == [0, 1]
예제 #23
0
    def test_count_A(self):
        A_filename = utils.get_test_data('all-A.fa')

        tracking = khmer._Hashbits(4, [5])
        dist = self.kh.abundance_distribution(A_filename, tracking)

        assert sum(dist) == 1
        assert dist[10] == 1
예제 #24
0
    def test_count_A(self):
        A_filename = utils.get_test_data('all-A.fa')

        tracking = khmer._Hashbits(4, [5])
        dist = self.kh.abundance_distribution(A_filename, tracking)

        assert sum(dist) == 1
        assert dist[10] == 1
예제 #25
0
def test_find_stoptags():
    htable = khmer._Hashbits(5, [1])
    htable.add_stop_tag("AAAAA")

    assert htable.identify_stoptags_by_position("AAAAA") == [0]
    assert htable.identify_stoptags_by_position("AAAAAA") == [0, 1]
    assert htable.identify_stoptags_by_position("TTTTT") == [0]
    assert htable.identify_stoptags_by_position("TTTTTT") == [0, 1]
예제 #26
0
def test_count_kmer_degree():
    inpfile = utils.get_test_data('all-A.fa')
    ht = khmer._Hashbits(4, [3, 5])
    ht.consume_fasta(inpfile)

    assert ht.kmer_degree('AAAA') == 2
    assert ht.kmer_degree('AAAT') == 1
    assert ht.kmer_degree('AATA') == 0
    assert ht.kmer_degree('TAAA') == 1
예제 #27
0
def test_tagset_ksize_check():
    htable = khmer._Hashbits(31, [1])

    inpath = utils.get_test_data('goodversion-k32.tagset')
    try:
        htable.load_tagset(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
예제 #28
0
def test_tagset_ksize_check():
    ht = khmer._Hashbits(31, [1])

    inpath = utils.get_test_data("goodversion-k32.tagset")
    try:
        ht.load_tagset(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
예제 #29
0
def test_count_kmer_degree():
    inpfile = utils.get_test_data("all-A.fa")
    ht = khmer._Hashbits(4, [3, 5])
    ht.consume_fasta(inpfile)

    assert ht.kmer_degree("AAAA") == 2
    assert ht.kmer_degree("AAAT") == 1
    assert ht.kmer_degree("AATA") == 0
    assert ht.kmer_degree("TAAA") == 1
예제 #30
0
def test_extract_unique_paths_0():
    kh = khmer._Hashbits(10, [5, 7, 11, 13])

    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG']

    kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG')
    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    assert not x
예제 #31
0
def test_stop_tags_filetype_check():
    ht = khmer._Hashbits(31, [1])

    inpath = utils.get_test_data('goodversion-k32.tagset')
    try:
        ht.load_stop_tags(inpath)
        assert 0, "this should fail"
    except IOError as e:
        print(str(e))
예제 #32
0
def test_count_kmer_degree():
    inpfile = utils.get_test_data('all-A.fa')
    htable = khmer._Hashbits(4, [3, 5])
    htable.consume_fasta(inpfile)

    assert htable.kmer_degree('AAAA') == 2
    assert htable.kmer_degree('AAAT') == 1
    assert htable.kmer_degree('AATA') == 0
    assert htable.kmer_degree('TAAA') == 1
예제 #33
0
def test_extract_unique_paths_0():
    kh = khmer._Hashbits(10, [5, 7, 11, 13])

    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG']

    kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG')
    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    assert not x
예제 #34
0
def test_tagset_filetype_check():
    ht = khmer._Hashbits(31, [1])

    inpath = utils.get_test_data('goodversion-k32.stoptags')
    try:
        ht.load_tagset(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
예제 #35
0
def test_save_load_tagset_notexist():
    htable = khmer._Hashbits(32, [1])

    outfile = utils.get_temp_filename('tagset')
    try:
        htable.load_tagset(outfile)
        assert 0, "this test should fail"
    except OSError as e:
        print(str(e))
예제 #36
0
def test_tagset_filetype_check():
    htable = khmer._Hashbits(31, [1])

    inpath = utils.get_test_data('goodversion-k32.stoptags')
    try:
        htable.load_tagset(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
예제 #37
0
def test_save_load_tagset_notexist():
    ht = khmer._Hashbits(32, [1])

    outfile = utils.get_temp_filename('tagset')
    try:
        ht.load_tagset(outfile)
        assert 0, "this test should fail"
    except OSError as e:
        print(str(e))
예제 #38
0
def test_count_within_radius_simple():
    inpfile = utils.get_test_data('all-A.fa')
    htable = khmer._Hashbits(4, [3, 5])

    print(htable.consume_fasta(inpfile))
    n = htable.count_kmers_within_radius('AAAA', 1)
    assert n == 1

    n = htable.count_kmers_within_radius('AAAA', 10)
    assert n == 1
예제 #39
0
def test_stoptags_file_version_check():
    htable = khmer._Hashbits(32, [1])

    inpath = utils.get_test_data('badversion-k32.stoptags')

    try:
        htable.load_stop_tags(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
예제 #40
0
def test_hashbits_file_version_check():
    htable = khmer._Hashbits(12, [1])

    inpath = utils.get_test_data('badversion-k12.htable')

    try:
        htable.load(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
예제 #41
0
def test_stoptags_file_version_check():
    htable = khmer._Hashbits(32, [1])

    inpath = utils.get_test_data('badversion-k32.stoptags')

    try:
        htable.load_stop_tags(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
예제 #42
0
def test_tagset_file_version_check():
    ht = khmer._Hashbits(32, [1])

    inpath = utils.get_test_data('badversion-k32.tagset')

    try:
        ht.load_tagset(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
예제 #43
0
def test_hashbits_file_version_check():
    htable = khmer._Hashbits(12, [1])

    inpath = utils.get_test_data('badversion-k12.htable')

    try:
        htable.load(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
예제 #44
0
def test_count_within_radius_simple():
    inpfile = utils.get_test_data('all-A.fa')
    ht = khmer._Hashbits(4, [3, 5])

    print(ht.consume_fasta(inpfile))
    n = ht.count_kmers_within_radius('AAAA', 1)
    assert n == 1

    n = ht.count_kmers_within_radius('AAAA', 10)
    assert n == 1
예제 #45
0
def test_tagset_file_version_check():
    ht = khmer._Hashbits(32, [1])

    inpath = utils.get_test_data('badversion-k32.tagset')

    try:
        ht.load_tagset(inpath)
        assert 0, "this should fail"
    except IOError as e:
        print(str(e))
예제 #46
0
def test_fakelump_load_stop_tags_notexist():
    fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo')

    # ok, now try loading these stop tags; should fail.
    ht = khmer._Hashbits(32, [5, 7, 11, 13])

    try:
        ht.load_stop_tags(fakelump_fa_foo)
        assert 0, "this test should fail"
    except OSError:
        pass
예제 #47
0
def test_fakelump_load_stop_tags_notexist():
    fakelump_fa_foo = utils.get_temp_filename("fakelump.fa.stopfoo")

    # ok, now try loading these stop tags; should fail.
    ht = khmer._Hashbits(32, [5, 7, 11, 13])

    try:
        ht.load_stop_tags(fakelump_fa_foo)
        assert 0, "this test should fail"
    except OSError:
        pass
예제 #48
0
def test_hashbits_file_type_check():
    kh = khmer._CountingHash(12, [1])
    savepath = utils.get_temp_filename('tempcountingsave0.ct')
    kh.save(savepath)

    ht = khmer._Hashbits(12, [1])

    try:
        ht.load(savepath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
예제 #49
0
def test_hashbits_file_type_check():
    kh = khmer._CountingHash(12, [1])
    savepath = utils.get_temp_filename('tempcountingsave0.ct')
    kh.save(savepath)

    htable = khmer._Hashbits(12, [1])

    try:
        htable.load(savepath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
예제 #50
0
def test_filter_if_present():
    ht = khmer._Hashbits(32, [3, 5])

    maskfile = utils.get_test_data("filter-test-A.fa")
    inputfile = utils.get_test_data("filter-test-B.fa")
    outfile = utils.get_temp_filename("filter")

    ht.consume_fasta(maskfile)
    ht.filter_if_present(inputfile, outfile)

    records = list(fasta_iter(open(outfile)))
    assert len(records) == 1
    assert records[0]["name"] == "3"
예제 #51
0
def test_save_load():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave0.ht')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    hi = khmer.CountingHash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    ht = khmer.CountingHash(12, sizes)
    ht.load(savepath)

    tracking = khmer._Hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
예제 #52
0
def test_filter_if_present():
    ht = khmer._Hashbits(32, [3, 5])

    maskfile = utils.get_test_data('filter-test-A.fa')
    inputfile = utils.get_test_data('filter-test-B.fa')
    outfile = utils.get_temp_filename('filter')

    ht.consume_fasta(maskfile)
    ht.filter_if_present(inputfile, outfile)

    records = list(fasta_iter(open(outfile)))
    assert len(records) == 1
    assert records[0]['name'] == '3'
예제 #53
0
def test_save_load():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave0.ht')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    hi = khmer.CountingHash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    ht = khmer.CountingHash(12, sizes)
    ht.load(savepath)

    tracking = khmer._Hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
예제 #54
0
def test_filter_if_present():
    htable = khmer._Hashbits(32, [3, 5])

    maskfile = utils.get_test_data('filter-test-A.fa')
    inputfile = utils.get_test_data('filter-test-B.fa')
    outfile = utils.get_temp_filename('filter')

    htable.consume_fasta(maskfile)
    htable.filter_if_present(inputfile, outfile)

    records = list(fasta_iter(open(outfile)))
    assert len(records) == 1
    assert records[0]['name'] == '3'
예제 #55
0
def main():
    info('count-kmers-single.py', ['counting'])
    args = get_parser().parse_args()

    check_input_files(args.input_sequence_filename, False)

    print ('making k-mer counting table', file=sys.stderr)
    counting_hash = khmer.CountingHash(args.ksize, args.max_tablesize,
                                            args.n_tables)
    # @CTB counting_hash.set_use_bigcount(args.bigcount)

    kmer_size = counting_hash.ksize()
    hashsizes = counting_hash.hashsizes()
    tracking = khmer._Hashbits(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    print ('kmer_size: %s' % counting_hash.ksize(), file=sys.stderr)
    print ('k-mer counting table sizes: %s' % (counting_hash.hashsizes(),),
           file=sys.stderr)

    if args.output_file is None:
        args.output_file = sys.stdout
    writer = csv.writer(args.output_file)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    print ('consuming input, round 1 -- %s' % (args.input_sequence_filename),
           file=sys.stderr)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=counting_hash.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    for record in screed.open(args.input_sequence_filename):
        seq = record.sequence.replace('N', 'A')
        for i in range(len(seq) - kmer_size + 1):
            kmer = seq[i:i+kmer_size]
            if not tracking.get(kmer):
                tracking.count(kmer)
                writer.writerow([kmer, str(counting_hash.get(kmer))])

    print ('Total number of unique k-mers: {0}'.format(
        counting_hash.n_unique_kmers()), file=sys.stderr)
예제 #56
0
def test_load_partitioned():
    inpfile = utils.get_test_data('combine_parts_1.fa')
    htable = khmer._Hashbits(32, [1])

    htable.consume_partitioned_fasta(inpfile)
    assert htable.count_partitions() == (2, 0)

    first_seq = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
    assert htable.get(first_seq)

    second_seq = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
    assert htable.get(second_seq)

    third_s = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:]
    assert htable.get(third_s)
예제 #57
0
def test_consume_absentfasta_with_reads_parser():
    presencetable = khmer._Hashbits(31, [1])
    try:
        presencetable.consume_fasta_with_reads_parser()
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
    try:
        readparser = ReadParser(utils.get_test_data('empty-file'))
        presencetable.consume_fasta_with_reads_parser(readparser)
        assert 0, "this should fail"
    except OSError as err:
        print(str(err))
    except ValueError as err:
        print(str(err))
예제 #58
0
def test_load_partitioned():
    inpfile = utils.get_test_data('combine_parts_1.fa')
    ht = khmer._Hashbits(32, [1])

    ht.consume_partitioned_fasta(inpfile)
    assert ht.count_partitions() == (2, 0)

    s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
    assert ht.get(s1)

    s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
    assert ht.get(s2)

    s3 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:]
    assert ht.get(s3)
예제 #59
0
    def test_not_output_unassigned(self):
        import screed

        filename = utils.get_test_data('random-20-a.fa')

        ht = khmer._Hashbits(21, [5, 7, 11, 13])
        ht.consume_fasta_and_tag(filename)

        output_file = utils.get_temp_filename('parttest')
        ht.output_partitions(filename, output_file, False)

        len1 = len(list(screed.open(filename)))
        len2 = len(list(screed.open(output_file)))

        assert len1 > 0
        assert len2 == 0, len2