Exemplo n.º 1
0
def test_get_primes():
    primes = khmer.get_n_primes_near_x(7, 20)

    assert primes == [19, 17, 13, 11, 7, 5, 3]

    primes_not_float = khmer.get_n_primes_near_x(7, 20.)

    assert primes_not_float == [19, 17, 13, 11, 7, 5, 3]
    assert all(isinstance(p, int) for p in primes_not_float)
Exemplo n.º 2
0
def test_get_primes():
    primes = khmer.get_n_primes_near_x(7, 20)

    assert primes == [19, 17, 13, 11, 7, 5, 3]

    primes_not_float = khmer.get_n_primes_near_x(7, 20.)

    assert primes_not_float == [19, 17, 13, 11, 7, 5, 3]
    assert all(isinstance(p, int) for p in primes_not_float)
Exemplo n.º 3
0
def test_get_primes_fal():
    try:
        primes = khmer.get_n_primes_near_x(5, 5)
        assert 0, "previous statement should fail"
    except AssertionError:
        raise
    except Exception as err:
        assert "unable to find 5 prime numbers < 5" in str(err)
Exemplo n.º 4
0
def test_save_load_large(ctfile):
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename(ctfile)

    sizes = khmer.get_n_primes_near_x(1, 2**31 + 1000)

    orig = khmer._Countgraph(12, sizes)
    orig.consume_seqfile(inpath)
    orig.save(savepath)

    loaded = khmer.load_countgraph(savepath)

    orig_count = orig.n_occupied()
    loaded_count = loaded.n_occupied()
    assert orig_count == 3966, orig_count
    assert loaded_count == orig_count, loaded_count
Exemplo n.º 5
0
    def do_test(ctfile):
        inpath = utils.get_test_data('random-20-a.fa')
        savepath = utils.get_temp_filename(ctfile)

        sizes = khmer.get_n_primes_near_x(1, 2 ** 31 + 1000)

        orig = khmer._Countgraph(12, sizes)
        orig.consume_fasta(inpath)
        orig.save(savepath)

        loaded = khmer.load_countgraph(savepath)

        orig_count = orig.n_occupied()
        loaded_count = loaded.n_occupied()
        assert orig_count == 3966, orig_count
        assert loaded_count == orig_count, loaded_count
Exemplo n.º 6
0
def test_load_truncated():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('save.ht')
    truncpath = utils.get_temp_filename('trunc.ht')

    sizes = khmer.get_n_primes_near_x(3, 200)

    hi = khmer._Countgraph(12, sizes)
    hi.consume_seqfile(inpath)
    hi.save(savepath)

    data = open(savepath, 'rb').read()
    for i in range(len(data)):
        fp = open(truncpath, 'wb')
        fp.write(data[:i])
        fp.close()

        try:
            khmer.load_countgraph(truncpath)
            assert 0, "this should not be reached!"
        except OSError as err:
            print(str(err))
Exemplo n.º 7
0
def test_load_truncated():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('save.ht')
    truncpath = utils.get_temp_filename('trunc.ht')

    sizes = khmer.get_n_primes_near_x(3, 200)

    hi = khmer._Countgraph(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    data = open(savepath, 'rb').read()
    for i in range(len(data)):
        fp = open(truncpath, 'wb')
        fp.write(data[:i])
        fp.close()

        try:
            khmer.load_countgraph(truncpath)
            assert 0, "this should not be reached!"
        except OSError as err:
            print(str(err))
Exemplo n.º 8
0
def test_get_primes():
    primes = khmer.get_n_primes_near_x(7, 20)

    assert primes == [19, 17, 13, 11, 7, 5, 3]
Exemplo n.º 9
0
def test_get_primes_fal():
    try:
        khmer.get_n_primes_near_x(5, 5)
        assert 0, "previous statement should fail"
    except RuntimeError as err:
        assert "unable to find 5 prime numbers < 5" in str(err)
Exemplo n.º 10
0
def test_get_primes_fal():
    try:
        khmer.get_n_primes_near_x(5, 5)
        assert 0, "previous statement should fail"
    except RuntimeError as err:
        assert "unable to find 5 prime numbers < 5" in str(err)
Exemplo n.º 11
0
def test_init_with_primes(sketchtype):
    primes = khmer.get_n_primes_near_x(4, random.randint(1000, 2000))
    sketch = sketchtype(31, 1, 1, primes=primes)
    assert sketch.hashsizes() == primes
Exemplo n.º 12
0
def test_get_primes():
    primes = khmer.get_n_primes_near_x(7, 20)

    assert primes == [19, 17, 13, 11, 7, 5, 3]
Exemplo n.º 13
0
def main(filename):
    global ht

    n = 5

    basename = os.path.basename(filename)

    fd = open("log.txt", "w")

    primes = []

    below = khmer.get_n_primes_near_x(N_HT * n, HASHTABLE_SIZE)
    above = khmer.get_n_primes_above_x(N_HT * n, HASHTABLE_SIZE)

    primes = below + above
    random.shuffle(primes)

    for run in range(n):

        print primes[run * N_HT:run * N_HT + N_HT]

        ht = khmer._new_hashbits(K, primes[run * N_HT:run * N_HT + N_HT])
        #ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)

        # populate the hash table and tag set
        if not load_ht:
            ht.consume_fasta_and_tag(filename)

            # save to a file (optional)
            if save_ht:
                ht.save(basename + '.ht')
                ht.save_tagset(basename + '.tagset')

            # calculate the hashtable occupancy
            print '---'
            print 'hashtable occupancy:', ht.n_occupied() / float(
                HASHTABLE_SIZE)
            print '---'
        else:
            ht.load(basename + '.ht')
            ht.load_tagset(basename + '.tagset')

        # did we just want to load the ht/tagset?
        if stop_after_n_subsets == 0:
            sys.exit(0)

        #stop_tags = pickle.load(open(sys.argv[2]))

        #for stop_tag in stop_tags:
        #    ht.add_stop_tag(stop_tag)

        # divide the tags up into subsets
        divvy = ht.divide_tags_into_subsets(SUBSET_SIZE)
        n_subsets = len(divvy)
        divvy.append(0)

        # build a queue of tasks:
        worker_q = Queue.Queue()

        for i in range(0, n_subsets):
            if stop_after_n_subsets is not None and i >= stop_after_n_subsets:
                break

            start = divvy[i]
            end = divvy[i + 1]
            worker_q.put((ht, i, start, end))

        open('%s.info' % basename,
             'w').write('%d subsets total\n' % (n_subsets))

        threads = []
        for th in range(N_THREADS):
            t = threading.Thread(target=worker, args=(worker_q, basename))
            threads.append(t)
            t.start()

        # wait for threads
        for t in threads:
            t.join()

        ###

        del ht
        gc.collect()

        # create a new, empty ht object for merging; K matters, but not
        # hashtable size.
        ht = khmer.new_hashbits(K, 1, 1)

        # load & merge all pmap files
        for i in range(0, n_subsets):
            pmap_file = basename + '.subset.%d.pmap' % (i, )
            ht.merge_subset_from_disk(pmap_file)

        # save merged partitionmap
        if save_merged_pmap:
            ht.save_partitionmap(basename + '.pmap.merged')

        if remove_orig_pmap:
            for i in range(0, n_subsets):
                pmap_file = basename + '.subset.%d.pmap' % (i, )
                os.unlink(pmap_file)

        # output partitions!
        n_partitions = ht.output_partitions(filename, basename + '.part')
        (n_partitions, n_singletons) = ht.count_partitions()
        print n_partitions

        fd.write(str(n_partitions) + "\n")
        #print os.listdir(os.getcwd())

        for file in glob.glob(os.getcwd() + "/*pmap*"):
            os.remove(file)
        for file in glob.glob(os.getcwd() + "/*.info"):
            os.remove(file)
        for file in glob.glob(os.getcwd() + "/*.part"):
            os.remove(file)

    fd.close()
Exemplo n.º 14
0
def main(filename):
    global ht
    
    n = 5

    basename = os.path.basename(filename)

    fd = open("log.txt", "w")
    
    primes = []

    below = khmer.get_n_primes_near_x(N_HT * n, HASHTABLE_SIZE)
    above = khmer.get_n_primes_above_x(N_HT * n, HASHTABLE_SIZE)

    primes = below + above
    random.shuffle(primes)

    for run in range(n):

        print primes[run*N_HT:run*N_HT+N_HT]

        ht = khmer._new_hashbits(K, primes[run*N_HT:run*N_HT+N_HT])
        #ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
    
        # populate the hash table and tag set
        if not load_ht:
            ht.consume_fasta_and_tag(filename)

            # save to a file (optional)
            if save_ht:
                ht.save(basename + '.ht')
                ht.save_tagset(basename + '.tagset')
            
            # calculate the hashtable occupancy
            print '---'
            print 'hashtable occupancy:', ht.n_occupied() / float(HASHTABLE_SIZE)
            print '---'
        else:
            ht.load(basename + '.ht')
            ht.load_tagset(basename + '.tagset')

        # did we just want to load the ht/tagset?
        if stop_after_n_subsets == 0:
            sys.exit(0)

        #stop_tags = pickle.load(open(sys.argv[2]))

        #for stop_tag in stop_tags:
        #    ht.add_stop_tag(stop_tag)

        # divide the tags up into subsets
        divvy = ht.divide_tags_into_subsets(SUBSET_SIZE)
        n_subsets = len(divvy)
        divvy.append(0)

        # build a queue of tasks:
        worker_q = Queue.Queue()

        for i in range(0, n_subsets):
            if stop_after_n_subsets is not None and i >= stop_after_n_subsets:
                break

            start = divvy[i]
            end = divvy[i+1]
            worker_q.put((ht, i, start, end))

        open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))

        threads = []
        for th in range(N_THREADS):
            t = threading.Thread(target=worker, args=(worker_q, basename))
            threads.append(t)
            t.start()

        # wait for threads
        for t in threads:
            t.join()

        ###

        del ht
        gc.collect()

        # create a new, empty ht object for merging; K matters, but not
        # hashtable size.
        ht = khmer.new_hashbits(K, 1, 1)

        # load & merge all pmap files
        for i in range(0, n_subsets):
            pmap_file = basename + '.subset.%d.pmap' % (i,)
            ht.merge_subset_from_disk(pmap_file)

        # save merged partitionmap
        if save_merged_pmap:
            ht.save_partitionmap(basename + '.pmap.merged')

        if remove_orig_pmap:
            for i in range(0, n_subsets):
                pmap_file = basename + '.subset.%d.pmap' % (i,)
                os.unlink(pmap_file)

        # output partitions!
        n_partitions = ht.output_partitions(filename, basename + '.part')
        (n_partitions, n_singletons) = ht.count_partitions()
        print n_partitions

        fd.write(str(n_partitions) + "\n")
        #print os.listdir(os.getcwd())

        for file in glob.glob(os.getcwd() + "/*pmap*"):
            os.remove(file)
        for file in glob.glob(os.getcwd() + "/*.info"):
            os.remove(file)
        for file in glob.glob(os.getcwd() + "/*.part"):
            os.remove(file)

    fd.close()
Exemplo n.º 15
0
def test_init_with_primes(sketchtype):
    primes = khmer.get_n_primes_near_x(4, random.randint(1000, 2000))
    sketch = sketchtype(31, 1, 1, primes=primes)
    assert sketch.hashsizes() == primes