示例#1
0
def count_hash_incremental_move(output=True, decimals=4):
    """
    For all English words, starting with a hashtable of size 1,024 and
    a load factor of 0.75, count how many times the hash code (i.e., %)
    is invoked.
    """
    from ch03.book import CountableHash
    from ch03.hashtable_linked import DynamicHashtable

    print(
        'Each emitted row contains an operation more costly than any before...'
    )
    ht_dynamic = DynamicHashtable(1023)
    tbl = DataTable([20, 10, 10], ['Word', 'N', 'cost'],
                    output=output,
                    decimals=decimals)
    tbl.format('Word', 's')
    tbl.format('N', ',d')

    max_cost = 0
    now = time.time()
    for w in english_words():
        before = time.time()
        ht_dynamic.put(CountableHash(w), w)
        cost = time.time() - before
        if cost > max_cost:
            max_cost = cost
            tbl.row([w, ht_dynamic.N, cost])
    total_normal = time.time() - now
    print('Normal:{}'.format(total_normal))

    for delta in [512, 256, 128, 64, 32, 16, 8, 4]:
        ht = DynamicHashtableIncrementalResizing(1023, delta=delta)

        tbl = DataTable([20, 10, 10], ['Word', 'N', 'cost'],
                        output=output,
                        decimals=decimals)
        tbl.format('Word', 's')
        tbl.format('N', ',d')

        max_cost = 0
        now = time.time()
        for w in english_words():
            before = time.time()
            ht.put(CountableHash(w), w)
            cost = time.time() - before
            if cost > max_cost:
                max_cost = cost
                tbl.row([w, ht.N, cost])
        total_delta = time.time() - now
        print('delta={}, Normal:{}'.format(delta, total_delta))
示例#2
0
    def test_challenge(self):
        from ch03.challenge import DynamicHashtableIncrementalResizing as Hashtable
        from resources.english import english_words
        ht = Hashtable(31, 5)
        for w in english_words():
            ht.put(w, w)

        # make sure all still present
        for w in english_words():
            self.assertEqual(w, ht.get(w))

        # now remove them one at a time
        for w in english_words():
            self.assertEqual(w, ht.remove(w))
示例#3
0
def time_results_linked(output=True, decimals=3):
    """Average time to find a key in growing hashtable_open."""

    sizes = [8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576]
    tbl = DataTable([8] + [8] * len(sizes),
                    ['N'] + [comma(sz) for sz in sizes],
                    output=output,
                    decimals=decimals)
    # Now start with M words to be added into a table of size N.
    # Start at 1000 and work up to 2000
    words = english_words()
    for num_to_add in [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]:
        all_words = words[:num_to_add]

        line = [num_to_add]
        for size in sizes:
            time1 = min(
                timeit.repeat(stmt='''
table = Hashtable({})
for word in words:
    table.put(word, 99)'''.format(size),
                              setup='''
from ch03.hashtable_linked import Hashtable
words={}'''.format(all_words),
                              repeat=1,
                              number=100))
            line.append(1000000 * time1 / size)
        tbl.row(line)
    return tbl
示例#4
0
 def test_ordered_pq(self):
     from ch04.ordered import PQ
     from resources.english import english_words
     words = english_words()[:10000]
     pair = self.priority_queue_stress_test(PQ(len(words)), len(words))
     # Note: we cannot guarantee individual words BUT we can guarantee length
     self.assertEqual((len('acetylphenylhydrazine'), len('a')),
                      (len(pair[0]), len(pair[1])))
示例#5
0
 def test_builtin_heap_pq(self):
     from ch04.builtin import PQ
     from resources.english import english_words
     words = english_words()[:1000]
     pair = self.priority_queue_stress_test(PQ(len(words)), len(words))
     # Note: we cannot guarantee individual words BUT we can guarantee length
     self.assertEqual((len('abdominohysterectomy'), len('a')),
                      (len(pair[0]), len(pair[1])))
示例#6
0
def compare_dynamic_build_and_access_time(repeat=25, num=10, output=True):
    """Generate tables for build and access for AVL trees."""
    from ch06.symbol import BinaryTree
    from resources.english import english_words
    bt = BinaryTree()
    for w in english_words():
        bt.put(w, w)
    total = len(english_words())

    if output:
        print('This will take several minutes...')
        print('total number of words =', total)
        print('height of AVL tree for all English words =', bt.root.height)
        print('has to at least be =', math.log(total + 1) / math.log(2) - 1)

    # When 'ht = HTLL(...) is inside the STMT, it measures BUILD TIME.
    # When it is included in the setup, we are measuring ACCESS TIME.
    t_build = min(
        timeit.repeat(stmt='''
ht = BinaryTree()
for w in words:
    ht.put(w,w)''',
                      setup='''
from ch06.symbol import BinaryTree
from resources.english import english_words
words = english_words()''',
                      repeat=repeat,
                      number=num)) / num

    t_access = min(
        timeit.repeat(stmt='''
for w in words:
    ht.get(w)''',
                      setup='''
from ch06.symbol import BinaryTree
from resources.english import english_words
ht = BinaryTree()
words = english_words()
for w in words:
    ht.put(w,w)''',
                      repeat=repeat,
                      number=num)) / num

    if output:
        print('Build-time =', t_build, ', Access-time = ', t_access)
    return (t_build, t_access)
示例#7
0
    def test_binary_tree_from_chapter_06(self):
        from ch06.pq import PQ
        from ch04.test import TestChapter4

        from resources.english import english_words
        words = english_words()
        pair = TestChapter4().priority_queue_stress_test(PQ(), len(words))
        # Note: we cannot guarantee individual words BUT we can guarantee length
        self.assertEqual((len('formaldehydesulphoxylate'), len('a')),
                         (len(pair[0]), len(pair[1])))
示例#8
0
def count_collisions(num_rows=0, output=True, decimals=1):
    """Generate table counting collisions."""
    all_words = english_words()
    N = len(all_words)

    from ch03.hashtable_linked import Hashtable as HL
    from ch03.hashtable_linked import stats_linked_lists
    from ch03.hashtable_open import Hashtable as OHL
    from ch03.hashtable_open import stats_open_addressing

    tbl = DataTable([10,8,8,8,8], ['M', 'Avg LL', 'Max LL', 'Avg OA', 'Max OA'],
                    output=output, decimals=decimals)
    tbl.format('Max LL', 'd')
    tbl.format('Max OA', 'd')

    M = 20*N
    hl = HL(M)
    ohl = OHL(M)
    for w in all_words:
        hl.put(w, 1)
        ohl.put(w, 1)
    avg_size_linked = stats_linked_lists(hl)
    avg_size_open = stats_open_addressing(ohl)
    tbl.row([M, avg_size_linked[0], avg_size_linked[1], avg_size_open[0], avg_size_open[1]])

    M = 2*N
    while M > N/16:
        hl = HL(M)
        ohl = OHL(M)
        for w in all_words:
            hl.put(w, 1)
            if M > N:               # otherwise, will fail...
                ohl.put(w, 1)
        avg_size_linked = stats_linked_lists(hl)

        if N < M:
            avg_size_open = stats_open_addressing(ohl)
        else:
            tbl.format('Avg OA', 's')
            tbl.format('Max OA', 's')
            avg_size_open = [SKIP, SKIP]

        num_rows -= 1
        tbl.row([M, avg_size_linked[0], avg_size_linked[1], avg_size_open[0], avg_size_open[1]])

        # Once below threshold, go down at 60% clip
        if M > N:
            M = (M * 95) // 100
        else:
            M = (M * 6) // 10

        # To allow for testing, simple way to break out after a number of rows are generated.
        if num_rows == 0:
            break
    return tbl
示例#9
0
def check_for_duplicates():
    """
    Determine if there are any hash() clashes on the words in the English language.

    Because Python uses 64-bit hashcodes the likelihood is tremendously small.
    Also remember that Python now salts hash code values, so they are not the
    same from one run to the next.

    The Python code below finds no clashes on hash() values.

    The following Java code finds 11 clashes::

        import java.book.*;
        public class EnglishClash {
            public static void main(String[] args) throws Exception {
                java.io.File f = new java.io.File("words.english.txt");
                Scanner sc = new Scanner(f);
                Hashtable<Integer,String> ht = new Hashtable<>();
                while (sc.hasNextLine()) {
                    String s = sc.nextLine();
                    int i = s.hashCode();
                    if (ht.containsKey(i)) {
                        System.out.println("clash on " + s + " and " + ht.get(i));
                    } else {
                        ht.put(i, s);
                    }
                }
                sc.close();
            }
        }

    The above code finds 11 clashes

        clash on hazardless and agarwal
        clash on hierarch and crinolines
        clash on isohel and epistolaries
        clash on kindergartener and acouasm
        clash on misused and horsemints
        clash on poised and dentinalgia
        clash on proselytized and nonguard
        clash on righto and buzzards
        clash on unapprehending and fineable
        clash on unheavenly and hypoplankton
        clash on variants and gelato
    """
    hash_values = {}
    clashes = 0
    for w in english_words():
        hc = hash(w)
        if hc in hash_values:
            print('clash on', w, 'and', hash_values[hc])
            clashes += 1
        hash_values[hc] = w
    print('Number of duplicate hashcodes found for dictionary:', clashes)
示例#10
0
def count_collisions_dynamic(num_rows=0, output=True, decimals=2):
    """Generate data counting collisions for dynamic hashtables. Not used in book."""
    all_words = english_words()
    # start twice as big as the number of words, and reduce steadily, counting collisions
    N = len(all_words)
    M = 2*N

    from ch03.hashtable_linked import DynamicHashtable as DHL
    from ch03.hashtable_linked import stats_linked_lists
    from ch03.hashtable_open import DynamicHashtable as ODHL
    from ch03.hashtable_open import stats_open_addressing

    tbl = DataTable([10,8,8,8,8], ['M', 'Avg LL', 'Max LL', 'Avg OA', 'Max OA'],
                    output=output, decimals=decimals)
    tbl.format('Max LL', 'd')
    tbl.format('Max OA', 'd')
    while M > N/16:
        dhl = DHL(M)
        odhl = ODHL(M)
        for w in all_words:
            dhl.put(w, 1)
            odhl.put(w, 1)

        avg_size_linked_dynamic = stats_linked_lists(dhl)
        avg_size_open_dynamic = stats_open_addressing(odhl)

        num_rows -= 1
        tbl.row([M, avg_size_linked_dynamic[0], avg_size_linked_dynamic[1],
                 avg_size_open_dynamic[0], avg_size_open_dynamic[1]])

        # Start with one ten times as big, then drop down to 2*N
        if M > N:
            M = (M * 95) // 100
        else:
            M = (M * 6) // 10

        # To allow for testing, simple way to break out after a number of rows are generated.
        if num_rows == 0:
            break
    return tbl
示例#11
0
def count_hash(output=True, decimals=2):
    """
    For all English words, starting with a hashtable of size 1,024 and
    a load factor of 0.75, count how many times the hash code (i.e., %)
    is invoked.
    """
    from ch03.hashtable_linked import DynamicHashtable

    ht = DynamicHashtable(1023)
    tbl = DataTable([20,10,10,10,10],['Word', 'M', 'N', '#insert', 'average'],
                    output=output, decimals=decimals)
    tbl.format('Word', 's')
    tbl.format('N', ',d')
    tbl.format('M', ',d')
    tbl.format('#insert', ',d')

    last_word = None
    for w in english_words():
        last_word = w
        last_m = ht.M
        last = CountableHash.hash_count
        ht.put(CountableHash(w), w)
        if CountableHash.hash_count != last + 1:
            tbl.row([w, last_m, ht.N, CountableHash.hash_count, CountableHash.hash_count/ht.N])

    tbl.row([last_word, last_m, ht.N, CountableHash.hash_count, CountableHash.hash_count/ht.N])

    # determine when next resize event would occur...
    for i in range(1, 200000):
        last = CountableHash.hash_count
        last_m = ht.M
        ht.put(CountableHash(last_word + str(i)), last_word)
        if CountableHash.hash_count != last + 1:
            tbl.row([last_word + str(i), last_m, ht.N,
                     CountableHash.hash_count, CountableHash.hash_count/ht.N])
            break

    return tbl
示例#12
0
    def priority_queue_stress_test(self, pq, max_length=None):
        """
        Given an empty Priority queue, add words from English dictionary where
        priority is length of word. Because some PQ implementations are so
        inefficient, allow a caller to restrict
        """
        from resources.english import english_words
        words = english_words()
        if max_length:
            words = words[:max_length]
        for w in words:
            pq.enqueue(w, len(w))

        # First word out is longest... / Last one out is smallest
        first = pq.dequeue()
        while pq:
            last = pq.dequeue()

        # Should be drained
        with self.assertRaises(RuntimeError):
            pq.dequeue()

        return (first, last)
示例#13
0
def time_results_open_addressing(num_rows=0, output=True, decimals=3):
    """Average time to insert a key in growing hashtable_open (in microseconds)."""
    sizes = [8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576]
    headers = [comma(s) for s in sizes]
    headers.insert(0,'N')
    tbl = DataTable([8,8,8,8,8,8,8,8,10], headers, output=output, decimals=decimals)

    # Now start with M words to be added into a table of size N.
    # Start at 1000 and work up to 2000
    for num_to_add in [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768]:
        all_words = english_words()[:num_to_add]

        line = [len(all_words)]
        for size in sizes:
            try:
                tbl.format(comma(size), '.3f')
                timing = min(timeit.repeat(stmt='''
table = Hashtable({})
for word in all_words:
    table.put(word, 99)'''.format(size), setup='''
from ch03.hashtable_open import Hashtable
from resources.english import english_words
all_words=english_words()[:{}]'''.format(num_to_add),repeat=1,number=100))
                timing = (100000.0 * timing) / size
            except RuntimeError:
                timing = SKIP

            line.append(timing)
        num_rows -= 1
        tbl.row(line)

        # Provide effective way to terminate early for testing.
        if num_rows == 0:
            break

    return tbl
示例#14
0

#######################################################################
if __name__ == '__main__':
    print(
        'Average search Times with separate chaining hashtables (time in ns)')
    time_results_linked()
    print()

    print(
        'Count how many times hashcode is computed (i.e., when % is invoked) on PUT'
    )
    probability_of_failure()
    print()

    print('Statistics from the perfect hash on just a few words')
    ewords = english_words()
    simple_stats(ewords[:10])
    print()

    print('Compare performance of perfect hash with regular hashtable')
    random.shuffle(ewords)
    compare_time(ewords)
    print()

    print(
        'Trying to find two words in the dictionary with the same Python hash() value'
    )
    check_for_duplicates()
    print()
示例#15
0
    def test_bad_timing(self):
        from resources.english import english_words
        from ch03.challenge import bad_timing

        tbl = bad_timing(english_words()[:100], output=False)
        self.assertTrue(tbl.entry('Good', 'Max Len') > 0)
示例#16
0
def measure_performance_resize(max_d=50, output=True):
    """Generate table of statistics for table resizing up to (but not including maxd=50)."""
    from ch03.hashtable_linked import DynamicHashtable

    try:
        # Added in Python 3.7
        from time import time_ns
        timing = time_ns
    except ImportError:
        from time import time
        timing = time

    if output:
        print('Dynamic Resizing Hashtable')
    tbl = DataTable([8, 15, 15, 10, 10],
                    ['idx', 'word', 'time', 'old-size', 'new-size'],
                    output=output,
                    decimals=2)
    tbl.format('idx', 'd')
    tbl.format('word', 's')
    tbl.format('old-size', ',d')
    tbl.format('new-size', ',d')

    ht = DynamicHashtable(1023)
    idx = 1
    last = None
    average = 0
    words = english_words()
    for w in words:
        before = timing()
        old_size = len(ht.table)
        ht.put(w, w)
        new_size = len(ht.table)
        after = timing()
        average += (after - before)
        if last:
            if after - before > last:
                last = after - before
                tbl.row([idx, w, last, old_size, new_size])
        else:
            last = after - before
        idx += 1

    average /= len(words)
    ht = None
    if output:
        print('Average was ', average)
        print('Incremental Resizing Hashtable')

    tbl_ir = DataTable([8, 15, 15, 10, 10],
                       ['idx', 'word', 'time', 'old-size', 'new-size'],
                       output=output,
                       decimals=2)
    tbl_ir.format('idx', 'd')
    tbl_ir.format('word', 's')
    tbl_ir.format('old-size', ',d')
    tbl_ir.format('new-size', ',d')
    ht = DynamicHashtableIncrementalResizing(1023, 10)
    idx = 1
    last = None
    average = 0
    words = english_words()
    for w in words:
        before = timing()
        old_size = len(ht.table)
        ht.put(w, w)
        new_size = len(ht.table)
        after = timing()
        average += (after - before)
        if last:
            if after - before > last:
                last = after - before
                tbl_ir.row([idx, w, last, old_size, new_size])
        else:
            last = after - before
        idx += 1

    ht = None

    average /= len(words)
    if output:
        print('Average was ', average)
        print('Incremental Resizing dependent on Delta')
        print()

    tbl_d = DataTable([8, 10], ['Delta', 'Average'], output=output)
    tbl_d.format('Delta', 'd')
    for delta in range(1, max_d):
        ht = DynamicHashtableIncrementalResizing(1023, delta)
        average = 0
        words = english_words()
        for w in words:
            before = timing()
            ht.put(w, w)
            after = timing()
            average += (after - before)

        average /= len(words)
        tbl_d.row([delta, average])

    return (tbl, tbl_ir, tbl_d)
示例#17
0
#######################################################################
if __name__ == '__main__':
    chapter = 3

    with ExerciseNum(1) as exercise_number:
        exercise_triangle_number_probing()
        print(caption(chapter, exercise_number), 'Fragment evaluation')

    with ExerciseNum(2) as exercise_number:
        evaluate_hashtable_sorted_chains()
        print(caption(chapter, exercise_number),
              'Hashtable with sorted linked list chains')

    # To provide a full exercise, remove the "[:5000]" from below, otherwise takes too long for book.
    with ExerciseNum(3) as exercise_number:
        bad_timing(english_words()[:5000])
        print(caption(chapter, exercise_number), 'ValueBadHash exercise')

    with ExerciseNum(4) as exercise_number:
        prime_number_difference(english_words())
        print(caption(chapter, exercise_number), 'Prime Number exercise')

    with ExerciseNum(5) as exercise_number:
        evaluate_DynamicHashtablePlusRemove()
        print(caption(chapter, exercise_number),
              'Open Addressing with Marked Elements as deleted.')

    with ExerciseNum(6) as exercise_number:
        count_hash_incremental_move()
        print(
            caption(chapter, exercise_number),