def construct_tree_inline(root_table, key_list): (i, j) = 0, len(root_table) - 1 root_index = root_table[i][j] root = BSTTree(key_list[root_index]) node_stack = [] if (root_index + 1 <= j): node_stack.append((root_index + 1, j, root)) if i <= (root_index - 1): node_stack.append((i, root_index - 1, root)) while node_stack: (i, j, parent) = node_stack.pop() next_root = root_table[i][j] node = BSTTree(key_list[next_root]) if node.value < parent.value: parent.left = node else: parent.right = node if (next_root + 1 <= j): node_stack.append((next_root + 1, j, node)) if i <= (next_root - 1): node_stack.append((i, next_root - 1, node)) return root
def find_optimal_tree_ordering(beta_list, alpha_list, beta_length, key_list): """ Returns binary search tree ordered according to Rule 1 with given key probabilities (beta_list) and gap probabilities (alpha_list) """ root = None # holds subdivisions of beta value array section_queue = Queue() section_queue.put((0, beta_length - 1, root)) # Iteratively find the best root for each subtree while not section_queue.empty(): (left_index, right_index, parent) = section_queue.get() if (left_index < 0 or right_index >= beta_length or left_index > right_index): continue if left_index == right_index: node = BSTTree(key_list[left_index]) if parent is None: root = node elif left_index < parent.value: parent.left = node else: parent.right = node continue max_prob = 0 best_splits = [left_index] for i in range(left_index, right_index + 1): if beta_list[i] > max_prob: max_prob = beta_list[i] best_splits = [i] if beta_list[i] == max_prob: best_splits.append(i) best_split = best_splits[len(best_splits) / 2] node = BSTTree(key_list[best_split]) if parent is None: root = node elif key_list[best_split] < parent.value: parent.left = node else: parent.right = node section_queue.put((left_index, best_split - 1, node)) section_queue.put((best_split + 1, right_index, node)) return root
def test(): """ Test function to build trees based on corpus specified """ if len(sys.argv) < 2: print "Usage: python benchmarks.py corpus.txt num_words test_type" exit(1) corpusfile = sys.argv[1] num_words = 10000 small_words = 10000 big_words = 160000 num_searches = 100000 test_type = "small" if len(sys.argv) != 4: print "Usage: python benchmarks.py corpus.txt num_words test_type" exit(1) else: num_words = int(sys.argv[2]) test_type = sys.argv[3] even_dist_keys = 3000 random_repeats = 3 # Convert text document of English words into Python list of strings/words corpus = [word for line in open(corpusfile, 'r') for word in line.split()] standard_corp = corpus[:num_words] # define number of corpus repeats corpus_repeats = 2 #to fill in datasets = [] if test_type == "small": #small dataset datasets.append(("small dataset", generate_probs(corpus[:small_words]), corpus[:small_words])) elif test_type == "medium": #medium dataset datasets.append( ("medium dataset", generate_probs(standard_corp), standard_corp)) elif test_type == "large": #large dataset datasets.append(("large dataset", generate_probs(corpus[:big_words]), corpus[:big_words])) elif test_type == "leaf": #high leaf probabilties datasets.append( ("high leaf dataset", generate_probs_high_leaf(standard_corp), standard_corp)) elif test_type == "key": #high key probabilties datasets.append( ("high key ds", generate_probs_high_key(standard_corp[:num_words / 2]), standard_corp[:num_words / 2])) elif test_type == "uniform": #uniform dataset datasets.append(("uniform ds", generate_probs_uniform(even_dist_keys), [i for i in range(even_dist_keys)])) for (name, (alphas, betas, beta_values), corpora) in datasets: print "========================================" print "running", name print len(beta_values) insert_indices = [i for i in range(len(beta_values))] shuffle(insert_indices) searches = corpora #MEHLHORN print print "MEHLHORN KNUTH" start = time.time() nlogntree = Nlogn_build(betas, alphas, len(betas), beta_values, min(betas) / 2) end = time.time() cons_time = end - start depths = [] start = time.time() for x in range(corpus_repeats): for k in searches: depths.append(nlogntree.find(k)[1]) end = time.time() print "BUILD TIME:", cons_time, "AVG SEARCH TIME:", (end - start) / ( len(searches) * corpus_repeats), "AVG DEPTH:", float( sum(depths)) / len(depths) #KNUTH OPTION 1 print print "KNUTH ROOT METHOD" start = time.time() root_tree = Knuth_Rule1(betas, alphas, len(betas), beta_values) end = time.time() cons_time = end - start depths = [] start = time.time() for x in range(corpus_repeats): for k in searches: depths.append(root_tree.find(k)[1]) end = time.time() print "BUILD TIME:", cons_time, "AVG SEARCH TIME:", (end - start) / ( len(searches) * corpus_repeats), "AVG DEPTH:", float( sum(depths)) / len(depths) #AVL print print "AVL" avg_build_time = 0.0 avg_depth = 0.0 for x in range(random_repeats): start = time.time() avl_tree = AVLTree(None) for v in insert_indices: avl_tree = avl_tree.insert(beta_values[v]) end = time.time() cons_time = end - start depths = [] start = time.time() for x in range(corpus_repeats): for k in searches: depths.append(avl_tree.find(k)[1]) end = time.time() avg_build_time += cons_time avg_depth += float(sum(depths)) / len(depths) print "BUILD TIME:", avg_build_time / random_repeats, "AVG SEARCH TIME:", ( end - start) / (len(searches) * corpus_repeats ), "AVG DEPTH:", avg_depth / random_repeats #NAIVE BST print print "NAIVE BST" avg_build_time = 0.0 avg_depth = 0.0 for x in range(random_repeats): start = time.time() bst_tree = BSTTree(None) for v in insert_indices: bst_tree.insert(beta_values[v]) end = time.time() cons_time = end - start depths = [] start = time.time() for x in range(corpus_repeats): for k in searches: depths.append(bst_tree.find(k)[1]) end = time.time() avg_build_time += cons_time avg_depth += float(sum(depths)) / len(depths) print "BUILD TIME:", avg_build_time / random_repeats, "AVG SEARCH TIME:", ( end - start) / (len(searches) * corpus_repeats ), "AVG DEPTH:", avg_depth / random_repeats #KNUTH print print "OPTIMAL KNUTH" start = time.time() (exp, root) = Knuth_find(betas, alphas, len(betas)) Knuth_tree = Knuth_build(root, beta_values) end = time.time() cons_time = end - start depths = [] start = time.time() for x in range(corpus_repeats): for k in searches: depths.append(Knuth_tree.find(k)[1]) end = time.time() print "BUILD TIME:", cons_time, "AVG SEARCH TIME:", (end - start) / ( len(searches) * corpus_repeats), "AVG DEPTH:", float( sum(depths)) / len(depths) print "\nMemory:", float( resource.getrusage( resource.RUSAGE_SELF).ru_maxrss) / 1000000, "megaytes used"
def __init__(self, value): BSTTree.__init__(self, value) self.balance = 0 self.parent = None
# bsttests.py # COMP 150 # Created by Alex King # 3/31/2017 # Testing the functionality, correctness, and general runtime of BSTTree from BSTTree import BSTTree from random import randint import time import sys def print_value(x): sys.stdout.write(str(x) + "\n") myTree = BSTTree(None) values = [] for i in range(30): random = randint(0, 500) values.append(random) myTree.insert(random) print myTree # Ensure insertion works as expected for v in values: if not myTree.find(v): print "Inserted value %d not found in tree!" % v exit(1)
def find_optimal_tree_ordering(beta_list, alpha_list, beta_length, key_list, EPSILON): """ Returns nearly optimal binary search tree given key probabilities (beta_list) and gap probabilities (alpha_list) """ root = None # holds subdivisions of beta value array section_queue = Queue() # triple contains beginning and ending indices of array and probability sum # in updated version, we also put the parent node section_queue.put((0, beta_length - 1, 1, root)) # Iteratively find the best root for each subtree while not section_queue.empty(): # find best split (left_index, right_index, prob_sum, parent) = section_queue.get() if (left_index < 0 or right_index >= beta_length or left_index > right_index): continue if left_index == right_index: node = BSTTree(key_list[left_index]) if parent is None: root = node elif key_list[left_index] < parent.value: parent.left = node else: parent.right = node continue left_prob_sum = alpha_list[left_index] right_prob_sum = alpha_list[right_index + 1] left_last_diff = abs(left_prob_sum - (prob_sum - left_prob_sum - beta_list[left_index])) right_last_diff = abs(right_prob_sum - (prob_sum - right_prob_sum - beta_list[right_index])) for i in xrange(1, (right_index - left_index + 1)): # Move lefthand pointer inwards and calculate new split left_prob_sum += beta_list[left_index + i - 1] + alpha_list[left_index + i] new_diff = abs(left_prob_sum - (prob_sum - left_prob_sum - beta_list[left_index + i])) if (new_diff < left_last_diff) and (abs(new_diff - left_last_diff) > EPSILON): left_last_diff = new_diff else: best_split = left_index + i - 1 node = BSTTree(key_list[best_split]) if parent is None: root = node elif key_list[best_split] < parent.value: parent.left = node else: parent.right = node prev_left_prob_sum = left_prob_sum - beta_list[ left_index + i - 1] - alpha_list[left_index + i] section_queue.put( (left_index, best_split - 1, prev_left_prob_sum, node)) section_queue.put( (best_split + 1, right_index, (prob_sum - prev_left_prob_sum - beta_list[best_split]), node)) break # Move righthand pointer inwards and calculate new split right_prob_sum += (beta_list[right_index + 1 - i] + alpha_list[right_index + 1 - i]) new_diff = abs(right_prob_sum - (prob_sum - right_prob_sum - beta_list[right_index - i])) if new_diff < right_last_diff and abs(new_diff - right_last_diff) > EPSILON: right_last_diff = new_diff else: best_split = right_index - i + 1 node = BSTTree(key_list[best_split]) if parent is None: root = node elif key_list[best_split] < parent.value: parent.left = node else: parent.right = node prev_right_prob_sum = right_prob_sum - beta_list[ right_index + 1 - i] - alpha_list[right_index + 1 - i] section_queue.put( (best_split + 1, right_index, prev_right_prob_sum, node)) section_queue.put( (left_index, best_split - 1, (prob_sum - prev_right_prob_sum - beta_list[best_split]), node)) break return root