def test_resize_table_1(self): """ Test resize_table() with Example #1 from the guidelines. :passed: yes """ print("--- EXAMPLE 1 ---") m = HashMap(20, hash_function_1) m.put('key1', 10) print(m.size, m.capacity, m.get('key1'), m.contains_key('key1')) m.resize_table(30) print(m.size, m.capacity, m.get('key1'), m.contains_key('key1'))
def test_contains_key_3(self): """ Test contains_key with an empty hash map. :passed: yes """ # Make an empty hash map hash_m = HashMap(3, hash_function_2) print(hash_m) self.assertFalse(hash_m.contains_key("cat")) self.assertFalse(hash_m.contains_key(" "))
def test_contains_key_4(self): """ Test contains_key with an empty hash map. :passed: yes """ # Make an empty hash map hash_m = HashMap(0, hash_function_2) print("hash_m:", hash_m) self.assertFalse(hash_m.contains_key("blue")) self.assertFalse(hash_m.contains_key("a")) self.assertFalse(hash_m.contains_key(" ")) print(hash_m.contains_key("a"))
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() # Variable representing empty set. ht = HashMap( 2500, hash_function_2 ) # Variable to represent hash map construct utilizing above function. # This block of code will read a file one word as a time and # put the word in `w`. It should be left as starter code. with open(source ) as f: # Opens file to be used declaring it as variable 'f'. for line in f: # Loops through each line within file (f). words = rgx.findall( line) # Variable utilized to represent words within each line. for w in words: # Loops through each word within each line. lw = w.lower( ) # Turns words lowercase to remove case sensitivity. keys.add( lw ) # Adds lowercase word to set represented by variable 'key'. if ht.contains_key( lw): # Checks if word is already present in hash map. new_value = ( ht.get(lw) + 1 ) # Variable represents word count increased by one. ht.put( lw, new_value ) # Inserts word into hash map to have word count be updated. else: ht.put( lw, 1 ) # Inserts word into hash map with initial count of one. keys_list = [] # Variable represents an empty list. for values in keys: # Loops through words present in set represented by variable 'keys'. ind = ht._hash_function(values) % ht.capacity # Variable to represent number established by chosen function and available capacity. temp = ht._buckets[ ind] # Variable to represent position within hash map containing linked list. node = temp.contains( values ) # Variable to represent node containing key if already present. keys_list.append( (node.key, node.value)) # Adds tuple to list containing word, word count. keys_list.sort( key=lambda tup: tup[1], reverse=True) # Sorts list in descending order based on word count. return keys_list[ 0: number] # Returns list of top words within given range provided by user.
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500,hash_function_2) # This block of code will read a file one word as a time and # put the word in `w`. It should be left as starter code. with open(source) as f: for line in f: words = rgx.findall(line) for w in words: if ht.contains_key(w.lower()): ht.put(w.lower(), ht.get(w.lower()) + 1) else: ht.put(w.lower(), 1) tup = ht.sorted_tup() return tup[:number]
def test_contains_key_5(self): """ Test contains_key with Example #1 from the guidelines. :passed: yes """ print("--- EXAMPLE 1 ---") m = HashMap(50, hash_function_1) print(m.contains_key('key1')) m.put('key1', 10) m.put('key2', 20) m.put('key3', 30) print(m.contains_key('key1')) print(m.contains_key('key4')) print(m.contains_key('key2')) print(m.contains_key('key3')) m.remove('key3') print(m.contains_key('key3'))
def test_contains_key_2(self): """ Test contains_key with a hash map of 1 bucket. :passed: yes """ # Make a linked list ll_1 = LinkedList() ll_1.add_front("cot", 3) ll_1.add_front("box", 2) ll_1.add_front("axe", 1) # print("ll_1:", ll_1) # Make a hash map hash_m = HashMap(7, hash_function_2) hash_m._buckets[6] = ll_1 # Make calls to contains_key self.assertTrue(hash_m.contains_key("axe")) self.assertTrue(hash_m.contains_key("box")) self.assertTrue(hash_m.contains_key("cot")) self.assertFalse(hash_m.contains_key("Axe")) self.assertFalse(hash_m.contains_key("aXe")) self.assertFalse(hash_m.contains_key("axE")) self.assertFalse(hash_m.contains_key("AXE")) self.assertFalse(hash_m.contains_key("boxx")) self.assertFalse(hash_m.contains_key("cat")) self.assertFalse(hash_m.contains_key("verb"))
def test_contains_key_6(self): """ Test contains_key with Example #2 from the guidelines. :passed: yes """ print("--- EXAMPLE 2 ---") m = HashMap(75, hash_function_2) keys = [i for i in range(1, 1000, 20)] for key in keys: m.put(str(key), key * 42) print(m.size, m.capacity) result = True for key in keys: # all inserted keys must be present result = result and m.contains_key(str(key)) # all NOT inserted keys must be absent result = result and not m.contains_key(str(key + 1)) print(result)
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500, hash_function_2) # Reads a file one word as a time and with open(source) as f: for line in f: words = rgx.findall(line) for w in words: w = w.lower( ) # covert word to lowercase for case-insensitive comparisons if ht.contains_key( w ): # if word already exists as key in ht, add 1 to value to track count value = ht.get(w) ht.put(w, value + 1) else: ht.put( w, 1 ) # if word does not exist in ht as key, add word as key and initialize value as 1 keys.add(w) # add word to set of keys count_dict = {} # initialize empty dictionary count_array = [] # initialize empty array for key in keys: # for each key, get it's value from ht and then add key/value pair to count_dict value = ht.get(key) count_dict[key] = value for key in keys: # for each key, add value/key pair to array for sorting count_array.append((count_dict[key], key)) count_array = sorted( count_array, reverse=True ) # reverse sort count_array from largest to smallest value for i in range( len(count_array) ): # reswap key/value pairs to get (word, count) for each tuple in count_array count_array[i] = (count_array[i][1], count_array[i][0]) return count_array[: number] # return only the requested number of top words
def test_resize_table_2(self): """ Test resize_table() with Example #2 from the guidelines. :passed: yes """ print("--- EXAMPLE 2 ---") m = HashMap(75, hash_function_2) keys = [i for i in range(1, 1000, 13)] for key in keys: m.put(str(key), key * 42) print(m.size, m.capacity) for capacity in range(111, 1000, 117): m.resize_table(capacity) result = True for key in keys: result = result and m.contains_key(str(key)) result = result and not m.contains_key(str(key + 1)) print(capacity, result, m.size, m.capacity, round(m.table_load(), 2))
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500, hash_function_2) # This block of code will read a file one word as a time and | # put the word in `w`. It should be left as starter code. | with open(source) as f: # | for line in f: # | words = rgx.findall(line) # | for w in words: # | # lower case all incoming words to make case insensitive w = w.lower() # Check if hashtable already contains word if ht.contains_key(w): # If so retrieve the count for the given word count = ht.get(w) # Update existing key with incremented count value ht.put(w, count + 1) else: # Add new word to keys set collection keys.add(w) # put new word in hash table with a count of 1 ht.put(w, 1) # Check if table load is over load limit before next word if ht.table_load() > 8: # if so, resize hash table to twice the capacity ht.resize_table(2 * ht.capacity) # initialize tuple word/count list topWords = [] # for each key in the set of keys for key in keys: # append the key and value as a tuple in the topWords list topWords.append((key, ht.get(key))) # once all tuples are added to list, sort list by the count of each key in descending order topWords.sort(key=lambda keyCountTup: keyCountTup[1], reverse=True) # After sort, set top word list to only contain the given number of tuples requested topWords = topWords[:number] # return topWords list of tuples of length equal to number return topWords
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500, hash_function_2) # This block of code will read a file one word as a time and # put the word in `w`. It should be left as starter code. with open(source) as f: for line in f: words = rgx.findall(line) for w in words: # Iterate through all the words in the line. # Place lowercased version of words. if ht.contains_key( w.lower()): # If the word is in the hashmap. ht.put(w.lower(), ht.get(w.lower()) + 1) # Update the word count by 1. else: # If the word does not exist in the hashmap. ht.put(w.lower(), 1) # Place the key in the map with the value of 1. keys.add(w.lower()) # Add the new keys into the keys set. list_of_occurences = [ ] # Create an empty list to hold the tuples of keys and values. for key in keys: # Iterate through all the keys. list_of_occurences.append( (key, ht.get(key))) # Add the key and value tuple into the list. # Source to help me find a way to implement this: # stackoverflow.com/questions/10695139/sort-a-list-of-tuples-by-2nd-item-integer-value # We use lambda here to sort the list of tuples by its second value. # The sorting is also reversed to make it in descending order. sorted_list = sorted(list_of_occurences, key=lambda x: x[1], reverse=True) return sorted_list[: number] # Using list slice, return the top numbers of the list depending on what the user inputs # print(top_words("alice.txt",10)) # COMMENT THIS OUT WHEN SUBMITTING TO GRADESCOPE
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500, hash_function_2) # This block of code will read a file one word at a time and # put the word in `w`. It should be left as starter code. with open(source) as f: for line in f: words = rgx.findall(line) for w in words: # Convert all words to lowercase prior to insertion w = w.lower() # If the word is already in the hash map, pass the value with a new updated count if ht.contains_key(w): count = ht.get(w) + 1 ht.put(w, count) else: # Otherwise, create a new entry in the hashmap ht.put(w, 1) # Add all of the words to the keys set for bucket in ht.get_buckets(): # Iterate through each bucket/linked list curr = bucket.head while curr is not None: # Add the keys as a tuple keys.add((curr.key, curr.value)) curr = curr.next # Cast the set as a list all_words = list(keys) # Sort the words according to their value in the tuple all_words.sort(key=lambda word: word[1]) slice_val = (number * -1 - 1) top_wds = all_words[:slice_val:-1] return top_wds
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500, hash_function_2) # This block of code will read a file one word as a time and # put the word in `w`. It should be left as starter code. with open(source) as f: for line in f: words = rgx.findall(line) for w in words: #append the individual words to the list and convert letters #to lowercase for case sensitivity lower_case = w.lower() keys.add(lower_case) #check if word is alread in hashmap if ht.contains_key(lower_case): #increase word count and insert into hasmap and update count val = (ht.get(lower_case) + 1) ht.put(lower_case, val) else: #insert into hasmap with initial count being one 1 if not in hashmap already ht.put(lower_case, 1) #create a new list if words word_list = [] #loop thru the list for k in keys: index = ht._hash_function(k) % ht.capacity temp = ht._buckets[index] #add tuples to list containing word and count linked_node = temp.contains(k) word_list.append((linked_node.key, linked_node.value)) #sort list in descending order word_list.sort(key=lambda tup: tup[1], reverse=True) #return list of top words return word_list[0:number] # print(top_words("alice.txt",10)) # COMMENT THIS OUT WHEN SUBMITTING TO GRADESCOPE
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500, hash_function_2) # This block of code will read a file one word as a time and # put the word in `w`. It should be left as starter code. with open(source) as f: for line in f: words = rgx.findall(line) for w in words: # Place word in hash map or update value by one w = w.lower() if not ht.contains_key(w): keys.add(w) ht.put(w, 1) else: ht.put(w, ht.get(w) + 1) # Sort the words by mentions sorted_words = [] for word in keys: next_word = (word, ht.get(word)) if sorted_words == []: sorted_words.append(next_word) else: for index, value in enumerate(sorted_words): if next_word[1] >= value[1]: sorted_words.insert(index, next_word) break return sorted_words[:number] # print(top_words("alice.txt",10)) # COMMENT THIS OUT WHEN SUBMITTING TO GRADESCOPE
def test_contains_key(self): """Tests the HashMap contains_key method""" test_values = [("test_5", 5), ("test_-5", -5), ("test_5_", 5), ("diff_word", 15), ("another_word", 20), ("set", 10), ("anotha_one", -7), ("completely_different", 5), ("getting_there", -1)] student_map = HashMap(10, hash_function_1) # simple check to test that all values are in the list for key, val in test_values: student_map.put(key, val) found = False for bucket in student_map._buckets: if bucket.contains(key): found = True self.assertEqual(found, student_map.contains_key(key))
def top_words(source, number): """ Take a plain text file and count the number of occurrences of case insensitive words. Return the top `number` of words in a list of tuples of the form (word, count). :param source: the file name containing the text :param number: the number of top results to return (e.g. 5 would return the 5 most common words) :return: a list of tuples of the form (word, count), sorted by most common word (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500, hash_function_2) # Read the file one word at a time and put the word in `w` with open(source) as f: for line in f: words = rgx.findall(line) for w in words: # Convert the word to lowercase to enforce case insensitivity word_lower = w.lower() # If the word already exists in the table, get and update its # current count if ht.contains_key(word_lower): cur_count = ht.get(word_lower) # Get current count ht.put(word_lower, cur_count + 1) # Update current count # If the word does not exist in the table, add it and set its # count to 1 else: ht.put(word_lower, 1) # Get a list of tuples consisting of all the key-value pairs in the table tuple_list = ht.get_tuples() # Sort the list of tuples in descending order by word count tuple_list.sort(key=get_count, reverse=True) # print("sorted tuple_list:", tuple_list) # Slice the list of tuples to contain `number` amount of tuples sliced_list = tuple_list[0:number] # Return the sliced list of tuples return sliced_list
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500, hash_function_2) # This block of code will read a file one word as a time and # put the word in `w`. It should be left as starter code. with open(source) as f: for line in f: words = rgx.findall(line) for w in words: w = w.lower() value = 1 if ht.contains_key(w): value = ht.get(w) + 1 ht.put(w, value) else: ht.put(w, value) temp_list = ht.word_count_list() lst = len(temp_list) for i in range(0, lst): for j in range(0, lst - i - 1): if temp_list[j][1] < temp_list[j + 1][1]: temp_list[j], temp_list[j + 1] = temp_list[j + 1], temp_list[j] final_tuples = [] for k in range(0, number): final_tuples.append(temp_list[k]) return final_tuples
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ result = [] ht = HashMap(2500, hash_function_2) # This block of code will read a file one word at a time # and add it to the hash map with open(source) as f: for line in f: words = rgx.findall(line) for w in words: lw = w.lower() # If the word is not in the hash map, add it with a value of 1 if not ht.contains_key(lw): ht.put(lw, 1) else: # Otherwise, update the value by increasing it by 1 ht.put(lw, ht.get(lw) + 1) for bucket in ht._buckets: cur = bucket.head while cur is not None: result.append((cur.key, cur.value)) cur = cur.next print(ht.table_load()) print(ht.empty_buckets()) sort_words(result) return result[:number]
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ # keys = set() ht = HashMap(25,hash_function_2) tuple_list = [] # This block of code will read a file one word as a time and # put the word in `w`. with open(source) as f: for line in f: words = rgx.findall(line) for w in words: w = w.lower() if ht.contains_key(w): # inc count ht.put(w, ht.get(w) +1) else: # start count ht.put(w, 1) for bucket in ht._buckets: current = bucket.head while current is not None: tuple_list.append((current.key, current.value)) current = current.next sort_tuples(tuple_list) return tuple_list[:number]
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500,hash_function_2) # This block of code will read a file one word as a time and # put the word in `w`. It should be left as starter code. with open(source) as f: for line in f: words = rgx.findall(line) for w in words: w = w.lower() if ht.contains_key(w): node_value = ht.get(w) node_value += 1 ht.put(w, node_value) else: node_value = 1 ht.put(w, node_value) keys.add(w) key_value_arr = [] for i in keys: key_value = (i, ht.get(i)) key_value_arr.append(key_value) key_value_arr.sort(key=lambda x: x[1], reverse=True) top = [] for i in range(number): top.append(key_value_arr[i]) return top
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ ht = HashMap(2500, hash_function_2) # This block of code will read a file one word as a time and # put the word in `w`. It should be left as starter code. with open(source) as f: for line in f: words = rgx.findall(line) for w in words: w = w.lower() count = ht.get(w) if ht.contains_key(w): ht.put(w, count + 1) else: ht.put(w, 1) # Add all items in hash table to list sorted_list = [] for list in ht.get_buckets(): current = list.head for tuple in range(list.size): sorted_list.append((current.key, current.value)) current = current.next # Sort list by value in descending order. Return given number of 'top-words'. sorted_list = sorted(sorted_list, key=get_second, reverse=True) return_list = [] for i in range(number): return_list.append(sorted_list[i]) return return_list
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500, hash_function_2) # This block of code will read a file one word as a time with open(source) as f: for line in f: words = rgx.findall(line) for w in words: # check if word is already in hash map (ht). if none, create an entry with value as 1 if ht.contains_key(w) is False: ht.put(w, 1) # if True, value += 1 else: val = ht.get(w) ht.put(w, val + 1) # put the word in the set keys keys.add(w.lower()) # create empty array, push pair values of keys in pairs = [(word, ht.get(word)) for word in keys] # sort the array, slice the array by number given pairs = sorted(pairs, key=lambda x: x[1], reverse=True) return pairs[:number]
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500, hash_function_2) # This block of code will read a file one word as a time and with open(source) as f: for line in f: words = rgx.findall(line) for w in words: # FIXME: Complete this function word = w.lower() if ht.contains_key(word) == False: ht.put(word, 1) else: curVal = ht.get(word) newVal = curVal + 1 ht.put(word, newVal) curVal = 0 newList = [] printTuple = () listWords = ht.bucket_keys( ) #fills a list with all nodes in bucketlist and then sorts them listWords.sort(key=get_value, reverse=True) for i in range(number): #takes the top 5 words and puts them in a list newList.append(listWords[i].returnNode()) printTuple = tuple(newList) #convert list into tuple return printTuple
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500,hash_function_2) count = 1 # This block of code will read a file one word as a time and # put the word in `w`. It should be left as starter code. with open(source) as f: for line in f: words = rgx.findall(line) for w in words: # check if we have found the key before and update the count if ht.contains_key(w.lower()): value = ht.get(w.lower()) value += 1 ht.put(w.lower(), value) else: # add the new key ht.put(w.lower(), 1) # grab the top x results result = sort_function(ht, number) return result
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500, hash_function_2) # This block of code will read a file one word as a time and # put the word in `w`. It should be left as starter code. with open(source) as f: for line in f: words = rgx.findall(line) for w in words: # if a link with the key w exists in the table, update increment its value if ht.contains_key(w.lower()): ht.put(w.lower(), ht.get(w.lower()) + 1) # otherwise add w to the hash table as a new link with count of 1 else: ht.put(w.lower(), 1) # call the list_of_links() method of the hash table to get a list of tuples of all links in the table word_list = ht.list_of_links() # sort the list of tuples from highest to lowest according to count word_list.sort(key=get_count, reverse=True) # return the first however many words requested by the user in a list of tuples return word_list[:number]
for i in range(150): m.put('str' + str(i), i * 100) if i % 25 == 24: print(m.empty_buckets(), m.table_load(), m.size, m.capacity) print("--- EXAMPLE 2 ---") m = HashMap(40, hash_function_2) for i in range(50): m.put('str' + str(i // 3), i * 100) if i % 10 == 9: print(m.empty_buckets(), m.table_load(), m.size, m.capacity) """ CONTAINS KEY """ print("\n\n******** CONTAINS_KEY() ********") print("--- EXAMPLE 1 ---") m = HashMap(50, hash_function_1) print(m.contains_key('key1')) m.put('key1', 10) m.put('key2', 20) m.put('key3', 30) print(m.contains_key('key1')) print(m.contains_key('key4')) print(m.contains_key('key2')) print(m.contains_key('key3')) m.remove('key3') print(m.contains_key('key3')) print("--- EXAMPLE 2 ---") m = HashMap(75, hash_function_2) keys = [i for i in range(1, 1000, 20)] for key in keys: m.put(str(key), key * 42)
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500, hash_function_2) # This block of code will read a file one word as a time and # put the word in `w`. It should be left as starter code. with open(source) as f: for line in f: words = rgx.findall(line) for w in words: # add the word to the set of keys w = w.lower() keys.add(w) if ht.contains_key(w): # if the hashmap already contains the key, get its index hashmap_index = ht.get_index(w) # get the linked list at that index linked_list = ht.get_chain(hashmap_index) # get the head of that linked list node = linked_list.get_head() while node is not None: if node.key == w: cur_val = node.value node.value = cur_val + 1 node = node.next else: # if the hashmap doesn't contain the key, put it in ht.put(w, 1) # create list for the tuples wordcount_tuples = [] for key in keys: # loop through the keys from earlier, get associated value, and add the pair to the list wordcount_tuples.append(tuple((key, ht.get(key)))) length = len(wordcount_tuples) for i in range(length): for n in range(0, (length - i - 1)): # Sort the list of key value pairs if (wordcount_tuples[n][1]) < (wordcount_tuples[n + 1][1]): old = wordcount_tuples[n] wordcount_tuples[n] = wordcount_tuples[n + 1] wordcount_tuples[n + 1] = old return_list = [] for i in range(number): # get a list of 0-[number] pairs to return return_list.append(wordcount_tuples[i]) return return_list
def top_words(source, number): """ Takes a plain text file and counts the number of occurrences of case insensitive words. Returns the top `number` of words in a list of tuples of the form (word, count). Args: source: the file name containing the text number: the number of top results to return (e.g. 5 would return the 5 most common words) Returns: A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) """ keys = set() ht = HashMap(2500, hash_function_2) # This block of code will read a file one word as a time and # put the word in `w`. It should be left as starter code. with open(source) as f: for line in f: words = rgx.findall(line) for w in words: #print(w) low = w.lower() keys.add(low) if ht.contains_key(low): ht.put(low, (ht.get(low) + 1)) else: ht.put(low, 1) #print(w) #print(ht.get(w)) top_words_array = [] largest = None largest_key = None #print(keys) while number > 0: for key in keys: if largest is None: largest = ht.get(key) largest_key = key elif largest < ht.get(key): largest = ht.get(key) largest_key = key #print(largest) #print(largest_key) #keys.remove(key) #print(largest_key) keys.discard(largest_key) temp_tup = (largest_key, largest) largest_key = None largest = None top_words_array.append(temp_tup) number = number - 1 #print(top_words_array[0]) return top_words_array #print(ht.get(key)) #print(key) #print(w) # FIXME: Complete this function #print(top_words("alice.txt",10)) # COMMENT THIS OUT WHEN SUBMITTING TO GRADESCOPE
def test_contains_key_1(self): """ Test contains_key with a hash map of 2 buckets. :passed: yes """ # Create some linked lists # First linked list ll_1 = LinkedList() ll_1.add_front("cat", 3) ll_1.add_front("bin", 2) ll_1.add_front("ape", 1) # print("ll_1:", ll_1) # Second linked list ll_2 = LinkedList() ll_2.add_front("fin", 3) ll_2.add_front("ewe", 2) ll_2.add_front("dim", 1) # print("ll_2:", ll_2) # Create hash map hash_m = HashMap(4, hash_function_1) hash_m._buckets[0] = ll_1 hash_m._buckets[1] = ll_2 print(hash_m) print(hash_m.contains_key("ape")) self.assertTrue(hash_m.contains_key("ape")) self.assertTrue(hash_m.contains_key("bin")) self.assertTrue(hash_m.contains_key("cat")) self.assertTrue(hash_m.contains_key("dim")) self.assertTrue(hash_m.contains_key("ewe")) self.assertTrue(hash_m.contains_key("fin")) self.assertFalse(hash_m.contains_key("aqe")) self.assertFalse(hash_m.contains_key("Bin")) self.assertFalse(hash_m.contains_key("BIN")) self.assertFalse(hash_m.contains_key("bat")) self.assertFalse(hash_m.contains_key("diM")) self.assertFalse(hash_m.contains_key("ew")) self.assertFalse(hash_m.contains_key("fIn")) self.assertFalse(hash_m.contains_key("blue")) self.assertFalse(hash_m.contains_key("mop"))