def make_trie_helper(text, word = None, phrase = None): """ Helper function for make_word_trie and make_phrase_trie Use tokenize_sentences to extract sentences For sentence in setences: - If word: split sentence into words, if word in trie, increment by 1, otherwise set to 1 - If sentence: split into tuple of words, if sentence if trie, increment by 1, otherwise set to 1 Return trie """ t = Trie() sentences = tokenize_sentences(text) for sentence in sentences: if word: for word in sentence.split(' '): if word in t: t[word] += 1 else: t[word] = 1 elif phrase: sentence = tuple(sentence.split(' ')) if sentence in t: t[sentence] += 1 else: t[sentence] = 1 return t
def make_word_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the words in the text, and whose values are the number of times the associated word appears in the text >>> phrase = 'hello my name is darius' >>> word_trie = make_word_trie(phrase) >>> 'hello' in word_trie True >>> 'darius' in word_trie True >>> 'charles' in word_trie False """ new_text = tokenize_sentences(text) #creates a list of strings word_trie = Trie(str) for sentence in new_text: #ex of sentence: 'how shall i ever repay you' word = '' for i in range(len(sentence)): #counter to detect last element letter = sentence[i] if letter != ' ': #Updates word for letters and not spaces word = word + letter if letter == ' ' or i == len(sentence) - 1: #full word if word not in word_trie: #Initializes if it doesn't exist word_trie[word] = 1 else: #Updates the count word_trie[word] += 1 word = '' #Resets if the count has been updated for the word return word_trie
def make_phrase_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the sentences in the text (as tuples of individual words) and whose values are the number of times the associated sentence appears in the text. >>> phrase = 'hello my name is darius. I am a MIT student. I like to robots. hello my name is darius. I am on the crew team' >>> phrase_trie = make_phrase_trie(phrase) >>> ('hello','my','name','is','darius') in phrase_trie True """ new_text = tokenize_sentences(text) #creates a list of strings phrase_trie = Trie(tuple) for sentence in new_text: words_in_sentence = [] word = '' for i in range(len(sentence)): #counter to detect last element letter = sentence[i] if letter != ' ': #Updates word for letters and not spaces word = word + letter if letter == ' ' or i == len(sentence) - 1: #full word words_in_sentence.append(word) word = '' #Resets if the count has been updated for the word words_in_sentence = tuple(words_in_sentence) if words_in_sentence not in phrase_trie: #Updates phrase trie for the sentence phrase_trie[words_in_sentence] = 1 else: phrase_trie[words_in_sentence] += 1 return phrase_trie
def frequency_sen(text): freq = {} s_list = tokenize_sentences(text) for sentence in s_list: if sentence not in freq: freq[sentence] = 1 else: freq[sentence] += 1 return freq
def frequency_word(text): freq = {} s_list = tokenize_sentences(text) for sentence in s_list: for key in sentence.split(' '): if key not in freq: freq[key] = 1 else: freq[key] += 1 return freq
def make_word_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the words in the text, and whose values are the number of times the associated word appears in the text """ trie = Trie() for sentence in tokenize_sentences(text): for word in sentence.split(): if word not in trie: trie[word] = 0 trie[word] += 1 return trie
def make_phrase_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the sentences in the text (as tuples of individual words) and whose values are the number of times the associated sentence appears in the text. """ trie = Trie() for sentence in tokenize_sentences(text): sentence = tuple(sentence.split()) if sentence not in trie: trie[sentence] = 0 trie[sentence] += 1 return trie
def make_phrase_trie(text): """ Given a piece of text as a single string, return a Trie whose keys are the sentences in the text (as tuples of individual words) and whose values are the number of times the associated sentence appears in the text. """ t = Trie() s_list = tokenize_sentences(text) freq = frequency_sen(text) for sentence in s_list: tuple_sen = tuple(sentence.split(' ')) t.__setitem__(tuple_sen, freq[sentence]) return t
def make_phrase_trie(text): """ Given a piece of text as a single string, return a Trie whose keys are the sentences in the text (as tuples of individual words) and whose values are the number of times the associated sentence appears in the text. """ t = Trie() my_list = tokenize_sentences(text) for line in my_list: if tuple(line.split()) in t: t[tuple(line.split())] += 1 else: t[tuple(line.split())] = 1 return t
def make_word_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the words in the text, and whose values are the number of times the associated word appears in the text """ text = tokenize_sentences(text) counter = text_data(text)['word_count'] # Build a dictionary counting the number of times each word appears trie = Trie() for word in counter: # Build trie trie[word] = counter[word] return trie
def make_phrase_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the sentences in the text (as tuples of individual words) and whose values are the number of times the associated sentence appears in the text. """ t = Trie() sentences = tokenize_sentences(text) for sentence in sentences: key = tuple(sentence.split()) if key in t: t[key] = t[key] + 1 #adds one to the sentence's value else: t[key] = 1 return t
def make_word_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the words in the text, and whose values are the number of times the associated word appears in the text """ t = Trie() sentences = tokenize_sentences(text) for sentence in sentences: for word in sentence.split(): if word in t: t[word] = t[word] + 1 #adds 1 to every word's value else: t[word] = 1 return t
def make_word_trie(text): """ Given a piece of text as a single string, return a Trie whose keys are the words in the text, and whose values are the number of times the associated word appears in the text """ t = Trie() my_list = tokenize_sentences(text) for line in my_list: for word in line.split(): if word in t: t[word] += 1 else: t[word] = 1 return t
def make_word_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the words in the text, and whose values are the number of times the associated word appears in the text """ t = Trie() for sentence in tokenize_sentences(text): # each sentence is a string of words separated by whitespace for word in sentence.split(): if word not in t: t[word] = 1 else: t[word] = t[word] + 1 return t
def make_phrase_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the sentences in the text (as tuples of individual words) and whose values are the number of times the associated sentence appears in the text. """ full_sentences = tokenize_sentences(text) trie = Trie() for sentence in full_sentences: phrase_list = sentence.split() final_tuple = tuple(phrase_list) if final_tuple in trie: trie[final_tuple] += 1 else: trie[final_tuple] = 1 return trie
def make_phrase_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the sentences in the text (as tuples of individual words) and whose values are the number of times the associated sentence appears in the text. """ phraseTrie = Trie() sentenceList = tokenize_sentences( text) # sentenceList is a list of space-containing strings sentenceSet = set(sentenceList) for s in sentenceSet: tempTuple = tuple(s.split()) phraseTrie.set(tempTuple, sentenceList.count(s)) return phraseTrie
def make_word_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the words in the text, and whose values are the number of times the associated word appears in the text """ wordTrie = Trie() allText, sentenceList = '', tokenize_sentences(text) for sentence in sentenceList: allText += sentence + ' ' stringList = allText.split() stringSet = set(stringList) for s in stringSet: wordTrie.set(s, stringList.count(s)) return wordTrie
def make_phrase_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the sentences in the text (as tuples of individual words) and whose values are the number of times the associated sentence appears in the text. """ t = Trie() key_counts = {} new_text = tokenize_sentences(text) for sentence in new_text: new_s = tuple(sentence.split(' ')) if new_s in key_counts: key_counts[new_s] += 1 else: key_counts[new_s] = 1 for key in key_counts: t.set(key, key_counts[key]) return t
def make_word_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the words in the text, and whose values are the number of times the associated word appears in the text """ t = Trie() key_counts = {} new_text = tokenize_sentences(text) for sentence in new_text: new_s = sentence.split(' ') for word in new_s: if word in key_counts: key_counts[word] += 1 else: key_counts[word] = 1 for key in key_counts: t.set(key, key_counts[key]) return t
def make_word_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the words in the text, and whose values are the number of times the associated word appears in the text """ word_list = tokenize_sentences(text) # print(word_list) out_trie = Trie() for sentence in word_list: for word in sentence.split(): #print(word, len(word)) if word in out_trie: # print("hello") out_trie[word] += 1 else: out_trie[word] = 1 return out_trie
def make_phrase_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the sentences in the text (as tuples of individual words) and whose values are the number of times the associated sentence appears in the text. """ text = tokenize_sentences(text) # Build a dictionary counting the number of times each sentence appears and a list representation of each # sentence where each element is a word: data = text_data(text) counter = data['sentence_count'] sentences = data['sentence_lists'] trie = Trie() for sentence in sentences: # Build trie trie[sentence] = counter[sentence] return trie
def make_word_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the words in the text, and whose values are the number of times the associated word appears in the text >>> t = make_word_trie("code code and more code") >>> t['and'] 1 >>> t['code'] 3 """ word_trie = Trie(str) sentences = tokenize_sentences(text) for s in sentences: for w in s.split(" "): if w in word_trie: word_trie[w] += 1 else: word_trie[w] = 1 return word_trie
def make_phrase_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the sentences in the text (as tuples of individual words) and whose values are the number of times the associated sentence appears in the text. >>> t = make_phrase_trie("I like waffles. I like french toast.") >>> t[('i', 'like', 'waffles')] 1 >>> t[('i', 'like', 'french', 'toast')] 1 """ phrase_trie = Trie(tuple) sentences = tokenize_sentences(text) for s in sentences: sent_tuple = get_sentence_tuple(s) if sent_tuple in phrase_trie: phrase_trie[sent_tuple] += 1 else: phrase_trie[sent_tuple] = 1 return phrase_trie
def make_phrase_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the sentences in the text (as tuples of individual words) and whose values are the number of times the associated sentence appears in the text. >>> text = 'I am a test. Test.' >>> x = make_phrase_trie(text) >>> x[('test',)] 1 """ sentences = tokenize_sentences(text) trie = Trie(tuple) for sentence in sentences: tup_sent = tuple(word for word in sentence.split(' ')) if tup_sent not in trie: trie[tup_sent] = 1 else: trie[tup_sent] += 1 return trie
def make_phrase_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the sentences in the text (as tuples of individual words) and whose values are the number of times the associated sentence appears in the text. """ #split text into tuples each containing a sentence text = tokenize_sentences(text) sentences = [] for sent in text: sent = sent.split() sentences.append(tuple([x for x in sent])) root = Trie() for s in sentences: #if phrase isn't in trie, add it if not root.contains(s): root.set(s, 1) #if phrase is in trie, add 1 to it's value (frequency) else: value = root.get(s) root.set(s, value + 1) return root
def make_word_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the words in the text, and whose values are the number of times the associated word appears in the text >>> text = 'I am a test.' >>> x = make_word_trie(text) >>> x['test'] 1 >>> text = 'I am a test test.' >>> x = make_word_trie(text) >>> x['test'] 2 """ sentences = tokenize_sentences(text) trie = Trie(str) for sentence in sentences: for word in sentence.split(' '): if word not in trie: trie[word] = 1 else: trie[word] += 1 return trie
def make_word_trie(text): """ Given a piece of text as a single string, create a Trie whose keys are the words in the text, and whose values are the number of times the associated word appears in the text """ #split text into a list of words text = tokenize_sentences(text) words = [] for sentence in text: sentence = sentence.split() words = words + sentence root = Trie() for w in words: #if word isn't in trie, add it if not root.contains(w): root.set(w, 1) #if word is in trie, add 1 to it's value (frequency) else: value = root.get(w) root.set(w, value + 1) return root
return [(word, freq) for word, freq in trie if is_match(word, pattern, 0, 0)] # you can include test cases of your own in the block below. if __name__ == '__main__': import doctest doctest.testmod() with open("alice.txt", encoding="utf-8") as f: text = f.read() trie = make_phrase_trie(text) print(autocomplete(trie, tuple(), 6)) [('said', 'alice'), ('thought', 'alice'), ('wow',), ('said', 'the', 'caterpillar'), ('said', 'the', 'march', 'hare'), ('beauootiful', 'soooop')] print(len(tokenize_sentences(text))) print(len(list(trie))) trie = make_word_trie(text) print(autocorrect(trie, 'hear', 12)) with open("metamorphosis.txt", encoding='utf-8') as f: text = f.read() trie = make_word_trie(text) print(autocomplete(trie, 'gre', 6)) print(word_filter(trie, 'c*h')) with open("twocities.txt", encoding="utf-8") as f: text = f.read() trie = make_word_trie(text)
def get_words(text): return [tuple(i.split()) for i in tokenize_sentences(text, True)]