def test_frequency(): histogram = Listogram(fish_words) # Verify frequency count of all words assert histogram.frequency('one') == 1 assert histogram.frequency('two') == 1 assert histogram.frequency('red') == 1 assert histogram.frequency('blue') == 1 assert histogram.frequency('fish') == 4
def test_tokens(self): histogram = Listogram(self.fish_words) # Verify total count of all word tokens assert len(self.fish_words) == 8 assert histogram.tokens == 8 # Adding words again should double total count of all word tokens for word in self.fish_words: histogram.add_count(word) assert histogram.tokens == 8 * 2
def test_types(self): histogram = Listogram(self.fish_words) # Verify count of distinct word types assert len(set(self.fish_words)) == 5 assert histogram.types == 5 # Adding words again should not change count of distinct word types for word in self.fish_words: histogram.add_count(word) assert histogram.types == 5
def test_entries(): # NOTE: This test assumes Listogram is implemented as a list of tuples, # but if you implement it as a list of lists (or a list of count-lists) # you should modify the fish_list fixture above and/or this test (only) listogram = Listogram(fish_words) # Verify histogram as list of entries like [(word, count)] assert len(listogram.list_histogram) == 5 assert len(listogram.list_histogram) == len(fish_list) # Ignore item order
def test_contains(self): histogram = Listogram(self.fish_words) # All of these words should be found for word in self.fish_words: assert word in histogram # None of these words should be found for word in ('fishy', 'food'): assert word not in histogram
def test_contains(self): histogram = Listogram(self.fish_words) print(f"histogram in test: {histogram}") # All of these words should be found for word in self.fish_words: assert word in histogram # assert histogram.__contains__(word) # None of these words should be found for word in ('fishy', 'food'): assert word not in histogram
def test_entries(self): # NOTE: This test assumes Listogram is implemented as a list of tuples, # but if you implement it as a list of lists (or a list of count-lists) # you should modify the fish_list fixture above and/or this test (only) listogram = Listogram(self.fish_words) # Verify histogram as list of entries like [(word, count)] dictogram = dict(listogram) assert len(dictogram) == 5 self.assertCountEqual(dictogram, self.fish_dict) # Ignore item order
def test_contains(self): histogram = Listogram(self.fish_words) # All of these words should be found print("\n", self.fish_words) print(histogram) for word in self.fish_words: assert word in histogram print([word]) # None of these words should be found for word in ('fishy', 'food'): assert word not in histogram
def test_contains(self): histogram = Listogram(self.fish_words) # All of these words should be found for word in self.fish_list: assert word in histogram # None of these words should be found isIn = False for word in ('fishy', 'food'): for item in histogram: if word == item[0]: isIn = True assert isIn == False
def generate_words(): with open('words.txt', 'r') as f: words = f.read().split(' ') listo = Listogram(words).listogram_samples(10) dicto = Dictogram(words).dictogram_samples(10) histograms = {0: listo, 1: dicto} # sentences = [sentence for (index, sentence) in histograms.items()] # return sentences return random_sentence(5, words)
def test_entries(self): # NOTE: This test assumes Listogram is implemented as a list of tuples, # but if you implement it as a list of lists (or a list of count-lists) # you should modify the fish_list fixture above and/or this test (only) listogram = Listogram(self.fish_words) # reassign fish_list to be a list of lists fish_list_lists = [list(tuple) for tuple in self.fish_list] # Verify histogram as list of entries like [(word, count)] assert len(listogram) == 5 self.assertCountEqual(listogram, fish_list_lists) # Ignore item order # Verify histogram as dictionary of entries like {word: count} dictogram = dict(listogram) assert len(dictogram) == 5 self.assertCountEqual(dictogram, self.fish_dict) # Ignore item order
def test_frequency(self): histogram = Listogram(self.fish_words) # Verify frequency count of all words assert histogram.frequency('one') == 1 assert histogram.frequency('two') == 1 assert histogram.frequency('red') == 1 assert histogram.frequency('blue') == 1 assert histogram.frequency('fish') == 4 # Verify frequency count of unseen words assert histogram.frequency('food') == 0
def test_sample(self): histogram = Listogram(self.fish_words) # Create a list of 10,000 word samples from histogram samples_list = [histogram.sample() for _ in range(10000)] # Create a histogram to count frequency of each word samples_hist = Listogram(samples_list) # Check each word in original histogram for word, count in histogram: # Calculate word's observed frequency observed_freq = count / histogram.tokens # Calculate word's sampled frequency samples = samples_hist.frequency(word) sampled_freq = samples / samples_hist.tokens # Verify word's sampled frequency is close to observed frequency lower_bound = observed_freq * 0.9 # 10% below = 90% = 0.9 upper_bound = observed_freq * 1.1 # 10% above = 110% = 1.1 assert lower_bound <= sampled_freq <= upper_bound
def __init__(self, word_list, passed_text_list): """Initialize the class and create variables""" self.passed_text_list = passed_text_list if self.passed_text_list == True: self.word_list = word_list else: self.word_list = create_list(word_list) self.dictionary_histogram = Dictogram(self.word_list) self.listogram = Listogram(self.word_list) """ Creating the Markov Chain """ #Edit so as to get rid of length of list minus 1 and it doesnt run errors for index in range(len(self.word_list) - 2): word = self.word_list[index] next_word = self.word_list[index + 1] word_after_next = self.word_list[index + 2] if (word, next_word) not in self: small_dicto = Dictogram([(next_word, word_after_next)]) self[(word, next_word)] = small_dicto else: self[(word, next_word)].add_count((next_word, word_after_next))
from listogram import Listogram # split the corpus into sentences import re corpus = "This is a sentence. And this is also a sentence. One fish two fish and all that." sentences = re.split(r' *[\.\?!][\'"\)\]]* *', corpus) print(sentences) bag_of_words = [] # make a dictogram of each sentence for sentence in sentences: sentence = sentence.split(" ") # put them all into a list hist = Listogram(sentence) print(hist) bag_of_words.append(hist) print(bag_of_words)
cume = 0 for word in histogram_in: cume += histogram_in[word] if (cume > value): return word def sample_list_O_stuff(histogram_in): # stuff means tuples or lists cap = 0 i = 0 while i < len(histogram_in): cap += histogram_in[i][1] i += 1 value = random.randint(0, cap) cume = 0 index = 0 while index < len(histogram_in): cume += histogram_in[index][1] if cume > value: return histogram_in[index][0] index += 1 if __name__ == "__main__": text = clean_text('book_1.txt') hist = Dictogram(text) list_hist = Listogram(text) print(sample_list_O_stuff(list_hist)) print(dictionary_sample(hist))
def test_add_count(self): histogram = Listogram(self.fish_words) # Add more words to update frequency counts histogram.add_count('two', 2) histogram.add_count('blue', 3) histogram.add_count('fish', 4) histogram.add_count('food', 5) # Verify updated frequency count of all words assert histogram.frequency('one') == 1 assert histogram.frequency('two') == 3 assert histogram.frequency('red') == 1 assert histogram.frequency('blue') == 4 assert histogram.frequency('fish') == 8 assert histogram.frequency('food') == 5 # Verify count of distinct word types print(' *********************** ', histogram.types) assert histogram.types == 6 # Verify total count of all word tokens assert histogram.tokens == 8 + 14
def test_tokens(): listogram = Listogram(fish_words) # Verify total count of all word tokens assert len(fish_words) == 8 assert listogram.tokens == 8
def test_types(): listogram = Listogram(fish_words) # Verify count of distinct word types assert len(set(fish_words)) == 5 assert listogram.types == 5
def lists(): temp = Listogram("I DO AND YOU KNOW I DO AND YOU KNOW THAT I AM WITH YOU FOREVER".split()) return temp.generate_sentence(20)
def lists(): temp = Listogram('one fish two fish red fish blue fish'.split()) return temp.generate_sentence(10)