Пример #1
0
def test_frequency():
    histogram = Listogram(fish_words)
    # Verify frequency count of all words
    assert histogram.frequency('one') == 1
    assert histogram.frequency('two') == 1
    assert histogram.frequency('red') == 1
    assert histogram.frequency('blue') == 1
    assert histogram.frequency('fish') == 4
 def test_tokens(self):
     histogram = Listogram(self.fish_words)
     # Verify total count of all word tokens
     assert len(self.fish_words) == 8
     assert histogram.tokens == 8
     # Adding words again should double total count of all word tokens
     for word in self.fish_words:
         histogram.add_count(word)
     assert histogram.tokens == 8 * 2
 def test_types(self):
     histogram = Listogram(self.fish_words)
     # Verify count of distinct word types
     assert len(set(self.fish_words)) == 5
     assert histogram.types == 5
     # Adding words again should not change count of distinct word types
     for word in self.fish_words:
         histogram.add_count(word)
     assert histogram.types == 5
Пример #4
0
def test_entries():
    # NOTE: This test assumes Listogram is implemented as a list of tuples,
    # but if you implement it as a list of lists (or a list of count-lists)
    # you should modify the fish_list fixture above and/or this test (only)
    listogram = Listogram(fish_words)
    # Verify histogram as list of entries like [(word, count)]
    assert len(listogram.list_histogram) == 5
    assert len(listogram.list_histogram) == len(fish_list)  # Ignore item order
 def test_contains(self):
     histogram = Listogram(self.fish_words)
     # All of these words should be found
     for word in self.fish_words:
         assert word in histogram
     # None of these words should be found
     for word in ('fishy', 'food'):
         assert word not in histogram
Пример #6
0
 def test_contains(self):
     histogram = Listogram(self.fish_words)
     print(f"histogram in test: {histogram}")
     # All of these words should be found
     for word in self.fish_words:
         assert word in histogram
         # assert histogram.__contains__(word)
     # None of these words should be found
     for word in ('fishy', 'food'):
         assert word not in histogram
    def test_entries(self):
        # NOTE: This test assumes Listogram is implemented as a list of tuples,
        # but if you implement it as a list of lists (or a list of count-lists)
        # you should modify the fish_list fixture above and/or this test (only)
        listogram = Listogram(self.fish_words)
        # Verify histogram as list of entries like [(word, count)]

        dictogram = dict(listogram)
        assert len(dictogram) == 5
        self.assertCountEqual(dictogram, self.fish_dict)  # Ignore item order
 def test_contains(self):
     histogram = Listogram(self.fish_words)
     # All of these words should be found
     print("\n", self.fish_words)
     print(histogram)
     for word in self.fish_words:
         assert word in histogram
         print([word])
     # None of these words should be found
     for word in ('fishy', 'food'):
         assert word not in histogram
Пример #9
0
 def test_contains(self):
     histogram = Listogram(self.fish_words)
     # All of these words should be found
     for word in self.fish_list:
         assert word in histogram
     # None of these words should be found
     isIn = False
     for word in ('fishy', 'food'):
         for item in histogram:
             if word == item[0]:
                 isIn = True
     assert isIn == False
Пример #10
0
def generate_words():
    with open('words.txt', 'r') as f:
        words = f.read().split(' ')

    listo = Listogram(words).listogram_samples(10)
    dicto = Dictogram(words).dictogram_samples(10)

    histograms = {0: listo, 1: dicto}

    # sentences = [sentence for (index, sentence) in histograms.items()]
    # return sentences
    return random_sentence(5, words)
Пример #11
0
 def test_entries(self):
     # NOTE: This test assumes Listogram is implemented as a list of tuples,
     # but if you implement it as a list of lists (or a list of count-lists)
     # you should modify the fish_list fixture above and/or this test (only)
     listogram = Listogram(self.fish_words)
     # reassign fish_list to be a list of lists
     fish_list_lists = [list(tuple) for tuple in self.fish_list]
     # Verify histogram as list of entries like [(word, count)]
     assert len(listogram) == 5
     self.assertCountEqual(listogram, fish_list_lists)  # Ignore item order
     # Verify histogram as dictionary of entries like {word: count}
     dictogram = dict(listogram)
     assert len(dictogram) == 5
     self.assertCountEqual(dictogram, self.fish_dict)  # Ignore item order
 def test_frequency(self):
     histogram = Listogram(self.fish_words)
     # Verify frequency count of all words
     assert histogram.frequency('one') == 1
     assert histogram.frequency('two') == 1
     assert histogram.frequency('red') == 1
     assert histogram.frequency('blue') == 1
     assert histogram.frequency('fish') == 4
     # Verify frequency count of unseen words
     assert histogram.frequency('food') == 0
 def test_sample(self):
     histogram = Listogram(self.fish_words)
     # Create a list of 10,000 word samples from histogram
     samples_list = [histogram.sample() for _ in range(10000)]
     # Create a histogram to count frequency of each word
     samples_hist = Listogram(samples_list)
     # Check each word in original histogram
     for word, count in histogram:
         # Calculate word's observed frequency
         observed_freq = count / histogram.tokens
         # Calculate word's sampled frequency
         samples = samples_hist.frequency(word)
         sampled_freq = samples / samples_hist.tokens
         # Verify word's sampled frequency is close to observed frequency
         lower_bound = observed_freq * 0.9  # 10% below = 90% = 0.9
         upper_bound = observed_freq * 1.1  # 10% above = 110% = 1.1
         assert lower_bound <= sampled_freq <= upper_bound
Пример #14
0
    def __init__(self, word_list, passed_text_list):
        """Initialize the class and create variables"""
        self.passed_text_list = passed_text_list

        if self.passed_text_list == True:
            self.word_list = word_list
        else:
            self.word_list = create_list(word_list)

        self.dictionary_histogram = Dictogram(self.word_list)
        self.listogram = Listogram(self.word_list)
        """ Creating the Markov Chain """
        #Edit so as to get rid of length of list minus 1 and it doesnt run errors
        for index in range(len(self.word_list) - 2):
            word = self.word_list[index]
            next_word = self.word_list[index + 1]
            word_after_next = self.word_list[index + 2]

            if (word, next_word) not in self:
                small_dicto = Dictogram([(next_word, word_after_next)])
                self[(word, next_word)] = small_dicto

            else:
                self[(word, next_word)].add_count((next_word, word_after_next))
Пример #15
0
from listogram import Listogram

# split the corpus into sentences
import re

corpus = "This is a sentence. And this is also a sentence. One fish two fish and all that."

sentences = re.split(r' *[\.\?!][\'"\)\]]* *', corpus)
print(sentences)
bag_of_words = []
# make a dictogram of each sentence
for sentence in sentences:
    sentence = sentence.split(" ")
    # put them all into a list
    hist = Listogram(sentence)
    print(hist)
    bag_of_words.append(hist)

print(bag_of_words)
Пример #16
0
    cume = 0
    for word in histogram_in:
        cume += histogram_in[word]
        if (cume > value):
            return word


def sample_list_O_stuff(histogram_in):  # stuff means tuples or lists
    cap = 0
    i = 0
    while i < len(histogram_in):
        cap += histogram_in[i][1]
        i += 1

    value = random.randint(0, cap)

    cume = 0
    index = 0
    while index < len(histogram_in):
        cume += histogram_in[index][1]
        if cume > value:
            return histogram_in[index][0]
        index += 1


if __name__ == "__main__":
    text = clean_text('book_1.txt')
    hist = Dictogram(text)
    list_hist = Listogram(text)
    print(sample_list_O_stuff(list_hist))
    print(dictionary_sample(hist))
 def test_add_count(self):
     histogram = Listogram(self.fish_words)
     # Add more words to update frequency counts
     histogram.add_count('two', 2)
     histogram.add_count('blue', 3)
     histogram.add_count('fish', 4)
     histogram.add_count('food', 5)
     # Verify updated frequency count of all words
     assert histogram.frequency('one') == 1
     assert histogram.frequency('two') == 3
     assert histogram.frequency('red') == 1
     assert histogram.frequency('blue') == 4
     assert histogram.frequency('fish') == 8
     assert histogram.frequency('food') == 5
     # Verify count of distinct word types
     print(' *********************** ', histogram.types)
     assert histogram.types == 6
     # Verify total count of all word tokens
     assert histogram.tokens == 8 + 14
Пример #18
0
def test_tokens():
    listogram = Listogram(fish_words)
    # Verify total count of all word tokens
    assert len(fish_words) == 8
    assert listogram.tokens == 8
Пример #19
0
def test_types():
    listogram = Listogram(fish_words)
    # Verify count of distinct word types
    assert len(set(fish_words)) == 5
    assert listogram.types == 5
Пример #20
0
def lists():
    temp = Listogram("I DO AND YOU KNOW I DO AND YOU KNOW THAT I AM WITH YOU FOREVER".split()) 
    return temp.generate_sentence(20)
Пример #21
0
def lists():
    temp = Listogram('one fish two fish red fish blue fish'.split())
    return temp.generate_sentence(10)