def _generate_words_letters(number_of_words=100, order=1, max_transitions=30, words_cache=None): if words_cache is None: words_cache = WordsCache() entries = entries_from_cmudict() existing_words = [word for word, phones in entries] entries = filter_entries(entries, 'Austen') sequences = [[letter for letter in word] for word, phones in entries] markov_tree = generate_markov_tree(sequences, order=order) number_generated = 0 while number_generated < number_of_words: cached_words = words_cache.get_words('letters') new_sequence = generate_new_sequence(markov_tree, max_transitions) new_word = ''.join(new_sequence) # Reject words already in the corpus if new_word in existing_words or new_word in cached_words: continue else: words_cache.add_word('letters', new_word) print "New word: %s" % new_word number_generated += 1 return words_cache
def test_first_order(self): markov_tree = { 'START': { 'B': 1 }, 'B': { 'UH': 1 }, 'UH': { 'STOP': 1 } } sequence = generate_new_sequence(markov_tree, 10) self.assertEqual(sequence, ['B', 'UH'])
def test_cutoff(self): markov_tree = { 'START': { 'B': 1 }, 'B': { 'UH': 1 }, 'UH': { 'L': 1 }, 'L': { 'STOP': 1 } } sequence = generate_new_sequence(markov_tree, 1) self.assertEqual(sequence, ['B'])
def test_second_order(self): markov_tree = { 'START': { 'START': { 'B': 1 }, 'B': { 'UH': 1 } }, 'B': { 'UH': { 'STOP': 1 } }, 'UH': { 'STOP': { 'STOP': 1 } } } sequence = generate_new_sequence(markov_tree, 10) self.assertEqual(sequence, ['B', 'UH'])