Пример #1
0
for text in texts:
    splitted_text = "POEMSTART "
    text = text_to_word_sequence(text,
                                 filters=text_filter,
                                 lower=True,
                                 split=" ")
    for word in text:
        if word == "\n":
            word = "LINEEND"
            splitted_text += " " + word
            continue
        for syllable in hyphenate_word(word):
            splitted_text += " " + syllable
    splitted_text += " POEMEND"
    splitted_texts.append(splitted_text)
    progressbar.count()
print("")

# Create an initial tokenizer
text_tokenizer = Tokenizer(filters=text_filter,
                           lower=True,
                           split=" ",
                           char_level=False)
text_tokenizer.fit_on_texts(splitted_texts)

# Generate a list of words that occur more than n times
# Generate a list of words that occur less than n times
less_occurring_words = []
more_occurring_words = []
progressbar = ProgressBar(len(text_tokenizer.word_counts.items()))
for word in text_tokenizer.word_counts.items():