def get_most_frequent_words(text):
    word_list = tokenize_nltk(text)
    word_list = [word for word in word_list if word not in stopwords and word not in string.punctuation and re.search('[a-zA-Z]', word)]
    freq_dist = FreqDist(word_list)
    top_200 = freq_dist.most_common(200)
    top_200 = [word[0] for word in top_200]
    return top_200
def get_stats(text):
    word_list = tokenize_nltk(text)
    word_list = [
        word for word in word_list
        if word not in stopwords and re.search("[A-Za-z]", word)
    ]
    freq_dist = FreqDist(word_list)
    print(freq_dist.most_common(200))
    return freq_dist
示例#3
0
def create_vectorizers(data_dict):
    topic_list = list(data_dict.keys())
    vectorizer_dict = {}
    for topic in topic_list:
        text_array = data_dict[topic]
        text = " ".join(text_array)
        word_list = tokenize_nltk(text)
        word_list = [word for word in word_list if word not in stopwords]
        freq_dist = FreqDist(word_list)
        top_200 = freq_dist.most_common(200)
        vocab = [
            wtuple[0] for wtuple in top_200 if wtuple[0] not in stopwords
            and wtuple[0] not in string.punctuation
        ]
        vectorizer_dict[topic] = CountVectorizer(vocabulary=vocab)
    return vectorizer_dict
def main():
    sentences = get_all_book_sentences(books_dir)
    sentences = [tokenize_nltk(s.lower()) for s in sentences]
    #model = train_word2vec(sentences)
    #test_model()
    model = pickle.load(open(word2vec_model_path, 'rb'))
    #accuracy_list = evaluate_model(model, evaluation_file)
    #print(accuracy_list)
    (analogy_score, word_list) = model.wv.evaluate_word_analogies(
        datapath('questions-words.txt'))
    print(analogy_score)
    pretrained_model = KeyedVectors.load_word2vec_format(pretrained_model_path,
                                                         binary=True)
    (analogy_score, word_list) = pretrained_model.evaluate_word_analogies(
        datapath('questions-words.txt'))
    print(analogy_score)
示例#5
0
def pos_tag_nltk(text):
    words = tokenize_nltk(text)
    words_with_pos = nltk.pos_tag(words)
    return words_with_pos