def get_most_frequent_words(text): word_list = tokenize_nltk(text) word_list = [ word for word in word_list if word not in stopwords and word not in string.punctuation and re.search('[a-zA-Z]', word) ] freq_dist = FreqDist(word_list) top_200 = freq_dist.most_common(200) top_200 = [word[0] for word in top_200] return top_200
def main(): sentences = get_all_book_sentences(books_dir) sentences = [tokenize_nltk(s.lower()) for s in sentences] #model = train_word2vec(sentences) #test_model() model = pickle.load(open(word2vec_model_path, 'rb')) #accuracy_list = evaluate_model(model, evaluation_file) #print(accuracy_list) (analogy_score, word_list) = model.wv.evaluate_word_analogies( datapath('questions-words.txt')) print(analogy_score) pretrained_model = KeyedVectors.load_word2vec_format(pretrained_model_path, binary=True) (analogy_score, word_list) = pretrained_model.evaluate_word_analogies( datapath('questions-words.txt')) print(analogy_score)
def create_vectorizers(data_dict): topic_list = list(data_dict.keys()) vectorizer_dict = {} for topic in topic_list: text_array = data_dict[topic] text = " ".join(text_array) word_list = tokenize_nltk(text) word_list = [word for word in word_list if word not in stopwords] freq_dist = FreqDist(word_list) top_200 = freq_dist.most_common(200) vocab = [ wtuple[0] for wtuple in top_200 if wtuple[0] not in stopwords and wtuple[0] not in string.punctuation ] vectorizer_dict[topic] = CountVectorizer(vocabulary=vocab) return vectorizer_dict
def get_stats(text, num_words=200): word_list = tokenize_nltk(text) word_list = [word for word in word_list if word not in stopwords and re.search("[A-Za-z]", word)] freq_dist = FreqDist(word_list) print(freq_dist.most_common(num_words)) return freq_dist
def pos_tag_nltk(text): words = tokenize_nltk(text) words_with_pos = nltk.pos_tag(words) return words_with_pos