for regr in regrs: regr.fit(X_train, Y_train) predict = regr.predict(X_test) np.seterr(invalid="ignore") true, pred = Y_test, predict MAE = mean_absolute_error(np.array(true), np.array(pred)) MSE = mean_squared_error(np.array(true), np.array(pred)) Pearson_r = pearsonr(np.array(true), np.array(pred)) decimal = 4 print("|%s|%s|%s|" % (round(MAE, decimal), round(MSE, decimal), round(Pearson_r[0], decimal))) if __name__ == "__main__": words, valence, arousal = load_anew("./resources/Lexicon/ANEW.txt") remove_idx = [] for i, w in enumerate(words): if w in {"glamour", "skijump"}: remove_idx.append(i) for i in remove_idx[::-1]: words.pop(i) valence.pop(i) arousal.pop(i) # for i,j,k in zip(words, valence, arousal): # print(i,j,k) vecs = build_amended_anew_vectors(words) print(vecs.shape) regression(vecs, np.array(arousal))
vocab[word] += 1 print(len(vocab)) return vocab def process(corpus): return [clean_str(sent) for sent in corpus] vec_dim = 300 ############################################## all ############################################### corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) # corpus, ratings = load_vader(['movie_reviews']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus), len(corpus)) vocab = get_vocab(corpus) # dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p') # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p') print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) # dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p') # print('dump word_idx_map successful') dump_picle(W, './data/corpus/vader/embedding_matrix_all.p') print('dump embedding matrix file OK') c idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)
linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression') if __name__ == '__main__': from load_data import load_vader normalize = True corpus, ratings = load_vader(['movie_reviews']) corpus = process(corpus) # lexicon = load_lexicon(get_file_path('lexicon')) from load_data import load_anew from file_name import get_file_path import numpy as np words, valences, _ = load_anew(get_file_path('anew')) mark = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 lexicon = dict() for i, word in enumerate(words): lexicon[word] = valences[i] # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon) # exit() # valence_mean, valence_true = linear_fusion(corpus, lexicon, mark) # print('start.....') # cv(valence_mean, valence_true, multivariant=False) # print('OK')
def generate_anew_synsets_data(): anew_words, _, _ = load_anew('./resource/ANEW.txt') build_synsets(anew_words) print('Saved.') print(anew_words)
print(len(vocab)) return vocab def process(corpus): return [clean_str(sent) for sent in corpus] vec_dim = 300 ############################################## all ############################################### corpus, ratings = load_vader( ['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) # corpus, ratings = load_vader(['movie_reviews']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus), len(corpus)) vocab = get_vocab(corpus) # dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p') # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p') print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) # dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p') # print('dump word_idx_map successful') dump_picle(W, './data/corpus/vader/embedding_matrix_all.p') print('dump embedding matrix file OK')
occur_times = (word_count >= 1).sum() if occur_times > 0: avg = np.average(np.array(valence)[word_count >= 1]) else: avg = -1 texts_scores.append(avg) return texts_scores if __name__ == '__main__': from load_data import load_selected_data, load_anew, load_extend_anew # # print(count_matching(texts, words)) texts, labels = load_selected_data(data_type='train', stem=False) words, valence, _ = load_anew() # select_matching(texts,labels, words) # exit() words, valence = np.array(words), np.array(valence) words_pos, valence_pos = words[valence > np.average(valence)], valence[ valence > np.average(valence)] # avg = 5.1511713456 words_neg, valence_neg = words[valence < np.average(valence)], valence[ valence < np.average(valence)] # avg = 5.1511713456 pos = avg_valence(texts, words_pos, valence_pos) neg = avg_valence(texts, words_neg, valence_neg) from visualization import draw_scatter_with_color draw_scatter_with_color(pos, neg, labels, 'pos', 'neg')
__author__ = 'NLP-PC' from load_data import load_anew from visualization import draw_scatter_with_labels from load_data import load_extend_anew words, arousal, valence = load_extend_anew() draw_scatter_with_labels(arousal,valence, words, 'arousal', 'valence') word, a, v = load_anew() draw_scatter_with_labels(a, v, word, 'arousal', 'valence')
if occur_times > 0: avg = np.average(np.array(valence)[word_count >= 1]) else: avg = -1 texts_scores.append(avg) return texts_scores if __name__ == '__main__': from load_data import load_selected_data, load_anew, load_extend_anew # # print(count_matching(texts, words)) texts, labels = load_selected_data(data_type='train', stem=False) words, valence, _ = load_anew() # select_matching(texts,labels, words) # exit() words, valence = np.array(words), np.array(valence) words_pos, valence_pos = words[valence > np.average(valence)], valence[ valence > np.average(valence)] # avg = 5.1511713456 words_neg, valence_neg = words[valence < np.average(valence)], valence[ valence < np.average(valence)] # avg = 5.1511713456 pos = avg_valence(texts, words_pos, valence_pos) neg = avg_valence(texts, words_neg, valence_neg) from visualization import draw_scatter_with_color draw_scatter_with_color(pos, neg, labels, 'pos', 'neg')
cost_fun='Ridge_Regression') if __name__ == '__main__': from load_data import load_vader normalize = True corpus, ratings = load_vader(['movie_reviews']) corpus = process(corpus) # lexicon = load_lexicon(get_file_path('lexicon')) from load_data import load_anew from file_name import get_file_path import numpy as np words, valences, _ = load_anew(get_file_path('anew')) mark = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 lexicon = dict() for i, word in enumerate(words): lexicon[word] = valences[i] # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon) # exit() # valence_mean, valence_true = linear_fusion(corpus, lexicon, mark) # print('start.....') # cv(valence_mean, valence_true, multivariant=False) # print('OK')
__author__ = "NLP-PC" from load_data import load_anew from visualization import draw_scatter_with_labels from load_data import load_extend_anew words, arousal, valence = load_extend_anew() draw_scatter_with_labels(arousal, valence, words, "arousal", "valence") word, a, v = load_anew() draw_scatter_with_labels(a, v, word, "arousal", "valence")
def generate_anew_synsets_data(): anew_words,_,_ = load_anew('./resource/ANEW.txt') build_synsets(anew_words) print('Saved.') print(anew_words)
regr.fit(X_train, Y_train) predict = regr.predict(X_test) np.seterr(invalid='ignore') true, pred = Y_test, predict MAE = mean_absolute_error(np.array(true), np.array(pred)) MSE = mean_squared_error(np.array(true), np.array(pred)) Pearson_r = pearsonr(np.array(true), np.array(pred)) decimal = 4 print('|%s|%s|%s|' % (round(MAE, decimal), round( MSE, decimal), round(Pearson_r[0], decimal))) if __name__ == '__main__': words, valence, arousal = load_anew('./resources/Lexicon/ANEW.txt') remove_idx = [] for i, w in enumerate(words): if w in {'glamour', 'skijump'}: remove_idx.append(i) for i in remove_idx[::-1]: words.pop(i) valence.pop(i) arousal.pop(i) # for i,j,k in zip(words, valence, arousal): # print(i,j,k) vecs = build_ori_anew_vectors(words) print(vecs.shape) print("Valence") regression(vecs, np.array(valence))
__author__ = "NLP-PC" # coding: utf-8 import goslate from load_data import load_lexicon from file_name import get_file_path from load_data import load_anew import os os.chdir("..") print(load_lexicon(get_file_path("lexicon"))) words, valence, arousal = load_anew(get_file_path("anew")) gs = goslate.Goslate() for tw_text in gs.translate(words, "zh-tw"): print(tw_text) print(gs.translate("Hi", "zh-TW")) # You could get all supported language list through get_languages languages = gs.get_languages() print(languages["zh-TW"])
__author__ = 'NLP-PC' # coding: utf-8 import goslate from load_data import load_lexicon from file_name import get_file_path from load_data import load_anew import os os.chdir('..') print(load_lexicon(get_file_path('lexicon'))) words, valence, arousal = load_anew(get_file_path('anew')) gs = goslate.Goslate() for tw_text in gs.translate(words, 'zh-tw'): print(tw_text) print(gs.translate('Hi', 'zh-TW')) # You could get all supported language list through get_languages languages = gs.get_languages() print(languages['zh-TW'])