예제 #1
0
        for regr in regrs:
            regr.fit(X_train, Y_train)
            predict = regr.predict(X_test)
            np.seterr(invalid="ignore")

            true, pred = Y_test, predict
            MAE = mean_absolute_error(np.array(true), np.array(pred))
            MSE = mean_squared_error(np.array(true), np.array(pred))
            Pearson_r = pearsonr(np.array(true), np.array(pred))

            decimal = 4
            print("|%s|%s|%s|" % (round(MAE, decimal), round(MSE, decimal), round(Pearson_r[0], decimal)))


if __name__ == "__main__":
    words, valence, arousal = load_anew("./resources/Lexicon/ANEW.txt")
    remove_idx = []
    for i, w in enumerate(words):
        if w in {"glamour", "skijump"}:
            remove_idx.append(i)

    for i in remove_idx[::-1]:
        words.pop(i)
        valence.pop(i)
        arousal.pop(i)
    # for i,j,k in zip(words, valence, arousal):
    #     print(i,j,k)
    vecs = build_amended_anew_vectors(words)
    print(vecs.shape)
    regression(vecs, np.array(arousal))
            vocab[word] += 1
    print(len(vocab))
    return vocab


def process(corpus):
    return [clean_str(sent) for sent in corpus]


vec_dim = 300
############################################## all ###############################################
corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
# corpus, ratings = load_vader(['movie_reviews'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))

vocab = get_vocab(corpus)
# dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p')
# vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')
print(len(vocab))
W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300)
# dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p')
# print('dump word_idx_map successful')
dump_picle(W, './data/corpus/vader/embedding_matrix_all.p')
print('dump embedding matrix file OK')
c
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)
예제 #3
0
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression')


if __name__ == '__main__':

    from load_data import load_vader

    normalize = True
    corpus, ratings = load_vader(['movie_reviews'])
    corpus = process(corpus)
    # lexicon = load_lexicon(get_file_path('lexicon'))
    from load_data import load_anew
    from file_name import get_file_path
    import numpy as np

    words, valences, _ = load_anew(get_file_path('anew'))
    mark = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
    lexicon = dict()
    for i, word in enumerate(words):
        lexicon[word] = valences[i]

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    # exit()

    # valence_mean, valence_true = linear_fusion(corpus, lexicon, mark)
    # print('start.....')
    # cv(valence_mean, valence_true, multivariant=False)
    # print('OK')
def generate_anew_synsets_data():
    anew_words, _, _ = load_anew('./resource/ANEW.txt')
    build_synsets(anew_words)
    print('Saved.')
    print(anew_words)
예제 #5
0
    print(len(vocab))
    return vocab


def process(corpus):
    return [clean_str(sent) for sent in corpus]


vec_dim = 300
############################################## all ###############################################
corpus, ratings = load_vader(
    ['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
# corpus, ratings = load_vader(['movie_reviews'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))

vocab = get_vocab(corpus)
# dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p')
# vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')
print(len(vocab))
W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'),
                                         vocab,
                                         k=300)
# dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p')
# print('dump word_idx_map successful')
dump_picle(W, './data/corpus/vader/embedding_matrix_all.p')
print('dump embedding matrix file OK')
        occur_times = (word_count >= 1).sum()
        if occur_times > 0:
            avg = np.average(np.array(valence)[word_count >= 1])
        else:
            avg = -1
        texts_scores.append(avg)
    return texts_scores


if __name__ == '__main__':
    from load_data import load_selected_data, load_anew, load_extend_anew

    # # print(count_matching(texts, words))

    texts, labels = load_selected_data(data_type='train', stem=False)
    words, valence, _ = load_anew()

    # select_matching(texts,labels, words)
    # exit()
    words, valence = np.array(words), np.array(valence)
    words_pos, valence_pos = words[valence > np.average(valence)], valence[
        valence > np.average(valence)]  # avg = 5.1511713456
    words_neg, valence_neg = words[valence < np.average(valence)], valence[
        valence < np.average(valence)]  # avg = 5.1511713456

    pos = avg_valence(texts, words_pos, valence_pos)
    neg = avg_valence(texts, words_neg, valence_neg)

    from visualization import draw_scatter_with_color

    draw_scatter_with_color(pos, neg, labels, 'pos', 'neg')
예제 #7
0
__author__ = 'NLP-PC'
from load_data import load_anew
from visualization import draw_scatter_with_labels
from load_data import load_extend_anew
words, arousal, valence = load_extend_anew()

draw_scatter_with_labels(arousal,valence, words, 'arousal', 'valence')

word, a, v = load_anew()
draw_scatter_with_labels(a, v, word, 'arousal', 'valence')
        if occur_times > 0:
            avg = np.average(np.array(valence)[word_count >= 1])
        else:
            avg = -1
        texts_scores.append(avg)
    return texts_scores


if __name__ == '__main__':
    from load_data import load_selected_data, load_anew, load_extend_anew

    # # print(count_matching(texts, words))


    texts, labels = load_selected_data(data_type='train', stem=False)
    words, valence, _ = load_anew()

    # select_matching(texts,labels, words)
    # exit()
    words, valence = np.array(words), np.array(valence)
    words_pos, valence_pos = words[valence > np.average(valence)], valence[
        valence > np.average(valence)]  # avg = 5.1511713456
    words_neg, valence_neg = words[valence < np.average(valence)], valence[
        valence < np.average(valence)]  # avg = 5.1511713456

    pos = avg_valence(texts, words_pos, valence_pos)
    neg = avg_valence(texts, words_neg, valence_neg)

    from visualization import draw_scatter_with_color

    draw_scatter_with_color(pos, neg, labels, 'pos', 'neg')
                                       cost_fun='Ridge_Regression')


if __name__ == '__main__':

    from load_data import load_vader

    normalize = True
    corpus, ratings = load_vader(['movie_reviews'])
    corpus = process(corpus)
    # lexicon = load_lexicon(get_file_path('lexicon'))
    from load_data import load_anew
    from file_name import get_file_path
    import numpy as np

    words, valences, _ = load_anew(get_file_path('anew'))
    mark = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
    lexicon = dict()
    for i, word in enumerate(words):
        lexicon[word] = valences[i]

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    # exit()

    # valence_mean, valence_true = linear_fusion(corpus, lexicon, mark)
    # print('start.....')
    # cv(valence_mean, valence_true, multivariant=False)
    # print('OK')
__author__ = "NLP-PC"
from load_data import load_anew
from visualization import draw_scatter_with_labels
from load_data import load_extend_anew

words, arousal, valence = load_extend_anew()

draw_scatter_with_labels(arousal, valence, words, "arousal", "valence")

word, a, v = load_anew()
draw_scatter_with_labels(a, v, word, "arousal", "valence")
def generate_anew_synsets_data():
    anew_words,_,_ = load_anew('./resource/ANEW.txt')
    build_synsets(anew_words)
    print('Saved.')
    print(anew_words)
예제 #12
0
            regr.fit(X_train, Y_train)
            predict = regr.predict(X_test)
            np.seterr(invalid='ignore')

            true, pred = Y_test, predict
            MAE = mean_absolute_error(np.array(true), np.array(pred))
            MSE = mean_squared_error(np.array(true), np.array(pred))
            Pearson_r = pearsonr(np.array(true), np.array(pred))

            decimal = 4
            print('|%s|%s|%s|' % (round(MAE, decimal), round(
                MSE, decimal), round(Pearson_r[0], decimal)))


if __name__ == '__main__':
    words, valence, arousal = load_anew('./resources/Lexicon/ANEW.txt')
    remove_idx = []
    for i, w in enumerate(words):
        if w in {'glamour', 'skijump'}:
            remove_idx.append(i)

    for i in remove_idx[::-1]:
        words.pop(i)
        valence.pop(i)
        arousal.pop(i)
    # for i,j,k in zip(words, valence, arousal):
    #     print(i,j,k)
    vecs = build_ori_anew_vectors(words)
    print(vecs.shape)
    print("Valence")
    regression(vecs, np.array(valence))
예제 #13
0
__author__ = "NLP-PC"
# coding: utf-8
import goslate
from load_data import load_lexicon
from file_name import get_file_path
from load_data import load_anew
import os

os.chdir("..")
print(load_lexicon(get_file_path("lexicon")))
words, valence, arousal = load_anew(get_file_path("anew"))
gs = goslate.Goslate()
for tw_text in gs.translate(words, "zh-tw"):
    print(tw_text)
print(gs.translate("Hi", "zh-TW"))
# You could get all supported language list through get_languages
languages = gs.get_languages()
print(languages["zh-TW"])
예제 #14
0
__author__ = 'NLP-PC'
# coding: utf-8
import goslate
from load_data import load_lexicon
from file_name import get_file_path
from load_data import load_anew
import os
os.chdir('..')
print(load_lexicon(get_file_path('lexicon')))
words, valence, arousal = load_anew(get_file_path('anew'))
gs = goslate.Goslate()
for tw_text in gs.translate(words, 'zh-tw'):
    print(tw_text)
print(gs.translate('Hi', 'zh-TW'))
# You could get all supported language list through get_languages
languages = gs.get_languages()
print(languages['zh-TW'])