예제 #1
0
def train_docvecs(Sentences):
    model = Doc2Vec(min_count=2, window=10, size=50, sample=1e-5, negative=5, workers=7)
    model.build_vocab(Sentences.to_array())
    for epoch in range(100):
        print('epoch: %s' % epoch)
        model.train(Sentences.sentences_rand())
    model.save(get_file_path('docvecs_CVAT'))
    print('Training model complete, saved successful.')
예제 #2
0
def load_embeddings(arg=None):
    if arg == 'zh_tw':  # dim = 400
        model = gensim.models.Word2Vec.load_word2vec_format(get_file_path('cn_word2vec'), binary=False)
    elif arg == 'CVAT':  # dim = 50
        model = gensim.models.Word2Vec.load(get_file_path('wordvecs_CVAT'))
    elif arg == 'IMDb':  # dim = 100
        model = Doc2Vec.load(get_file_path('test_doc2vec_model'))
    elif arg == 'CVAT_docvecs':  # dim = 50
        model = Doc2Vec.load(get_file_path('docvecs_CVAT'))
    elif arg == 'google_news':
        model = gensim.models.Word2Vec.load_word2vec_format(get_file_path('google_news'), binary=True)
    elif arg == 'vader':
        model = gensim.models.Word2Vec.load('./data/vader_wordvecs.w2v')
    else:
        raise Exception('Wrong Argument.')
    print('Load Model Complete.')
    return model
예제 #3
0
def load_embeddings(arg=None):
    if arg == 'zh_tw':  # dim = 400
        model = gensim.models.Word2Vec.load_word2vec_format(
            get_file_path('cn_word2vec'), binary=False)
    elif arg == 'CVAT':  # dim = 50
        model = gensim.models.Word2Vec.load(get_file_path('wordvecs_CVAT'))
    elif arg == 'IMDb':  # dim = 100
        model = Doc2Vec.load(get_file_path('test_doc2vec_model'))
    elif arg == 'CVAT_docvecs':  # dim = 50
        model = Doc2Vec.load(get_file_path('docvecs_CVAT'))
    elif arg == 'google_news':
        model = gensim.models.Word2Vec.load_word2vec_format(
            get_file_path('google_news'), binary=True)
    elif arg == 'vader':
        model = gensim.models.Word2Vec.load('./data/vader_wordvecs.w2v')
    else:
        raise Exception('Wrong Argument.')
    print('Load Model Complete.')
    return model
예제 #4
0
def log_performance(MSE, MAE, Pearson_r, R2, Spearman_r, sqrt_MSE):
    # create a file handler
    handler = logging.FileHandler(get_file_path('log'))
    handler.setLevel(logging.INFO)
    # create a logging format
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    # add the handlers to the logger
    logger.addHandler(handler)
    logger.info('MSE: %s, MAE: %s, Pearson_r: %s, R2: %s, Spearman_r: %s, sqrt_MSE: %s', MSE, MAE, Pearson_r, R2, Spearman_r, sqrt_MSE)
    logger.removeHandler(handler) # remove the Handler after you finish your job
예제 #5
0
def log_state(msg):
    # create a file handler
    handler = logging.FileHandler(get_file_path('log'))
    handler.setLevel(logging.INFO)
    # create a logging format
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    # add the handlers to the logger
    logger.addHandler(handler)
    logger.info(msg)
    logger.removeHandler(handler)
예제 #6
0
def train_wordvecs(Sentence, save_path=None):
    model = Word2Vec(size=50, min_count=1)
    model.build_vocab(Sentence.toarray())
    for epoch in range(10):
        print('epoch: %s' % epoch)
        model.train(Sentence.rand())
    if save_path is None:
        model.save(get_file_path('wordvecs_CVAT'))
    else:
        model.save(save_path)
    print('Training model complete, saved successful.')
예제 #7
0
def train_wordvecs(Sentence, save_path=None):
    model = Word2Vec(size=50, min_count=1)
    model.build_vocab(Sentence.toarray())
    for epoch in range(10):
        print('epoch: %s' % epoch)
        model.train(Sentence.rand())
    if save_path is None:
        model.save(get_file_path('wordvecs_CVAT'))
    else:
        model.save(save_path)
    print('Training model complete, saved successful.')
예제 #8
0
def log_state(msg):
    # create a file handler
    handler = logging.FileHandler(get_file_path('log'))
    handler.setLevel(logging.INFO)
    # create a logging format
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    # add the handlers to the logger
    logger.addHandler(handler)
    logger.info(msg)
    logger.removeHandler(handler)
예제 #9
0
def train_docvecs(Sentences):
    model = Doc2Vec(min_count=2,
                    window=10,
                    size=50,
                    sample=1e-5,
                    negative=5,
                    workers=7)
    model.build_vocab(Sentences.to_array())
    for epoch in range(100):
        print('epoch: %s' % epoch)
        model.train(Sentences.sentences_rand())
    model.save(get_file_path('docvecs_CVAT'))
    print('Training model complete, saved successful.')
예제 #10
0
def log_performance(MSE, MAE, Pearson_r, R2, Spearman_r, sqrt_MSE):
    # create a file handler
    handler = logging.FileHandler(get_file_path('log'))
    handler.setLevel(logging.INFO)
    # create a logging format
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    # add the handlers to the logger
    logger.addHandler(handler)
    logger.info(
        'MSE: %s, MAE: %s, Pearson_r: %s, R2: %s, Spearman_r: %s, sqrt_MSE: %s',
        MSE, MAE, Pearson_r, R2, Spearman_r, sqrt_MSE)
    logger.removeHandler(
        handler)  # remove the Handler after you finish your job
예제 #11
0
import numpy as np


def get_vocab(corpus):
    vocab = defaultdict(float)
    for sent in corpus:
        for word in sent:
            vocab[word] += 1
    print(len(vocab))
    return vocab


########################################## config ########################################
vec_dim = 400
##########################################################################################
corpus = load_corpus(get_file_path('cn_corpus'))
# print(corpus[:2])
# vocab = get_vocab(corpus)
# dump_picle(vocab, get_file_path('CVAT_Vocab'))
# print('OK')
vocab = load_pickle(get_file_path('CVAT_Vocab'))
# for i in vocab:
#     print(i)
# print(len(vocab))

# W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400)
# dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT'))
# print('dump word_idx_map successful')
# dump_picle(W, '/home/hs/Data/embedding_matrix_CVAT.p')
# print('OK')
예제 #12
0
    from positive_negative_split import get_pos_neg_va

    def cv(data, target, multivariant=False):
        X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(
            data, target, test_size=0.2, random_state=0)
        if multivariant is False:
            linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
        else:
            linear_regression_multivariant(X_train,
                                           X_test,
                                           Y_train,
                                           Y_test,
                                           cost_fun='Ridge_Regression')

    normalize = True
    corpus = load_corpus(get_file_path('cn_corpus'))
    lexicon = load_lexicon(get_file_path('normalized_onezero_lexicon'))
    mark = load_mark(get_file_path('normalized_onezero_mark'))

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    # exit()

    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_sqr(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = nonlinear_max_fusion(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tf(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tfidf(corpus, lexicon, mark)
    valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo(
        corpus, lexicon, mark)
예제 #13
0
    # get vocab and save to pickle
    vocab = get_vocab(file_dir)
    dump_picle(vocab, './data/tmp/vocab.p')
    print('OK')
    a = load_pickle('./data/tmp/vocab.p')
    for i in a:
        print(i)
    print(len(a))
    exit()
    # end

    # make word index map
    vocab = load_pickle('./data/tmp/vocab.p')
    W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300)
    dump_picle(word_idx_map, get_file_path('word_idx_map'))
    print('dump word_idx_map successful')
    dump_picle(W, '/home/hs/Data/embedding_matrix.p')
    print('OK')
    exit()
    # make word index map end

    word_idx_map = load_pickle(get_file_path('word_idx_map'))
    print(len(word_idx_map))
    for i in word_idx_map:
        print(i)
    exit()

    word_idx_map = load_pickle(get_file_path('word_idx_map'))
    data, pos_length, neg_length = prepare_data(file_dir, word_idx_map)
    dump_picle([data, pos_length, neg_length], get_file_path('imdb_processed_data'))
예제 #14
0
        if i % 10 == 0:
            logger.info("evaluate for text : %i/%i..." % (i, num))

    evaluate(valence_true, valence_pred, 'valence')
    evaluate(arousal_true, arousal_pred, 'arousal')


if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info(r"running %s" % ''.join(sys.argv))

    corpus_name = get_file_path('cn_corpus')
    logger.info(r"loading corpus from : " + corpus_name)

    lexicon_name = get_file_path('lexicon')
    logger.info(r"loading lexicon form : " + lexicon_name)

    expand_name = get_file_path('neural_cand')
    logger.info(r"loading expand_word from : " + expand_name)

    mark_name = get_file_path('mark')
    logger.info(r"loading mark from : " + mark_name)

    corpus = load_corpus(corpus_name)
    lexicon = load_lexicon(lexicon_name)
    mark = load_mark(mark_name)
    # log_state('use extend lexicon')
예제 #15
0
        if i % 10 == 0:
            logger.info("evaluate for text : %i/%i..." % (i, num))

    evaluate(valence_true, valence_pred, 'valence')
    evaluate(arousal_true, arousal_pred, 'arousal')


if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info(r"running %s" % ''.join(sys.argv))

    corpus_name = get_file_path('cn_corpus')
    logger.info(r"loading corpus from : " + corpus_name)

    lexicon_name = get_file_path('lexicon')
    logger.info(r"loading lexicon form : " + lexicon_name)

    expand_name = get_file_path('neural_cand')
    logger.info(r"loading expand_word from : " + expand_name)

    mark_name = get_file_path('mark')
    logger.info(r"loading mark from : " + mark_name)

    corpus = load_corpus(corpus_name)
    lexicon = load_lexicon(lexicon_name)
    mark = load_mark(mark_name)
    # log_state('use extend lexicon')
        for word in clean_str(sent).split():
            vocab[word] += 1
    print(len(vocab))
    return vocab


def process(corpus):
    return [clean_str(sent) for sent in corpus]


vec_dim = 300
############################################## all ###############################################
corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
# corpus, ratings = load_vader(['movie_reviews'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))

vocab = get_vocab(corpus)
# dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p')
# vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')
print(len(vocab))
W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300)
# dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p')
# print('dump word_idx_map successful')
dump_picle(W, './data/corpus/vader/embedding_matrix_all.p')
print('dump embedding matrix file OK')
c
예제 #17
0
from file_name import get_file_path
from load_data import load_corpus, load_lexicon, load_mark
from load_data import load_embeddings
from word2vec_fn import buill_word_vector
from word2vec_fn import gold_valence_arousal
import numpy as np
from sklearn import cross_validation
from cross_validation import cv
from word2vec_fn import build_doc_vector
# '''
model = load_embeddings('CVAT_docvecs')
print(model.docvecs[1])
print(model.docvecs['SENT_23'])
print(len(model.vocab.keys()))

corpus = load_corpus(get_file_path('cn_corpus'))
mark = load_mark(get_file_path('mark'))
vecs = build_doc_vector(corpus, model)

valence, arousal = gold_valence_arousal(corpus, mark)
cv(vecs, valence, multivariant=True)
cv(vecs, arousal, multivariant=True)
# '''
# from save_data import dump_picle
# dump_picle(model.key(), get_file_path('words_in_wordvec'))
# print('ok')
#
# # print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1))
# # print(model.doesnt_match("breakfast cereal dinner lunch".split()))
# # print(model.similarity('woman', 'man'))
# # print(model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'], topn=10))
예제 #18
0
                for item_no, line in enumerate(fin):
                    self.sentences.append(
                        LabeledSentence(words=utils.to_unicode(line).split(), tags=[prefix + '_%s' % str(item_no)]))
        return self.sentences

    def sentences_rand(self):
        # out = numpy.random.permutation(self.sentences)
        # return out
        random.shuffle(self.sentences)
        return self.sentences


sources = {'test-neg.txt': 'TEST_NEG', 'test-pos.txt': 'TEST_POS', 'train-neg.txt': 'TRAIN_NEG',
           'train-pos.txt': 'TRAIN_POS', 'train-unsup.txt': 'TRAIN_UNS'}
##########################################################
dir = get_file_path('test_doc2vec')
keys = list(sources.keys())
for old_key in keys:
    sources[dir + '/' + old_key] = sources.pop(old_key)
##############################################################
############################# model training #################
'''
sentences = LabeledLineSentence(sources)
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)
model.build_vocab(sentences.to_array())
for epoch in range(10):
    print('epoch: %s' % epoch)
    model.train(sentences.sentences_rand())

print(model.most_similar('good'))
# print(model['TRAIN_NEG_0'])
예제 #19
0
def cv(data, target, multivariant=False):
    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.1, random_state=10)
    if multivariant is False:
        linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
    else:
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="ordinary_least_squares")
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="Ridge_Regression")
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="Bayesian_Regression")
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="SVR")
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="KNN_Reg")


if __name__ == "__main__":
    normalize = True
    corpus = load_corpus(get_file_path("cn_corpus"))
    # lexicon = load_lexicon(get_file_path('lexicon'))
    mark = load_mark(get_file_path("mark"))
    lexicon = combine_lexicon(get_file_path("lexicon"), get_file_path("neural_cand"))

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    # exit()

    valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark)
    print("start.....")
    cv(valence_mean, valence_true, multivariant=False)
    cv(arousal_mean, arousal_true, multivariant=False)
    print("OK")
예제 #20
0
    def sentences_rand(self):
        # out = numpy.random.permutation(self.sentences)
        # return out
        random.shuffle(self.sentences)
        return self.sentences


sources = {
    'test-neg.txt': 'TEST_NEG',
    'test-pos.txt': 'TEST_POS',
    'train-neg.txt': 'TRAIN_NEG',
    'train-pos.txt': 'TRAIN_POS',
    'train-unsup.txt': 'TRAIN_UNS'
}
##########################################################
dir = get_file_path('test_doc2vec')
keys = list(sources.keys())
for old_key in keys:
    sources[dir + '/' + old_key] = sources.pop(old_key)
##############################################################
############################# model training #################
'''
sentences = LabeledLineSentence(sources)
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)
model.build_vocab(sentences.to_array())
for epoch in range(10):
    print('epoch: %s' % epoch)
    model.train(sentences.sentences_rand())

print(model.most_similar('good'))
# print(model['TRAIN_NEG_0'])
예제 #21
0
__author__ = "NLP-PC"
# coding: utf-8
import goslate
from load_data import load_lexicon
from file_name import get_file_path
from load_data import load_anew
import os

os.chdir("..")
print(load_lexicon(get_file_path("lexicon")))
words, valence, arousal = load_anew(get_file_path("anew"))
gs = goslate.Goslate()
for tw_text in gs.translate(words, "zh-tw"):
    print(tw_text)
print(gs.translate("Hi", "zh-TW"))
# You could get all supported language list through get_languages
languages = gs.get_languages()
print(languages["zh-TW"])
예제 #22
0

def get_vocab(corpus):
    vocab = defaultdict(float)
    for sent in corpus:
        for word in sent:
            vocab[word] += 1
    print(len(vocab))
    return vocab


# 注意: 这个文件就是CVAT构造cnn输入数据的代码
########################################## config ########################################
vec_dim = 400
##########################################################################################
corpus = load_corpus(get_file_path('cn_corpus'))
print(corpus[:2])
vocab = get_vocab(corpus)
dump_picle(vocab, get_file_path('CVAT_Vocab'))
print('Dump CVAT vocab OK')
# vocab = load_pickle(get_file_path('CVAT_Vocab'))
for i in vocab:
    print(i)
print(len(vocab))

W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'),
                                         vocab,
                                         k=400)
dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT'))
print('dump word_idx_map successful')
dump_picle(W, './data/tmp/embedding_matrix_CVAT.p')
예제 #23
0
        raise Exception('Wrong Argument.')
    print('Load Model Complete.')
    return model


def load_vader(name):
    def load_text(filename):
        with open(filename, 'r', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile, delimiter='\t')
            texts, ratings = [], []
            for line in reader:
                texts.append(line[2])
                ratings.append(float(line[1]))
        return texts, ratings

    texts, ratings = [], []
    for filename in name:
        text, rating = load_text('./data/corpus/vader/' + filename + '.txt')
        texts.extend(text)
        ratings.extend(rating)
    return texts, ratings


if __name__ == '__main__':
    from file_name import get_file_path

    words = load_corpus(get_file_path('cn_corpus'))
    print(words[719])
    # for i, w in enumerate(words):
    #     print(i,w)
                                       cost_fun='Ridge_Regression')


if __name__ == '__main__':

    from load_data import load_vader

    normalize = True
    corpus, ratings = load_vader(['movie_reviews'])
    corpus = process(corpus)
    # lexicon = load_lexicon(get_file_path('lexicon'))
    from load_data import load_anew
    from file_name import get_file_path
    import numpy as np

    words, valences, _ = load_anew(get_file_path('anew'))
    mark = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
    lexicon = dict()
    for i, word in enumerate(words):
        lexicon[word] = valences[i]

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    # exit()

    # valence_mean, valence_true = linear_fusion(corpus, lexicon, mark)
    # print('start.....')
    # cv(valence_mean, valence_true, multivariant=False)
    # print('OK')
예제 #25
0
import numpy as np


def statistic(texts):
    avg_length, vocab = 0, set()
    length_list = []
    for text in texts:
        if type(text) is not list:
            text = text.split()
        length_list.append(len(text))
        vocab = vocab.union(set(text))
        # if len(text)>200:
        #     print(text)
    avg_length = np.average(np.array(length_list))
    return avg_length, len(vocab)


if __name__ == '__main__':
    # (['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
    tweets, _ = load_vader(['tweets'])
    movie, _ = load_vader(['movie_reviews'])
    amazon, _ = load_vader(['product_reviews'])
    NYT, _ = load_vader(['news_articles'])
    cvat = load_corpus(get_file_path('cn_corpus'))

    print(statistic(tweets))
    print(statistic(movie))
    print(statistic(amazon))
    print(statistic(NYT))
    print(statistic(cvat))
예제 #26
0
    return result


def scaling_onezero(num_list):
    # Note: the type of the parameter is np.array
    # Function: To normalize data
    result = []
    for num in num_list:
        result.append(num / 9)
    return result


if __name__ == '__main__':
    from load_data import load_lexicon
    from load_data import load_mark
    from file_name import get_file_path
    from save_data import save_csv

    lexicon = load_lexicon(get_file_path('lexicon'))
    mark = load_mark(get_file_path('mark'))
    lexicon = np.array(lexicon)
    mark = np.array(mark)
    #####################################
    lexicon[:, 1] = scaling_onezero(np.array(lexicon[:, 1], dtype=float))
    lexicon[:, 2] = scaling_onezero(np.array(lexicon[:, 2], dtype=float))
    mark[:, 1] = scaling_onezero(np.array(mark[:, 1], dtype=float))
    mark[:, 2] = scaling_onezero(np.array(mark[:, 2], dtype=float))
    ######################################
    save_csv(lexicon, get_file_path('normalized_onezero_lexicon'))
    save_csv(mark, get_file_path('normalized_onezero_mark'))
예제 #27
0
            vocab[word] += 1
    print(len(vocab))
    return vocab


def process(corpus):
    return [clean_str(sent) for sent in corpus]


vec_dim = 300
############################################## all ###############################################
corpus, ratings = load_vader(
    ['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
# corpus, ratings = load_vader(['movie_reviews'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))

vocab = get_vocab(corpus)
# dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p')
# vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')
print(len(vocab))
W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'),
                                         vocab,
                                         k=300)
# dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p')
# print('dump word_idx_map successful')
dump_picle(W, './data/corpus/vader/embedding_matrix_all.p')
    from file_name import get_file_path
    from regression import linear_regression, linear_regression_multivariant
    from positive_negative_split import get_pos_neg_va


    def cv(data, target, multivariant=False):
        X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.2,
                                                                             random_state=0)
        if multivariant is False:
            linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
        else:
            linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression')


    normalize = True
    corpus = load_corpus(get_file_path('cn_corpus'))
    lexicon = load_lexicon(get_file_path('normalized_onezero_lexicon'))
    mark = load_mark(get_file_path('normalized_onezero_mark'))

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    # exit()

    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_sqr(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = nonlinear_max_fusion(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tf(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tfidf(corpus, lexicon, mark)
    valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo_tf(corpus, lexicon, mark)
예제 #29
0
def cv(data, target, multivariant=False):
    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.1, random_state=10)
    if multivariant is False:
        linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
    else:
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='ordinary_least_squares')
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression')
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Bayesian_Regression')
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='SVR')
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='KNN_Reg')


if __name__ == '__main__':
    normalize = True
    corpus = load_corpus(get_file_path('cn_corpus'))
    # lexicon = load_lexicon(get_file_path('lexicon'))
    mark = load_mark(get_file_path('mark'))
    lexicon = combine_lexicon(get_file_path('lexicon'), get_file_path('neural_cand'))

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    # exit()

    valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark)
    print('start.....')
    cv(valence_mean, valence_true, multivariant=False)
    cv(arousal_mean, arousal_true, multivariant=False)
    print('OK')
예제 #30
0
        raise Exception('Wrong Argument.')
    print('Load Model Complete.')
    return model


def load_vader(name):
    def load_text(filename):
        with open(filename, 'r', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile, delimiter='\t')
            texts, ratings = [], []
            for line in reader:
                texts.append(line[2])
                ratings.append(float(line[1]))
        return texts, ratings

    texts, ratings = [], []
    for filename in name:
        text, rating = load_text('./data/corpus/vader/' + filename + '.txt')
        texts.extend(text)
        ratings.extend(rating)
    return texts, ratings


if __name__ == '__main__':
    from file_name import get_file_path

    words = load_corpus(get_file_path('cn_corpus'))
    print(words[719])
    # for i, w in enumerate(words):
    #     print(i,w)
예제 #31
0
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression')


if __name__ == '__main__':

    from load_data import load_vader

    normalize = True
    corpus, ratings = load_vader(['movie_reviews'])
    corpus = process(corpus)
    # lexicon = load_lexicon(get_file_path('lexicon'))
    from load_data import load_anew
    from file_name import get_file_path
    import numpy as np

    words, valences, _ = load_anew(get_file_path('anew'))
    mark = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
    lexicon = dict()
    for i, word in enumerate(words):
        lexicon[word] = valences[i]

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    # exit()

    # valence_mean, valence_true = linear_fusion(corpus, lexicon, mark)
    # print('start.....')
    # cv(valence_mean, valence_true, multivariant=False)
    # print('OK')
예제 #32
0
    vocab = get_vocab(file_dir)
    dump_picle(vocab, './data/tmp/vocab.p')
    print('OK')
    a = load_pickle('./data/tmp/vocab.p')
    for i in a:
        print(i)
    print(len(a))
    exit()
    # end

    # make word index map
    vocab = load_pickle('./data/tmp/vocab.p')
    W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'),
                                             vocab,
                                             k=300)
    dump_picle(word_idx_map, get_file_path('word_idx_map'))
    print('dump word_idx_map successful')
    dump_picle(W, '/home/hs/Data/embedding_matrix.p')
    print('OK')
    exit()
    # make word index map end

    word_idx_map = load_pickle(get_file_path('word_idx_map'))
    print(len(word_idx_map))
    for i in word_idx_map:
        print(i)
    exit()

    word_idx_map = load_pickle(get_file_path('word_idx_map'))
    data, pos_length, neg_length = prepare_data(file_dir, word_idx_map)
    dump_picle([data, pos_length, neg_length],
예제 #33
0
import numpy as np


def get_vocab(corpus):
    vocab = defaultdict(float)
    for sent in corpus:
        for word in sent:
            vocab[word] += 1
    print(len(vocab))
    return vocab

# 注意: 这个文件就是CVAT构造cnn输入数据的代码
########################################## config ########################################
vec_dim = 400
##########################################################################################
corpus = load_corpus(get_file_path('cn_corpus'))
print(corpus[:2])
vocab = get_vocab(corpus)
dump_picle(vocab, get_file_path('CVAT_Vocab'))
print('Dump CVAT vocab OK')
# vocab = load_pickle(get_file_path('CVAT_Vocab'))
for i in vocab:
    print(i)
print(len(vocab))

W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400)
dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT'))
print('dump word_idx_map successful')
dump_picle(W, './data/tmp/embedding_matrix_CVAT.p')
print('OK')
예제 #34
0
__author__ = 'NLP-PC'
# coding: utf-8
import goslate
from load_data import load_lexicon
from file_name import get_file_path
from load_data import load_anew
import os
os.chdir('..')
print(load_lexicon(get_file_path('lexicon')))
words, valence, arousal = load_anew(get_file_path('anew'))
gs = goslate.Goslate()
for tw_text in gs.translate(words, 'zh-tw'):
    print(tw_text)
print(gs.translate('Hi', 'zh-TW'))
# You could get all supported language list through get_languages
languages = gs.get_languages()
print(languages['zh-TW'])