def train_docvecs(Sentences): model = Doc2Vec(min_count=2, window=10, size=50, sample=1e-5, negative=5, workers=7) model.build_vocab(Sentences.to_array()) for epoch in range(100): print('epoch: %s' % epoch) model.train(Sentences.sentences_rand()) model.save(get_file_path('docvecs_CVAT')) print('Training model complete, saved successful.')
def load_embeddings(arg=None): if arg == 'zh_tw': # dim = 400 model = gensim.models.Word2Vec.load_word2vec_format(get_file_path('cn_word2vec'), binary=False) elif arg == 'CVAT': # dim = 50 model = gensim.models.Word2Vec.load(get_file_path('wordvecs_CVAT')) elif arg == 'IMDb': # dim = 100 model = Doc2Vec.load(get_file_path('test_doc2vec_model')) elif arg == 'CVAT_docvecs': # dim = 50 model = Doc2Vec.load(get_file_path('docvecs_CVAT')) elif arg == 'google_news': model = gensim.models.Word2Vec.load_word2vec_format(get_file_path('google_news'), binary=True) elif arg == 'vader': model = gensim.models.Word2Vec.load('./data/vader_wordvecs.w2v') else: raise Exception('Wrong Argument.') print('Load Model Complete.') return model
def load_embeddings(arg=None): if arg == 'zh_tw': # dim = 400 model = gensim.models.Word2Vec.load_word2vec_format( get_file_path('cn_word2vec'), binary=False) elif arg == 'CVAT': # dim = 50 model = gensim.models.Word2Vec.load(get_file_path('wordvecs_CVAT')) elif arg == 'IMDb': # dim = 100 model = Doc2Vec.load(get_file_path('test_doc2vec_model')) elif arg == 'CVAT_docvecs': # dim = 50 model = Doc2Vec.load(get_file_path('docvecs_CVAT')) elif arg == 'google_news': model = gensim.models.Word2Vec.load_word2vec_format( get_file_path('google_news'), binary=True) elif arg == 'vader': model = gensim.models.Word2Vec.load('./data/vader_wordvecs.w2v') else: raise Exception('Wrong Argument.') print('Load Model Complete.') return model
def log_performance(MSE, MAE, Pearson_r, R2, Spearman_r, sqrt_MSE): # create a file handler handler = logging.FileHandler(get_file_path('log')) handler.setLevel(logging.INFO) # create a logging format formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) # add the handlers to the logger logger.addHandler(handler) logger.info('MSE: %s, MAE: %s, Pearson_r: %s, R2: %s, Spearman_r: %s, sqrt_MSE: %s', MSE, MAE, Pearson_r, R2, Spearman_r, sqrt_MSE) logger.removeHandler(handler) # remove the Handler after you finish your job
def log_state(msg): # create a file handler handler = logging.FileHandler(get_file_path('log')) handler.setLevel(logging.INFO) # create a logging format formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) # add the handlers to the logger logger.addHandler(handler) logger.info(msg) logger.removeHandler(handler)
def train_wordvecs(Sentence, save_path=None): model = Word2Vec(size=50, min_count=1) model.build_vocab(Sentence.toarray()) for epoch in range(10): print('epoch: %s' % epoch) model.train(Sentence.rand()) if save_path is None: model.save(get_file_path('wordvecs_CVAT')) else: model.save(save_path) print('Training model complete, saved successful.')
def log_state(msg): # create a file handler handler = logging.FileHandler(get_file_path('log')) handler.setLevel(logging.INFO) # create a logging format formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) # add the handlers to the logger logger.addHandler(handler) logger.info(msg) logger.removeHandler(handler)
def log_performance(MSE, MAE, Pearson_r, R2, Spearman_r, sqrt_MSE): # create a file handler handler = logging.FileHandler(get_file_path('log')) handler.setLevel(logging.INFO) # create a logging format formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) # add the handlers to the logger logger.addHandler(handler) logger.info( 'MSE: %s, MAE: %s, Pearson_r: %s, R2: %s, Spearman_r: %s, sqrt_MSE: %s', MSE, MAE, Pearson_r, R2, Spearman_r, sqrt_MSE) logger.removeHandler( handler) # remove the Handler after you finish your job
import numpy as np def get_vocab(corpus): vocab = defaultdict(float) for sent in corpus: for word in sent: vocab[word] += 1 print(len(vocab)) return vocab ########################################## config ######################################## vec_dim = 400 ########################################################################################## corpus = load_corpus(get_file_path('cn_corpus')) # print(corpus[:2]) # vocab = get_vocab(corpus) # dump_picle(vocab, get_file_path('CVAT_Vocab')) # print('OK') vocab = load_pickle(get_file_path('CVAT_Vocab')) # for i in vocab: # print(i) # print(len(vocab)) # W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400) # dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT')) # print('dump word_idx_map successful') # dump_picle(W, '/home/hs/Data/embedding_matrix_CVAT.p') # print('OK')
from positive_negative_split import get_pos_neg_va def cv(data, target, multivariant=False): X_train, X_test, Y_train, Y_test = cross_validation.train_test_split( data, target, test_size=0.2, random_state=0) if multivariant is False: linear_regression(X_train, X_test, Y_train, Y_test, plot=False) else: linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression') normalize = True corpus = load_corpus(get_file_path('cn_corpus')) lexicon = load_lexicon(get_file_path('normalized_onezero_lexicon')) mark = load_mark(get_file_path('normalized_onezero_mark')) # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon) # exit() # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_sqr(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = nonlinear_max_fusion(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tf(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tfidf(corpus, lexicon, mark) valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo( corpus, lexicon, mark)
# get vocab and save to pickle vocab = get_vocab(file_dir) dump_picle(vocab, './data/tmp/vocab.p') print('OK') a = load_pickle('./data/tmp/vocab.p') for i in a: print(i) print(len(a)) exit() # end # make word index map vocab = load_pickle('./data/tmp/vocab.p') W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) dump_picle(word_idx_map, get_file_path('word_idx_map')) print('dump word_idx_map successful') dump_picle(W, '/home/hs/Data/embedding_matrix.p') print('OK') exit() # make word index map end word_idx_map = load_pickle(get_file_path('word_idx_map')) print(len(word_idx_map)) for i in word_idx_map: print(i) exit() word_idx_map = load_pickle(get_file_path('word_idx_map')) data, pos_length, neg_length = prepare_data(file_dir, word_idx_map) dump_picle([data, pos_length, neg_length], get_file_path('imdb_processed_data'))
if i % 10 == 0: logger.info("evaluate for text : %i/%i..." % (i, num)) evaluate(valence_true, valence_pred, 'valence') evaluate(arousal_true, arousal_pred, 'arousal') if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info(r"running %s" % ''.join(sys.argv)) corpus_name = get_file_path('cn_corpus') logger.info(r"loading corpus from : " + corpus_name) lexicon_name = get_file_path('lexicon') logger.info(r"loading lexicon form : " + lexicon_name) expand_name = get_file_path('neural_cand') logger.info(r"loading expand_word from : " + expand_name) mark_name = get_file_path('mark') logger.info(r"loading mark from : " + mark_name) corpus = load_corpus(corpus_name) lexicon = load_lexicon(lexicon_name) mark = load_mark(mark_name) # log_state('use extend lexicon')
for word in clean_str(sent).split(): vocab[word] += 1 print(len(vocab)) return vocab def process(corpus): return [clean_str(sent) for sent in corpus] vec_dim = 300 ############################################## all ############################################### corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) # corpus, ratings = load_vader(['movie_reviews']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus), len(corpus)) vocab = get_vocab(corpus) # dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p') # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p') print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) # dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p') # print('dump word_idx_map successful') dump_picle(W, './data/corpus/vader/embedding_matrix_all.p') print('dump embedding matrix file OK') c
from file_name import get_file_path from load_data import load_corpus, load_lexicon, load_mark from load_data import load_embeddings from word2vec_fn import buill_word_vector from word2vec_fn import gold_valence_arousal import numpy as np from sklearn import cross_validation from cross_validation import cv from word2vec_fn import build_doc_vector # ''' model = load_embeddings('CVAT_docvecs') print(model.docvecs[1]) print(model.docvecs['SENT_23']) print(len(model.vocab.keys())) corpus = load_corpus(get_file_path('cn_corpus')) mark = load_mark(get_file_path('mark')) vecs = build_doc_vector(corpus, model) valence, arousal = gold_valence_arousal(corpus, mark) cv(vecs, valence, multivariant=True) cv(vecs, arousal, multivariant=True) # ''' # from save_data import dump_picle # dump_picle(model.key(), get_file_path('words_in_wordvec')) # print('ok') # # # print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)) # # print(model.doesnt_match("breakfast cereal dinner lunch".split())) # # print(model.similarity('woman', 'man')) # # print(model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'], topn=10))
for item_no, line in enumerate(fin): self.sentences.append( LabeledSentence(words=utils.to_unicode(line).split(), tags=[prefix + '_%s' % str(item_no)])) return self.sentences def sentences_rand(self): # out = numpy.random.permutation(self.sentences) # return out random.shuffle(self.sentences) return self.sentences sources = {'test-neg.txt': 'TEST_NEG', 'test-pos.txt': 'TEST_POS', 'train-neg.txt': 'TRAIN_NEG', 'train-pos.txt': 'TRAIN_POS', 'train-unsup.txt': 'TRAIN_UNS'} ########################################################## dir = get_file_path('test_doc2vec') keys = list(sources.keys()) for old_key in keys: sources[dir + '/' + old_key] = sources.pop(old_key) ############################################################## ############################# model training ################# ''' sentences = LabeledLineSentence(sources) model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7) model.build_vocab(sentences.to_array()) for epoch in range(10): print('epoch: %s' % epoch) model.train(sentences.sentences_rand()) print(model.most_similar('good')) # print(model['TRAIN_NEG_0'])
def cv(data, target, multivariant=False): X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.1, random_state=10) if multivariant is False: linear_regression(X_train, X_test, Y_train, Y_test, plot=False) else: linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="ordinary_least_squares") linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="Ridge_Regression") linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="Bayesian_Regression") linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="SVR") linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="KNN_Reg") if __name__ == "__main__": normalize = True corpus = load_corpus(get_file_path("cn_corpus")) # lexicon = load_lexicon(get_file_path('lexicon')) mark = load_mark(get_file_path("mark")) lexicon = combine_lexicon(get_file_path("lexicon"), get_file_path("neural_cand")) # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon) # exit() valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark) print("start.....") cv(valence_mean, valence_true, multivariant=False) cv(arousal_mean, arousal_true, multivariant=False) print("OK")
def sentences_rand(self): # out = numpy.random.permutation(self.sentences) # return out random.shuffle(self.sentences) return self.sentences sources = { 'test-neg.txt': 'TEST_NEG', 'test-pos.txt': 'TEST_POS', 'train-neg.txt': 'TRAIN_NEG', 'train-pos.txt': 'TRAIN_POS', 'train-unsup.txt': 'TRAIN_UNS' } ########################################################## dir = get_file_path('test_doc2vec') keys = list(sources.keys()) for old_key in keys: sources[dir + '/' + old_key] = sources.pop(old_key) ############################################################## ############################# model training ################# ''' sentences = LabeledLineSentence(sources) model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7) model.build_vocab(sentences.to_array()) for epoch in range(10): print('epoch: %s' % epoch) model.train(sentences.sentences_rand()) print(model.most_similar('good')) # print(model['TRAIN_NEG_0'])
__author__ = "NLP-PC" # coding: utf-8 import goslate from load_data import load_lexicon from file_name import get_file_path from load_data import load_anew import os os.chdir("..") print(load_lexicon(get_file_path("lexicon"))) words, valence, arousal = load_anew(get_file_path("anew")) gs = goslate.Goslate() for tw_text in gs.translate(words, "zh-tw"): print(tw_text) print(gs.translate("Hi", "zh-TW")) # You could get all supported language list through get_languages languages = gs.get_languages() print(languages["zh-TW"])
def get_vocab(corpus): vocab = defaultdict(float) for sent in corpus: for word in sent: vocab[word] += 1 print(len(vocab)) return vocab # 注意: 这个文件就是CVAT构造cnn输入数据的代码 ########################################## config ######################################## vec_dim = 400 ########################################################################################## corpus = load_corpus(get_file_path('cn_corpus')) print(corpus[:2]) vocab = get_vocab(corpus) dump_picle(vocab, get_file_path('CVAT_Vocab')) print('Dump CVAT vocab OK') # vocab = load_pickle(get_file_path('CVAT_Vocab')) for i in vocab: print(i) print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400) dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT')) print('dump word_idx_map successful') dump_picle(W, './data/tmp/embedding_matrix_CVAT.p')
raise Exception('Wrong Argument.') print('Load Model Complete.') return model def load_vader(name): def load_text(filename): with open(filename, 'r', encoding='utf-8') as csvfile: reader = csv.reader(csvfile, delimiter='\t') texts, ratings = [], [] for line in reader: texts.append(line[2]) ratings.append(float(line[1])) return texts, ratings texts, ratings = [], [] for filename in name: text, rating = load_text('./data/corpus/vader/' + filename + '.txt') texts.extend(text) ratings.extend(rating) return texts, ratings if __name__ == '__main__': from file_name import get_file_path words = load_corpus(get_file_path('cn_corpus')) print(words[719]) # for i, w in enumerate(words): # print(i,w)
cost_fun='Ridge_Regression') if __name__ == '__main__': from load_data import load_vader normalize = True corpus, ratings = load_vader(['movie_reviews']) corpus = process(corpus) # lexicon = load_lexicon(get_file_path('lexicon')) from load_data import load_anew from file_name import get_file_path import numpy as np words, valences, _ = load_anew(get_file_path('anew')) mark = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 lexicon = dict() for i, word in enumerate(words): lexicon[word] = valences[i] # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon) # exit() # valence_mean, valence_true = linear_fusion(corpus, lexicon, mark) # print('start.....') # cv(valence_mean, valence_true, multivariant=False) # print('OK')
import numpy as np def statistic(texts): avg_length, vocab = 0, set() length_list = [] for text in texts: if type(text) is not list: text = text.split() length_list.append(len(text)) vocab = vocab.union(set(text)) # if len(text)>200: # print(text) avg_length = np.average(np.array(length_list)) return avg_length, len(vocab) if __name__ == '__main__': # (['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) tweets, _ = load_vader(['tweets']) movie, _ = load_vader(['movie_reviews']) amazon, _ = load_vader(['product_reviews']) NYT, _ = load_vader(['news_articles']) cvat = load_corpus(get_file_path('cn_corpus')) print(statistic(tweets)) print(statistic(movie)) print(statistic(amazon)) print(statistic(NYT)) print(statistic(cvat))
return result def scaling_onezero(num_list): # Note: the type of the parameter is np.array # Function: To normalize data result = [] for num in num_list: result.append(num / 9) return result if __name__ == '__main__': from load_data import load_lexicon from load_data import load_mark from file_name import get_file_path from save_data import save_csv lexicon = load_lexicon(get_file_path('lexicon')) mark = load_mark(get_file_path('mark')) lexicon = np.array(lexicon) mark = np.array(mark) ##################################### lexicon[:, 1] = scaling_onezero(np.array(lexicon[:, 1], dtype=float)) lexicon[:, 2] = scaling_onezero(np.array(lexicon[:, 2], dtype=float)) mark[:, 1] = scaling_onezero(np.array(mark[:, 1], dtype=float)) mark[:, 2] = scaling_onezero(np.array(mark[:, 2], dtype=float)) ###################################### save_csv(lexicon, get_file_path('normalized_onezero_lexicon')) save_csv(mark, get_file_path('normalized_onezero_mark'))
vocab[word] += 1 print(len(vocab)) return vocab def process(corpus): return [clean_str(sent) for sent in corpus] vec_dim = 300 ############################################## all ############################################### corpus, ratings = load_vader( ['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) # corpus, ratings = load_vader(['movie_reviews']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus), len(corpus)) vocab = get_vocab(corpus) # dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p') # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p') print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) # dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p') # print('dump word_idx_map successful') dump_picle(W, './data/corpus/vader/embedding_matrix_all.p')
from file_name import get_file_path from regression import linear_regression, linear_regression_multivariant from positive_negative_split import get_pos_neg_va def cv(data, target, multivariant=False): X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.2, random_state=0) if multivariant is False: linear_regression(X_train, X_test, Y_train, Y_test, plot=False) else: linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression') normalize = True corpus = load_corpus(get_file_path('cn_corpus')) lexicon = load_lexicon(get_file_path('normalized_onezero_lexicon')) mark = load_mark(get_file_path('normalized_onezero_mark')) # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon) # exit() # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_sqr(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = nonlinear_max_fusion(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tf(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tfidf(corpus, lexicon, mark) valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo_tf(corpus, lexicon, mark)
def cv(data, target, multivariant=False): X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.1, random_state=10) if multivariant is False: linear_regression(X_train, X_test, Y_train, Y_test, plot=False) else: linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='ordinary_least_squares') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Bayesian_Regression') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='SVR') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='KNN_Reg') if __name__ == '__main__': normalize = True corpus = load_corpus(get_file_path('cn_corpus')) # lexicon = load_lexicon(get_file_path('lexicon')) mark = load_mark(get_file_path('mark')) lexicon = combine_lexicon(get_file_path('lexicon'), get_file_path('neural_cand')) # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon) # exit() valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark) print('start.....') cv(valence_mean, valence_true, multivariant=False) cv(arousal_mean, arousal_true, multivariant=False) print('OK')
linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression') if __name__ == '__main__': from load_data import load_vader normalize = True corpus, ratings = load_vader(['movie_reviews']) corpus = process(corpus) # lexicon = load_lexicon(get_file_path('lexicon')) from load_data import load_anew from file_name import get_file_path import numpy as np words, valences, _ = load_anew(get_file_path('anew')) mark = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 lexicon = dict() for i, word in enumerate(words): lexicon[word] = valences[i] # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon) # exit() # valence_mean, valence_true = linear_fusion(corpus, lexicon, mark) # print('start.....') # cv(valence_mean, valence_true, multivariant=False) # print('OK')
vocab = get_vocab(file_dir) dump_picle(vocab, './data/tmp/vocab.p') print('OK') a = load_pickle('./data/tmp/vocab.p') for i in a: print(i) print(len(a)) exit() # end # make word index map vocab = load_pickle('./data/tmp/vocab.p') W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) dump_picle(word_idx_map, get_file_path('word_idx_map')) print('dump word_idx_map successful') dump_picle(W, '/home/hs/Data/embedding_matrix.p') print('OK') exit() # make word index map end word_idx_map = load_pickle(get_file_path('word_idx_map')) print(len(word_idx_map)) for i in word_idx_map: print(i) exit() word_idx_map = load_pickle(get_file_path('word_idx_map')) data, pos_length, neg_length = prepare_data(file_dir, word_idx_map) dump_picle([data, pos_length, neg_length],
import numpy as np def get_vocab(corpus): vocab = defaultdict(float) for sent in corpus: for word in sent: vocab[word] += 1 print(len(vocab)) return vocab # 注意: 这个文件就是CVAT构造cnn输入数据的代码 ########################################## config ######################################## vec_dim = 400 ########################################################################################## corpus = load_corpus(get_file_path('cn_corpus')) print(corpus[:2]) vocab = get_vocab(corpus) dump_picle(vocab, get_file_path('CVAT_Vocab')) print('Dump CVAT vocab OK') # vocab = load_pickle(get_file_path('CVAT_Vocab')) for i in vocab: print(i) print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400) dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT')) print('dump word_idx_map successful') dump_picle(W, './data/tmp/embedding_matrix_CVAT.p') print('OK')
__author__ = 'NLP-PC' # coding: utf-8 import goslate from load_data import load_lexicon from file_name import get_file_path from load_data import load_anew import os os.chdir('..') print(load_lexicon(get_file_path('lexicon'))) words, valence, arousal = load_anew(get_file_path('anew')) gs = goslate.Goslate() for tw_text in gs.translate(words, 'zh-tw'): print(tw_text) print(gs.translate('Hi', 'zh-TW')) # You could get all supported language list through get_languages languages = gs.get_languages() print(languages['zh-TW'])