def train_doc2vec(): # def isEnglish(s): # try: # s.encode('ascii') # except UnicodeEncodeError: # return False # else: # return True labeled_data, _ = load_vader('./resource/tweets.txt') # for i,d in enumerate(labeled_data): # print(i) # if not isEnglish(d): # print('*'*111) # print(i,d) # exit() unlabeled_data, _ = load_sentiment140('/home/hs/Data/Corpus/training.csv') labeled_data = preprocess(labeled_data, replace=True) dump_picle(labeled_data, './data/acc/labeled_data.p') unlabeled_data = preprocess(unlabeled_data, replace=True) dump_picle(unlabeled_data, './data/acc/unlabeled_data.p') # labeled_data = load_pickle('./data/acc/labeled_data.p') # unlabeled_data = load_pickle('./data/acc/unlabeled_data.p') sentence = TaggedLineSentence(labeled_data, unlabeled_data) train_docvecs(sentence)
def run_build_docvecs(): model = load_embeddings('twitter') simple_evaluate(model) _, ratings = load_vader('./resource/tweets.txt') # Do not account the 1240 and 3516 -th item # r = ratings[:1240] + ratings[1241:3516] + ratings[3517:] build_docvecs(model, ratings)
def cv(data, target, multivariant=False): X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.2, random_state=0) if multivariant is False: linear_regression(X_train, X_test, Y_train, Y_test, plot=False) else: linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression') if __name__ == '__main__': from load_data import load_vader normalize = True corpus, ratings = load_vader(['movie_reviews']) corpus = process(corpus) # lexicon = load_lexicon(get_file_path('lexicon')) from load_data import load_anew from file_name import get_file_path import numpy as np words, valences, _ = load_anew(get_file_path('anew')) mark = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 lexicon = dict() for i, word in enumerate(words): lexicon[word] = valences[i] # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon)
def get_vocab(corpus): vocab = defaultdict(float) for sent in corpus: for word in clean_str(sent).split(): vocab[word] += 1 print(len(vocab)) return vocab def process(corpus): return [clean_str(sent) for sent in corpus] vec_dim = 300 ############################################## all ############################################### corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) # corpus, ratings = load_vader(['movie_reviews']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus), len(corpus)) vocab = get_vocab(corpus) # dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p') # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p') print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) # dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p') # print('dump word_idx_map successful')
def get_vocab(corpus): vocab = defaultdict(float) for sent in corpus: for word in clean_str(sent).split(): vocab[word] += 1 print(len(vocab)) return vocab def process(corpus): return [clean_str(sent) for sent in corpus] vec_dim = 300 ############################################## all ############################################### corpus, ratings = load_vader( ['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) # corpus, ratings = load_vader(['movie_reviews']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus), len(corpus)) vocab = get_vocab(corpus) # dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p') # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p') print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300)
def process(corpus): return [clean_str(sent) for sent in corpus] if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info(r"running %s" % ''.join(sys.argv)) from load_data import load_vader # corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) corpus, ratings = load_vader(['news_articles']) lexicon_name = get_file_path('anew') logger.info(r"loading lexicon form : " + lexicon_name) words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus)) # for i in corpus[:100]: # print(i) lexicon = dict() for i, word in enumerate(words): lexicon[word] = valences[i] mean_ratings, tf_means, tfidf_means, geos, tf_geos, tfidf_geos = calculate_ratings( corpus, ratings, lexicon) dump_picle([
import numpy as np def statistic(texts): avg_length, vocab = 0, set() length_list = [] for text in texts: if type(text) is not list: text = text.split() length_list.append(len(text)) vocab = vocab.union(set(text)) # if len(text)>200: # print(text) avg_length = np.average(np.array(length_list)) return avg_length, len(vocab) if __name__ == '__main__': # (['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) tweets, _ = load_vader(['tweets']) movie, _ = load_vader(['movie_reviews']) amazon, _ = load_vader(['product_reviews']) NYT, _ = load_vader(['news_articles']) cvat = load_corpus(get_file_path('cn_corpus')) print(statistic(tweets)) print(statistic(movie)) print(statistic(amazon)) print(statistic(NYT)) print(statistic(cvat))
if multivariant is False: linear_regression(X_train, X_test, Y_train, Y_test, plot=False) else: linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression') if __name__ == '__main__': from load_data import load_vader normalize = True corpus, ratings = load_vader(['movie_reviews']) corpus = process(corpus) # lexicon = load_lexicon(get_file_path('lexicon')) from load_data import load_anew from file_name import get_file_path import numpy as np words, valences, _ = load_anew(get_file_path('anew')) mark = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 lexicon = dict() for i, word in enumerate(words): lexicon[word] = valences[i] # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon)
for sent in corpus: for word in clean_str(sent).split(): vocab[word] += 1 print(len(vocab)) return vocab def process(corpus): return [clean_str(sent) for sent in corpus] vec_dim = 300 ############################################## tweets ############################################### # corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) corpus, ratings = load_vader(['tweets']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus), len(corpus)) vocab = get_vocab(corpus) dump_picle(vocab, './data/corpus/vader/vocab_moview_tweets.p') # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p') print('词汇数量:%s' % str(len(vocab))) W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_tweets.p') print('dump word_idx_map successful') dump_picle(W, './data/corpus/vader/embedding_matrix_tweets.p')