def load_data(file_dir): file_names = os.listdir(file_dir) data = [] length = len(file_names) for file_name in file_names: text = ' '.join(codecs.open(os.path.join(file_dir, file_name), 'r', 'utf-8').readlines()) data.append(clean_str(text)) idx_data = make_idx_data(data, word_idx_map, max_len=200, kernel_size=5) return idx_data, length
def load_data(file_dir): file_names = os.listdir(file_dir) data = [] length = len(file_names) for file_name in file_names: text = ' '.join( codecs.open(os.path.join(file_dir, file_name), 'r', 'utf-8').readlines()) data.append(clean_str(text)) idx_data = make_idx_data(data, word_idx_map, max_len=200, kernel_size=5) return idx_data, length
print('Dump CVAT vocab OK') # vocab = load_pickle(get_file_path('CVAT_Vocab')) for i in vocab: print(i) print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400) dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT')) print('dump word_idx_map successful') dump_picle(W, './data/tmp/embedding_matrix_CVAT.p') print('OK') # word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT')) mark = load_mark(get_file_path('mark')) valence, arousal = gold_valence_arousal(corpus, mark) idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5) dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data')) # idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data')) print(idx_data.shape) exit() word_vecs = load_embeddings('zh_tw') dim = len(word_vecs['我們']) # 400 embedding_matrix, idx_map = build_embedding_matrix(word_vecs, k=dim) print(embedding_matrix[1]) print(idx_map['我們']) print(len(word_vecs['我們']))
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus), len(corpus)) vocab = get_vocab(corpus) # dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p') # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p') print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) # dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p') # print('dump word_idx_map successful') dump_picle(W, './data/corpus/vader/embedding_matrix_all.p') print('dump embedding matrix file OK') # word_idx_map = load_pickle('./data/corpus/vader/word_idx_map_movie_reviews.p') idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5) dump_picle([idx_data, ratings], './data/corpus/vader/vader_processed_data_all.p') print(idx_data[0]) print(ratings[0]) ############################################## tweets ############################################### # corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) corpus, ratings = load_vader(['tweets']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus), len(corpus))