def GRNN_preprocess(csv_folder, output_folder, save_word2vec_data=True): # Read and Preprocessing train data print('Reading and preprocessing train_data...\n') train_texts, train_labels, train_size = prep_data(csv_folder, 'train') # Save text data for word2vec if save_word2vec_data: torch.save(train_texts, os.path.join(output_folder, 'word2vec_data_grnn.pth.tar')) # Read and Preprocessing val data print('Reading and preprocessing val_data...\n') val_texts, val_labels, val_size = prep_data(csv_folder, 'val') # Read and Preprocessing test data print('Reading and preprocessing test_data...\n') test_texts, test_labels, test_size = prep_data(csv_folder, 'test') print('\n Train word2vec model...') train_word2vec_model(data_folder=output_folder, model='grnn') print('\nEND TRAINING WORD2VEC MODEL\n') # Build word_map & embedding embedding, word_map = load_word2vec_embeddings_grnn(output_folder) # Encode train data encode_data('train', train_texts, word_map, train_labels, output_folder) # Encode val data encode_data('val', val_texts, word_map, val_labels, output_folder) # Encode test data encode_data('test', test_texts, word_map, test_labels, output_folder) print('END PREPROCESSING!\n') return embedding, word_map, train_size, val_size, test_size
def HAN_preprocess(csv_folder, output_folder, sentence_limit, word_limit, min_word_count=5, save_word2vec_data=True): # Read and Preprocessing train data print('Reading and preprocessing train_data...\n') train_texts, train_labels, word_counter, n_classes = read_csv( csv_folder, 'train', sentence_limit, word_limit) # Save text data for word2vec if save_word2vec_data: torch.save(train_texts, os.path.join(output_folder, 'word2vec_data.pth.tar')) # Build word_map (=vocabulary, remove unique words) word_map = dict() word_map['<pad>'] = 0 for word, count in word_counter.items(): if count >= min_word_count: word_map[word] = len(word_map) word_map['<unk>'] = len(word_map) # Save word_map with open(os.path.join(output_folder, 'word_map.json'), 'w') as j: json.dump(word_map, j) split_preprocessing('train', train_texts, train_labels, output_folder, sentence_limit, word_limit, word_map) # Read and Preprocessing val data print('Reading and preprocessing val data...\n') val_texts, val_labels, _, _ = read_csv(csv_folder, 'val', sentence_limit, word_limit) split_preprocessing('val', val_texts, val_labels, output_folder, sentence_limit, word_limit, word_map) # Read and Preprocessing test data print('Reading and preprocessing test data...\n') test_texts, test_labels, _, _ = read_csv(csv_folder, 'test', sentence_limit, word_limit) split_preprocessing('test', test_texts, test_labels, output_folder, sentence_limit, word_limit, word_map) print('END PREPROCESSING!\n') print('\n Train word2vec model...') train_word2vec_model(data_folder=output_folder, model='han') print('\nEND TRAINING WORD2VEC MODEL\n') return word_map, n_classes
test = test.dropna() train_temp = train.loc[:, ['anger', 'body']] test_temp = test.loc[:, ['anger', 'body']] train_temp.to_csv('./data/train.csv', index=False, header=False) test_temp.to_csv('./data/test.csv', index=False, header=False) create_input_files(csv_folder='./data', output_folder='./outdata', # sentence_limit=15, # word_limit=20, # min_word_count=5) sentence_limit=30, word_limit=100, min_word_count=10) train_word2vec_model(data_folder='./outdata', algorithm='skipgram') file1 = open("label.txt", "a") file2 = open("result.txt", "a") file2.close() for i in c.LABELS: print(i) file1.write(i) file1.write('\n') os.system('python3 train.py') os.system('python3 eval.py') file1.close()
from utils import create_input_files, train_word2vec_model if __name__ == '__main__': create_input_files(csv_folder='/users5/yjtian/tyj/demo/Hierarchical-Attention-Network/yahoo_answers_csv', output_folder='/users5/yjtian/tyj/demo/HAN/data', sentence_limit=15, word_limit=20, min_word_count=5) train_word2vec_model(data_folder='/users5/yjtian/tyj/demo/HAN/data', algorithm='skipgram')
from utils import create_input_files, train_word2vec_model if __name__ == '__main__': create_input_files(csv_folder='./yahoo_answers_csv', output_folder='/media/ssd/han data', sentence_limit=15, word_limit=20, min_word_count=5) train_word2vec_model(data_folder='/media/ssd/han data', algorithm='skipgram')
import sys from utils import create_input_files_fromdb, train_word2vec_model if __name__ == '__main__': args = sys.argv create_input_files_fromdb(output_folder='./data' + args[1], hostname=args[2], database=args[3], sentence_limit=15, word_limit=20, min_word_count=5) train_word2vec_model(data_folder='./data' + args[1], algorithm='skipgram')