def test_train(self): """ 模型训练. """ print('{} test_train {}'.format('-' * 15, '-' * 15)) # 数据集加载 test_data_pickle = './data/aclImdb/test_data.pkl' test_data = load_serialize_obj(test_data_pickle) test_data = test_data[:100] # 数据量比较大,cpu电脑跑不动,取一部分进行训练 test_data_tokenized = get_tokenized_imdb(imdb_data=test_data) test_data_vocab = get_tokenized_vocab(test_data_tokenized) vocab_size = len(test_data_vocab) print('vocab len:{}'.format(vocab_size)) # vocab len:45098 test_iter = get_imdb_data_iter(test_data, test_data_vocab, batch_size=8, shuffle=True) print('test_iter len:{}'.format(len(test_iter))) # test_iter len:3218 # 构造模型 net = BiLSTM(vocab_size=vocab_size, labels_size=2) print('参数量:{}'.format(get_parameter_number( net))) # total:436.002 Thousand, trainable:436.002 Thousand print(net) optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, net.parameters())) loss_func = nn.CrossEntropyLoss() # 训练 train_net(net, train_iter=test_iter, dev_iter=test_iter, max_epoch=5, optimizer=optimizer, loss_func=loss_func)
def test_train_use_pretrained_embedding(self): """ 模型训练,使用预训练embed. """ print('{} test_train_use_pretrained_embedding {}'.format( '-' * 15, '-' * 15)) # 数据集加载 test_data_pickle = './data/aclImdb/test_data.pkl' test_data = load_serialize_obj(test_data_pickle) test_data = test_data[:1000] # 数据量比较大,cpu电脑跑不动,取一部分进行训练 test_data_tokenized = get_tokenized_imdb(imdb_data=test_data) test_data_vocab = get_tokenized_vocab(test_data_tokenized) vocab_size = len(test_data_vocab) print('vocab len:{}'.format(vocab_size)) # vocab len:4345 test_iter = get_imdb_data_iter(test_data, test_data_vocab, batch_size=8, shuffle=True) print('test_iter len:{}'.format(len(test_iter))) # test_iter len:125 # 构造模型 net = TextCNN(vocab_size=vocab_size, labels_size=2) print('参数量:{}'.format(get_parameter_number( net))) # total:263.152 Thousand, trainable:263.152 Thousand # 使用预训练embed初始化 glove_embedding = torchtext.vocab.GloVe(name='6B', dim=50, cache='./data/torchtext') print("glove_embedding 一共包含%d个词。" % len(glove_embedding.stoi)) # 一共包含400000个词。 words = test_data_vocab.itos embed = load_pretrained_embedding( words=words, pretrained_vocab=glove_embedding) # There are 1004 oov words. net.embedding.weight.data.copy_(embed) net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它 print('参数量:{}'.format(get_parameter_number( net))) # total:263.152 Thousand, trainable:45.902 Thousand print(net) optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, net.parameters())) loss_func = nn.CrossEntropyLoss() # 训练 train_net(net, train_iter=test_iter, dev_iter=test_iter, max_epoch=2, optimizer=optimizer, loss_func=loss_func) init_file_path('./data/save/text_classify/textcnn/') # 保存模型 torch.save(net, f='./data/save/text_classify/textcnn/model.pkl') # 保存vocabulary save_vocab_words( test_data_vocab, file_name='./data/save/text_classify/textcnn/vocab_words.txt') save_serialize_obj( test_data_vocab, filename='./data/save/text_classify/textcnn/vocab.pkl')
def test_tokenized_vocab(self): """ 数据集词典构造. """ print('{} test_tokenized_vocab {}'.format('-' * 15, '-' * 15)) test_data_tokenized_pickle = './data/aclImdb/test_data_tokenized.pkl' test_data_tokenized = load_serialize_obj(test_data_tokenized_pickle) test_data_vocab = get_tokenized_vocab(test_data_tokenized) print('vocab len:{}'.format(len(test_data_vocab))) # vocab len:45098 print('overcome id:{}'.format( test_data_vocab.stoi.get('overcome', None))) # overcome id:3753 print('father id:{}'.format(test_data_vocab.stoi.get( 'father', None))) # father id:475
def test_get_tokenized_imdb(self): """ 数据集分词. """ print('{} test_get_tokenized_imdb {}'.format('-' * 15, '-' * 15)) test_data_pickle = './data/aclImdb/test_data.pkl' test_data_tokenized_pickle = './data/aclImdb/test_data_tokenized.pkl' test_data = load_serialize_obj(test_data_pickle) test_data_tokenized = get_tokenized_imdb(imdb_data=test_data) print('test_data_tokenized len:{}'.format( len(test_data_tokenized))) # test_data_tokenized len:25737 print('example:{}'.format(test_data_tokenized[:3])) # 数据集序列化,方便直接加载 save_serialize_obj(test_data_tokenized, test_data_tokenized_pickle)
def test_net_predict(self): """ 模型预测. """ print('{} test_net_predict {}'.format('-' * 15, '-' * 15)) # 加载模型 file_path = './data/save/machine_translation/attention' encoder_net = torch.load(os.path.join(file_path, 'encoder_net.pkl')) decoder_net = torch.load(os.path.join(file_path, 'decoder_net.pkl')) # 加载词典 in_vocab = load_serialize_obj(os.path.join(file_path, 'in_vocab.pkl')) out_vocab = load_serialize_obj(os.path.join(file_path, 'out_vocab.pkl')) print('in_vocab len:{}, out_vocab len:{}'.format( len(in_vocab), len(out_vocab))) # n_vocab len:46, out_vocab len:38 # 预测 input_seq = 'ils regardent .' output_tokens = predict_translate(encoder=encoder_net, decoder=decoder_net, input_seq=input_seq, max_seq_len=7, in_vocab=in_vocab, out_vocab=out_vocab) print(output_tokens) # ['they', 'are', 'watching', '.']
def test_get_imdb_data_iter(self): """ 预处理数据集,构造DataLoader. """ print('{} test_get_imdb_data_iter {}'.format('-' * 15, '-' * 15)) test_data_pickle = './data/aclImdb/test_data.pkl' test_data = load_serialize_obj(test_data_pickle) test_data_tokenized = get_tokenized_imdb(imdb_data=test_data) test_data_vocab = get_tokenized_vocab(test_data_tokenized) print('vocab len:{}'.format(len(test_data_vocab))) # vocab len:45098 test_iter = get_imdb_data_iter(test_data, test_data_vocab, batch_size=8, shuffle=True) print('test_iter len:{}'.format(len(test_iter))) # test_iter len:3218 for X, y in test_iter: print('X', X.shape, 'y', y.shape) # X torch.Size([8, 500]) y torch.Size([8]) break
def test_net_predict(self): """ 模型预测. """ print('{} test_net_predict {}'.format('-' * 15, '-' * 15)) # 加载模型 model_file = './data/save/text_classify/bilstm/model.pkl' net = torch.load(model_file) print(net) # 加载词典 vocab_file = './data/save/text_classify/bilstm/vocab.pkl' vocab_obj = load_serialize_obj(vocab_file) print('vocab len:{}'.format(len(vocab_obj))) # vocab len:624 # 预测 label = predict_sentiment(net=net, vocab_obj=vocab_obj, words=['this', 'movie', 'is', 'so', 'good']) print(label) label = predict_sentiment( net=net, vocab_obj=vocab_obj, words=['terrible', 'movie', 'is', 'so', 'bad']) print(label)