def evaluate(self, test_data): test_loss = 0 self.nnlm.eval() with torch.no_grad(): for ctx, tgt in batch_iter(test_data, self.args, self.wd_vocab): pred, _ = self.nnlm(ctx) loss = self._calc_loss(pred, tgt) test_loss += loss.data.item() print('test data loss: %.3f' % (test_loss / len(test_data)))
def train(self, train_pairs, enc_optimizer, dec_optimizer, args, src_vocab, tgt_vocab): train_loss = 0 for src_batch, tgt_batch in batch_iter(train_pairs, args, src_vocab, tgt_vocab): loss = 0 # enc_out: (batch_size, seq_len, hidden_size * nb_directions) # enc_hidden: (num_layers * nb_directions, batch_size, hidden_size) enc_out, enc_hidden = self.encoder(src_batch.src_idxs, mask=src_batch.non_pad_mask) self.encoder.zero_grad() self.decoder.zero_grad() dec_hidden = enc_hidden dec_input = tgt_batch.src_idxs[0].unsqueeze(1) if np.random.uniform(0, 1) <= args.teacher_force: # print('以目标作为下一个输入') for i in range(1, tgt_batch.src_idxs.size(0)): dec_out, dec_hidden = self.decoder(dec_input, dec_hidden, enc_out) dec_hidden *= tgt_batch.non_pad_mask[i].unsqueeze( 1).repeat(1, dec_hidden.size(-1)) loss += self.calc_loss(dec_out, tgt_batch.src_idxs[i]) train_loss += loss.data.item() dec_input = tgt_batch.src_idxs[i].unsqueeze(1) else: # print('以网络的预测输出作为下一个输入') for i in range(1, tgt_batch.src_idxs.size(0)): dec_out, dec_hidden = self.decoder(dec_input, dec_hidden, enc_out) dec_hidden *= tgt_batch.non_pad_mask[i].unsqueeze( 1).repeat(1, dec_hidden.size(-1)) loss += self.calc_loss(dec_out, tgt_batch.src_idxs[i]) train_loss += loss.data.item() _, top_i = dec_out.data.topk(1) dec_input = top_i # (batch_size, 1) loss.backward() nn_utils.clip_grad_norm_(filter(lambda p: p.requires_grad, self.encoder.parameters()), max_norm=5.0) nn_utils.clip_grad_norm_(filter(lambda p: p.requires_grad, self.decoder.parameters()), max_norm=5.0) enc_optimizer.step() dec_optimizer.step() return train_loss / len(train_pairs)
def validate(self, valid_data): val_loss = 0 self.nnlm.eval() with torch.no_grad(): for ctx, tgt in batch_iter(valid_data, self.args, self.wd_vocab): pred, _ = self.nnlm(ctx) loss = self._calc_loss(pred, tgt) val_loss += loss.data.item() val_loss /= len(valid_data) return val_loss
def train(self, train_data, dev_data): optimizer = optim.Adam(filter(lambda p: p.requires_grad, self._model.parameters()), lr=self._args.lr) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5) for ep in range(self._args.epoch): self._model.train() start = time.time() train_loss = 0 nb_correct, nb_gold, nb_pred = 0, 0, 0 lr_scheduler.step() print(lr_scheduler.get_lr()) for batch in batch_iter(train_data, self._args.batch_size, self._wd_vocab, self._ch_vocab, device=self._args.device): self._model.zero_grad() loss = self._model.calc_loss(batch.wd_src, batch.ch_src, batch.tgt) # GPU并行计算,自定义方法无法并行(在主GPU中运行),需要加.module调用 # loss = self._model.module.calc_loss(batch.wd_src, batch.ch_src, batch.tgt) train_loss += loss.data.item() loss.backward() # 梯度裁剪 - 解决梯度爆炸问题 nn.utils.clip_grad_value_(filter(lambda p: p.requires_grad, self._model.parameters()), clip_value=5.0) # 限制梯度值在[-5, 5]之间 optimizer.step() pred = self._model(batch.wd_src, batch.ch_src) result = self._calc_acc(pred, batch.tgt, batch.mask) nb_correct += result[0] nb_gold += result[1] nb_pred += result[2] train_f1 = self._calc_f1(nb_correct, nb_gold, nb_pred) print('[Epoch %d] train_loss: %.3f train_F1: %.3f' % (ep, train_loss, train_f1)) dev_loss, dev_f1 = self._validate(dev_data) end = time.time() print('dev_loss: %.3f dev_F1: %.3f' % (dev_loss, dev_f1)) print('time cost: %.2f s' % (end - start))
def train_iter(self, train_data, args, vocab, optimizer): self.parser_model.train() train_loss = 0 all_arc_acc, all_rel_acc, all_arcs = 0, 0, 0 start_time = time.time() nb_batch = int(np.ceil(len(train_data) / args.batch_size)) batch_size = args.batch_size // args.update_steps for i, batcher in enumerate( batch_iter(train_data, batch_size, vocab, True)): batcher = (x.to(args.device) for x in batcher) wd_idx, extwd_idx, tag_idx, true_head_idx, true_rel_idx, non_pad_mask, _ = batcher pred_arc_score, pred_rel_score = self.parser_model( wd_idx, extwd_idx, tag_idx, non_pad_mask) loss = self.calc_loss(pred_arc_score, pred_rel_score, true_head_idx, true_rel_idx, non_pad_mask) if args.update_steps > 1: loss = loss / args.update_steps loss_val = loss.data.item() train_loss += loss_val loss.backward() arc_acc, rel_acc, total_arcs = self.calc_acc( pred_arc_score, pred_rel_score, true_head_idx, true_rel_idx, non_pad_mask) all_arc_acc += arc_acc all_rel_acc += rel_acc all_arcs += total_arcs ARC = all_arc_acc * 100. / all_arcs REL = all_rel_acc * 100. / all_arcs logger.info('Iter%d ARC: %.3f%%, REL: %.3f%%' % (i + 1, ARC, REL)) logger.info('time cost: %.2fs, train loss: %.2f' % ((time.time() - start_time), loss_val)) # 梯度累积,相对于变相增大batch_size,节省存储 if (i + 1) % args.update_steps == 0 or (i == nb_batch - 1): nn.utils.clip_grad_norm_(filter( lambda p: p.requires_grad, self.parser_model.parameters()), max_norm=5.) optimizer.step() self.parser_model.zero_grad() train_loss /= len(train_data) ARC = all_arc_acc * 100. / all_arcs REL = all_rel_acc * 100. / all_arcs return train_loss, ARC, REL
def train(self, train_data, dev_data): # 优化器:更新模型参数 optimizer = optim.Adam( filter(lambda p: p.requires_grad, self._model.parameters()), lr=self._args.learning_rate, # 1e-2 weight_decay=self._args.weight_decay) # 0 # optimizer = optim.SGD(filter(lambda p: p.requires_grad, self._model.parameters()), # lr=self._args.learning_rate, # 1e-2 # momentum=0.9, # weight_decay=self._args.weight_decay, # 1e-5 # nesterov=True) lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ep: 0.95**ep) # 迭代更新 for i in range(self._args.epochs): self._model.train() start = time.time() train_loss = 0 lr_scheduler.step() for batch_data in batch_iter(train_data, self._args.batch_size, self._vocab, device=self._args.device): self._model.zero_grad() pred, tgt = self._model(batch_data.wd_src) loss = self._calc_loss(pred, tgt) train_loss += loss.data.item() loss.backward() # gradient exploding nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, (self._model.parameters())), max_norm=1.0) optimizer.step() dev_loss = self._validate(dev_data) end = time.time() print('learning rate:', lr_scheduler.get_lr()) print('[Epoch %d] train loss: %.3f dev loss: %.3f' % (i, train_loss, dev_loss)) print('time cost: %.3f' % (end - start))
def train(self, train_data, valid_data): if self.args.optm == 'Adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.nnlm.parameters()), lr=self.args.lr) else: # n-gram语言模型使用SGD效果更好! optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.nnlm.parameters()), lr=self.args.lr, momentum=0.9, weight_decay=self.args.weight_decay, nesterov=True) lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda epoch: max(1 - epoch / self.args.epoch, 1e-4)) # optimizer = nn.DataParallel(optimizer, device_ids=[0, 1]) for ep in range(self.args.epoch): self.nnlm.train() start = time.time() train_loss = 0 hidden = None lr_scheduler.step() for ctx, tgt in batch_iter(train_data, self.args, self.wd_vocab, shuffle=False): self.nnlm.zero_grad() pred, hidden = self.nnlm(ctx, hidden) loss = self._calc_loss(pred, tgt) train_loss += loss.data.item() # 第一次反向传播之后,计算图的内存就会被释放掉,这样的话再次进行反向传播就不行了 loss.backward() optimizer.step() # optimizer.module.step() # optimizer在DataParallel中,需要变成普通的Adam才能调用step end = time.time() print('[Epoch %d] train loss: %.3f' % (ep + 1, train_loss / len(train_data))) val_loss = self.validate(valid_data) print('dev loss: %.3f' % val_loss) print('lr: ', lr_scheduler.get_lr()) print('time cost: %.2fs' % (end - start))
def _validate(self, dev_data): dev_loss = 0 nb_correct, nb_gold, nb_pred = 0, 0, 0 self._model.eval() with torch.no_grad(): for batch in batch_iter(dev_data, self._args.batch_size, self._wd_vocab, self._ch_vocab, device=self._args.device): loss = self._model.calc_loss(batch.wd_src, batch.ch_src, batch.tgt) dev_loss += loss.data.item() pred = self._model(batch.wd_src, batch.ch_src) result = self._calc_acc(pred, batch.tgt, batch.mask) nb_correct += result[0] nb_gold += result[1] nb_pred += result[2] return dev_loss, self._calc_f1(nb_correct, nb_gold, nb_pred)
def _validate(self, dev_data): self._model.eval() dev_loss = 0 cos_sim_lst = [] with torch.no_grad(): # 确保在代码执行期间没有计算和存储梯度, 起到预测加速作用 for batch_data in batch_iter(dev_data, self._args.batch_size, self._vocab, device=self._args.device): pred, tgt = self._model(batch_data.wd_src) loss = self._calc_loss(pred, tgt) dev_loss += loss.data.item() _, pred_enc = self._model.encoder(pred, batch_data.non_pad_mask) _, tgt_enc = self._model.encoder(tgt, batch_data.non_pad_mask) cos_sim_lst.append( self._cosine_sim(pred_enc[0][-1], tgt_enc[0][-1])) print('cosine similarity:', sum(cos_sim_lst) / len(cos_sim_lst)) return dev_loss
def evaluate(self, test_data, args, vocab): self.parser_model.eval() all_arc_acc, all_rel_acc, all_arcs = 0, 0, 0 with torch.no_grad(): for batcher in batch_iter(test_data, args.batch_size, vocab): batcher = (x.to(args.device) for x in batcher) wd_idx, extwd_idx, tag_idx, true_head_idx, true_rel_idx, non_pad_mask, punc_mask = batcher pred_arc_score, pred_rel_score = self.parser_model( wd_idx, extwd_idx, tag_idx, non_pad_mask) arc_acc, rel_acc, total_arcs = self.metric_evaluate( pred_arc_score, pred_rel_score, true_head_idx, true_rel_idx, non_pad_mask, punc_mask) all_arc_acc += arc_acc all_rel_acc += rel_acc all_arcs += total_arcs uas = all_arc_acc * 100. / all_arcs las = all_rel_acc * 100. / all_arcs return uas, las
def evaluate(self, test_data): vail_loss = 0 nb_correct, nb_gold, nb_pred = 0, 0, 0 self._model.eval() with torch.no_grad(): for batch in batch_iter(test_data, self._args.batch_size, self._wd_vocab, self._ch_vocab, device=self._args.device): loss = self._model.calc_loss(batch.wd_src, batch.ch_src, batch.tgt) vail_loss += loss.data.item() pred = self._model(batch.wd_src, batch.ch_src) result = self._calc_acc(pred, batch.tgt, batch.mask) nb_correct += result[0] nb_gold += result[1] nb_pred += result[2] test_f1 = self._calc_f1(nb_correct, nb_gold, nb_pred) print('======== test_loss: %.3f test_F1: %.3f ========' % (vail_loss, test_f1)) return vail_loss, test_f1
def evaluate(self, test_pairs, args, src_vocab, tgt_vocab): self.encoder.eval() self.decoder.eval() # pred_wds, tgt_wds = [], [] for src_batch, tgt_batch in batch_iter(test_pairs, args, src_vocab, tgt_vocab): # batch_pred_wds, batch_tgt_wds = [], [] enc_out, enc_hidden = self.encoder(src_batch.src_idxs, mask=src_batch.non_pad_mask) # 保存历史分数 seq_len, batch_size = tgt_batch.src_idxs.size() # (bz, beam_size) hist_score = torch.zeros((batch_size, args.beam_size), device=args.device) # (beam_size, bz, vocab_size) beam_score = torch.zeros( (args.beam_size, batch_size, tgt_vocab.vocab_size), device=args.device) # (bz, beam_size, max_len) best_paths = torch.zeros((MAX_LEN, batch_size, args.beam_size), device=args.device) dec_hidden = enc_hidden dec_input = tgt_batch.src_idxs[0].unsqueeze(1) for i in range(1, min(MAX_LEN, seq_len)): if i == 1: # dec_input: (bz, 1) # dec_out: (bz, vocab_size) dec_out, dec_hidden = self.decoder(dec_input, dec_hidden, enc_out) dec_hidden *= tgt_batch.non_pad_mask[i].unsqueeze( 1).repeat(1, dec_hidden.size(-1)) # (bz, beam_size) top_prob, top_idxs = dec_out.data.topk(args.beam_size, dim=1) hist_score = top_prob best_paths[i] = top_idxs # (bz, beam_size) dec_input = top_idxs else: # dec_input: (bz, beam_size) -> (beam_size, bz) dec_input = dec_input.transpose(0, 1) for j in range(args.beam_size): # dec_out: (bz, vocab_size) dec_out, dec_hidden = self.decoder( dec_input[j].unsqueeze(1), dec_hidden, enc_out) dec_hidden *= tgt_batch.non_pad_mask[i].unsqueeze( 1).repeat(1, dec_hidden.size(-1)) beam_score[j] = dec_out # (bz, beam_size, 1) -> (bz, beam_size, vocab_size) hist_score = hist_score.unsqueeze(-1).expand( (-1, -1, tgt_vocab.vocab_size)) hist_score += beam_score.transpose( 0, 1) # (bz, beam_size, vocab_size) # (bz, beam_size * vocab_size) hist_score = hist_score.reshape((batch_size, -1)) # (bz, beam_size) top_prob, top_idxs = hist_score.topk(args.beam_size, dim=1) hist_score = top_prob top_idxs %= tgt_vocab.vocab_size best_paths[i] = top_idxs dec_input = top_idxs