def train_lm(self, epoch): self.model.train() epoch_loss = 0 for batch_idx, feature in enumerate(self.train_iter): self.optimizer.zero_grad() utils.feature_to_device(feature, self.device) out = self.model(feature) loss = self.out_loss_fn(out.view(-1, out.shape[-1]), feature.y.view(-1)) # utils.print_backward_graph(loss) loss.backward() nn.utils.clip_grad_norm_(self.model.parameters(), self.args.clip_grad) self.optimizer.step() iloss = loss.item() epoch_loss += iloss self.logger.info( f'Step {batch_idx+1}/{epoch+1:02} | Train Loss: {iloss:.3f} | Train PPL: {math.exp(iloss):7.3f} | Time: {secs:.3f}s\n' ) return epoch_loss / len(self.train_iter)
def train(self, epoch, data_iter=None): self.model.train() if data_iter is None: data_iter = self.train_iter epoch_loss = 0 for batch_idx, feature in enumerate(data_iter): start_time = time.time() utils.feature_to_device(feature, self.device) # out, out_lm = torch_cp.checkpoint(self.model, feature) out, out_lm = self.model(feature) loss, loss_lm = models.AR.loss(self.args.auxiliary_task, self.out_loss_fn, out, out_lm, feature.resp, feature.lm.y) if self.args.auxiliary_task is not None: loss = loss + self.args.alpha * loss_lm if self.args.gradient_accumulation > 1: loss = loss / self.args.gradient_accumulation # accuracy = accuracy / self.args.gradient_accumulation # utils.self.logger.info_backward_graph(loss) loss.backward() iloss = loss.item() epoch_loss += iloss if self.args.clip_grad is not None: nn.utils.clip_grad_norm_(self.model.parameters(), self.args.clip_grad) # self.grad_util.collect(self.model) if (batch_idx + 1) % self.args.gradient_accumulation == 0: self.optimizer.step() self.optimizer.zero_grad() if self.args.use_scheduler: self.scheduler.step(iloss) end_time = time.time() secs = end_time - start_time self.logger.info( f'Step {batch_idx+1}/{epoch+1:02} | Train Loss: {iloss:.3f} | Train PPL: {math.exp(iloss):7.3f} | Time: {secs:.3f}s\n' ) return epoch_loss / len(data_iter)
def eval(self, data_iter): self.model.eval() epoch_loss = 0 with torch.no_grad(): for _, feature in enumerate(data_iter): utils.feature_to_device(feature, self.device) out, out_lm = self.model(feature) loss, loss_lm = models.AR.loss(self.args.auxiliary_task, self.out_loss_fn, out, out_lm, feature.resp, feature.lm.y) if self.args.auxiliary_task is not None: loss = loss + self.args.alpha * loss_lm epoch_loss += loss.item() return epoch_loss / len(data_iter)
def train(self, epoch, data_iter=None): self.model.train() if data_iter is None: data_iter = self.train_iter epoch_loss = 0 for batch_idx, feature in enumerate(data_iter): start_time = time.time() self.optimizer.zero_grad() utils.feature_to_device(feature, self.device) out, out_lm = self.model(feature) loss, loss_lm = models.AR.loss(self.out_loss_fn, out, out_lm, feature.resp, feature.lm.y) loss = loss + self.args.alpha * loss_lm # utils.print_backward_graph(loss) loss.backward() nn.utils.clip_grad_norm_(self.model.parameters(), self.args.clip_grad) self.grad_util.collect(self.model) self.optimizer.step() iloss = loss.item() epoch_loss += iloss end_time = time.time() secs = end_time - start_time self.logger.info( f'Step {batch_idx+1}/{epoch+1:02} | Train Loss: {iloss:.3f} | Train PPL: {math.exp(iloss):7.3f} | Time: {secs:.3f}s\n' ) return epoch_loss / len(data_iter)
def run(self): self.model.eval() total_bleu = 0 total_f1 = 0 total_dist1 = 0 total_dist2 = 0 total_loss = 0 print('Run eval...') with torch.no_grad(): for batch_idx, feature in enumerate(self.test_iter): utils.feature_to_device(feature, self.device) out, out_lm = self.model(feature) print(self.vocab.itos(out[3, 0].argmax(dim=0).item()), self.vocab.itos(out_lm[3, 0].argmax(dim=0).item())) loss, loss_lm = models.AR.loss(self.out_loss_fn, out, out_lm, feature.resp, feature.lm.y) print(loss, loss_lm) loss = loss + self.model_config.alpha * loss_lm total_loss += loss.item() # target include w1, w2...[EOS], len: max_seq_length + 1 target = copy.deepcopy(feature.resp[1:]) # feature will be changed pred, pred_padded = utils.sample_sequence( feature, self.vocab, self.model, self.args) pred_tokens = [[self.vocab.itos(k) for k in ks] for ks in pred] target_tokens = [[[self.vocab.itos(k) for k in ks]] for ks in target.T.tolist()] print('----------------------------------') print( 'Context: ', ''.join([ self.vocab.itos(k) for k in feature.context.T.tolist()[0] ])) print( 'LM x: ', ''.join([ self.vocab.itos(k) for k in feature.lm.x.T.tolist()[0] ])) print( 'LM y: ', ''.join([ self.vocab.itos(k) for k in feature.lm.y.T.tolist()[0] ])) print( 'Pred: ', ''.join([ self.vocab.itos(k) for k in pred_padded.T.tolist()[0] ])) print('Target: ', ''.join(target_tokens[0][0])) print( 'Pred: ', ''.join([ self.vocab.itos(k) for k in pred_padded.T.tolist()[-1] ])) print('Target: ', ''.join(target_tokens[-1][0])) print('----------------------------------') bleu = metrics.bleu_score(pred_tokens, target_tokens) f1 = metrics.f1_score(pred_padded.T.to('cpu'), target.T.to('cpu')) # dist1 = metrics.distinct_score([v[:-1] for v in pred]) dist1 = metrics.distinct_score(pred_tokens) dist2 = metrics.distinct_score(pred_tokens, 2) total_bleu += bleu total_f1 += f1 total_dist1 += dist1 total_dist2 += dist2 l = len(self.test_iter) bleu = total_bleu / l f1 = total_f1 / l dist1 = total_dist1 / l dist2 = total_dist2 / l # https://stackoverflow.com/questions/59209086/calculate-perplexity-in-pytorch # see per-word perplexity: # https://github.com/huggingface/transfer-learning-conv-ai/blob/master/convai_evaluation.py#L161 # https://github.com/facebookresearch/ParlAI/blob/56d46551190a7ffaedccd13534412d43bc7076e5/parlai/scripts/eval_ppl.py ppl = math.exp(total_loss / l) print(f'\tBleu: {bleu:.8f} | F1: {f1:.8f} | ' f'Dist1: {dist1:.3f} | Dist2: {dist2:.3f} | PPL: {ppl:7.3f}')
def inputs_labels_from_batch(self, batch_data): utils.feature_to_device(batch_data, 'cuda') return (batch_data, (batch_data.resp, batch_data.lm))