def run_epoch(self, data_iter, nbatches, epoch, split, optimizer=None, tb_writer=None): """ Args: data_iter: Pytorch DataLoader nbatches: int (number of batches in data_iter) epoch: int split: str ('train', 'val') optimizer: Wrapped optim (e.g. OptWrapper, NoamOpt) tb_writer: Tensorboard SummaryWriter Returns: 1D tensor containing average loss across all items in data_iter """ loss_avg = 0 n_fwds = 0 for s_idx, (texts, ratings, metadata) in enumerate(data_iter): start = time.time() # Add special tokens to texts x, lengths, labels = self.dataset.prepare_batch( texts, ratings, doc_append_id=EDOC_ID) iter = create_lm_data_iter(x, self.hp.lm_seq_len) for b_idx, batch_obj in enumerate(iter): if optimizer: optimizer.optimizer.zero_grad() # # Forward pass # if self.hp.model_type == 'mlstm': # Note: iter creates a sequence of length hp.lm_seq_len + 1, and batch_obj.trg is all about the # last token, while batch_obj.trg_y is all but the first token. They're named as such because # the Batch class was originally designed for the Encoder-Decoder version of the Transformer, and # the trg variables correspond to inputs to the Decoder. batch = move_to_cuda( batch_obj.trg ) # it's trg because doesn't include last token batch_trg = move_to_cuda(batch_obj.trg_y) batch_size, seq_len = batch.size() if b_idx == 0: h_init, c_init = self.model.module.rnn.state0(batch_size) if self.ngpus > 1 \ else self.model.rnn.state0(batch_size) h_init = move_to_cuda(h_init) c_init = move_to_cuda(c_init) # Forward steps for lstm result = self.model(batch, h_init, c_init) hiddens, cells, outputs = zip( *result) if self.ngpus > 1 else result # Calculate loss loss = 0 batch_trg = batch_trg.transpose( 0, 1).contiguous() # [seq_len, batch] if self.ngpus > 1: for t in range(len(outputs[0])): # length ngpus list of outputs at that time step loss += self.loss_fn( [outputs[i][t] for i in range(len(outputs))], batch_trg[t]) else: for t in range(len(outputs)): loss += self.loss_fn(outputs[t], batch_trg[t]) loss_value = loss.item() / self.hp.lm_seq_len # We only do bptt until lm_seq_len. Copy the hidden states so that we can continue the sequence if self.ngpus > 1: h_init = torch.cat([ copy_state(hiddens[i][-1]) for i in range(self.ngpus) ], dim=0) c_init = torch.cat([ copy_state(cells[i][-1]) for i in range(self.ngpus) ], dim=0) else: h_init = copy_state(hiddens[-1]) c_init = copy_state(cells[-1]) elif self.hp.model_type == 'transformer': # This is the decoder only version now logits = self.model(move_to_cuda(batch_obj.trg), move_to_cuda(batch_obj.trg_mask)) # logits: [batch, seq_len, vocab] loss = self.loss_fn(logits, move_to_cuda(batch_obj.trg_y)) loss /= move_to_cuda(batch_obj.ntokens.float( )) # normalize by number of non-pad tokens loss_value = loss.item() if self.ngpus > 1: # With the custom DataParallel, there is no gather() and the loss is calculated per # minibatch split on each GPU (see DataParallelCriterion's forward() -- the return # value is divided by the number of GPUs). We simply undo that operation here. # Also, note that the KLDivLoss in LabelSmoothing is already normalized by both # batch and seq_len, as we use size_average=False to prevent any normalization followed # by a manual normalization using the batch.ntokens. This oddity is because # KLDivLoss does not support ignore_index=PAD_ID as CrossEntropyLoss does. loss_value *= len(self.opt.gpus.split(',')) # # Backward pass # gn = -1.0 # dummy for val (norm can't be < 0 anyway) if optimizer: loss.backward() gn = calc_grad_norm( self.model ) # not actually using this, just for printing optimizer.step() loss_avg = update_moving_avg(loss_avg, loss_value, n_fwds + 1) n_fwds += 1 # Print print_str = 'Epoch={}, batch={}/{}, split={}, time={:.4f} --- ' \ 'loss={:.4f}, loss_avg_so_far={:.4f}, grad_norm={:.4f}' if s_idx % self.opt.print_every_nbatches == 0: print( print_str.format(epoch, s_idx, nbatches, split, time.time() - start, loss_value, loss_avg, gn)) if tb_writer: # Step for tensorboard: global steps in terms of number of reviews # This accounts for runs with different batch sizes step = (epoch * nbatches * self.hp.batch_size) + (s_idx * self.hp.batch_size) tb_writer.add_scalar('stats/loss', loss_value, step) # Save periodically so we don't have to wait for epoch to finish save_every = nbatches // 10 if save_every != 0 and s_idx % save_every == 0: save_model(self.save_dir, self.model, self.optimizer, epoch, self.opt, 'intermediate') print('Epoch={}, split={}, --- ' 'loss_avg={:.4f}'.format(epoch, split, loss_avg)) return loss_avg
def run_epoch(self, data_iter, nbatches, epoch, split, optimizer=None, tb_writer=None, save_intermediate=True): """ Args: data_iter: iterable providing minibatches nbatches: int (number of batches in data_iter) epoch: int split: str ('train', 'val') optimizer: Wrapped optim (e.g. OptWrapper) tb_writer: Tensorboard SummaryWriter save_intermediate: boolean (save intermediate checkpoints) Returns: 1D tensor containing average loss across all items in data_iter """ loss_avg = 0 acc_avg = 0 rating_diff_avg = 0 per_rating_counts = defaultdict(int) per_rating_acc = defaultdict(int) for s, batch in enumerate(data_iter): start = time.time() if optimizer: optimizer.optimizer.zero_grad() texts, ratings, metadata = batch batch_size = len(texts) x, lengths, labels = self.dataset.prepare_batch(texts, ratings) # # Forward pass # logits = self.model(x) if self.hp.clf_mse: logits = logits.squeeze(1) # [batch, 1] -> [batch] loss = self.loss_fn(logits, labels.float()) else: loss = self.loss_fn(logits, labels) loss_value = loss.item() acc = calc_clf_acc(logits, labels).item() # # Backward pass # gn = -1.0 # dummy for val (norm can't be < 0 anyway) if optimizer: loss.backward() gn = calc_grad_norm(self.model) # not actually using this, just for printing optimizer.step() # # Print etc. # loss_avg = update_moving_avg(loss_avg, loss_value, s + 1) acc_avg = update_moving_avg(acc_avg, acc, s + 1) print_str = 'Epoch={}, batch={}/{}, split={}, time={:.4f} --- ' \ 'loss={:.4f}, loss_avg_so_far={:.4f}, acc={:.4f}, acc_avg_so_far={:.4f}, grad_norm={:.4f}' if self.hp.clf_mse: rating_diff = (labels - logits.round().long()).float().mean() rating_diff_avg = update_moving_avg(rating_diff_avg, rating_diff, s + 1) print_str += ', rating_diff={:.4f}, rating_diff_avg_so_far={:.4f}'.format(rating_diff, rating_diff_avg) true_ratings = labels + 1 pred_ratings = logits.round() + 1 probs = torch.ones(batch_size) # dummy per_rating_counts, per_rating_acc = calc_per_rating_acc(pred_ratings, true_ratings, per_rating_counts, per_rating_acc) else: true_ratings = labels + 1 probs, max_idxs = torch.max(F.softmax(logits, dim=1), dim=1) pred_ratings = max_idxs + 1 per_rating_counts, per_rating_acc = calc_per_rating_acc(pred_ratings, true_ratings, per_rating_counts, per_rating_acc) if s % self.opt.print_every_nbatches == 0: print(print_str.format( epoch, s, nbatches, split, time.time() - start, loss_value, loss_avg, acc, acc_avg, gn )) print('Review: {}'.format(texts[0])) print('True rating: {}'.format(true_ratings[0])) print('Predicted rating: {}'.format(pred_ratings[0])) print('Predicted rating probability: {:.4f}'.format(probs[0])) print('Per rating accuracy: {}'.format(dict(per_rating_acc))) if tb_writer: # Global steps in terms of number of items # This accounts for runs with different batch sizes step = (epoch * nbatches * self.hp.batch_size) + (s * self.hp.batch_size) tb_writer.add_scalar('loss/batch_loss', loss_value, step) tb_writer.add_scalar('loss/avg_loss', loss_avg, step) tb_writer.add_scalar('acc/batch_acc', acc, step) tb_writer.add_scalar('acc/avg_acc', acc_avg, step) if self.hp.clf_mse: tb_writer.add_scalar('rating_diff/batch_diff', rating_diff, step) tb_writer.add_scalar('rating_diff/avg_diff', rating_diff_avg, step) tb_writer.add_text('predictions/review', texts[0], step) tb_writer.add_text('predictions/true_pred_prob', 'True={}, Pred={}, Prob={:.4f}'.format( true_ratings[0], pred_ratings[0], probs[0]), step) for r, acc in per_rating_acc.items(): tb_writer.add_scalar('acc/curavg_per_rating_acc_{}'.format(r), acc, step) # Save periodically so we don't have to wait for epoch to finish if save_intermediate: save_every = nbatches // 10 if save_every != 0 and s % save_every == 0: model_to_save = self.model.module if len(self.opt.gpus) > 1 else self.model save_model(self.save_dir, model_to_save, self.optimizer, epoch, self.opt, 'intermediate') print_str = 'Epoch={}, split={}, --- ' \ 'loss_avg={:.4f}, acc_avg={:.4f}, per_rating_acc={}'.format( epoch, split, loss_avg, acc_avg, dict(per_rating_acc)) if self.hp.clf_mse: print_str += ', rating_diff_avg={:.4f}'.format(rating_diff_avg) print(print_str) return loss_avg, acc_avg, rating_diff_avg, per_rating_acc
def train(self): """ Main train loop """ # # Get data, setup # self.dataset = SummDatasetFactory.get(self.opt.dataset, '../datasets/yelp_dataset/') subwordenc = self.dataset.subwordenc train_iter = self.dataset.get_data_loader( split='train', n_docs=self.hp.n_docs, sample_reviews=True, batch_size=self.hp.batch_size, shuffle=True) train_nbatches = train_iter.__len__() val_iter = self.dataset.get_data_loader(split='val', n_docs=self.hp.n_docs, sample_reviews=False, batch_size=self.hp.batch_size, shuffle=False) val_nbatches = val_iter.__len__() tb_path = os.path.join(self.save_dir, 'tensorboard/') print('Tensorboard events will be logged to: {}'.format(tb_path)) os.mkdir(tb_path) os.mkdir(tb_path + 'train/') os.mkdir(tb_path + 'val/') self.tb_tr_writer = SummaryWriter(tb_path + 'train/') self.tb_val_writer = SummaryWriter(tb_path + 'val/') # # Get model and loss # if len(self.opt.load_model) > 0: raise NotImplementedError( 'Need to save run to same directory, handle changes in hp, etc.' ) # checkpoint = torch.load(opt.load_model) # self.model = checkpoint['model'] else: if self.hp.model_type == 'mlstm': embed = nn.Embedding(subwordenc.vocab_size, self.hp.emb_size) lstm = StackedLSTM(mLSTM, self.hp.lstm_layers, self.hp.emb_size, self.hp.hidden_size, subwordenc.vocab_size, self.hp.lstm_dropout, layer_norm=self.hp.lstm_ln) self.model = StackedLSTMEncoder(embed, lstm) self.loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_ID) elif self.hp.model_type == 'transformer': self.model = make_model(subwordenc.vocab_size, subwordenc.vocab_size, N=self.hp.tsfr_blocks, d_model=self.hp.hidden_size, d_ff=self.hp.tsfr_ff_size, dropout=self.hp.tsfr_dropout, tie_embs=self.hp.tsfr_tie_embs, decoder_only=True) self.loss_fn = LabelSmoothing( size=subwordenc.vocab_size, smoothing=self.hp.tsfr_label_smooth) if torch.cuda.is_available(): self.model.cuda() self.ngpus = 1 if len(self.opt.gpus) > 1: self.ngpus = len(self.opt.gpus.split(',')) self.model = DataParallelModel(self.model) self.loss_fn = DataParallelCriterion(self.loss_fn) n_params = sum([p.nelement() for p in self.model.parameters()]) print('Number of parameters: {}'.format(n_params)) # # Get optimizer # if self.hp.optim == 'normal': self.optimizer = OptWrapper( self.model, self.hp.lm_clip, optim.Adam(self.model.parameters(), lr=self.hp.lm_lr)) elif self.hp.optim == 'noam': d_model = self.model.module.tgt_embed[0].d_model if self.ngpus > 1 else \ self.model.tgt_embed[0].d_model self.optimizer = NoamOpt( d_model, 2, self.hp.noam_warmup, torch.optim.Adam(self.model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) # # Train epochs # for e in range(hp.max_nepochs): try: self.model.train() loss_avg = self.run_epoch(train_iter, train_nbatches, e, 'train', optimizer=self.optimizer, tb_writer=self.tb_tr_writer) self.tb_tr_writer.add_scalar('overall_stats/loss_avg', loss_avg, e) except KeyboardInterrupt: print('Exiting from training early') self.model.eval() loss_avg = self.run_epoch(val_iter, val_nbatches, e, 'val', optimizer=None) self.tb_val_writer.add_scalar('overall_stats/loss_avg', loss_avg, e) save_model(self.save_dir, self.model, self.optimizer, e, self.opt, loss_avg)
def train(self): """ Main train loop """ # # Get data, setup # # NOTE: Use n_docs=1 so we can classify one review self.dataset = SummDatasetFactory.get(self.opt.dataset, self.opt.dir_path) train_iter = self.dataset.get_data_loader(split='train', sample_reviews=True, n_docs=1, batch_size=self.hp.batch_size, shuffle=True) val_iter = self.dataset.get_data_loader(split='val', sample_reviews=False, n_docs=1, batch_size=self.hp.batch_size, shuffle=False) self.tb_tr_writer = None self.tb_val_writer = None tb_path = os.path.join(self.save_dir, 'tensorboard/') print('Tensorboard events will be logged to: {}'.format(tb_path)) os.mkdir(tb_path) os.mkdir(tb_path + 'train/') os.mkdir(tb_path + 'val/') self.tb_tr_writer = SummaryWriter(tb_path + 'train/') self.tb_val_writer = SummaryWriter(tb_path + 'val/') # # Get model and loss # if len(self.opt.load_train_model) > 0: raise NotImplementedError('Need to save run to same directory, handle changes in hp, etc.') # checkpoint = torch.load(opt.load_model) # self.model = checkpoint['model'] else: if self.hp.model_type == 'cnn': cnn_output_size = self.hp.cnn_n_feat_maps * len(self.hp.cnn_filter_sizes) self.model = TextClassifier(self.dataset.subwordenc.vocab_size, self.hp.emb_size, self.hp.cnn_filter_sizes, self.hp.cnn_n_feat_maps, self.hp.cnn_dropout, cnn_output_size, self.dataset.n_ratings_labels, onehot_inputs=self.hp.clf_onehot, mse=self.hp.clf_mse) if self.hp.clf_mse: self.loss_fn = nn.MSELoss() else: self.loss_fn = nn.CrossEntropyLoss() if torch.cuda.is_available(): self.model.cuda() if len(self.opt.gpus) > 1: self.model = nn.DataParallel(self.model) n_params = sum([p.nelement() for p in self.model.parameters()]) print('Number of parameters: {}'.format(n_params)) # # Get optimizer # self.optimizer = OptWrapper( self.model, self.hp.clf_clip, optim.Adam(self.model.parameters(), lr=self.hp.clf_lr)) # # Train epochs # for e in range(hp.max_nepochs): try: self.model.train() loss_avg, acc_avg, rating_diff_avg, per_rating_acc = self.run_epoch( train_iter, train_iter.__len__(), e, 'train', optimizer=self.optimizer, tb_writer=self.tb_tr_writer) self.tb_tr_writer.add_scalar('overall/loss', loss_avg, e) self.tb_tr_writer.add_scalar('overall/acc', acc_avg, e) self.tb_tr_writer.add_scalar('overall/rating_diff', rating_diff_avg, e) for r, acc in per_rating_acc.items(): self.tb_tr_writer.add_scalar('overall/per_rating_acc_{}_stars'.format(r), acc, e) except KeyboardInterrupt: print('Exiting from training early') self.model.eval() loss_avg, acc_avg, rating_diff_avg, per_rating_acc = self.run_epoch( val_iter, val_iter.__len__(), e, 'val', optimizer=None) self.tb_val_writer.add_scalar('overall/loss', loss_avg, e) self.tb_val_writer.add_scalar('overall/acc', acc_avg, e) self.tb_val_writer.add_scalar('overall/rating_diff', rating_diff_avg, e) for r, acc in per_rating_acc.items(): self.tb_val_writer.add_scalar('overall/per_rating_acc_{}'.format(r), acc, e) fn_str = 'l{:.4f}_a{:.4f}_d{:.4f}'.format(loss_avg, acc_avg, rating_diff_avg) model_to_save = self.model.module if len(self.opt.gpus) > 1 else self.model save_model(self.save_dir, model_to_save, self.optimizer, e, self.opt, fn_str)