def train_val(self, lstm, dual_target=None): """ Function that does the training and validation :param lstm: whether the model is an LSTM :return: """ #print("Dual target in train_val: %r"%dual_target) for epoch in range(self.start_epoch, self.config['n_epochs']): # VALIDATION if self.config['do_val'] and ( (epoch % self.config['val_freq'] == 0) or (epoch == self.config['n_epochs'] - 1)): val_batch_time = Logger.AverageMeter() val_loss = Logger.AverageMeter() if self.extra_criterion: val_extra_loss = Logger.AverageMeter() self.model.eval() end = time.time() val_data_time = Logger.AverageMeter() for batch_idx, (data, target) in enumerate(self.val_loader): val_data_time.update(time.time() - end) #print(target[1].size()) #a = 1.0/0.0 kwargs = dict(target=target, criterion=self.train_criterion, optim=self.optimizer, train=False) if lstm: loss, output = step_lstm(data, self.model, self.config['cuda'], **kwargs) else: loss, output = step_feedfwd(data, self.model, self.config['cuda'], **kwargs) val_loss.update(loss) val_batch_time.update(time.time() - end) if self.extra_criterion: dual_target = type(target) is list or type( target) is tuple with torch.set_grad_enabled(False): if self.config['cuda']: if dual_target: target = tuple( single_target.cuda(async=True) for single_target in target) else: target = target.cuda(async=True) if dual_target: target_var = tuple( Variable(t, requires_grad=False) for t in target) else: target_var = Variable(target, requires_grad=False) extra_loss = self.extra_criterion( output, target_var) extra_loss = extra_loss.item() val_extra_loss.update(extra_loss) if batch_idx % self.config['print_freq'] == 0: print_string = 'Val {:s}: Epoch {:d}\t' \ 'Batch {:d}/{:d}\t' \ 'Data Time {:.4f} ({:.4f})\t' \ 'Batch Time {:.4f} ({:.4f})\t' \ 'Loss {:f}\t' \ .format(self.experiment, epoch, batch_idx, len(self.val_loader) - 1, val_data_time.val, val_data_time.avg, val_batch_time.val, val_batch_time.avg, loss) if self.extra_criterion: print_string += 'Loss Extra Scale {:f}\t'.format( extra_loss) print(print_string) if self.config['log_visdom']: self.vis.save(envs=[self.vis_env]) end = time.time() print_string = 'Val {:s}: Epoch {:d}, val_loss {:f}' \ .format(self.experiment, epoch, val_loss.avg) if self.extra_criterion: print_string += ' val_extra_loss {:f}\t'.format( val_extra_loss.avg) print(print_string) if self.config['log_visdom']: val_loss_avg = val_loss.avg self.vis.line(X=np.asarray([epoch]), Y=np.asarray([val_loss_avg]), win=self.loss_win, name='val_loss', update='append', env=self.vis_env) if self.extra_criterion: val_extra_loss_avg = val_extra_loss.avg self.vis.line(X=np.asarray([epoch]), Y=np.asarray([val_extra_loss_avg]), win=self.extra_loss_win, name='val_extra_loss', update='append', env=self.vis_env) self.vis.save(envs=[self.vis_env]) # SAVE CHECKPOINT if epoch % self.config['snapshot'] == 0: self.save_checkpoint(epoch) print('Epoch {:d} checkpoint saved for {:s}'.\ format(epoch, self.experiment)) # ADJUST LR lr = self.optimizer.adjust_lr(epoch) if self.config['log_visdom']: self.vis.line(X=np.asarray([epoch]), Y=np.asarray([np.log10(lr)]), win=self.lr_win, name='learning_rate', update='append', env=self.vis_env) # TRAIN self.model.train() train_data_time = Logger.AverageMeter() train_batch_time = Logger.AverageMeter() end = time.time() for batch_idx, (data, target) in enumerate(self.train_loader): train_data_time.update(time.time() - end) #print(target[1].size()) kwargs = dict(target=target, criterion=self.train_criterion, optim=self.optimizer, train=True, max_grad_norm=self.config['max_grad_norm']) if lstm: loss, output = step_lstm(data, self.model, self.config['cuda'], **kwargs) else: loss, output = step_feedfwd(data, self.model, self.config['cuda'], **kwargs) if self.extra_criterion: dual_target = type(target) is list or type(target) is tuple with torch.set_grad_enabled(False): if self.config['cuda']: if dual_target: target = tuple( single_target.cuda(async=True) for single_target in target) else: target = target.cuda(async=True) if dual_target: target_var = tuple( Variable(t, requires_grad=False) for t in target) else: target_var = Variable(target, requires_grad=False) extra_loss = self.extra_criterion(output, target_var) extra_loss = extra_loss.item() train_batch_time.update(time.time() - end) if batch_idx % self.config['print_freq'] == 0: n_iter = epoch * len(self.train_loader) + batch_idx epoch_count = float(n_iter) / len(self.train_loader) print_string = 'Train {:s}: Epoch {:d}\t' \ 'Batch {:d}/{:d}\t' \ 'Data Time {:.4f} ({:.4f})\t' \ 'Batch Time {:.4f} ({:.4f})\t' \ 'Loss {:f}\t' \ .format(self.experiment, epoch, batch_idx, len(self.train_loader) - 1, train_data_time.val, train_data_time.avg, train_batch_time.val, train_batch_time.avg, loss) if self.extra_criterion: print_string += 'Loss Extra Scale {:f}\t'.format( extra_loss) print_string += 'lr: {:f}'.format(lr) print(print_string) end = time.time() if self.config['log_visdom']: self.vis.line(X=np.asarray([epoch_count]), Y=np.asarray([loss]), win=self.loss_win, name='train_loss', update='append', env=self.vis_env) if self.extra_criterion: self.vis.line(X=np.asarray([epoch_count]), Y=np.asarray([extra_loss]), win=self.extra_loss_win, name='train_extra_loss', update='append', env=self.vis_env) if self.n_criterion_params: for name, v in self.train_criterion.named_parameters( ): v = v.item() self.vis.line(X=np.asarray([epoch_count]), Y=np.asarray([v]), win=self.criterion_param_win, name=name, update='append', env=self.vis_env) self.vis.save(envs=[self.vis_env]) end = time.time() # Save final checkpoint epoch = self.config['n_epochs'] self.save_checkpoint(epoch) print('Epoch {:d} checkpoint saved'.format(epoch)) if self.config['log_visdom']: self.vis.save(envs=[self.vis_env])
def train_val(self, lstm): """ Function that does the training and validation :param lstm: whether the model is an LSTM :return: """ for epoch in range(self.start_epoch, self.config['n_epochs']): # VALIDATION if self.config['do_val'] and ( (epoch % self.config['val_freq'] == 0) or (epoch == self.config['n_epochs'] - 1)): val_batch_time = Logger.AverageMeter() val_loss = Logger.AverageMeter() self.model.eval() end = time.time() val_data_time = Logger.AverageMeter() for batch_idx, (data, target) in enumerate(self.val_loader): val_data_time.update(time.time() - end) kwargs = dict(target=target, criterion=self.val_criterion, optim=self.optimizer, train=False) if lstm: loss, _ = step_lstm(data, self.model, self.config['cuda'], **kwargs) else: loss, _ = step_feedfwd(data, self.model, self.config['cuda'], **kwargs) val_loss.update(loss) val_batch_time.update(time.time() - end) if batch_idx % self.config['print_freq'] == 0: print('Val {:s}: Epoch {:d}\t' \ 'Batch {:d}/{:d}\t' \ 'Data time {:.4f} ({:.4f})\t' \ 'Batch time {:.4f} ({:.4f})\t' \ 'Loss {:f}' \ .format(self.experiment, epoch, batch_idx, len(self.val_loader)-1, val_data_time.val, val_data_time.avg, val_batch_time.val, val_batch_time.avg, loss)) if self.config['log_visdom']: self.vis.save(envs=[self.vis_env]) end = time.time() print('Val {:s}: Epoch {:d}, val_loss {:f}'.format( self.experiment, epoch, val_loss.avg)) if self.config['log_visdom']: self.vis.updateTrace(X=np.asarray([epoch]), Y=np.asarray([val_loss.avg]), win=self.loss_win, name='val_loss', append=True, env=self.vis_env) self.vis.save(envs=[self.vis_env]) # SAVE CHECKPOINT if epoch % self.config['snapshot'] == 0: self.save_checkpoint(epoch) print('Epoch {:d} checkpoint saved for {:s}'.\ format(epoch, self.experiment)) # ADJUST LR lr = self.optimizer.adjust_lr(epoch) if self.config['log_visdom']: self.vis.updateTrace(X=np.asarray([epoch]), Y=np.asarray([np.log10(lr)]), win=self.lr_win, name='learning_rate', append=True, env=self.vis_env) # TRAIN self.model.train() train_data_time = Logger.AverageMeter() train_batch_time = Logger.AverageMeter() end = time.time() for batch_idx, (data, target) in enumerate(self.train_loader): train_data_time.update(time.time() - end) kwargs = dict(target=target, criterion=self.train_criterion, optim=self.optimizer, train=True, max_grad_norm=self.config['max_grad_norm']) if lstm: loss, _ = step_lstm(data, self.model, self.config['cuda'], **kwargs) else: loss, _ = step_feedfwd(data, self.model, self.config['cuda'], **kwargs) train_batch_time.update(time.time() - end) if batch_idx % self.config['print_freq'] == 0: n_iter = epoch * len(self.train_loader) + batch_idx epoch_count = float(n_iter) / len(self.train_loader) print('Train {:s}: Epoch {:d}\t' \ 'Batch {:d}/{:d}\t' \ 'Data Time {:.4f} ({:.4f})\t' \ 'Batch Time {:.4f} ({:.4f})\t' \ 'Loss {:f}\t' \ 'lr: {:f}'.\ format(self.experiment, epoch, batch_idx, len(self.train_loader)-1, train_data_time.val, train_data_time.avg, train_batch_time.val, train_batch_time.avg, loss, lr)) if self.config['log_visdom']: self.vis.updateTrace(X=np.asarray([epoch_count]), Y=np.asarray([loss]), win=self.loss_win, name='train_loss', append=True, env=self.vis_env) if self.n_criterion_params: for name, v in self.train_criterion.named_parameters( ): v = v.data.cpu().numpy()[0] self.vis.updateTrace( X=np.asarray([epoch_count]), Y=np.asarray([v]), win=self.criterion_param_win, name=name, append=True, env=self.vis_env) self.vis.save(envs=[self.vis_env]) end = time.time() # Save final checkpoint epoch = self.config['n_epochs'] self.save_checkpoint(epoch) print('Epoch {:d} checkpoint saved'.format(epoch)) if self.config['log_visdom']: self.vis.save(envs=[self.vis_env])