def train_a2c(net, mb_obs, mb_rewards, mb_actions, mb_values, optimizer, tb_tracker, step_idx, device="cpu"): optimizer.zero_grad() mb_adv = mb_rewards - mb_values adv_v = torch.FloatTensor(mb_adv).to(device) obs_v = torch.FloatTensor(mb_obs).to(device) rewards_v = torch.FloatTensor(mb_rewards).to(device) actions_t = torch.LongTensor(mb_actions).to(device) logits_v, values_v = net(obs_v) log_prob_v = F.log_softmax(logits_v, dim=1) log_prob_actions_v = adv_v * log_prob_v[range(len(mb_actions)), actions_t] loss_policy_v = -log_prob_actions_v.mean() loss_value_v = F.mse_loss(values_v.squeeze(-1), rewards_v) prob_v = F.softmax(logits_v, dim=1) entropy_loss_v = (prob_v * log_prob_v).sum(dim=1).mean() loss_v = ENTROPY_BETA * entropy_loss_v + VALUE_LOSS_COEF * loss_value_v + loss_policy_v loss_v.backward() nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD) optimizer.step() tb_tracker.track("advantage", mb_adv, step_idx) tb_tracker.track("values", values_v, step_idx) tb_tracker.track("batch_rewards", rewards_v, step_idx) tb_tracker.track("loss_entropy", entropy_loss_v, step_idx) tb_tracker.track("loss_policy", loss_policy_v, step_idx) tb_tracker.track("loss_value", loss_value_v, step_idx) tb_tracker.track("loss_total", loss_v, step_idx) return obs_v
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['b1'], group['b2'] state['step'] += 1 # Add grad clipping if group['max_grad_norm'] > 0: clip_grad_norm_(p, group['max_grad_norm']) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) denom = exp_avg_sq.sqrt().add_(group['e']) bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] schedule_fct = SCHEDULES[group['schedule']] lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 p.data.addcdiv_(-step_size, exp_avg, denom) # Add weight decay at the end (fixed version) if (len(p.size()) > 1 or group['vector_l2']) and group['l2'] > 0: p.data.add_(-lr_scheduled * group['l2'], p.data) return loss
def on_grad_computed(self, _, named_parameters, **kwargs): if self.gradient_clip_value is None: return clip_grad_norm_( (p for _, p in named_parameters), max_norm=self.gradient_clip_value, norm_type=self.gradient_clip_norm_type, )
def train(args, model, train_data_loader, dev_data_loader, accuracy, device): """ Train the current model Keyword arguments: args: arguments model: model to be trained train_data_loader: pytorch build-in data loader output for training examples dev_data_loader: pytorch build-in data loader output for dev examples accuracy: previous best accuracy device: cpu of gpu """ model.train() optimizer = torch.optim.Adamax(model.parameters()) criterion = nn.CrossEntropyLoss() print_loss_total = 0 epoch_loss_total = 0 start = time.time() #### modify the following code to complete the training funtion for idx, batch in enumerate(train_data_loader): question_text = batch['text'].to(device) question_len = batch['len'] labels = batch['labels'] #### Your code here clip_grad_norm_(model.parameters(), args.grad_clipping) print_loss_total += loss.data.numpy() epoch_loss_total += loss.data.numpy() if idx % args.checkpoint == 0 and idx > 0: print_loss_avg = print_loss_total / args.checkpoint print('number of steps: %d, loss: %.5f time: %.5f' % (idx, print_loss_avg, time.time()- start)) print_loss_total = 0 curr_accuracy = evaluate(dev_data_loader, model, device) if accuracy < curr_accuracy: torch.save(model, args.save_model) accuracy = curr_accuracy return accuracy
def f(): grad_norm = clip_grad_norm_( [p for p in net.parameters() if p.requires_grad], clip_grad) grad_norm = grad_norm.item() if max_grad is not None and grad_norm >= max_grad: print('WARNING: Exploding Gradients {:.2f}'.format(grad_norm)) grad_norm = max_grad grad_log = {} grad_log['grad_norm'] = grad_norm return grad_log
def step(self): """Update the model parameters based on current gradients. Optionally, will employ gradient modification or update learning rate. """ learning_rate = self.learning_rate() if self._with_fp16_wrapper: if hasattr(self._optimizer, "update_master_grads"): self._optimizer.update_master_grads() if hasattr(self._optimizer, "clip_master_grads") and \ self._max_grad_norm > 0: self._optimizer.clip_master_grads(self._max_grad_norm) for group in self._optimizer.param_groups: group['lr'] = learning_rate if not self._with_fp16_wrapper and self._max_grad_norm > 0: clip_grad_norm_(group['params'], self._max_grad_norm) self._optimizer.step() self._decay_step += 1 self._training_step += 1
def f(): grad_log = {} for n, m in agent.named_children(): tot_grad = 0 for p in m.parameters(): if p.grad is not None: tot_grad += p.grad.norm(2) ** 2 tot_grad = tot_grad ** (1/2) grad_log['grad_norm'+n] = tot_grad.item() grad_norm = clip_grad_norm_( [p for p in params if p.requires_grad], clip_grad) grad_norm = grad_norm.item() if max_grad is not None and grad_norm >= max_grad: print('WARNING: Exploding Gradients {:.2f}'.format(grad_norm)) grad_norm = max_grad grad_log['grad_norm'] = grad_norm return grad_log
def train(self, model, optimizer, scheduler, data_loader, device, writer, args): ''' Train one epoch ''' model.train() clip = args.get('grad_clip', 50.0) log_interval = args.get('log_interval', 10) rank = args.get('rank', 0) accum_grad = args.get('accum_grad', 1) is_distributed = args.get('is_distributed', True) logging.info('using accumulate grad, new batch size is {} times' 'larger than before'.format(accum_grad)) num_seen_utts = 0 num_total_batch = len(data_loader) for batch_idx, batch in enumerate(data_loader): key, feats, target, feats_lengths, target_lengths = batch feats = feats.to(device) target = target.to(device) feats_lengths = feats_lengths.to(device) target_lengths = target_lengths.to(device) num_utts = target_lengths.size(0) if num_utts == 0: continue context = None # Disable gradient synchronizations across DDP processes. # Within this context, gradients will be accumulated on module # variables, which will later be synchronized. if is_distributed and batch_idx % accum_grad != 0 : context = model.no_sync #DDP给我们提供了一个暂时取消梯度同步的context函数 # Used for single gpu training and DDP gradient synchronization # processes. else: context = nullcontext with context(): loss, loss_att, loss_ctc = model(feats, feats_lengths, target, target_lengths) loss = loss / accum_grad loss.backward() num_seen_utts += num_utts if batch_idx % accum_grad == 0: if rank == 0 and writer is not None: writer.add_scalar('train_loss', loss, self.step) grad_norm = clip_grad_norm_(model.parameters(), clip) if torch.isfinite(grad_norm): optimizer.step() optimizer.zero_grad() scheduler.step() self.step += 1 if batch_idx % log_interval == 0: lr = optimizer.param_groups[0]['lr'] log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format( batch_idx, num_total_batch, loss.item() * accum_grad) if loss_att is not None: log_str += 'loss_att {:.6f} '.format(loss_att.item()) if loss_ctc is not None: log_str += 'loss_ctc {:.6f} '.format(loss_ctc.item()) log_str += 'lr {:.8f} rank {}'.format(lr, rank) logging.debug(log_str)
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: # group: list len(11): ['params'(list of tensors,len 151), 'lr', 'schedule', ...] for p in group[ 'params']: # p: Parameter: each iteration(total 151): Size([40737,768]), Size([768,2304]) if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError( 'Adam does not support sparse gradients, please consider SparseAdam instead' ) state = self.state[p] # state: 1st iteration: {}, # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['b1'], group['b2'] state['step'] += 1 # Add grad clipping if group['max_grad_norm'] > 0: clip_grad_norm_(p, group['max_grad_norm']) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) denom = exp_avg_sq.sqrt().add_(group['e']) bias_correction1 = 1 - beta1**state['step'] bias_correction2 = 1 - beta2**state['step'] schedule_fct = SCHEDULES[group['schedule']] lr_scheduled = group['lr'] * schedule_fct( state['step'] / group['t_total'], group['warmup']) step_size = lr_scheduled * math.sqrt( bias_correction2) / bias_correction1 p.data.addcdiv_(-step_size, exp_avg, denom) # Add weight decay at the end (fixed version) if (len(p.size()) > 1 or group['vector_l2']) and group['l2'] > 0: p.data.add_(-lr_scheduled * group['l2'], p.data) return loss
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() warned_for_t_total = False for group in self.param_groups: for p in group["params"]: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError( "Adam does not support sparse gradients, please consider SparseAdam instead" ) state = self.state[p] # State initialization if len(state) == 0: state["step"] = 0 # Exponential moving average of gradient values state["next_m"] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state["next_v"] = torch.zeros_like(p.data) next_m, next_v = state["next_m"], state["next_v"] beta1, beta2 = group["b1"], group["b2"] # Add grad clipping if group["max_grad_norm"] > 0: clip_grad_norm_(p, group["max_grad_norm"]) # Decay the first and second moment running average coefficient # In-place operations to update the averages at the same time next_m.mul_(beta1).add_(1 - beta1, grad) next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) update = next_m / (next_v.sqrt() + group["e"]) # Just adding the square of the weights to the loss function is *not* # the correct way of using L2 regularization/weight decay with Adam, # since that will interact with the m and v parameters in strange ways. # # Instead we want to decay the weights in a manner that doesn't interact # with the m/v parameters. This is equivalent to adding the square # of the weights to the loss with plain (non-momentum) SGD. if group["weight_decay"] > 0.0: update += group["weight_decay"] * p.data if group["t_total"] != -1: schedule_fct = SCHEDULES[group["schedule"]] progress = state["step"] / group["t_total"] lr_scheduled = group["lr"] * schedule_fct( progress, group["warmup"]) # warning for exceeding t_total (only active with warmup_linear if (group["schedule"] == "warmup_linear" and progress > 1.0 and not warned_for_t_total): logger.warning( "Training beyond specified 't_total' steps with schedule '{}'. Learning rate set to {}. " "Please set 't_total' of {} correctly.".format( group["schedule"], lr_scheduled, self.__class__.__name__)) warned_for_t_total = True # end warning else: lr_scheduled = group["lr"] update_with_lr = lr_scheduled * update p.data.add_(-update_with_lr) state["step"] += 1 # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 # No bias correction # bias_correction1 = 1 - beta1 ** state['step'] # bias_correction2 = 1 - beta2 ** state['step'] return loss
def update_model(self, experience: TensorTuple, epsilon: float) -> TensorTuple: """Update PPO actor and critic networks""" states, actions, rewards, values, log_probs, next_state, masks = experience next_state = numpy2floattensor(next_state, self.device) with torch.no_grad(): next_value = self.critic(next_state) returns = ppo_utils.compute_gae( next_value, rewards, masks, values, self.hyper_params.gamma, self.hyper_params.tau, ) states = torch.cat(states) actions = torch.cat(actions) returns = torch.cat(returns).detach() values = torch.cat(values).detach() log_probs = torch.cat(log_probs).detach() advantages = (returns - values).detach() if self.hyper_params.standardize_advantage: advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-7) actor_losses, critic_losses, total_losses = [], [], [] for ( state, action, old_value, old_log_prob, return_, adv, _, ) in ppo_utils.ppo_iter( self.hyper_params.epoch, self.hyper_params.batch_size, states, actions, values, log_probs, returns, advantages, ): gradient_clip_ac = self.hyper_params.gradient_clip_ac gradient_clip_cr = self.hyper_params.gradient_clip_cr w_value = self.hyper_params.w_value # critic_loss value = self.critic(state) if self.hyper_params.use_clipped_value_loss: value_pred_clipped = old_value + torch.clamp( (value - old_value), -epsilon, epsilon) value_loss_clipped = (return_ - value_pred_clipped).pow(2) value_loss = (return_ - value).pow(2) critic_loss = 0.5 * torch.max(value_loss, value_loss_clipped).mean() else: critic_loss = 0.5 * (return_ - value).pow(2).mean() critic_loss_ = w_value * critic_loss # train critic self.critic_optim.zero_grad() critic_loss_.backward() clip_grad_norm_(self.critic.parameters(), gradient_clip_cr) self.critic_optim.step() # calculate ratios _, dist = self.actor(state) log_prob = dist.log_prob(action) ratio = (log_prob - old_log_prob).exp() # actor_loss surr_loss = ratio * adv clipped_surr_loss = torch.clamp(ratio, 1.0 - epsilon, 1.0 + epsilon) * adv actor_loss = -torch.min(surr_loss, clipped_surr_loss).mean() # entropy entropy = dist.entropy().mean() w_entropy = self.hyper_params.w_entropy actor_loss_ = actor_loss - w_entropy * entropy # train actor self.actor_optim.zero_grad() actor_loss_.backward() clip_grad_norm_(self.actor.parameters(), gradient_clip_ac) self.actor_optim.step() # total_loss total_loss = critic_loss_ + actor_loss_ actor_losses.append(actor_loss.item()) critic_losses.append(critic_loss.item()) total_losses.append(total_loss.item()) actor_loss = sum(actor_losses) / len(actor_losses) critic_loss = sum(critic_losses) / len(critic_losses) total_loss = sum(total_losses) / len(total_losses) return actor_loss, critic_loss, total_loss
def train_one_epoch(model, optimizer, train_loader, model_func, lr_scheduler, accumulated_iter, optim_cfg, rank, tbar, total_it_each_epoch, dataloader_iter, tb_log=None, leave_pbar=False): if total_it_each_epoch == len(train_loader): dataloader_iter = iter(train_loader) if rank == 0: pbar = tqdm.tqdm(total=total_it_each_epoch, leave=leave_pbar, desc='train', dynamic_ncols=True) for cur_it in range(total_it_each_epoch): try: batch = next(dataloader_iter) except StopIteration: dataloader_iter = iter(train_loader) batch = next(dataloader_iter) print('new iters') lr_scheduler.step(accumulated_iter) try: cur_lr = float(optimizer.lr) except: cur_lr = optimizer.param_groups[0]['lr'] if tb_log is not None: tb_log.add_scalar('learning_rate', cur_lr, accumulated_iter) model.train() optimizer.zero_grad() loss, tb_dict, disp_dict = model_func(model, batch) loss.backward() clip_grad_norm_(model.parameters(), optim_cfg.GRAD_NORM_CLIP) optimizer.step() accumulated_iter += 1 disp_dict.update({'loss': loss.item(), 'lr': cur_lr}) # log to console and tensorboard if rank == 0: pbar.update() pbar.set_postfix(dict(total_it=accumulated_iter)) tbar.set_postfix(disp_dict) tbar.refresh() if tb_log is not None: tb_log.add_scalar('train_loss', loss, accumulated_iter) tb_log.add_scalar('learning_rate', cur_lr, accumulated_iter) for key, val in tb_dict.items(): tb_log.add_scalar('train_' + key, val, accumulated_iter) if rank == 0: pbar.close() return accumulated_iter
def main(args): run_name = args.run_name gpus = args.gpu save_step = args.save_step epochs = args.epochs dataset_dir = args.dataset_dir checkpoint_dir = args.checkpoint_path grad_clip = args.gradient_clip resume = args.resume # optimizer related args learning_rate = args.learning_rate scheduler_step = args.scheduler_step scheduler_gamma = args.scheduler_gamma scheduler_end = args.scheduler_end writer = SummaryWriter(comment='/runs/{}'.format(run_name)) latest_checkpoint_name = '{}-latest.ckpt'.format(run_name) latest_checkpoint_path = path.join(checkpoint_dir, latest_checkpoint_name) ################################## # -- setup dataloader / variables if(gpus != None): device = torch.device('cuda:{}'.format(gpus)) else: device = torch.device('cpu') # Setup default values # TODO: setup model model = FooModel().to(device) model.train() optimizer = optim.Adam(model.parameters(), lr=learning_rate) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=scheduler_step, gamma=scheduler_gamma) total_step = 0 epoch = 0 # load from previous checkpoint if exists if((not path.exists(latest_checkpoint_path)) or (not resume)): checkpoint = CheckPoint.load(latest_checkpoint_path, device) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) epoch = checkpoint['epoch'] total_step = checkpoint['total_step'] ################################# # -- setup datasets # TODO: setup dataset dataset = BarDataset() dataloader = DataLoader(dataset) ##################### # -- Actual training for epoch in range(epochs): for i, data in enumerate(dataloader): if(total_step < scheduler_end): scheduler.step() # TODO: get loss somehow using model loss = message = '[Training] Step: {:06d}, Loss: {:.04f})' logging.info(message.format(total_step, loss.item())) # reset optimizer (and clear out gradients to be applied) optimizer.zero_grad() # compute gradient loss.backward() # clip gradient if grad_clip is given if(grad_clip): utils.clip_grad_norm_(model.parameters(), grad_clip) # update optimizer (and actually apply gradients) optimizer.step() total_step += 1 # write to tensorboard writer.add_scalar('data/loss', loss, total_step) # -- save the run every some time if((total_step) % save_step == 0): checkpoint_name = '{}-{}.ckpt'.format(run_name, total_step) checkpoint_path = path.join(checkpoint_dir, checkpoint_name) CheckPoint.save(checkpoint_path, model, optimizer, scheduler, total_step, epoch) CheckPoint.save(latest_checkpoint_path, model, optimizer, scheduler, total_step, epoch) # write historgram (optional, NOT recommended) for name, param in model.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), total_step) writer.close()
def train(self): start_t = time.time() print(f'Training started at {datetime.now()}') print(f'Total number of batches: {len(self.data_loader_train)}') best_valid_loss, best_train_epoch_loss, best_roc_auc = 10, 10, 0 best_step_train_loss, best_step_valid_loss, best_step_valid_roc = 0, 0, 0 drop_counter = 0 loss_fn = self.model.loss() for epoch in range(self.config.num_epochs): epoch_loss = 0 self.model.train() ctr = 0 for ctr, (audio, target, fname) in enumerate(self.data_loader_train): #ctr += 1 drop_counter += 1 audio = audio.to(self.device) target = target.to(self.device) # Time-frequency transform if self.transforms is not None: audio = self.transforms(audio) # predict out = self.model(audio) loss = loss_fn(out, target) # back propagation self.optimizer.zero_grad() loss.backward() if self.config.clip_grad > 0: clip_grad_norm_(self.model.parameters(), self.config.clip_grad) self.optimizer.step() epoch_loss += loss.item() # print log if (ctr) % self.config.print_every == 0: print( "[%s] Epoch [%d/%d] Iter [%d/%d] train loss: %.4f Elapsed: %s" % (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), epoch + 1, self.config.num_epochs, ctr, len(self.data_loader_train), loss.item(), timedelta(seconds=time.time() - start_t))) if self.writer is not None: step = epoch * len(self.data_loader_train) + ctr self.writer.add_scalar('loss', loss.item(), step) self.writer.add_scalar( 'learning_rate', self.optimizer.param_groups[0]['lr'], step) self.writer.add_scalar( 'grad_norm', utils.grad_norm(self.model.parameters()), step) del audio, target epoch_loss = epoch_loss / len(self.data_loader_train) # validation valid_loss, scores, y_true, y_pred = self._validation( start_t, epoch) if self.scheduler is not None: if self.config.scheduler == 'plateau': self.scheduler.step(valid_loss) else: self.scheduler.step() # Log validation if self.writer is not None: step = epoch * len(self.data_loader_train) + ctr self.writer.add_scalar('valid_loss', valid_loss, step) self.writer.add_scalar('valid_roc_auc_macro', scores['roc_auc_macro'], step) if not self.config.debug_mode: self.writer.add_figure( 'valid_class', utils.compare_predictions(y_true, y_pred, filepath=None), step) # Save model, with respect to validation loss if valid_loss < best_valid_loss: # print('best model: %4f' % valid_loss) best_step_valid_loss = drop_counter best_valid_loss = valid_loss torch.save( self.model.state_dict(), os.path.join(self.config.checkpoint_dir, 'best_model_valid_loss.pth')) # Save model, with respect to validation roc_auc if scores['roc_auc_macro'] > best_roc_auc: best_step_valid_roc = drop_counter best_roc_auc = scores['roc_auc_macro'] torch.save( self.model.state_dict(), os.path.join(self.config.checkpoint_dir, 'best_model_valid_roc.pth')) # Save best model according to training loss if epoch_loss < best_train_epoch_loss: best_step_train_loss = drop_counter best_train_epoch_loss = epoch_loss torch.save( self.model.state_dict(), os.path.join(self.config.checkpoint_dir, 'best_model_train.pth')) print("{} Training finished. ----------------------- Elapsed: {}". format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), timedelta(seconds=time.time() - start_t))) print( "Best step (validation loss) = {} . ".format(best_step_valid_loss)) print("Best step (validation roc_auc) = {} .".format( best_step_valid_roc)) print("Best step (training loss) = {} .".format(best_step_train_loss)) # Save last model torch.save( self.model.state_dict(), os.path.join(self.config.checkpoint_dir, 'best_model_final.pth'))
def step(self, closure=None): loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue # Add grad clipping if group['max_grad_norm'] > 0: clip_grad_norm_(p, group['max_grad_norm']) grad = p.grad.data.float() if grad.is_sparse: raise RuntimeError( 'RAdam does not support sparse gradients') p_data_fp32 = p.data.float() state = self.state[p] if len(state) == 0: state['step'] = 0 state['exp_avg'] = torch.zeros_like(p_data_fp32) state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) else: state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) state['exp_avg_sq'] = state['exp_avg_sq'].type_as( p_data_fp32) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['betas'] exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) exp_avg.mul_(beta1).add_(1 - beta1, grad) state['step'] += 1 buffered = group['buffer'][int(state['step'] % 10)] if state['step'] == buffered[0]: N_sma, step_size = buffered[1], buffered[2] else: buffered[0] = state['step'] beta2_t = beta2**state['step'] N_sma_max = 2 / (1 - beta2) - 1 N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) buffered[1] = N_sma # more conservative since it's an approximated value if N_sma >= 5: step_size = math.sqrt( (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1**state['step']) elif self.degenerated_to_sgd: step_size = 1.0 / (1 - beta1**state['step']) else: step_size = -1 buffered[2] = step_size # more conservative since it's an approximated value if N_sma >= 5: if group['weight_decay'] != 0: p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) denom = exp_avg_sq.sqrt().add_(group['eps']) p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) p.data.copy_(p_data_fp32) elif step_size > 0: if group['weight_decay'] != 0: p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) p_data_fp32.add_(-step_size * group['lr'], exp_avg) p.data.copy_(p_data_fp32) return loss
def train(train_loader, model, optimizer, lr_scheduler, tb_writer): ''' :param train_loader: :param model: :param optimizer: :param lr_scheduler: :param tb_writer: :return: ''' cur_lr = lr_scheduler.get_cur_lr() #获得当前学习率 rank = get_rank() average_meter = AverageMeter() def is_valid_number(x): return not (math.isnan(x) or math.isinf(x) or x > 1e4) world_size = get_world_size() num_per_epoch = len(train_loader.dataset) // \ cfg.TRAIN.EPOCH // (cfg.TRAIN.BATCH_SIZE * world_size) start_epoch = cfg.TRAIN.START_EPOCH epoch = start_epoch if not os.path.exists(cfg.TRAIN.SNAPSHOT_DIR) and \ get_rank() == 0: os.makedirs(cfg.TRAIN.SNAPSHOT_DIR) logger.info("model\n{}".format(describe(model.module))) #打印模型 end = time.time() for idx, data in enumerate(train_loader): if epoch != idx // num_per_epoch + start_epoch: #每个epoch的跳变沿进行一次模型存储 epoch = idx // num_per_epoch + start_epoch if get_rank() == 0: torch.save( { 'epoch': epoch, 'state_dict': model.module.state_dict(), 'optimizer': optimizer.state_dict() }, cfg.TRAIN.SNAPSHOT_DIR + '/checkpoint_e%d.pth' % (epoch)) if epoch == cfg.TRAIN.EPOCH: return # 如果达到第10个epoch后,则要开始微调backbone的后面3层,要重新设置一下哪些参数是可训练的,哪些参数是不动的,学习率的调整因子 if cfg.BACKBONE.TRAIN_EPOCH == epoch: logger.info('start training backbone.') optimizer, lr_scheduler = build_opt_lr(model.module, epoch) logger.info("model\n{}".format(describe(model.module))) lr_scheduler.step(epoch) cur_lr = lr_scheduler.get_cur_lr() logger.info('epoch: {}'.format(epoch + 1)) tb_idx = idx #tensor board的idx if idx % num_per_epoch == 0 and idx != 0: for idx, pg in enumerate( optimizer.param_groups): #将优化器中的学习率添加到tensorboard中监视 logger.info('epoch {} lr {}'.format(epoch + 1, pg['lr'])) if rank == 0: tb_writer.add_scalar('lr/group{}'.format(idx + 1), pg['lr'], tb_idx) data_time = average_reduce(time.time() - end) if rank == 0: tb_writer.add_scalar('time/data', data_time, tb_idx) outputs = model(data) loss = outputs['total_loss'] if is_valid_number( loss.data.item()): #判断损失是否是合法数据,滤掉nan,+inf,>10000的这样的损失 optimizer.zero_grad() loss.backward() reduce_gradients(model) #分发梯度 if rank == 0 and cfg.TRAIN.LOG_GRADS: #对梯度信息监视 log_grads(model.module, tb_writer, tb_idx) # clip gradient clip_grad_norm_(model.parameters(), cfg.TRAIN.GRAD_CLIP) optimizer.step() batch_time = time.time() - end batch_info = {} batch_info['batch_time'] = average_reduce(batch_time) batch_info['data_time'] = average_reduce(data_time) for k, v in sorted(outputs.items()): batch_info[k] = average_reduce(v.data.item()) average_meter.update(**batch_info) if rank == 0: for k, v in batch_info.items(): tb_writer.add_scalar(k, v, tb_idx) if (idx + 1) % cfg.TRAIN.PRINT_FREQ == 0: info = "Epoch: [{}][{}/{}] lr: {:.6f}\n".format( epoch + 1, (idx + 1) % num_per_epoch, num_per_epoch, cur_lr) for cc, (k, v) in enumerate(batch_info.items()): if cc % 2 == 0: info += ("\t{:s}\t").format(getattr(average_meter, k)) else: info += ("{:s}\n").format(getattr(average_meter, k)) logger.info(info) print_speed(idx + 1 + start_epoch * num_per_epoch, average_meter.batch_time.avg, cfg.TRAIN.EPOCH * num_per_epoch) end = time.time()
def run_train(self, train_data, dev_data): self.print_all_model_parameters() if self.optim is None: self.optim = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=self.learning_rate) # Track dev metrics changes best_dev_metrics = 0 dev_metrics_history = [] for epoch_id in range(self.start_epoch, self.num_epochs): print('Epoch {}'.format(epoch_id)) if self.rl_variation_tag.startswith('rs'): # Reward shaping module sanity check: # Make sure the reward shaping module output value is in the correct range train_scores = self.test_fn(train_data) dev_scores = self.test_fn(dev_data) print('Train set average fact score: {}'.format( float(train_scores.mean()))) print('Dev set average fact score: {}'.format( float(dev_scores.mean()))) # Update model parameters self.train() if self.rl_variation_tag.startswith('rs'): self.fn.eval() self.fn_kg.eval() if self.model.endswith('hypere'): self.fn_secondary_kg.eval() self.batch_size = self.train_batch_size random.shuffle(train_data) batch_losses = [] entropies = [] if self.run_analysis: rewards = None fns = None for example_id in tqdm(range(0, len(train_data), self.batch_size)): self.optim.zero_grad() mini_batch = train_data[example_id:example_id + self.batch_size] if len(mini_batch) < self.batch_size: continue loss = self.loss(mini_batch) loss['model_loss'].backward() if self.grad_norm > 0: clip_grad_norm_(self.parameters(), self.grad_norm) self.optim.step() batch_losses.append(loss['print_loss']) if 'entropy' in loss: entropies.append(loss['entropy']) if self.run_analysis: if rewards is None: rewards = loss['reward'] else: rewards = torch.cat([rewards, loss['reward']]) if fns is None: fns = loss['fn'] else: fns = torch.cat([fns, loss['fn']]) # Check training statistics stdout_msg = 'Epoch {}: average training loss = {}'.format( epoch_id, np.mean(batch_losses)) if entropies: stdout_msg += ' entropy = {}'.format(np.mean(entropies)) print(stdout_msg) self.save_checkpoint(checkpoint_id=epoch_id, epoch_id=epoch_id) if self.run_analysis: print('* Analysis: # path types seen = {}'.format( self.num_path_types)) num_hits = float(rewards.sum()) hit_ratio = num_hits / len(rewards) print('* Analysis: # hits = {} ({})'.format( num_hits, hit_ratio)) num_fns = float(fns.sum()) fn_ratio = num_fns / len(fns) print('* Analysis: false negative ratio = {}'.format(fn_ratio)) # Check dev set performance if self.run_analysis or (epoch_id > 0 and epoch_id % self.num_peek_epochs == 0): self.eval() self.batch_size = self.dev_batch_size dev_scores = self.forward(dev_data, verbose=False) print('Dev set performance: (correct evaluation)') _, _, _, _, mrr = src.eval.hits_and_ranks(dev_data, dev_scores, self.kg.dev_objects, verbose=True) metrics = mrr print('Dev set performance: (include test set labels)') src.eval.hits_and_ranks(dev_data, dev_scores, self.kg.all_objects, verbose=True) # Action dropout anneaking if self.model.startswith('point'): eta = self.action_dropout_anneal_interval if len(dev_metrics_history) > eta and metrics < min( dev_metrics_history[-eta:]): old_action_dropout_rate = self.action_dropout_rate self.action_dropout_rate *= self.action_dropout_anneal_factor print( 'Decreasing action dropout rate: {} -> {}'.format( old_action_dropout_rate, self.action_dropout_rate)) # Save checkpoint if metrics > best_dev_metrics: self.save_checkpoint(checkpoint_id=epoch_id, epoch_id=epoch_id, is_best=True) best_dev_metrics = metrics with open( os.path.join(self.model_dir, 'best_dev_iteration.dat'), 'w') as o_f: o_f.write('{}'.format(epoch_id)) else: # Early stopping if epoch_id >= self.num_wait_epochs and metrics < np.mean( dev_metrics_history[-self.num_wait_epochs:]): break dev_metrics_history.append(metrics) if self.run_analysis: num_path_types_file = os.path.join(self.model_dir, 'num_path_types.dat') dev_metrics_file = os.path.join(self.model_dir, 'dev_metrics.dat') hit_ratio_file = os.path.join(self.model_dir, 'hit_ratio.dat') fn_ratio_file = os.path.join(self.model_dir, 'fn_ratio.dat') if epoch_id == 0: with open(num_path_types_file, 'w') as o_f: o_f.write('{}\n'.format(self.num_path_types)) with open(dev_metrics_file, 'w') as o_f: o_f.write('{}\n'.format(metrics)) with open(hit_ratio_file, 'w') as o_f: o_f.write('{}\n'.format(hit_ratio)) with open(fn_ratio_file, 'w') as o_f: o_f.write('{}\n'.format(fn_ratio)) else: with open(num_path_types_file, 'a') as o_f: o_f.write('{}\n'.format(self.num_path_types)) with open(dev_metrics_file, 'a') as o_f: o_f.write('{}\n'.format(metrics)) with open(hit_ratio_file, 'a') as o_f: o_f.write('{}\n'.format(hit_ratio)) with open(fn_ratio_file, 'a') as o_f: o_f.write('{}\n'.format(fn_ratio))
def train(engine, mini_batch): # You have to reset the gradients of all model parameters # before to take another step in gradient descent. engine.model.train() if engine.state.iteration % engine.config.iteration_per_update == 1 or \ engine.config.iteration_per_update == 1: if engine.state.iteration > 1: engine.optimizer.zero_grad() device = next(engine.model.parameters()).device mini_batch.src = (mini_batch.src[0].to(device), mini_batch.src[1]) mini_batch.tgt = (mini_batch.tgt[0].to(device), mini_batch.tgt[1]) # Raw target variable has both BOS and EOS token. # The output of sequence-to-sequence does not have BOS token. # Thus, remove BOS token for reference. x, y = mini_batch.src, mini_batch.tgt[0][:, 1:] # |x| = (batch_size, length) # |y| = (batch_size, length) with autocast(not engine.config.off_autocast): # Take feed-forward # Similar as before, the input of decoder does not have EOS token. # Thus, remove EOS token for decoder input. y_hat = engine.model(x, mini_batch.tgt[0][:, :-1]) # |y_hat| = (batch_size, length, output_size) loss = engine.crit(y_hat.contiguous().view(-1, y_hat.size(-1)), y.contiguous().view(-1)) backward_target = loss.div(y.size(0)).div( engine.config.iteration_per_update) if engine.config.gpu_id >= 0 and not engine.config.off_autocast: engine.scaler.scale(backward_target).backward() else: backward_target.backward() word_count = int(mini_batch.tgt[1].sum()) p_norm = float(get_parameter_norm(engine.model.parameters())) g_norm = float(get_grad_norm(engine.model.parameters())) if engine.state.iteration % engine.config.iteration_per_update == 0 and \ engine.state.iteration > 0: # In orther to avoid gradient exploding, we apply gradient clipping. torch_utils.clip_grad_norm_( engine.model.parameters(), engine.config.max_grad_norm, ) # Take a step of gradient descent. if engine.config.gpu_id >= 0 and not engine.config.off_autocast: # Use scaler instead of engine.optimizer.step() if using GPU. engine.scaler.step(engine.optimizer) engine.scaler.update() else: engine.optimizer.step() loss = float(loss / word_count) ppl = np.exp(loss) return { 'loss': loss, 'ppl': ppl, '|param|': p_norm if not np.isnan(p_norm) and not np.isinf(p_norm) else 0., '|g_param|': g_norm if not np.isnan(g_norm) and not np.isinf(g_norm) else 0., }
def train(model, train_loader, criterion, scheduler, optimizer, epoch, params, args): start = time.time() total_loss = [] model.train() model.is_training = True model.freeze_bn() pbar = tqdm(train_loader, desc='==> Train', position=1) idx = 0 for (images, targets) in pbar: images = images.to(args.device).float() targets = targets.to(args.device) if args.mixup: images, targets_a, targets_b, lam = mixup_data( images, targets, args.alpha, use_cuda=args.is_cuda) regression, classification, anchors = model(images) if args.mixup: cls_loss, reg_loss = mixup_criterion(images, regression, classification, anchors, targets_a, targets_b, lam) else: cls_loss, reg_loss = criterion(classification, regression, anchors, targets) # print(cls_loss, reg_loss) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): print('loss equal zero(0)') continue loss.backward() total_loss.append(loss.item()) mean_loss = np.mean(total_loss) if (idx + 1) % args.grad_accum_steps == 0: clip_grad_norm_(model.parameters(), args.max_grad_norm) # zero grad first since first step requires zero grad beforehand optimizer.zero_grad() optimizer.step() iter_step(epoch, mean_loss, cls_loss, reg_loss, optimizer, params, args) idx += 1 pbar.update() pbar.set_postfix({ 'Cls_loss': cls_loss.item(), 'Reg_loss': reg_loss.item(), 'Mean_loss': mean_loss, }) # pbar.set_description() # end of training epoch scheduler.step(mean_loss) # result = {'time': time.time()-start, 'loss': mean_loss} # for key, value in result.items(): # print(' {:15s}: {}'.format(str(key), value)) return mean_loss
log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t] loss_policy_v = -log_prob_actions_v.mean() prob_v = F.softmax(logits_v, dim=1) entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(dim=1).mean() # calculate policy gradients only loss_policy_v.backward(retain_graph=True) grads = np.concatenate([p.grad.data.cpu().numpy().flatten() for p in net.parameters() if p.grad is not None]) # apply entropy and value gradients loss_v = entropy_loss_v + loss_value_v loss_v.backward() nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD) optimizer.step() # get full loss loss_v += loss_policy_v tb_tracker.track("advantage", adv_v, step_idx) tb_tracker.track("values", value_v, step_idx) tb_tracker.track("batch_rewards", vals_ref_v, step_idx) tb_tracker.track("loss_entropy", entropy_loss_v, step_idx) tb_tracker.track("loss_policy", loss_policy_v, step_idx) tb_tracker.track("loss_value", loss_value_v, step_idx) tb_tracker.track("loss_total", loss_v, step_idx) tb_tracker.track("grad_l2", np.sqrt(np.mean(np.square(grads))), step_idx) tb_tracker.track("grad_max", np.max(np.abs(grads)), step_idx) tb_tracker.track("grad_var", np.var(grads), step_idx)
def main(args): aa = AskingAgent(args) if args['user_type'] == 'oracle': user = NoisyUser(args) elif args['user_type'] == 'persona': user = PersonaUser(aa, args) else: print('no user type implemented') device = torch.device('cuda') if args['cuda'] else torch.device('cpu') print(device) writer = SummaryWriter( os.path.join(args['tensorboard_dir'], args['comment'] + '_' + args['flavor'])) writer.add_text('Args', args['comment'] + ' ' + str(args) + '\n') save_path = args['checkpoint_path'] #==========loading data ============= policynet = Policy(args) print('policy network model: ') print(policynet.model) writer.add_text('model', str(policynet.model)) optimizer = optim.Adam(policynet.model.parameters(), lr=args['lr']) ftparams = [] if args['ft_tag']: ftparams += [aa.tagweight, aa.tagbias, aa.lmda] if args['ft_emb']: if args['ft_rnn']: for m in aa.model.modules(): if isinstance(m, nn.Dropout): m.p = args['dropout'] if isinstance(m, SRU): m.dropout = args['dropout'] ftparams += get_params(aa.model) else: ftparams += [aa.embedweight] if args['ft_emb'] or args['ft_tag']: print('Finetuning turned on ') nnoptimizer = optim.Adam(ftparams, lr=args['ft_lr']) else: nnoptimizer = None for episode in range(1, args['episodes']): if episode % (args['test_every']) == 0: batch = aa.testdata() mode = 'test' policynet.model.eval() aa.model.eval() elif episode % args['eval_every'] == 0: batch = aa.valdata() mode = 'val' policynet.model.eval() aa.model.eval() else: batch = aa.sampletrain(args['batch_size']) mode = 'train' policynet.model.train() aa.model.train() batch_s = len(batch[0]) rank_batch, p_fx_batch, _ = infogain_rollout(batch, aa, user, args, mode) action_batch = [] logp_batch = [] for cnt in range(1, len(p_fx_batch) + 1): p_f_x = p_fx_batch[cnt - 1] if not args['ft_tag'] and not args['ft_emb']: p_f_x = p_f_x.detach() if cnt == args['max_step']: action = np.zeros(batch_s) log_pact = torch.zeros(batch_s).to(device) else: state = policynet.get_state(p_f_x, cnt) action, log_pact, _ = policynet.select_action(state) action_batch.append(action) logp_batch.append(log_pact) rewards, logp_bs, scalars = reprocess_withmask(action_batch, rank_batch, logp_batch, device, args) if mode == 'train': if nnoptimizer: nnoptimizer.zero_grad() scalars = policynet.update_policy(optimizer, rewards, logp_bs, scalars) if nnoptimizer: print('fintuning') clip_grad_norm_( [p for p in aa.model.parameters() if p.requires_grad], 3.0) nnoptimizer.step() if args['ft_tag']: aa.tag_inference() #print('w: {:.3f}, b: {:.3f}, lmd: {:.3f}'.format(aa.tagweight.item(), aa.tagbias.item(), aa.lmda.item())) #writer.add_scalar('tagmodel/weight', aa.tagweight.item(), episode) #*args['batch_size']) #writer.add_scalar('tagmodel/bias', aa.tagbias.item(), episode) #*args['batch_size']) writer.add_scalar('tagmodel/lmda', aa.lmda.item(), episode) #*args['batch_size']) writer.add_scalar('tagmodel/weight', aa.tagweight.data.norm(), episode) #*args['batch_size']) writer.add_scalar('tagmodel/bias', aa.tagbias.data.norm(), episode) #*args['batch_size']) if args['ft_emb']: writer.add_scalar('tagmodel/embweight', aa.embedweight.data.norm(), episode) #*args['batch_size']) if args['ft_rnn']: writer.add_scalar('rnn-parameter/rnn_param_norm', compute_param_norm(aa.model), episode) writer.add_scalar('rnn-parameter/rnn_grad_norm', compute_grad_norm(aa.model), episode) if writer is not None: for name, value in scalars: writer.add_scalar(mode + name, value, episode) #*args['batch_size']) if episode % args['print_every'] == 0: print(mode) print('Step: {:,} '.format(episode * args['batch_size']) + ' '.join([ '{} = {:.3f}'.format(name, value) for name, value in scalars ])) if episode % args['save_every'] == 0: torch.save( aa.state_dict(), args['checkpoint_dir'] + '/' + args['flavor'] + '_aa.pt') save_path = save_checkpoint(policynet.model, optimizer, episode, episode * args['batch_size'], dict(scalars)['/suc_rate'], args, prev_save_path=save_path)
torch.zeros(num_layers, batch_size, hidden_size).to(device)) for i in range(0, ids.size(1) - seq_length, seq_length): # Get mini-batch inputs and targets inputs = ids[:, i:i+seq_length].to(device) targets = ids[:, (i+1):(i+1)+seq_length].to(device) # Forward pass states = detach(states) outputs, states = model(inputs, states) loss = criterion(outputs, targets.reshape(-1)) # Backward and optimize model.zero_grad() loss.backward() clip_grad_norm_(model.parameters(), 0.5) optimizer.step() step = (i+1) // seq_length if step % 100 == 0: print ('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}' .format(epoch+1, num_epochs, step, num_batches, loss.item(), np.exp(loss.item()))) # Test the model with torch.no_grad(): with open('sample.txt', 'w') as f: # Set intial hidden ane cell states state = (torch.zeros(num_layers, 1, hidden_size).to(device), torch.zeros(num_layers, 1, hidden_size).to(device)) # Select one word id randomly
def main(gpu): params = configreader.get_config( "./common/config/hyperparams.yaml")["pong"] params["device"] = f"cuda:{gpu}" params["train_freq"] = 4 params["batch_size"] *= params["train_freq"] init_logger(params) env = gym.make(params["env_name"]) env = ptan.common.wrappers.wrap_dqn(env) net = neuralnetworks.DQN(env.observation_space.shape, env.action_space.n) net = net.to(params["device"]) wandb.watch(net) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params["epsilon_start"]) epsilon_tracker = trackers.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=params["device"]) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=params["gamma"], steps_count=1) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params["replay_size"]) optimizer = optim.Adam(net.parameters(), lr=params["learning_rate"], **params["optim_params"]) frame_idx = 0 with trackers.RewardTracker(params["stop_reward"]) as reward_tracker: while True: frame_idx += params["train_freq"] buffer.populate(params["train_freq"]) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): break if len(buffer) < params["replay_initial"]: continue optimizer.zero_grad() batch = buffer.sample(params["batch_size"]) loss = losses.calc_loss_dqn( batch, net, tgt_net.target_model, gamma=params["gamma"], device=params["device"], ) loss.backward() clip_grad_norm_(net.parameters(), params["gradient_clip"]) optimizer.step() if frame_idx % params["target_net_sync"] < params["train_freq"]: tgt_net.sync()
def train(model, dataset, model_dir, summary_writer, epochs, lr, conf_thres, nms_thres, iou_thres, lambda_coord=5, lambda_no_obj=0.5, gradient_accumulations=2, clip_gradients=False, limit=None, debug=False, print_every=10, save_every=None, log_to_neptune=False): if log_to_neptune: env_path = Path(os.environ['HOME'], 'workspace/setup-box/neptune.env') load_dotenv(dotenv_path=env_path) neptune.init('petersiemen/sandbox', api_token=os.getenv("NEPTUNE_API_TOKEN")) total = limit if limit is not None else len(dataset) logger.info( f'Start training on {total} images. Using lr: {lr}, ' f'lambda_coord: {lambda_coord}, lambda_no_obj: {lambda_no_obj}, ' f'conf_thres: {conf_thres}, nms_thres:{nms_thres}, iou_thres: {iou_thres}, ' f'gradient_accumulations: {gradient_accumulations}, ' f'clip_gradients: {clip_gradients}, lambda_no_obj: {lambda_no_obj}') metrics = Metrics() model.to(DEVICE) model.train() optimizer = torch.optim.Adam(model.get_trainable_parameters(), lr=lr) grid_sizes = model.grid_sizes data_loader = DataLoader(dataset, batch_size=dataset.batch_size, shuffle=True, collate_fn=dataset.collate_fn) class_names = model.class_names for epoch in range(1, epochs + 1): for batch_i, (images, ground_truth_boxes, image_paths) in tqdm(enumerate(data_loader), total=total): if len(images) != dataset.batch_size: logger.warning( f"Skipping batch {batch_i} because it does not have correct size ({dataset.batch_size})" ) continue images = images.to(DEVICE) coordinates, class_scores, confidence = model(images) obj_mask, noobj_mask, cls_mask, target_coordinates, target_confidence, target_class_scores = build_targets( coordinates, class_scores, ground_truth_boxes, grid_sizes) yolo_loss = YoloLoss(coordinates, confidence, class_scores, obj_mask, noobj_mask, cls_mask, target_coordinates, target_confidence, target_class_scores, lambda_coord=lambda_coord, lambda_no_obj=lambda_no_obj) class_scores = torch.sigmoid(class_scores) prediction = torch.cat( (coordinates, confidence.unsqueeze(-1), class_scores), -1) detections = non_max_suppression(prediction=prediction, conf_thres=conf_thres, nms_thres=nms_thres) ground_truth_map_objects = list( GroundTruth.from_ground_truths(image_paths, ground_truth_boxes)) detection_map_objects = list( Detection.from_detections(image_paths, detections)) metrics.add_detections_for_batch(detection_map_objects, ground_truth_map_objects, iou_thres=iou_thres) if debug: plot_batch(detections, ground_truth_boxes, images, class_names) loss = yolo_loss.get() # backward pass to calculate the weight gradients loss.backward() if clip_gradients: logger.debug("Clipping gradients with max_norm = 1") clip_grad_norm_(model.parameters(), max_norm=1) if batch_i % print_every == 0: # print every print_every +1 batches yolo_loss.capture(summary_writer, batch_i, during='train') #plot_weights_and_gradients(model, summary_writer, epoch * batch_i) log_performance(epoch, epochs, batch_i, total, yolo_loss, metrics, class_names, summary_writer, log_to_neptune) # Accumulates gradient before each step if batch_i % gradient_accumulations == 0: logger.debug( f"Updating weights for batch {batch_i} (gradient_accumulations :{gradient_accumulations})" ) # update the weights optimizer.step() # zero the parameter (weight) gradients optimizer.zero_grad() del images del ground_truth_boxes if limit is not None and batch_i + 1 >= limit: logger.info( 'Stop here after training {} batches (limit: {})'.format( batch_i, limit)) log_performance(epoch, epochs, batch_i, total, yolo_loss, metrics, class_names, summary_writer, log_to_neptune) save_model(model_dir, model, epoch, batch_i) return if save_every is not None and batch_i % save_every == 0: save_model(model_dir, model, epoch, batch_i) # save model after every epoch save_model(model_dir, model, epoch, None)
def f_step(config, vocab, model_F, model_D, optimizer_F, batch, temperature, drop_decay, cyc_rec_enable=True): model_D.eval() pad_idx = vocab.stoi['<pad>'] eos_idx = vocab.stoi['<eos>'] unk_idx = vocab.stoi['<unk>'] vocab_size = len(vocab) loss_fn = nn.NLLLoss(reduction='none') inp_tokens, inp_lengths, raw_styles = batch_preprocess( batch, pad_idx, eos_idx) rev_styles = 1 - raw_styles batch_size = inp_tokens.size(0) token_mask = (inp_tokens != pad_idx).float() optimizer_F.zero_grad() # self reconstruction loss noise_inp_tokens = word_dropout( #word_drop( inp_tokens, inp_lengths, config.inp_drop_prob * drop_decay, unk_idx #vocab ) noise_inp_lengths = get_lengths(noise_inp_tokens, eos_idx) slf_log_probs = model_F( noise_inp_tokens, inp_tokens, noise_inp_lengths, raw_styles, generate=False, differentiable_decode=False, temperature=temperature, ) slf_rec_loss = loss_fn(slf_log_probs.transpose(1, 2), inp_tokens) * token_mask slf_rec_loss = slf_rec_loss.sum() / batch_size slf_rec_loss *= config.slf_factor slf_rec_loss.backward() # cycle consistency loss if not cyc_rec_enable: optimizer_F.step() model_D.train() return slf_rec_loss.item(), 0, 0 gen_log_probs = model_F( inp_tokens, None, inp_lengths, rev_styles, generate=True, differentiable_decode=True, temperature=temperature, ) gen_soft_tokens = gen_log_probs.exp() gen_lengths = get_lengths(gen_soft_tokens.argmax(-1), eos_idx) cyc_log_probs = model_F( gen_soft_tokens, inp_tokens, gen_lengths, raw_styles, generate=False, differentiable_decode=False, temperature=temperature, ) cyc_rec_loss = loss_fn(cyc_log_probs.transpose(1, 2), inp_tokens) * token_mask cyc_rec_loss = cyc_rec_loss.sum() / batch_size cyc_rec_loss *= config.cyc_factor # style consistency loss adv_log_porbs = model_D(gen_soft_tokens, gen_lengths, rev_styles) if config.discriminator_method == 'Multi': adv_labels = rev_styles + 1 else: adv_labels = torch.ones_like(rev_styles) adv_loss = loss_fn(adv_log_porbs, adv_labels) adv_loss = adv_loss.sum() / batch_size adv_loss *= config.adv_factor (cyc_rec_loss + adv_loss).backward() # update parameters clip_grad_norm_(model_F.parameters(), 5) optimizer_F.step() model_D.train() return slf_rec_loss.item(), cyc_rec_loss.item(), adv_loss.item()
def train(self): self.model.cuda().train() for iter_idx in range(self.start_iter, self.max_iter): self.cur_epoch = int(float(iter_idx + 1) / self.epoch_iters) self.cur_iter = iter_idx inputs = self.get_batch('train') loss = self.forward(inputs) self.backward(loss) if self.config.trainer.clip_gradient > 0: clip_grad_norm_(self.model.parameters(), self.config.trainer.clip_gradient) self.update() if iter_idx % self.config.trainer.print_freq == 0 and self.rank == 0: self.tb_logger.add_scalar('loss_train', self.metrics['losses'].avg, iter_idx) self.tb_logger.add_scalar('lr', self.lr_scheduler.get_lr()[0], iter_idx) log_formatter = get_log_format(self.multi_class) if self.multi_class: self.tb_logger.add_scalar('mAP_train', self.metrics['mAP'].avg, iter_idx) self.logger.info( log_formatter.format( iter_idx, self.max_iter, self.cur_epoch + 1, self.config.trainer.epochs, batch_time=self.metrics['batch_time'], data_time=self.metrics['data_time'], loss=self.metrics['losses'], mAP=self.metrics['mAP'], lr=self.lr_scheduler.get_lr()[0])) else: self.tb_logger.add_scalar('acc1_train', self.metrics['top1'].avg, iter_idx) self.tb_logger.add_scalar('acc5_train', self.metrics['top5'].avg, iter_idx) self.logger.info( log_formatter.format( iter_idx, self.max_iter, self.cur_epoch + 1, self.config.trainer.epochs, batch_time=self.metrics['batch_time'], data_time=self.metrics['data_time'], loss=self.metrics['losses'], top1=self.metrics['top1'], top5=self.metrics['top5'], lr=self.lr_scheduler.get_lr()[0])) if (iter_idx == self.max_iter - 1) or (iter_idx % self.epoch_iters == 0 and iter_idx > 0 and \ self.cur_epoch % self.config.trainer.eval_freq == 0): metric = self.evaluate() if self.rank == 0 and self.tb_logger is not None: self.tb_logger.add_scalar('loss_val', metric.loss, iter_idx) if self.multi_class: self.tb_logger.add_scalar('mAP_val', metric.top1, iter_idx) else: self.tb_logger.add_scalar('acc1_val', metric.top1, iter_idx) self.tb_logger.add_scalar('acc5_val', metric.top5, iter_idx) if self.rank == 0: # remember best prec@1 and save checkpoint is_best = metric.top1 > self.best_prec1 self.best_prec1 = max(metric.top1, self.best_prec1) self.save_checkpoint( { 'epoch': self.cur_epoch, 'optimizer': self.optimizer.state_dict(), 'model': self.model.state_dict(), 'lr_scheduler': self.lr_scheduler.state_dict(), 'best_prec1': self.best_prec1 }, is_best) if self.multi_class: self.logger.info(' * Best mAP {:.3f}'.format( self.best_prec1)) else: self.logger.info(' * Best Prec@1 {:.3f}'.format( self.best_prec1)) end = time.time()
def d_step(config, vocab, model_F, model_D, optimizer_D, batch, temperature): model_F.eval() pad_idx = vocab.stoi['<pad>'] eos_idx = vocab.stoi['<eos>'] vocab_size = len(vocab) loss_fn = nn.NLLLoss(reduction='none') inp_tokens, inp_lengths, raw_styles = batch_preprocess( batch, pad_idx, eos_idx) rev_styles = 1 - raw_styles batch_size = inp_tokens.size(0) with torch.no_grad(): raw_gen_log_probs = model_F( inp_tokens, None, inp_lengths, raw_styles, generate=True, differentiable_decode=True, temperature=temperature, ) rev_gen_log_probs = model_F( inp_tokens, None, inp_lengths, rev_styles, generate=True, differentiable_decode=True, temperature=temperature, ) raw_gen_soft_tokens = raw_gen_log_probs.exp() raw_gen_lengths = get_lengths(raw_gen_soft_tokens.argmax(-1), eos_idx) rev_gen_soft_tokens = rev_gen_log_probs.exp() rev_gen_lengths = get_lengths(rev_gen_soft_tokens.argmax(-1), eos_idx) if config.discriminator_method == 'Multi': gold_log_probs = model_D(inp_tokens, inp_lengths) gold_labels = raw_styles + 1 raw_gen_log_probs = model_D(raw_gen_soft_tokens, raw_gen_lengths) rev_gen_log_probs = model_D(rev_gen_soft_tokens, rev_gen_lengths) gen_log_probs = torch.cat((raw_gen_log_probs, rev_gen_log_probs), 0) raw_gen_labels = raw_styles + 1 rev_gen_labels = torch.zeros_like(rev_styles) gen_labels = torch.cat((raw_gen_labels, rev_gen_labels), 0) else: raw_gold_log_probs = model_D(inp_tokens, inp_lengths, raw_styles) rev_gold_log_probs = model_D(inp_tokens, inp_lengths, rev_styles) gold_log_probs = torch.cat((raw_gold_log_probs, rev_gold_log_probs), 0) raw_gold_labels = torch.ones_like(raw_styles) rev_gold_labels = torch.zeros_like(rev_styles) gold_labels = torch.cat((raw_gold_labels, rev_gold_labels), 0) raw_gen_log_probs = model_D(raw_gen_soft_tokens, raw_gen_lengths, raw_styles) rev_gen_log_probs = model_D(rev_gen_soft_tokens, rev_gen_lengths, rev_styles) gen_log_probs = torch.cat((raw_gen_log_probs, rev_gen_log_probs), 0) raw_gen_labels = torch.ones_like(raw_styles) rev_gen_labels = torch.zeros_like(rev_styles) gen_labels = torch.cat((raw_gen_labels, rev_gen_labels), 0) adv_log_probs = torch.cat((gold_log_probs, gen_log_probs), 0) adv_labels = torch.cat((gold_labels, gen_labels), 0) adv_loss = loss_fn(adv_log_probs, adv_labels) assert len(adv_loss.size()) == 1 adv_loss = adv_loss.sum() / batch_size loss = adv_loss optimizer_D.zero_grad() loss.backward() clip_grad_norm_(model_D.parameters(), 5) optimizer_D.step() model_F.train() return adv_loss.item()
def train_epoch(self, train, optimizer, verbose=VERBOSE_BATCH_WISE ): ''' Train an epoch with given train iterator and optimizer. ''' total_loss, total_word_count = 0, 0 total_grad_norm = 0 avg_loss, avg_grad_norm = 0, 0 sample_cnt = 0 if verbose == VERBOSE_BATCH_WISE: print(optimizer) progress_bar = tqdm(train, desc='Training: ', unit='batch' ) if verbose is VERBOSE_BATCH_WISE else train # Iterate whole train-set. for idx, mini_batch in enumerate(progress_bar): # Raw target variable has both BOS and EOS token. # The output of sequence-to-sequence does not have BOS token. # Thus, remove BOS token for reference. x, y = mini_batch.src, mini_batch.tgt[0][:, 1:] # |x| = (batch_size, length) # |y| = (batch_size, length) # You have to reset the gradients of all model parameters before to take another step in gradient descent. optimizer.zero_grad() # Take feed-forward # Similar as before, the input of decoder does not have EOS token. # Thus, remove EOS token for decoder input. y_hat = self.model(x, mini_batch.tgt[0][:, :-1]) # |y_hat| = (batch_size, length, output_size) # Calcuate loss and gradients with back-propagation. loss = self._get_loss(y_hat, y) loss.div(y.size(0)).backward() # Simple math to show stats. # Don't forget to detach final variables. total_loss += float(loss) total_word_count += int(mini_batch.tgt[1].sum()) param_norm = float(utils.get_parameter_norm(self.model.parameters())) total_grad_norm += float(utils.get_grad_norm(self.model.parameters())) avg_loss = total_loss / total_word_count avg_grad_norm = total_grad_norm / (idx + 1) if verbose is VERBOSE_BATCH_WISE: progress_bar.set_postfix_str('|param|=%.2f |g_param|=%.2f loss=%.4e PPL=%.2f' % (param_norm, avg_grad_norm, avg_loss, exp(avg_loss) )) # In orther to avoid gradient exploding, we apply gradient clipping. torch_utils.clip_grad_norm_(self.model.parameters(), self.config.max_grad_norm ) # Take a step of gradient descent. optimizer.step() sample_cnt += mini_batch.tgt[0].size(0) if idx >= len(progress_bar) * self.config.train_ratio_per_epoch: break if verbose is VERBOSE_BATCH_WISE: progress_bar.close() return avg_loss, param_norm, avg_grad_norm
lr=params["learning_rate"], eps=1e-3) batch = [] # TRAINING with logger.RewardTracker(net, writer, stop_reward=195, tag="a2c") as tracker: for step_idx, exp in enumerate(exp_source): batch.append(exp) # handle new rewards new_rewards = exp_source.pop_total_rewards() if new_rewards: if tracker.reward(new_rewards[0], step_idx): break if len(batch) < params["batch_size"]: continue loss_policy, loss_v = calc_a2c_loss(batch, net, params) batch.clear() optimizer.zero_grad() loss_policy.backward(retain_graph=True) loss_v.backward() nn_utils.clip_grad_norm_(net.parameters(), params["grad_clip"]) optimizer.step()
for epoch in range(num_epochs): # Set initial hidden and cell states states = (torch.zeros(num_layers, batch_size, hidden_size).to(device), torch.zeros(num_layers, batch_size, hidden_size).to(device)) for i in range(0, ids.size(1) - seq_length, seq_length): inputs = ids[:, i:i + seq_length].to(device) targets = ids[:, (i + 1):(i + 1 + seq_length)].to(device) states = detach(states) outputs, states = model(inputs, states) loss = criterion(outputs, targets.reshape(-1)) model.zero_grad() loss.backward() clip_grad_norm_(model.parameters(), 0.5) optimizer.step() step = (i + 1) // seq_length if step % 100 == 0: if step % 100 == 0: print( 'Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}' .format(epoch + 1, num_epochs, step, num_batches, loss.item(), np.exp(loss.item()))) # Test the model with torch.no_grad(): with open('sample.txt', 'w') as f: # Set intial hidden ane cell states state = (torch.zeros(num_layers, 1, hidden_size).to(device),
valid_criterion = ValidLoss() optim = Adam(model.parameters(), lr=lr, weight_decay=1e-6) scheduler = StepLR(optim, step_size=5, gamma=0.1) for e in range(epochs): model.train() for i, (*inputs, score) in enumerate(dataloader): optim.zero_grad() inputs = [input_.to(device) for input_ in inputs] score = score.to(device).contiguous().view(-1) predicted = model(*inputs) loss = criterion(predicted, score) loss.backward() clip_grad_norm_(model.parameters(), 5.0) optim.step() valid_loss = 0.0 acc = 0.0 model.eval() total_num = 0 for *inputs, score in valid_dataloader: batch_size = score.size(0) inputs = [input_.to(device) for input_ in inputs] score = score.to(device).contiguous().view(-1) predicted = model(*inputs) valid_loss += valid_criterion(predicted, score).item() * batch_size predicted = predicted.argmax(dim=1) acc += torch.eq(predicted, score).sum() total_num += batch_size
def iterate(self, src_tuple, target_tuple, training=True): # limit number of tokens o avoid gpu overload if self.limit_num_tokens is not None: src_tuple, target_tuple = self._batch_limit_tokens( src_tuple, target_tuple) src, src_length = src_tuple target, target_length = target_tuple batch_dim, time_dim = (0, 1) if self.batch_first else (1, 0) num_words = sum(target_length) - target.size(batch_dim) if isinstance(src, PackedSequence) or \ not isinstance(self.model_with_loss, DataParallel): if isinstance(src, PackedSequence): src = PackedSequence(src.data.to(self.device), src.batch_sizes.to(self.device)) else: src = src.to(self.device) target = target.to(self.device) if self.batch_first: inputs = (src, target[:, :-1]) target_labels = target[:, 1:].contiguous() else: inputs = (src, target[:-1]) target_labels = target[1:] # compute output loss, accuracy = self.model_with_loss(inputs, target_labels) loss = loss.sum() loss_measure = float(loss / num_words) if self.avg_loss_time: loss /= num_words else: loss /= target.size(batch_dim) accuracy = float(accuracy.sum().float() / num_words) if training: # compute gradient and do SGD step self.optimizer.zero_grad() loss.backward() if self.grad_clip is not None: if isinstance(self.grad_clip, dict): clip_encoder = self.grad_clip.get('encoder', 0) clip_decoder = self.grad_clip.get('decoder', 0) if clip_encoder > 0: clip_grad_norm_( self.model.encoder.parameters(), clip_encoder) if clip_decoder > 0: clip_grad_norm_( self.model.decoder.parameters(), clip_decoder) elif self.grad_clip > 0: # grad_clip is a number clip_grad_norm_(self.model.parameters(), self.grad_clip) if self.embedding_grad_clip is not None and self.embedding_grad_clip > 0: if hasattr(self.model.encoder, 'embedder'): clip_grad_norm_(self.model.encoder.embedder.parameters(), self.embedding_grad_clip) if hasattr(self.model.decoder, 'embedder'): clip_grad_norm_(self.model.decoder.embedder.parameters(), self.embedding_grad_clip) self.optimizer.step() return loss_measure, accuracy, num_words
def train( self, training_batch_size: int = 50, learning_rate: float = 5e-4, validation_fraction: float = 0.1, stop_after_epochs: int = 20, max_num_epochs: Optional[int] = None, clip_max_norm: Optional[float] = 5.0, calibration_kernel: Optional[Callable] = None, exclude_invalid_x: bool = True, resume_training: bool = False, discard_prior_samples: bool = False, retrain_from_scratch_each_round: bool = False, show_train_summary: bool = False, dataloader_kwargs: Optional[dict] = None, ) -> DirectPosterior: r""" Return density estimator that approximates the distribution $p(\theta|x)$. Args: training_batch_size: Training batch size. learning_rate: Learning rate for Adam optimizer. validation_fraction: The fraction of data to use for validation. stop_after_epochs: The number of epochs to wait for improvement on the validation set before terminating training. max_num_epochs: Maximum number of epochs to run. If reached, we stop training even when the validation loss is still decreasing. If None, we train until validation loss increases (see also `stop_after_epochs`). clip_max_norm: Value at which to clip the total gradient norm in order to prevent exploding gradients. Use None for no clipping. calibration_kernel: A function to calibrate the loss with respect to the simulations `x`. See Lueckmann, Gonçalves et al., NeurIPS 2017. exclude_invalid_x: Whether to exclude simulation outputs `x=NaN` or `x=±∞` during training. Expect errors, silent or explicit, when `False`. resume_training: Can be used in case training time is limited, e.g. on a cluster. If `True`, the split between train and validation set, the optimizer, the number of epochs, and the best validation log-prob will be restored from the last time `.train()` was called. discard_prior_samples: Whether to discard samples simulated in round 1, i.e. from the prior. Training may be sped up by ignoring such less targeted samples. retrain_from_scratch_each_round: Whether to retrain the conditional density estimator for the posterior from scratch each round. show_train_summary: Whether to print the number of epochs and validation loss after the training. dataloader_kwargs: Additional or updated kwargs to be passed to the training and validation dataloaders (like, e.g., a collate_fn) Returns: Density estimator that approximates the distribution $p(\theta|x)$. """ # Calibration kernels proposed in Lueckmann, Gonçalves et al., 2017. if calibration_kernel is None: calibration_kernel = lambda x: ones([len(x)], device=self._device) max_num_epochs = 2**31 - 1 if max_num_epochs is None else max_num_epochs # Starting index for the training set (1 = discard round-0 samples). start_idx = int(discard_prior_samples and self._round > 0) # For non-atomic loss, we can not reuse samples from previous rounds as of now. # SNPE-A can, by construction of the algorithm, only use samples from the last # round. SNPE-A is the only algorithm that has an attribute `_ran_final_round`, # so this is how we check for whether or not we are using SNPE-A. if self.use_non_atomic_loss or hasattr(self, "_ran_final_round"): start_idx = self._round theta, x, prior_masks = self.get_simulations(start_idx, exclude_invalid_x, warn_on_invalid=True) # Dataset is shared for training and validation loaders. dataset = data.TensorDataset( theta, x, prior_masks, ) # Set the proposal to the last proposal that was passed by the user. For # atomic SNPE, it does not matter what the proposal is. For non-atomic # SNPE, we only use the latest data that was passed, i.e. the one from the # last proposal. proposal = self._proposal_roundwise[-1] train_loader, val_loader = self.get_dataloaders( dataset, training_batch_size, validation_fraction, resume_training, dataloader_kwargs=dataloader_kwargs, ) # First round or if retraining from scratch: # Call the `self._build_neural_net` with the rounds' thetas and xs as # arguments, which will build the neural network. # This is passed into NeuralPosterior, to create a neural posterior which # can `sample()` and `log_prob()`. The network is accessible via `.net`. if self._neural_net is None or retrain_from_scratch_each_round: self._neural_net = self._build_neural_net( theta[self.train_indices], x[self.train_indices]) # If data on training device already move net as well. if (not self._device == "cpu" and f"{x.device.type}:{x.device.index}" == self._device): self._neural_net.to(self._device) test_posterior_net_for_multi_d_x(self._neural_net, theta, x) self._x_shape = x_shape_from_simulation(x) # Move entire net to device for training. self._neural_net.to(self._device) if not resume_training: self.optimizer = optim.Adam( list(self._neural_net.parameters()), lr=learning_rate, ) self.epoch, self._val_log_prob = 0, float("-Inf") while self.epoch <= max_num_epochs and not self._converged( self.epoch, stop_after_epochs): # Train for a single epoch. self._neural_net.train() train_log_prob_sum = 0 epoch_start_time = time.time() for batch in train_loader: self.optimizer.zero_grad() # Get batches on current device. theta_batch, x_batch, masks_batch = ( batch[0].to(self._device), batch[1].to(self._device), batch[2].to(self._device), ) batch_loss = torch.mean( self._loss( theta_batch, x_batch, masks_batch, proposal, calibration_kernel, )) train_log_prob_sum += batch_loss.sum().item() batch_loss.backward() if clip_max_norm is not None: clip_grad_norm_( self._neural_net.parameters(), max_norm=clip_max_norm, ) self.optimizer.step() self.epoch += 1 train_log_prob_sum /= int(theta.shape[0] * (1.0 - validation_fraction)) self._summary["train_log_probs"].append(train_log_prob_sum) # Calculate validation performance. self._neural_net.eval() log_prob_sum = 0 with torch.no_grad(): for batch in val_loader: theta_batch, x_batch, masks_batch = ( batch[0].to(self._device), batch[1].to(self._device), batch[2].to(self._device), ) # Take negative loss here to get validation log_prob. batch_log_prob = -self._loss( theta_batch, x_batch, masks_batch, proposal, calibration_kernel, ) log_prob_sum += batch_log_prob.sum().item() # Take mean over all validation samples. self._val_log_prob = log_prob_sum / (len(val_loader) * val_loader.batch_size) # Log validation log prob for every epoch. self._summary["validation_log_probs"].append(self._val_log_prob) self._summary["epoch_durations_sec"].append(time.time() - epoch_start_time) self._maybe_show_progress(self._show_progress_bars, self.epoch) self._report_convergence_at_end(self.epoch, stop_after_epochs, max_num_epochs) # Update summary. self._summary["epochs"].append(self.epoch) self._summary["best_validation_log_probs"].append( self._best_val_log_prob) # Update tensorboard and summary dict. self._summarize( round_=self._round, x_o=None, theta_bank=theta, x_bank=x, ) # Update description for progress bar. if show_train_summary: print(self._describe_round(self._round, self._summary)) return deepcopy(self._neural_net)
scale_std = np.std(batch_scales) batch_scale_v = torch.FloatTensor(batch_scales).to(device) optimizer.zero_grad() logits_v = net(states_v) log_prob_v = F.log_softmax(logits_v, dim=1) log_prob_actions_v = batch_scale_v * log_prob_v[range(BATCH_SIZE), batch_actions_t] loss_policy_v = -log_prob_actions_v.mean() prob_v = F.softmax(logits_v, dim=1) entropy_v = -(prob_v * log_prob_v).sum(dim=1).mean() entropy_loss_v = -ENTROPY_BETA * entropy_v loss_v = loss_policy_v + entropy_loss_v loss_v.backward() nn_utils.clip_grad_norm_(net.parameters(), GRAD_L2_CLIP) optimizer.step() # calc KL-div new_logits_v = net(states_v) new_prob_v = F.softmax(new_logits_v, dim=1) kl_div_v = -((new_prob_v / prob_v).log() * prob_v).sum(dim=1).mean() writer.add_scalar("kl", kl_div_v.item(), step_idx) grad_max = 0.0 grad_means = 0.0 grad_count = 0 for p in net.parameters(): grad_max = max(grad_max, p.grad.abs().max().item()) grad_means += (p.grad ** 2).mean().sqrt().item() grad_count += 1
def main(args): args.color_t = torch.rand(700, 3) if not os.path.exists(args.ckpt_dir): os.makedirs(args.ckpt_dir) if not os.path.exists(args.summary_dir): os.makedirs(args.summary_dir) device = torch.device( "cuda" if not args.nocuda and torch.cuda.is_available() else "cpu") model = SCALOR(args) model.to(device) model.train() optimizer = torch.optim.RMSprop(model.parameters(), lr=args.lr) global_step = 0 if args.last_ckpt: global_step, args.start_epoch = \ load_ckpt(model, optimizer, args.last_ckpt, device) writer = SummaryWriter(args.summary_dir) args.global_step = global_step log_tau_gamma = np.log(args.tau_end) / args.tau_ep D = torch.load(args.experience_replay) num_train = D.size for epoch in range(int(args.start_epoch), args.epochs): local_count = 0 last_count = 0 end_time = time.time() for _ in range(num_train // args.batch_size): chunk_size = epoch + 2 chunk_size = min(chunk_size, args.chunk_size) observations, actions, rewards, nonterminals = D.sample(args.batch_size, chunk_size) tau = np.exp(global_step * log_tau_gamma) tau = max(tau, args.tau_end) args.tau = tau global_step += 1 log_phase = global_step % args.print_freq == 0 or global_step == 1 args.global_step = global_step args.log_phase = log_phase if np.random.binomial(1, min(epoch, 10)/10, 1)[0] and not log_phase: args.phase_generate = True else: args.phase_generate = False sample = observations[:,:,0:3].permute(1,0,2,3,4) / 255 actions = actions.permute(1,0,2) imgs = sample.to(device) actions = actions.to(device) y_seq, log_like, kl_z_what, kl_z_where, kl_z_depth, \ kl_z_pres, kl_z_bg, kl_edge_type, log_imp, counting, \ log_disc_list, log_prop_list, scalor_log_list = model(imgs, actions) log_like = log_like.mean(dim=0) kl_z_what = kl_z_what.mean(dim=0) kl_z_where = kl_z_where.mean(dim=0) kl_z_depth = kl_z_depth.mean(dim=0) kl_z_pres = kl_z_pres.mean(dim=0) kl_z_bg = kl_z_bg.mean(0) kl_edge_type = kl_edge_type.mean(0) total_loss = - log_like + kl_z_what + kl_z_where + kl_z_depth + kl_z_pres + kl_z_bg + kl_edge_type optimizer.zero_grad() total_loss.backward() clip_grad_norm_(model.parameters(), args.cp) optimizer.step() local_count += imgs.data.shape[0] if log_phase: time_inter = time.time() - end_time end_time = time.time() count_inter = local_count - last_count print_scalor(global_step, epoch, local_count, count_inter,\ num_train, total_loss, log_like, kl_z_what, kl_z_where,\ kl_z_pres, kl_z_depth, time_inter) writer.add_scalar('train/total_loss', total_loss.item(), global_step=global_step) writer.add_scalar('train/log_like', log_like.item(), global_step=global_step) writer.add_scalar('train/What_KL', kl_z_what.item(), global_step=global_step) writer.add_scalar('train/Where_KL', kl_z_where.item(), global_step=global_step) writer.add_scalar('train/Pres_KL', kl_z_pres.item(), global_step=global_step) writer.add_scalar('train/Depth_KL', kl_z_depth.item(), global_step=global_step) writer.add_scalar('train/Bg_KL', kl_z_bg.item(), global_step=global_step) writer.add_scalar('train/Edge_KL', kl_edge_type.item(), global_step=global_step) # writer.add_scalar('train/Bg_alpha_KL', kl_z_bg_mask.item(), global_step=global_step) writer.add_scalar('train/tau', tau, global_step=global_step) log_summary(args, writer, imgs, y_seq, global_step, log_disc_list, log_prop_list, scalor_log_list, prefix='train') last_count = local_count #print(args.generate_freq) #args.generate_freq = 2 #if global_step % args.generate_freq == 0: ####################################### do generation #################################### model.eval() with torch.no_grad(): args.phase_generate = True y_seq, log_like, kl_z_what, kl_z_where, kl_z_depth, \ kl_z_pres, kl_z_bg, kl_edge_type, log_imp, counting, \ log_disc_list, log_prop_list, scalor_log_list = model(imgs, actions) args.phase_generate = False log_summary(args, writer, imgs, y_seq, global_step, log_disc_list, log_prop_list, scalor_log_list, prefix='generate') model.train() ####################################### end generation #################################### if global_step % args.save_epoch_freq == 0 or global_step == 1: save_ckpt(args.ckpt_dir, model, optimizer, global_step, epoch, local_count, args.batch_size, num_train)
def train(train_loader, model, criterion, optimizer, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # In PyTorch 0.4, "volatile=True" is deprecated. torch.set_grad_enabled(True) if args.no_partialbn: model.module.partialBN(False) else: model.module.partialBN(True) # switch to train mode model.train() end = time.time() loss_summ = 0 localtime = time.localtime() end_time = time.strftime("%Y/%m/%d-%H:%M:%S", localtime) for i, (input, target) in enumerate(train_loader): # discard final batch if i == len(train_loader) - 1: break # measure data loading time data_time.update(time.time() - end) # target size: [batch_size] target = target.cuda(async=True) input_var = input target_var = target # compute output, output size: [batch_size, num_class] output = model(input_var) loss = criterion(output, target_var) loss = loss / args.iter_size loss_summ += loss # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) losses.update(loss_summ.item(), input.size(0)) top1.update(prec1.item(), input.size(0)) top5.update(prec5.item(), input.size(0)) loss.backward() if (i + 1) % args.iter_size == 0: # scale down gradients when iter size is functioning optimizer.step() optimizer.zero_grad() loss_summ = 0 #if i % args.print_freq == 0: print(('Epoch: [{0}][{1}/{2}], lr: {lr:.7f}\t' 'Time {batch_time.val:.2f} ({batch_time.avg:.2f})\t' 'UTime {end_time:} \t' 'Data {data_time.val:.2f} ({data_time.avg:.2f})\t' 'Loss {loss.val:.3f} ({loss.avg:.3f})\t' 'Prec@1 {top1.val:.2f} ({top1.avg:.2f})\t' 'Prec@5 {top5.val:.2f} ({top5.avg:.2f})'.format( epoch, i, len(train_loader), batch_time=batch_time, end_time=end_time, data_time=data_time, loss=losses, top1=top1, top5=top5, lr=optimizer.param_groups[-1]['lr']))) if args.clip_gradient is not None: total_norm = clip_grad_norm_(model.parameters(), args.clip_gradient) if total_norm > args.clip_gradient: print("clipping gradient: {} with coef {}".format( total_norm, args.clip_gradient / total_norm)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() localtime = time.localtime() end_time = time.strftime("%Y/%m/%d-%H:%M:%S", localtime)
def train(engine, mini_batch): # You have to reset the gradients of all model parameters # before to take another step in gradient descent. engine.model.train() if engine.state.iteration % engine.config.iteration_per_update == 1 or \ engine.config.iteration_per_update == 1: if engine.state.iteration > 1: engine.optimizer.zero_grad() device = next(engine.model.parameters()).device mini_batch.src = (mini_batch.src[0].to(device), mini_batch.src[1]) mini_batch.tgt = (mini_batch.tgt[0].to(device), mini_batch.tgt[1]) # Raw target variable has both BOS and EOS token. # The output of sequence-to-sequence does not have BOS token. # Thus, remove BOS token for reference. x, y = mini_batch.src, mini_batch.tgt[0][:, 1:] # |x| = (batch_size, length) # |y| = (batch_size, length) # Take sampling process because set False for is_greedy. y_hat, indice = engine.model.search( x, is_greedy=False, max_length=engine.config.max_length) with torch.no_grad(): # Based on the result of sampling, get reward. actor_reward = MinimumRiskTrainingEngine._get_reward( indice, y, n_gram=engine.config.rl_n_gram, method=engine.config.rl_reward, ) # |y_hat| = (batch_size, length, output_size) # |indice| = (batch_size, length) # |actor_reward| = (batch_size) # Take samples as many as n_samples, and get average rewards for them. # I figured out that n_samples = 1 would be enough. baseline = [] for _ in range(engine.config.rl_n_samples): _, sampled_indice = engine.model.search( x, is_greedy=False, max_length=engine.config.max_length, ) baseline += [ MinimumRiskTrainingEngine._get_reward( sampled_indice, y, n_gram=engine.config.rl_n_gram, method=engine.config.rl_reward, ) ] baseline = torch.stack(baseline).mean(dim=0) # |baseline| = (n_samples, batch_size) --> (batch_size) # Now, we have relatively expected cumulative reward. # Which score can be drawn from actor_reward subtracted by baseline. reward = actor_reward - baseline # |reward| = (batch_size) # calculate gradients with back-propagation loss = MinimumRiskTrainingEngine._get_loss(y_hat, indice, reward=reward) backward_target = loss.div(y.size(0)).div( engine.config.iteration_per_update) backward_target.backward() p_norm = float(get_parameter_norm(engine.model.parameters())) g_norm = float(get_grad_norm(engine.model.parameters())) if engine.state.iteration % engine.config.iteration_per_update == 0 and \ engine.state.iteration > 0: # In orther to avoid gradient exploding, we apply gradient clipping. torch_utils.clip_grad_norm_( engine.model.parameters(), engine.config.max_grad_norm, ) # Take a step of gradient descent. engine.optimizer.step() return { 'actor': float(actor_reward.mean()), 'baseline': float(baseline.mean()), 'reward': float(reward.mean()), '|param|': p_norm if not np.isnan(p_norm) and not np.isinf(p_norm) else 0., '|g_param|': g_norm if not np.isnan(g_norm) and not np.isinf(g_norm) else 0., }
def train(train_loader, model, criterion, optimizer, epoch, log, tf_writer): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() mAPs = AverageMeter() if args.no_partialbn: model.module.partialBN(False) else: model.module.partialBN(True) # switch to train mode model.train() end = time.time() checkpoint_dir = os.path.join(args.root_model, args.store_name) for i, (input, target) in enumerate(train_loader): adjust_learning_rate(optimizer, epoch, args.lr_type, args.lr_steps, epoch + float(i) / len(train_loader)) # measure data loading time data_time.update(time.time() - end) target = target.cuda() input_var = torch.autograd.Variable(input) target_var = torch.autograd.Variable(target) # compute output output = model(input_var) loss = criterion(output, target_var) # measure accuracy and record loss if args.multi_class: mAP = calculate_mAP(output.data, target) mAPs.update(mAP, input.size(0)) else: prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) top1.update(prec1.item(), input.size(0)) top5.update(prec5.item(), input.size(0)) losses.update(loss.item(), input.size(0)) # compute gradient and do SGD step loss.backward() if args.clip_gradient is not None: total_norm = clip_grad_norm_(model.parameters(), args.clip_gradient) optimizer.step() optimizer.zero_grad() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: if args.multi_class: output = ('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'mAP {mAPs.val:.3f} ({mAPs.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, mAPs=mAPs, lr=optimizer.param_groups[2]['lr'])) print(output) log.write(output + '\n') log.flush() else: output = ('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5, lr=optimizer.param_groups[2]['lr'])) print(output) log.write(output + '\n') log.flush() tf_writer.add_scalar('loss/train', losses.avg, epoch) if args.multi_class: tf_writer.add_scalar('acc/train_mAP', mAPs.avg, epoch) else: tf_writer.add_scalar('acc/train_top1', top1.avg, epoch) tf_writer.add_scalar('acc/train_top5', top5.avg, epoch) tf_writer.add_scalar('lr', optimizer.param_groups[-1]['lr'], epoch)
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Adam does not support sparse ' 'gradients, please consider ' 'SparseAdam instead') state = self.state[p] # State initialization if not state: state['step'] = 0 # Exponential moving average of gradient values state['next_m'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['next_v'] = torch.zeros_like(p.data) next_m, next_v = state['next_m'], state['next_v'] beta1, beta2 = group['b1'], group['b2'] # Add grad clipping if group['max_grad_norm'] > 0: clip_grad_norm_(p, group['max_grad_norm']) # Decay the first and second moment running average coefficient # In-place operations to update the averages at the same time next_m.mul_(beta1).add_(1 - beta1, grad) next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) update = next_m / (next_v.sqrt() + group['e']) # Just adding the square of the weights to the loss function # is *not* the correct way of using L2 regularization/weight # decay with Adam, since that will interact with the m and v # parameters in strange ways. # # Instead we want to decay the weights in a manner that # doesn't interact with the m/v parameters. This is # equivalent to adding the square of the weights to the loss # with plain (non-momentum) SGD. if group['weight_decay_rate'] > 0.0: update += group['weight_decay_rate'] * p.data if group['t_total'] != -1: schedule_fct = SCHEDULES[group['schedule']] lr_scheduled = group['lr'] * schedule_fct( state['step'] / group['t_total'], group['warmup']) else: lr_scheduled = group['lr'] update_with_lr = lr_scheduled * update p.data.add_(-update_with_lr) state['step'] += 1 # step_size = lr_scheduled * math.sqrt(bias_correction2) / \ # bias_correction1 # No bias correction # bias_correction1 = 1 - beta1 ** state['step'] # bias_correction2 = 1 - beta2 ** state['step'] return loss
def train_epoch(model, bimpm, criterion, train_iter, valid_iter, config, start_epoch=1, others_to_save=None, valid_nli_iter=None): current_lr = config.rl_lr highest_valid_bleu = -np.inf no_improve_cnt = 0 # Print initial valid BLEU before we start RL. model.eval() total_reward, sample_cnt = 0, 0 for batch_index, batch in enumerate(valid_iter): current_batch_word_cnt = torch.sum(batch.tgt[1]) x = batch.src y = batch.tgt[0][:, 1:] batch_size = y.size(0) # |x| = (batch_size, length) # |y| = (batch_size, length) # feed-forward y_hat, indice = model.search(x, is_greedy=True, max_length=config.max_length) # |y_hat| = (batch_size, length, output_size) # |indice| = (batch_size, length) reward = get_bleu_reward(y, indice, n_gram=config.rl_n_gram) total_reward += float(reward.sum()) sample_cnt += batch_size if sample_cnt >= len(valid_iter.dataset.examples): break avg_bleu = total_reward / sample_cnt print("initial valid BLEU: %.4f" % avg_bleu) # You can figure-out improvement. if valid_nli_iter: nli_validation(valid_nli_iter, model, bimpm, config) model.train() # Now, begin training. # Start RL nli_criterion = nn.CrossEntropyLoss(reduce=False) print("start rl epoch:", start_epoch) print("number of epoch to complete:", config.rl_n_epochs + 1) if config.reward_mode == 'combined': if config.gpu_id >= 0: nli_weight = torch.tensor([1.0], requires_grad=True, device="cuda") bleu_weight = torch.tensor([1.0], requires_grad=True, device="cuda") else: nli_weight = torch.tensor([1.0], requires_grad=True) bleu_weight = torch.tensor([1.0], requires_grad=True) print("nli_weight, bleu_weight:", nli_weight.data.cpu().numpy()[0], bleu_weight.data.cpu().numpy()[0]) weight_optimizer = optim.Adam(iter([nli_weight, bleu_weight]), lr=0.0001) optimizer = optim.SGD( model.parameters(), lr=current_lr, ) # Default hyper-parameter is set for SGD. print("current learning rate: %f" % current_lr) print(optimizer) for epoch in range(start_epoch, config.rl_n_epochs + 1): sample_cnt = 0 total_loss, total_actor_loss, total_sample_count, total_word_count, total_parameter_norm, total_grad_norm = 0, 0, 0, 0, 0, 0 start_time = time.time() train_loss = np.inf epoch_accuracy = [] for batch_index, batch in enumerate(train_iter): optimizer.zero_grad() current_batch_word_cnt = torch.sum(batch.tgt[1]) x = batch.src y = batch.tgt[0][:, 1:] batch_size = y.size(0) if config.reward_mode != 'bleu': premise = batch.premise hypothesis = batch.hypothesis isSrcPremise = batch.isSrcPremise label = batch.labels # |x| = (batch_size, length) # |y| = (batch_size, length) # Take sampling process because set False for is_greedy. y_hat, indice = model.search(x, is_greedy=False, max_length=config.max_length) if config.reward_mode == 'bleu': q_actor = get_bleu_reward(y, indice, n_gram=config.rl_n_gram) epoch_accuracy.append(q_actor.sum() / batch_size) else: padded_indice, padded_premise, padded_hypothesis = padding_three_tensors( indice, premise, hypothesis, batch_size) # put pred sentece into either premise and hypothesis for i in range(batch_size): if not isSrcPremise[i]: padded_premise[i] = padded_indice[i] else: padded_hypothesis[i] = padded_indice[i] kwargs = {'p': padded_premise, 'h': padded_hypothesis} pred_logit = bimpm(**kwargs) accuracy = get_accuracy(pred_logit, label) epoch_accuracy.append(accuracy) # Based on the result of sampling, get reward. if config.reward_mode == 'nli': q_actor = -get_nli_reward(pred_logit, label, nli_criterion) else: q_actor = 1/(2 * nli_weight.pow(2)) * -get_nli_reward(pred_logit, label, nli_criterion) \ + 1/(2 * bleu_weight.pow(2)) * (get_bleu_reward(y, indice, n_gram=config.rl_n_gram)/100) \ + torch.log(nli_weight * bleu_weight) # |y_hat| = (batch_size, length, output_size) # |indice| = (batch_size, length) # |q_actor| = (batch_size) # Take samples as many as n_samples, and get average rewards for them. # I figured out that n_samples = 1 would be enough. baseline = [] with torch.no_grad(): for i in range(config.n_samples): _, sampled_indice = model.search( x, is_greedy=False, max_length=config.max_length) if config.reward_mode == 'bleu': baseline_reward = get_bleu_reward( y, sampled_indice, n_gram=config.rl_n_gram) epoch_accuracy.append(baseline_reward.sum() / batch_size) else: padded_sampled_indice, padded_premise, padded_hypothesis = padding_three_tensors( sampled_indice, premise, hypothesis, batch_size) # put pred sentece into either premise and hypothesis for i in range(batch_size): if not isSrcPremise[i]: padded_premise[i] = padded_sampled_indice[i] else: padded_hypothesis[i] = padded_sampled_indice[i] kwargs = {'p': padded_premise, 'h': padded_hypothesis} pred_logit = bimpm(**kwargs) accuracy = get_accuracy(pred_logit, label) epoch_accuracy.append(accuracy) # Based on the result of sampling, get reward. if config.reward_mode == 'nli': baseline_reward = -get_nli_reward( pred_logit, label, nli_criterion) else: baseline_reward = 1/(2 * nli_weight.pow(2)) * -get_nli_reward(pred_logit, label, nli_criterion) \ + 1/(2 * bleu_weight.pow(2)) * (get_bleu_reward(y, sampled_indice, n_gram=config.rl_n_gram)/100) \ + torch.log(nli_weight * bleu_weight) baseline += [baseline_reward] baseline = torch.stack(baseline).sum(dim=0).div( config.n_samples) # |baseline| = (n_samples, batch_size) --> (batch_size) # Now, we have relatively expected cumulative reward. # Which score can be drawn from q_actor subtracted by baseline. tmp_reward = q_actor - baseline # |tmp_reward| = (batch_size) # calcuate gradients with back-propagation get_gradient(indice, y_hat, criterion, reward=tmp_reward) # simple math to show stats total_loss += float(tmp_reward.sum()) total_actor_loss += float(q_actor.sum()) total_sample_count += batch_size total_word_count += int(current_batch_word_cnt) total_parameter_norm += float( utils.get_parameter_norm(model.parameters())) total_grad_norm += float(utils.get_grad_norm(model.parameters())) if (batch_index + 1) % config.print_every == 0: avg_loss = total_loss / total_sample_count avg_actor_loss = total_actor_loss / total_sample_count avg_parameter_norm = total_parameter_norm / config.print_every avg_grad_norm = total_grad_norm / config.print_every avg_epoch_accuracy = sum(epoch_accuracy) / len(epoch_accuracy) elapsed_time = time.time() - start_time print( "epoch: %d batch: %d/%d\t|param|: %.2f\t|g_param|: %.2f\trwd: %.4f\tactor loss: %.4f\tAccuracy: %.2f\t%5d words/s %3d secs" % (epoch, batch_index + 1, int( len(train_iter.dataset.examples) // config.batch_size), avg_parameter_norm, avg_grad_norm, avg_loss, avg_actor_loss, avg_epoch_accuracy, total_word_count // elapsed_time, elapsed_time)) if config.reward_mode == 'combined': print("nli_weight, bleu_weight:", nli_weight.data.cpu().numpy()[0], bleu_weight.data.cpu().numpy()[0]) total_loss, total_actor_loss, total_sample_count, total_word_count, total_parameter_norm, total_grad_norm = 0, 0, 0, 0, 0, 0 epoch_accuracy = [] start_time = time.time() train_loss = avg_actor_loss # In orther to avoid gradient exploding, we apply gradient clipping. torch_utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) # Take a step of gradient descent. optimizer.step() if config.reward_mode == 'combined': weight_optimizer.step() sample_cnt += batch_size if sample_cnt >= len(train_iter.dataset.examples): break sample_cnt = 0 total_reward = 0 # Start validation with torch.no_grad(): model.eval() # Turn-off drop-out for batch_index, batch in enumerate(valid_iter): current_batch_word_cnt = torch.sum(batch.tgt[1]) x = batch.src y = batch.tgt[0][:, 1:] batch_size = y.size(0) # |x| = (batch_size, length) # |y| = (batch_size, length) # feed-forward y_hat, indice = model.search(x, is_greedy=True, max_length=config.max_length) # |y_hat| = (batch_size, length, output_size) # |indice| = (batch_size, length) reward = get_bleu_reward(y, indice, n_gram=config.rl_n_gram) total_reward += float(reward.sum()) sample_cnt += batch_size if sample_cnt >= len(valid_iter.dataset.examples): break avg_bleu = total_reward / sample_cnt print("valid BLEU: %.4f" % avg_bleu) if highest_valid_bleu < avg_bleu: highest_valid_bleu = avg_bleu no_improve_cnt = 0 else: no_improve_cnt += 1 if valid_nli_iter: nli_validation(valid_nli_iter, model, bimpm, config) model.train() model_fn = config.model.split(".") model_fn = model_fn[:-1] + [ "%02d" % (config.n_epochs + epoch), "%.2f-%.4f" % (train_loss, avg_bleu) ] + [model_fn[-1]] + [config.reward_mode] # PyTorch provides efficient method for save and load model, which uses python pickle. to_save = { "model": model.state_dict(), "config": config, "epoch": config.n_epochs + epoch + 1, "current_lr": current_lr } if others_to_save is not None: for k, v in others_to_save.items(): to_save[k] = v torch.save(to_save, '.'.join(model_fn)) if config.early_stop > 0 and no_improve_cnt > config.early_stop: break
batch.clear() optimizer.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v) log_prob_v = F.log_softmax(logits_v, dim=1) adv_v = vals_ref_v - value_v.detach() log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t] loss_policy_v = -log_prob_actions_v.mean() prob_v = F.softmax(logits_v, dim=1) entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(dim=1).mean() loss_v = entropy_loss_v + loss_value_v + loss_policy_v loss_v.backward() nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD) optimizer.step() tb_tracker.track("advantage", adv_v, step_idx) tb_tracker.track("values", value_v, step_idx) tb_tracker.track("batch_rewards", vals_ref_v, step_idx) tb_tracker.track("loss_entropy", entropy_loss_v, step_idx) tb_tracker.track("loss_policy", loss_policy_v, step_idx) tb_tracker.track("loss_value", loss_value_v, step_idx) tb_tracker.track("loss_total", loss_v, step_idx) tb_tracker.track("dict_size", len(preprocessor), step_idx) pass