def _update_learning_rate( self, optimizers: Union[List[th.optim.Optimizer], th.optim.Optimizer], ) -> None: super(PPG, self)._update_learning_rate(optimizers) logger.record("train/aux_learning_rate", self.aux_lr_schedule(self._current_progress_remaining)) update_learning_rate( self.policy.aux_optimizer, self.aux_lr_schedule(self._current_progress_remaining))
def _update_learning_rate(self, optimizers: Union[List[th.optim.Optimizer], th.optim.Optimizer]) -> None: """ Update the optimizers learning rate using the current learning rate schedule and the current progress (from 1 to 0). :param optimizers: (Union[List[th.optim.Optimizer], th.optim.Optimizer]) An optimizer or a list of optimizers. """ # Log the current learning rate self.logger.logkv("learning_rate", self.lr_schedule(self._current_progress)) if not isinstance(optimizers, list): optimizers = [optimizers] for optimizer in optimizers: update_learning_rate(optimizer, self.lr_schedule(self._current_progress))
def _update_critic_learning_rate( self, optimizers: Union[List[th.optim.Optimizer], th.optim.Optimizer]) -> None: """ Update the optimizers learning rate using the current learning rate schedule and the current progress remaining (from 1 to 0). :param optimizers: An optimizer or a list of optimizers. """ # Log the current learning rate logger.record( "train/learning_rate_critic", self.lr_schedule_critic(self._current_progress_remaining)) if not isinstance(optimizers, list): optimizers = [optimizers] for optimizer in optimizers: update_learning_rate( optimizer, self.lr_schedule_critic(self._current_progress_remaining))
def train_mer(self, gradient_steps: int, batch_size: int = 64) -> None: optimizers = [self.policy.optimizer] # Reset optimizers: if self.reset_optimizers_during_training: for i_optimizer, optimizer in enumerate(optimizers): optimizer.__init__(optimizer.param_groups[0]['params']) optimizers[i_optimizer] = optimizer # Update learning rate according to schedule base_lr = self.lr_schedule(self._current_progress_remaining) # Save initial weights of model models_to_update = [self.policy] initial_state_dicts = [ deepcopy(model.state_dict()) for model in models_to_update ] losses = [] current_example_ind = randint(0, gradient_steps - 1) for gradient_step in range(gradient_steps): # Sample replay buffer or current example, and update learning rate accordingly if gradient_step == current_example_ind: replay_data = self.current_experience_buffer.sample( 1, env=self._vec_normalize_env) for optimizer in optimizers: update_learning_rate(optimizer, base_lr * self.mer_s) else: replay_data = self.replay_buffer.sample( batch_size, env=self._vec_normalize_env) for optimizer in optimizers: update_learning_rate(optimizer, base_lr) with th.no_grad(): # Compute the target Q values target_q = self.q_net_target(replay_data.next_observations) # Follow greedy policy: use the one with the highest value target_q, _ = target_q.max(dim=1) # Avoid potential broadcast issue target_q = target_q.reshape(-1, 1) # 1-step TD target target_q = replay_data.rewards + ( 1 - replay_data.dones) * self.gamma * target_q # Get current Q estimates current_q = self.q_net(replay_data.observations) # Retrieve the q-values for the actions from the replay buffer current_q = th.gather(current_q, dim=1, index=replay_data.actions.long()) # Compute Huber loss (less sensitive to outliers) loss = F.mse_loss(current_q, target_q) losses.append(loss.item()) # Optimize the policy self.policy.optimizer.zero_grad() loss.backward() # Clip gradient norm th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) self.policy.optimizer.step() # Perform Reptile step for i_model, model in enumerate(models_to_update): self.reptile_step_state_dict(model, initial_state_dicts[i_model]) # Increase update counter self._n_updates += gradient_steps logger.record("train/n_updates", self._n_updates, exclude="tensorboard") logger.record("train/loss", np.mean(losses))
def train_mer(self, gradient_steps: int, batch_size: int = 64) -> None: optimizers = [self.actor.optimizer, self.critic.optimizer] if self.ent_coef_optimizer is not None: optimizers += [self.ent_coef_optimizer] optimizers_to_reset = optimizers # Reset optimizers: for i_optimizer, optimizer in enumerate(optimizers_to_reset): optimizer.__init__(optimizer.param_groups[0]['params']) optimizers[i_optimizer] = optimizer # Update optimizers learning rate base_lr = self.lr_schedule(self._current_progress_remaining) ent_coef_losses, ent_coefs = [], [] actor_losses, critic_losses = [], [] # Save initial weights of model - for actor and critic but not for critic_target models_to_update = [self.actor, self.critic] initial_state_dicts = [ deepcopy(model.state_dict()) for model in models_to_update ] initial_log_ent_coef = self.log_ent_coef.detach().clone() current_example_ind = randint(0, gradient_steps - 1) for gradient_step in range(gradient_steps): # Sample replay buffer or current example, and update learning rate accordingly if gradient_step == current_example_ind: replay_data = self.current_experience_buffer.sample( 1, env=self._vec_normalize_env) for optimizer in optimizers: update_learning_rate(optimizer, base_lr * self.mer_s) else: replay_data = self.replay_buffer.sample( batch_size, env=self._vec_normalize_env) for optimizer in optimizers: update_learning_rate(optimizer, base_lr) # We need to sample because `log_std` may have changed between two gradient steps if self.use_sde: self.actor.reset_noise() # Action by the current actor for the sampled state actions_pi, log_prob = self.actor.action_log_prob( replay_data.observations) log_prob = log_prob.reshape(-1, 1) ent_coef_loss = None if self.ent_coef_optimizer is not None: # Important: detach the variable from the graph # so we don't change it with other losses # see https://github.com/rail-berkeley/softlearning/issues/60 ent_coef = th.exp(self.log_ent_coef.detach()) ent_coef_loss = -( self.log_ent_coef * (log_prob + self.target_entropy).detach()).mean() ent_coef_losses.append(ent_coef_loss.item()) else: ent_coef = self.ent_coef_tensor ent_coefs.append(ent_coef.item()) # Optimize entropy coefficient, also called # entropy temperature or alpha in the paper if ent_coef_loss is not None: self.ent_coef_optimizer.zero_grad() ent_coef_loss.backward() self.ent_coef_optimizer.step() with th.no_grad(): # Select action according to policy next_actions, next_log_prob = self.actor.action_log_prob( replay_data.next_observations) # Compute the target Q value: min over all critics targets targets = th.cat(self.critic_target( replay_data.next_observations, next_actions), dim=1) target_q, _ = th.min(targets, dim=1, keepdim=True) # add entropy term target_q = target_q - ent_coef * next_log_prob.reshape(-1, 1) # td error + entropy term q_backup = replay_data.rewards + ( 1 - replay_data.dones) * self.gamma * target_q # Get current Q estimates for each critic network # using action from the replay buffer current_q_estimates = self.critic(replay_data.observations, replay_data.actions) # Compute critic loss critic_loss = 0.5 * sum([ F.mse_loss(current_q, q_backup) for current_q in current_q_estimates ]) critic_losses.append(critic_loss.item()) # Optimize the critic self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() # Compute actor loss # Alternative: actor_loss = th.mean(log_prob - qf1_pi) # Mean over all critic networks q_values_pi = th.cat(self.critic.forward(replay_data.observations, actions_pi), dim=1) min_qf_pi, _ = th.min(q_values_pi, dim=1, keepdim=True) actor_loss = (ent_coef * log_prob - min_qf_pi).mean() actor_losses.append(actor_loss.item()) # Optimize the actor self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() # Perform Reptile step for i_model, model in enumerate(models_to_update): self.reptile_step_state_dict(model, initial_state_dicts[i_model]) self.log_ent_coef.data = self.reptile_step_tensor( self.log_ent_coef.data, initial_log_ent_coef.data) # Update target networks polyak_update(self.critic.parameters(), self.critic_target.parameters(), self.tau) self._n_updates += gradient_steps logger.record("train/n_updates", self._n_updates, exclude="tensorboard") logger.record("train/ent_coef", np.mean(ent_coefs)) logger.record("train/actor_loss", np.mean(actor_losses)) logger.record("train/critic_loss", np.mean(critic_losses)) if len(ent_coef_losses) > 0: logger.record("train/ent_coef_loss", np.mean(ent_coef_losses))