def forward(self, inputs: torch.Tensor) -> List[DistInstance]: mu = self.mu(inputs) if self.conditional_sigma: log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2) else: log_sigma = self.log_sigma if self.tanh_squash: return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))] else: return [GaussianDistInstance(mu, torch.exp(log_sigma))]
def forward(self, inputs: torch.Tensor) -> List[DistInstance]: mu = self.mu(inputs) if self.conditional_sigma: log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2) else: # Expand so that entropy matches batch size log_sigma = self.log_sigma.expand(inputs.shape[0], -1) if self.tanh_squash: return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))] else: return [GaussianDistInstance(mu, torch.exp(log_sigma))]
def forward(self, inputs: torch.Tensor) -> List[DistInstance]: mu = self.mu(inputs) if self.conditional_sigma: log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2) else: # Expand so that entropy matches batch size. Note that we're using # torch.cat here instead of torch.expand() becuase it is not supported in the # verified version of Barracuda (1.0.2). log_sigma = torch.cat([self.log_sigma] * inputs.shape[0], axis=0) if self.tanh_squash: return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))] else: return [GaussianDistInstance(mu, torch.exp(log_sigma))]
def forward(self, inputs: torch.Tensor) -> List[DistInstance]: mu = self.mu(inputs) if self.conditional_sigma: log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2) else: # Expand so that entropy matches batch size. Note that we're using # mu*0 here to get the batch size implicitly since Barracuda 1.2.1 # throws error on runtime broadcasting due to unknown reason. We # use this to replace torch.expand() becuase it is not supported in # the verified version of Barracuda (1.0.X). log_sigma = mu * 0 + self.log_sigma if self.tanh_squash: return TanhGaussianDistInstance(mu, torch.exp(log_sigma)) else: return GaussianDistInstance(mu, torch.exp(log_sigma))
def ppo_policy_loss( self, advantages: torch.Tensor, log_probs: torch.Tensor, old_log_probs: torch.Tensor, loss_masks: torch.Tensor, ) -> torch.Tensor: """ Evaluate PPO policy loss. :param advantages: Computed advantages. :param log_probs: Current policy probabilities :param old_log_probs: Past policy probabilities :param loss_masks: Mask for losses. Used with LSTM to ignore 0'ed out experiences. """ advantage = advantages.unsqueeze(-1) decay_epsilon = self.hyperparameters.epsilon r_theta = torch.exp(log_probs - old_log_probs) p_opt_a = r_theta * advantage p_opt_b = ( torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage) policy_loss = -1 * ModelUtils.masked_mean(torch.min(p_opt_a, p_opt_b), loss_masks) return policy_loss
def sac_policy_loss( self, log_probs: torch.Tensor, q1p_outs: Dict[str, torch.Tensor], loss_masks: torch.Tensor, discrete: bool, ) -> torch.Tensor: _ent_coef = torch.exp(self._log_ent_coef) mean_q1 = torch.mean(torch.stack(list(q1p_outs.values())), axis=0) if not discrete: mean_q1 = mean_q1.unsqueeze(1) batch_policy_loss = torch.mean(_ent_coef * log_probs - mean_q1, dim=1) policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks) else: action_probs = log_probs.exp() branched_per_action_ent = ModelUtils.break_into_branches( log_probs * action_probs, self.act_size ) branched_q_term = ModelUtils.break_into_branches( mean_q1 * action_probs, self.act_size ) branched_policy_loss = torch.stack( [ torch.sum(_ent_coef[i] * _lp - _qt, dim=1, keepdim=True) for i, (_lp, _qt) in enumerate( zip(branched_per_action_ent, branched_q_term) ) ], dim=1, ) batch_policy_loss = torch.squeeze(branched_policy_loss) policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks) return policy_loss
def trust_region_policy_loss( advantages: torch.Tensor, log_probs: torch.Tensor, old_log_probs: torch.Tensor, loss_masks: torch.Tensor, epsilon: float, ) -> torch.Tensor: """ Evaluate policy loss clipped to stay within a trust region. Used for PPO and POCA. :param advantages: Computed advantages. :param log_probs: Current policy probabilities :param old_log_probs: Past policy probabilities :param loss_masks: Mask for losses. Used with LSTM to ignore 0'ed out experiences. """ advantage = advantages.unsqueeze(-1) r_theta = torch.exp(log_probs - old_log_probs) p_opt_a = r_theta * advantage p_opt_b = torch.clamp(r_theta, 1.0 - epsilon, 1.0 + epsilon) * advantage policy_loss = -1 * ModelUtils.masked_mean(torch.min(p_opt_a, p_opt_b), loss_masks) return policy_loss
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Updates model using buffer. :param num_sequences: Number of trajectories in batch. :param batch: Experience mini-batch. :param update_target: Whether or not to update target value network :param reward_signal_batches: Minibatches to use for updating the reward signals, indexed by name. If none, don't update the reward signals. :return: Output from update process. """ rewards = {} for name in self.reward_signals: rewards[name] = ModelUtils.list_to_tensor(batch[f"{name}_rewards"]) n_obs = len(self.policy.behavior_spec.sensor_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] next_obs = ObsUtil.from_buffer_next(batch, n_obs) # Convert to tensors next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] act_masks = ModelUtils.list_to_tensor(batch["action_mask"]) actions = AgentAction.from_dict(batch) memories_list = [ ModelUtils.list_to_tensor(batch["memory"][i]) for i in range( 0, len(batch["memory"]), self.policy.sequence_length) ] # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true. offset = 1 if self.policy.sequence_length > 1 else 0 next_memories_list = [ ModelUtils.list_to_tensor( batch["memory"][i] [self.policy.m_size // 2:]) # only pass value part of memory to target network for i in range(offset, len(batch["memory"]), self.policy.sequence_length) ] if len(memories_list) > 0: memories = torch.stack(memories_list).unsqueeze(0) next_memories = torch.stack(next_memories_list).unsqueeze(0) else: memories = None next_memories = None # Q network memories are 0'ed out, since we don't have them during inference. q_memories = (torch.zeros_like(next_memories) if next_memories is not None else None) # Copy normalizers from policy self.value_network.q1_network.network_body.copy_normalization( self.policy.actor_critic.network_body) self.value_network.q2_network.network_body.copy_normalization( self.policy.actor_critic.network_body) self.target_network.network_body.copy_normalization( self.policy.actor_critic.network_body) ( sampled_actions, log_probs, _, value_estimates, _, ) = self.policy.actor_critic.get_action_stats_and_value( current_obs, masks=act_masks, memories=memories, sequence_length=self.policy.sequence_length, ) cont_sampled_actions = sampled_actions.continuous_tensor cont_actions = actions.continuous_tensor q1p_out, q2p_out = self.value_network( current_obs, cont_sampled_actions, memories=q_memories, sequence_length=self.policy.sequence_length, q2_grad=False, ) q1_out, q2_out = self.value_network( current_obs, cont_actions, memories=q_memories, sequence_length=self.policy.sequence_length, ) if self._action_spec.discrete_size > 0: disc_actions = actions.discrete_tensor q1_stream = self._condense_q_streams(q1_out, disc_actions) q2_stream = self._condense_q_streams(q2_out, disc_actions) else: q1_stream, q2_stream = q1_out, q2_out with torch.no_grad(): target_values, _ = self.target_network( next_obs, memories=next_memories, sequence_length=self.policy.sequence_length, ) masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool) dones = ModelUtils.list_to_tensor(batch["done"]) q1_loss, q2_loss = self.sac_q_loss(q1_stream, q2_stream, target_values, dones, rewards, masks) value_loss = self.sac_value_loss(log_probs, value_estimates, q1p_out, q2p_out, masks) policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks) entropy_loss = self.sac_entropy_loss(log_probs, masks) total_value_loss = q1_loss + q2_loss + value_loss decay_lr = self.decay_learning_rate.get_value( self.policy.get_current_step()) ModelUtils.update_learning_rate(self.policy_optimizer, decay_lr) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() ModelUtils.update_learning_rate(self.value_optimizer, decay_lr) self.value_optimizer.zero_grad() total_value_loss.backward() self.value_optimizer.step() ModelUtils.update_learning_rate(self.entropy_optimizer, decay_lr) self.entropy_optimizer.zero_grad() entropy_loss.backward() self.entropy_optimizer.step() # Update target network ModelUtils.soft_update(self.policy.actor_critic.critic, self.target_network, self.tau) update_stats = { "Losses/Policy Loss": policy_loss.item(), "Losses/Value Loss": value_loss.item(), "Losses/Q1 Loss": q1_loss.item(), "Losses/Q2 Loss": q2_loss.item(), "Policy/Discrete Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef.discrete)).item(), "Policy/Continuous Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef.continuous)).item(), "Policy/Learning Rate": decay_lr, } return update_stats
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Updates model using buffer. :param num_sequences: Number of trajectories in batch. :param batch: Experience mini-batch. :param update_target: Whether or not to update target value network :param reward_signal_batches: Minibatches to use for updating the reward signals, indexed by name. If none, don't update the reward signals. :return: Output from update process. """ rewards = {} for name in self.reward_signals: rewards[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.rewards_key(name)]) n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] next_obs = ObsUtil.from_buffer_next(batch, n_obs) # Convert to tensors next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK]) actions = AgentAction.from_buffer(batch) memories_list = [ ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i]) for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length) ] # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true. value_memories_list = [ ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i]) for i in range(0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length) ] if len(memories_list) > 0: memories = torch.stack(memories_list).unsqueeze(0) value_memories = torch.stack(value_memories_list).unsqueeze(0) else: memories = None value_memories = None # Q and V network memories are 0'ed out, since we don't have them during inference. q_memories = (torch.zeros_like(value_memories) if value_memories is not None else None) # Copy normalizers from policy self.q_network.q1_network.network_body.copy_normalization( self.policy.actor.network_body) self.q_network.q2_network.network_body.copy_normalization( self.policy.actor.network_body) self.target_network.network_body.copy_normalization( self.policy.actor.network_body) self._critic.network_body.copy_normalization( self.policy.actor.network_body) sampled_actions, log_probs, _, _, = self.policy.actor.get_action_and_stats( current_obs, masks=act_masks, memories=memories, sequence_length=self.policy.sequence_length, ) value_estimates, _ = self._critic.critic_pass( current_obs, value_memories, sequence_length=self.policy.sequence_length) cont_sampled_actions = sampled_actions.continuous_tensor cont_actions = actions.continuous_tensor q1p_out, q2p_out = self.q_network( current_obs, cont_sampled_actions, memories=q_memories, sequence_length=self.policy.sequence_length, q2_grad=False, ) q1_out, q2_out = self.q_network( current_obs, cont_actions, memories=q_memories, sequence_length=self.policy.sequence_length, ) if self._action_spec.discrete_size > 0: disc_actions = actions.discrete_tensor q1_stream = self._condense_q_streams(q1_out, disc_actions) q2_stream = self._condense_q_streams(q2_out, disc_actions) else: q1_stream, q2_stream = q1_out, q2_out with torch.no_grad(): # Since we didn't record the next value memories, evaluate one step in the critic to # get them. if value_memories is not None: # Get the first observation in each sequence just_first_obs = [ _obs[::self.policy.sequence_length] for _obs in current_obs ] _, next_value_memories = self._critic.critic_pass( just_first_obs, value_memories, sequence_length=1) else: next_value_memories = None target_values, _ = self.target_network( next_obs, memories=next_value_memories, sequence_length=self.policy.sequence_length, ) masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool) dones = ModelUtils.list_to_tensor(batch[BufferKey.DONE]) q1_loss, q2_loss = self.sac_q_loss(q1_stream, q2_stream, target_values, dones, rewards, masks) value_loss = self.sac_value_loss(log_probs, value_estimates, q1p_out, q2p_out, masks) policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks) entropy_loss = self.sac_entropy_loss(log_probs, masks) total_value_loss = q1_loss + q2_loss if self.policy.shared_critic: policy_loss += value_loss else: total_value_loss += value_loss decay_lr = self.decay_learning_rate.get_value( self.policy.get_current_step()) ModelUtils.update_learning_rate(self.policy_optimizer, decay_lr) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() ModelUtils.update_learning_rate(self.value_optimizer, decay_lr) self.value_optimizer.zero_grad() total_value_loss.backward() self.value_optimizer.step() ModelUtils.update_learning_rate(self.entropy_optimizer, decay_lr) self.entropy_optimizer.zero_grad() entropy_loss.backward() self.entropy_optimizer.step() # Update target network ModelUtils.soft_update(self._critic, self.target_network, self.tau) update_stats = { "Losses/Policy Loss": policy_loss.item(), "Losses/Value Loss": value_loss.item(), "Losses/Q1 Loss": q1_loss.item(), "Losses/Q2 Loss": q2_loss.item(), "Policy/Discrete Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef.discrete)).item(), "Policy/Continuous Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef.continuous)).item(), "Policy/Learning Rate": decay_lr, } return update_stats
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Updates model using buffer. :param num_sequences: Number of trajectories in batch. :param batch: Experience mini-batch. :param update_target: Whether or not to update target value network :param reward_signal_batches: Minibatches to use for updating the reward signals, indexed by name. If none, don't update the reward signals. :return: Output from update process. """ rewards = {} for name in self.reward_signals: rewards[name] = ModelUtils.list_to_tensor(batch[f"{name}_rewards"]) vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])] next_vec_obs = [ModelUtils.list_to_tensor(batch["next_vector_in"])] act_masks = ModelUtils.list_to_tensor(batch["action_mask"]) if self.policy.use_continuous_act: actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1) else: actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long) memories_list = [ ModelUtils.list_to_tensor(batch["memory"][i]) for i in range(0, len(batch["memory"]), self.policy.sequence_length) ] # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true. offset = 1 if self.policy.sequence_length > 1 else 0 next_memories_list = [ ModelUtils.list_to_tensor( batch["memory"][i][self.policy.m_size // 2 :] ) # only pass value part of memory to target network for i in range(offset, len(batch["memory"]), self.policy.sequence_length) ] if len(memories_list) > 0: memories = torch.stack(memories_list).unsqueeze(0) next_memories = torch.stack(next_memories_list).unsqueeze(0) else: memories = None next_memories = None # Q network memories are 0'ed out, since we don't have them during inference. q_memories = ( torch.zeros_like(next_memories) if next_memories is not None else None ) vis_obs: List[torch.Tensor] = [] next_vis_obs: List[torch.Tensor] = [] if self.policy.use_vis_obs: vis_obs = [] for idx, _ in enumerate( self.policy.actor_critic.network_body.visual_processors ): vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx]) vis_obs.append(vis_ob) next_vis_ob = ModelUtils.list_to_tensor( batch["next_visual_obs%d" % idx] ) next_vis_obs.append(next_vis_ob) # Copy normalizers from policy self.value_network.q1_network.network_body.copy_normalization( self.policy.actor_critic.network_body ) self.value_network.q2_network.network_body.copy_normalization( self.policy.actor_critic.network_body ) self.target_network.network_body.copy_normalization( self.policy.actor_critic.network_body ) (sampled_actions, _, log_probs, _, _) = self.policy.sample_actions( vec_obs, vis_obs, masks=act_masks, memories=memories, seq_len=self.policy.sequence_length, all_log_probs=not self.policy.use_continuous_act, ) value_estimates, _ = self.policy.actor_critic.critic_pass( vec_obs, vis_obs, memories, sequence_length=self.policy.sequence_length ) if self.policy.use_continuous_act: squeezed_actions = actions.squeeze(-1) # Only need grad for q1, as that is used for policy. q1p_out, q2p_out = self.value_network( vec_obs, vis_obs, sampled_actions, memories=q_memories, sequence_length=self.policy.sequence_length, q2_grad=False, ) q1_out, q2_out = self.value_network( vec_obs, vis_obs, squeezed_actions, memories=q_memories, sequence_length=self.policy.sequence_length, ) q1_stream, q2_stream = q1_out, q2_out else: # For discrete, you don't need to backprop through the Q for the policy q1p_out, q2p_out = self.value_network( vec_obs, vis_obs, memories=q_memories, sequence_length=self.policy.sequence_length, q1_grad=False, q2_grad=False, ) q1_out, q2_out = self.value_network( vec_obs, vis_obs, memories=q_memories, sequence_length=self.policy.sequence_length, ) q1_stream = self._condense_q_streams(q1_out, actions) q2_stream = self._condense_q_streams(q2_out, actions) with torch.no_grad(): target_values, _ = self.target_network( next_vec_obs, next_vis_obs, memories=next_memories, sequence_length=self.policy.sequence_length, ) masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool) use_discrete = not self.policy.use_continuous_act dones = ModelUtils.list_to_tensor(batch["done"]) q1_loss, q2_loss = self.sac_q_loss( q1_stream, q2_stream, target_values, dones, rewards, masks ) value_loss = self.sac_value_loss( log_probs, value_estimates, q1p_out, q2p_out, masks, use_discrete ) policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks, use_discrete) entropy_loss = self.sac_entropy_loss(log_probs, masks, use_discrete) total_value_loss = q1_loss + q2_loss + value_loss decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step()) ModelUtils.update_learning_rate(self.policy_optimizer, decay_lr) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() ModelUtils.update_learning_rate(self.value_optimizer, decay_lr) self.value_optimizer.zero_grad() total_value_loss.backward() self.value_optimizer.step() ModelUtils.update_learning_rate(self.entropy_optimizer, decay_lr) self.entropy_optimizer.zero_grad() entropy_loss.backward() self.entropy_optimizer.step() # Update target network ModelUtils.soft_update( self.policy.actor_critic.critic, self.target_network, self.tau ) update_stats = { "Losses/Policy Loss": policy_loss.item(), "Losses/Value Loss": value_loss.item(), "Losses/Q1 Loss": q1_loss.item(), "Losses/Q2 Loss": q2_loss.item(), "Policy/Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef)).item(), "Policy/Learning Rate": decay_lr, } return update_stats
def sac_value_loss( self, log_probs: torch.Tensor, values: Dict[str, torch.Tensor], q1p_out: Dict[str, torch.Tensor], q2p_out: Dict[str, torch.Tensor], loss_masks: torch.Tensor, discrete: bool, ) -> torch.Tensor: min_policy_qs = {} with torch.no_grad(): _ent_coef = torch.exp(self._log_ent_coef) for name in values.keys(): if not discrete: min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name]) else: action_probs = log_probs.exp() _branched_q1p = ModelUtils.break_into_branches( q1p_out[name] * action_probs, self.act_size ) _branched_q2p = ModelUtils.break_into_branches( q2p_out[name] * action_probs, self.act_size ) _q1p_mean = torch.mean( torch.stack( [ torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q1p ] ), dim=0, ) _q2p_mean = torch.mean( torch.stack( [ torch.sum(_br, dim=1, keepdim=True) for _br in _branched_q2p ] ), dim=0, ) min_policy_qs[name] = torch.min(_q1p_mean, _q2p_mean) value_losses = [] if not discrete: for name in values.keys(): with torch.no_grad(): v_backup = min_policy_qs[name] - torch.sum( _ent_coef * log_probs, dim=1 ) value_loss = 0.5 * ModelUtils.masked_mean( torch.nn.functional.mse_loss(values[name], v_backup), loss_masks ) value_losses.append(value_loss) else: branched_per_action_ent = ModelUtils.break_into_branches( log_probs * log_probs.exp(), self.act_size ) # We have to do entropy bonus per action branch branched_ent_bonus = torch.stack( [ torch.sum(_ent_coef[i] * _lp, dim=1, keepdim=True) for i, _lp in enumerate(branched_per_action_ent) ] ) for name in values.keys(): with torch.no_grad(): v_backup = min_policy_qs[name] - torch.mean( branched_ent_bonus, axis=0 ) value_loss = 0.5 * ModelUtils.masked_mean( torch.nn.functional.mse_loss(values[name], v_backup.squeeze()), loss_masks, ) value_losses.append(value_loss) value_loss = torch.mean(torch.stack(value_losses)) if torch.isinf(value_loss).any() or torch.isnan(value_loss).any(): raise UnityTrainerException("Inf found") return value_loss
def pdf(self, value): log_prob = self.log_prob(value) return torch.exp(log_prob)