def process_returns(self, samples): """ Compute bootstrapped returns and advantages from a minibatch of samples. Uses either discounted returns (if ``self.gae_lambda==1``) or generalized advantage estimation. Mask out invalid samples according to ``mid_batch_reset`` or for recurrent agent. Optionally, normalize advantages. """ reward, done, value, bv = (samples.env.reward, samples.env.done, samples.agent.agent_info.value, samples.agent.bootstrap_value) done = done.type(reward.dtype) if self.normalize_rewards is not None: # Normalize and clip rewards before computing advantage if self.normalize_rewards == 'return': return_ = discount_return( reward, done, 0., self.discount, return_dest=torch.zeros_like( reward)) # NO boostrapping of value reward = self.ret_rms(reward, center=False) else: reward = self.ret_rms(reward) if self.gae_lambda == 1: # GAE reduces to empirical discounted. return_ = discount_return(reward, done, bv, self.discount, return_dest=torch.zeros_like(reward)) advantage = return_ - value else: advantage, return_ = generalized_advantage_estimation( reward, value, done, bv, self.discount, self.gae_lambda, return_dest=torch.zeros_like(reward), advantage_dest=torch.zeros_like(reward)) if not self.mid_batch_reset or self.agent.recurrent: valid = valid_from_done( done) # Recurrent: no reset during training. else: valid = None # OR torch.ones_like(done) if self.normalize_advantage: if valid is not None: valid_mask = valid > 0 adv_mean = advantage[valid_mask].mean() adv_std = advantage[valid_mask].std() else: adv_mean = advantage.mean() adv_std = advantage.std() advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6) return return_, advantage, valid
def process_returns(self, samples): reward, done, value, bv = (samples.env.reward, samples.env.done, samples.agent.agent_info.value, samples.agent.bootstrap_value) done = done.type(reward.dtype) if self.gae_lambda == 1: # GAE reduces to empirical discounted. return_ = discount_return(reward, done, bv, self.discount) advantage = return_ - value else: advantage, return_ = generalized_advantage_estimation( reward, value, done, bv, self.discount, self.gae_lambda) if not self.mid_batch_reset or self.agent.recurrent: valid = valid_from_done( done) # Recurrent: no reset during training. else: valid = None # OR: torch.ones_like(done) if self.normalize_advantage: if valid is not None: valid_mask = valid > 0 adv_mean = advantage[valid_mask].mean() adv_std = advantage[valid_mask].std() else: adv_mean = advantage.mean() adv_std = advantage.std() advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6) return return_, advantage, valid
def process_intrinsic_returns(self, int_rew, int_val, int_bootstrap_value): """ Same as ``process_returns`` but discounted reward signal is carried over episodes. Note that int_val and int_bootstrap_value should come from separate critic model than that used for extrinsic rewards to keep these reward streams distinct. For more details, see https://arxiv.org/abs/1810.12894. """ faux_done = torch.zeros_like( int_rew) # Faux done signals, all "not done" if self.gae_lambda == 1: # GAE reduces to empirical discounted. return_ = discount_return(int_rew, faux_done, int_bootstrap_value, self.int_discount) advantage = return_ - int_val else: advantage, return_ = generalized_advantage_estimation( int_rew, int_val, faux_done, int_bootstrap_value, self.int_discount, self.gae_lambda) if self.normalize_advantage: adv_mean = advantage.mean() adv_std = advantage.std() advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6) return return_, advantage
def process_extrinsic_returns(self, ext_rew, done, ext_val, ext_bootstrap_value): """ Identical to ``process_returns`` but expects samples have been extracted for parameters, as some buffer names changed (e.g. value to ext_value). Also provides greater flexibility, for example in reward clipping before entering this function. """ if self.gae_lambda == 1: # GAE reduces to empirical discounted. return_ = discount_return(ext_rew, done, ext_bootstrap_value, self.discount) advantage = return_ - ext_val else: advantage, return_ = generalized_advantage_estimation( ext_rew, ext_val, done, ext_bootstrap_value, self.discount, self.gae_lambda) if not self.mid_batch_reset or self.agent.recurrent: valid = valid_from_done( done) # Recurrent: no reset during training. else: valid = None # OR torch.ones_like(done) if self.normalize_advantage: if valid is not None: valid_mask = valid > 0 adv_mean = advantage[valid_mask].mean() adv_std = advantage[valid_mask].std() else: adv_mean = advantage.mean() adv_std = advantage.std() advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6) return return_, advantage, valid
def process_returns(self, samples): """ Compute bootstrapped returns and advantages from a minibatch of samples. Uses either discounted returns (if ``self.gae_lambda==1``) or generalized advantage estimation. Mask out invalid samples according to ``mid_batch_reset`` or for recurrent agent. Optionally, normalize advantages. """ reward, done, value, bv, discounted_return = ( samples.env.reward, samples.env.done, samples.agent.agent_info.value, samples.agent.bootstrap_value, samples.env.discounted_return) done = done.type(reward.dtype) # print() # print('discounted return', discounted_return) if self.rets is None: self.rets = np.zeros(len(reward)) self.rets = discounted_return.numpy() + reward.numpy() self.ret_rms.update(self.rets) self.rets[done.numpy().astype(int)] = 0 pre_reward = reward reward = torch.div(reward, np.mean(np.sqrt(self.ret_rms.var + 1e-8))) # print('rets', self.rets) # print('std', np.mean(np.sqrt(self.ret_rms.var + 1e-8))) if self.gae_lambda == 1: # GAE reduces to empirical discounted. return_ = discount_return(reward, done, bv, self.discount) advantage = return_ - value else: advantage, return_ = generalized_advantage_estimation( reward, value, done, bv, self.discount, self.gae_lambda) # print('value', value) # print('bootstrap_value', bv) if not self.mid_batch_reset or self.agent.recurrent: valid = valid_from_done( done) # Recurrent: no reset during training. else: valid = None # OR torch.ones_like(done) if self.normalize_advantage: if valid is not None: valid_mask = valid > 0 adv_mean = advantage[valid_mask].mean() adv_std = advantage[valid_mask].std() else: adv_mean = advantage.mean() adv_std = advantage.std() advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6) return return_, advantage, valid, value, reward, pre_reward
def process_returns(self, reward, done, value_prediction, action, dist_info, old_dist_info, opt_info): done = done.type(reward.dtype) if self.pop_art_reward_normalization: unnormalized_value = value_prediction value_prediction, normalized_value = self.pop_art_normalizer( value_prediction) bootstrap_value = value_prediction[-1] reward, value_prediction, done = reward[: -1], value_prediction[: -1], done[: -1] return_ = discount_return(reward, done, bootstrap_value.detach(), self.discount) if self.pop_art_reward_normalization: self.pop_art_normalizer.update_parameters( return_.unsqueeze(-1), torch.ones_like(return_.unsqueeze(-1))) _, normalized_value = self.pop_art_normalizer( unnormalized_value[:-1]) return_ = self.pop_art_normalizer.normalize(return_) advantage = return_ - normalized_value.detach() value_prediction = normalized_value opt_info.normalized_return.append(return_.numpy()) else: advantage = return_ - value_prediction.detach() valid = valid_from_done(done) # Recurrent: no reset during training. opt_info.advantage.append(advantage.numpy()) loss, opt_info = self.loss(dist_info=dist_info[:-1], value=value_prediction, action=action[:-1], return_=return_, advantage=advantage.detach(), valid=valid, old_dist_info=old_dist_info[:-1], opt_info=opt_info) return loss, opt_info
def process_returns(self, samples): """ Compute bootstrapped returns and advantages from a minibatch of samples. Uses either discounted returns (if ``self.gae_lambda==1``) or generalized advantage estimation. Mask out invalid samples according to ``mid_batch_reset`` or for recurrent agent. Optionally, normalize advantages. """ reward, done, value, bv = (samples.env.reward, samples.env.done, samples.agent.agent_info.value, samples.agent.bootstrap_value) # reward = reward.squeeze(-1) #! the additional dimension at the end was causing issue done = done.type(reward.dtype) if self.gae_lambda == 1: # GAE reduces to empirical discounted. return_ = discount_return(reward, done, bv, self.discount) advantage = return_ - value else: advantage, return_ = generalized_advantage_estimation( reward, value, done, bv, self.discount, self.gae_lambda) if not self.mid_batch_reset or self.agent.recurrent: valid = valid_from_done( done) # Recurrent: no reset during training. else: valid = None # OR torch.ones_like(done) if self.normalize_advantage: if valid is not None: valid_mask = valid > 0 adv_mean = advantage[valid_mask].mean() adv_std = advantage[valid_mask].std() else: adv_mean = advantage.mean() adv_std = advantage.std() advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6) return return_, advantage, valid
def process_returns(self, samples): """ Compute bootstrapped returns and advantages from a minibatch of samples. Uses either discounted returns (if ``self.gae_lambda==1``) or generalized advantage estimation. Mask out invalid samples according to ``mid_batch_reset`` or for recurrent agent. Optionally, normalize advantages. """ if self.agent.dual_model: reward, done, value, bv, int_value, int_bv = (samples.env.reward, samples.env.done, samples.agent.agent_info.value, samples.agent.bootstrap_value, samples.agent.agent_info.int_value, samples.agent.int_bootstrap_value) else: reward, done, value, bv = (samples.env.reward, samples.env.done, samples.agent.agent_info.value, samples.agent.bootstrap_value) done = done.type(reward.dtype) if self.curiosity_type in {'icm', 'disagreement', 'micm'}: intrinsic_rewards, _ = self.agent.curiosity_step(self.curiosity_type, samples.env.observation.clone(), samples.env.next_observation.clone(), samples.agent.action.clone()) intrinsic_rewards_logging = intrinsic_rewards.clone().data.numpy() self.intrinsic_rewards = intrinsic_rewards_logging self.extint_ratio = reward.clone().data.numpy()/(intrinsic_rewards_logging+1e-15) if self.agent.dual_model: int_reward = intrinsic_rewards else: reward += intrinsic_rewards elif self.curiosity_type == 'ndigo': intrinsic_rewards, _ = self.agent.curiosity_step(self.curiosity_type, samples.env.observation.clone(), samples.agent.prev_action.clone(), samples.agent.action.clone()) # no grad intrinsic_rewards_logging = intrinsic_rewards.clone().data.numpy() self.intrinsic_rewards = intrinsic_rewards_logging self.extint_ratio = reward.clone().data.numpy()/(intrinsic_rewards_logging+1e-15) if self.agent.dual_model: int_reward = intrinsic_rewards else: reward += intrinsic_rewards elif self.curiosity_type == 'rnd': intrinsic_rewards, _ = self.agent.curiosity_step(self.curiosity_type, samples.env.next_observation.clone(), done.clone()) intrinsic_rewards_logging = intrinsic_rewards.clone().data.numpy() self.intrinsic_rewards = intrinsic_rewards_logging self.extint_ratio = reward.clone().data.numpy()/(intrinsic_rewards_logging+1e-15) if self.agent.dual_model: int_reward = intrinsic_rewards else: reward += intrinsic_rewards if self.normalize_reward: rews = np.array([]) for rew in reward.clone().detach().data.numpy(): rews = np.concatenate((rews, self.reward_ff.update(rew))) self.reward_rms.update_from_moments(np.mean(rews), np.var(rews), len(rews)) reward = reward / np.sqrt(self.reward_rms.var) if self.agent.dual_model: int_rews = np.array([]) for int_rew in int_reward.clone().detach().data.numpy(): int_rews = np.concatenate((int_rews, self.int_reward_ff.update(int_rew))) self.int_reward_rms.update_from_moments(np.mean(int_rews), np.var(int_rews), len(int_rews)) int_reward = int_reward / np.sqrt(self.int_reward_rms.var) if self.gae_lambda == 1: # GAE reduces to empirical discounted. return_ = discount_return(reward, done, bv, self.discount) advantage = return_ - value if self.agent.dual_model: int_return_ = discount_return(int_reward, done, bv, self.discount) int_advantage = int_return_ - value else: advantage, return_ = generalized_advantage_estimation(reward, value, done, bv, self.discount, self.gae_lambda) if self.agent.dual_model: int_advantage, int_return_ = generalized_advantage_estimation(int_reward, value, done, bv, self.discount, self.gae_lambda) if not self.mid_batch_reset or self.agent.recurrent: valid = valid_from_done(done) # Recurrent: no reset during training. else: valid = None # OR torch.ones_like(done) if self.normalize_advantage: if valid is not None: valid_mask = valid > 0 adv_mean = advantage[valid_mask].mean() adv_std = advantage[valid_mask].std() else: adv_mean = advantage.mean() adv_std = advantage.std() advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6) if self.agent.dual_model: return return_, advantage, valid, int_return_, int_advantage else: return return_, advantage, valid
def process_returns(self, itr, samples): reward, cost = samples.env.reward, samples.env.env_info.cost cost /= self.cost_scale done = samples.env.done value, c_value = samples.agent.agent_info.value # A named 2-tuple. bv, c_bv = samples.agent.bootstrap_value # A named 2-tuple. if self.reward_scale != 1: reward *= self.reward_scale value *= self.reward_scale # Keep the value learning the same. bv *= self.reward_scale done = done.type(reward.dtype) # rlpyt does this in discount_returns? if c_value is not None: # Learning c_value, even if reward penalized. if self.cost_gae_lambda == 1: # GAE reduces to empirical discount. c_return = discount_return(cost, done, c_bv, self.cost_discount) c_advantage = c_return - c_value else: c_advantage, c_return = generalized_advantage_estimation( cost, c_value, done, c_bv, self.cost_discount, self.cost_gae_lambda) else: c_advantage = c_return = None if self.gae_lambda == 1: # GAE reduces to empirical discounted. return_ = discount_return(reward, done, bv, self.discount) advantage = return_ - value else: advantage, return_ = generalized_advantage_estimation( reward, value, done, bv, self.discount, self.gae_lambda) if not self.mid_batch_reset or self.agent.recurrent: # Recurrent: no reset during training. valid = valid_from_done(done) # "done" might stay True until env resets next batch. # Could probably do this formula directly on (1 - done) and use it # regardless of mid_batch_reset. ep_cost_mask = valid * (1 - torch.cat( [valid[1:], torch.ones_like(valid[-1:])]) ) # Find where valid turns OFF. else: valid = None # OR: torch.ones_like(done) ep_cost_mask = done # Everywhere a done, is episode final cost. ep_costs = samples.env.env_info.cum_cost[ep_cost_mask.type(torch.bool)] if self._ddp: world_size = torch.distributed.get_world_size( ) # already have self.world_size if ep_costs.numel() > 0: # Might not have any completed trajectories. ep_cost_avg = ep_costs.mean() ep_cost_avg /= self.cost_scale if self._ddp: eca = ep_cost_avg.to(self.agent.device) torch.distributed.all_reduce(eca) ep_cost_avg = eca.to("cpu") ep_cost_avg /= world_size a = self.ep_cost_ema_alpha self._ep_cost_ema *= a self._ep_cost_ema += (1 - a) * ep_cost_avg if self.normalize_advantage: if valid is not None: valid_mask = valid > 0 adv_mean = advantage[valid_mask].mean() adv_std = advantage[valid_mask].std() else: adv_mean = advantage.mean() adv_std = advantage.std() if self._ddp: mean_std = torch.stack([adv_mean, adv_std]) mean_std = mean_std.to(self.agent.device) torch.distributed.all_reduce(mean_std) mean_std = mean_std.to("cpu") mean_std /= world_size adv_mean, adv_std = mean_std[0], mean_std[1] advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6) # Pretty sure not supposed to normalized c_advantage. if self.normalize_cost_advantage: if valid is not None: valid_mask = valid > 0 cadv_mean = c_advantage[valid_mask].mean() cadv_std = c_advantage[valid_mask].std() else: cadv_mean = c_advantage.mean() cadv_std = c_advantage.std() if self._ddp: mean_std = torch.stack([cadv_mean, cadv_std]) mean_std = mean_std.to(self.agent.device) torch.distributed.all_reduce(mean_std) mean_std = mean_std.to("cpu") mean_std /= world_size cadv_mean, cadv_std = mean_std[0], mean_std[1] c_advantage[:] = (c_advantage - cadv_mean) / max(cadv_std, 1e-6) return (return_, advantage, valid, c_return, c_advantage, self._ep_cost_ema)
def process_returns(self, samples): """ Compute bootstrapped returns and advantages from a minibatch of samples. Uses either discounted returns (if ``self.gae_lambda==1``) or generalized advantage estimation. Mask out invalid samples according to ``mid_batch_reset`` or for recurrent agent. Optionally, normalize advantages. """ reward, done, q, v, termination, o, prev_o, pi_omega, bv = ( samples.env.reward, samples.env.done, samples.agent.agent_info.q, samples.agent.agent_info.value, samples.agent.agent_info.termination, samples.agent.agent_info.o, samples.agent.agent_info.prev_o, samples.agent.agent_info.dist_info_omega, samples.agent.bootstrap_value) done = done.type(reward.dtype) q_o = select_at_indexes(o, q) if self.normalize_rewards is not None: # Normalize and clip rewards before computing advantage if self.normalize_rewards == 'return': return_ = discount_return( reward, done, 0., self.discount, return_dest=torch.zeros_like( reward)) # NO boostrapping of value reward = self.ret_rms(reward, center=False) else: reward = self.ret_rms(reward) valid_o = torch.ones_like( done ) # Options: If reset, no termination gradient, no deliberation cost valid_o[prev_o == -1] = 0. reward[torch.logical_and(valid_o.bool(), termination)] -= self.delib_cost if self.gae_lambda == 1: # GAE reduces to empirical discounted. return_ = discount_return(reward, done, bv, self.discount) advantage = return_ - q_o op_adv = return_ - v else: advantage, return_ = generalized_advantage_estimation( reward, q_o, done, bv, self.discount, self.gae_lambda, return_dest=torch.zeros_like(reward), advantage_dest=torch.zeros_like(reward)) op_adv, _ = generalized_advantage_estimation( reward, v, done, bv, self.discount, self.gae_lambda, return_dest=torch.zeros_like(reward), advantage_dest=torch.zeros_like(reward)) if not self.mid_batch_reset or self.agent.recurrent: valid = valid_from_done( done) # Recurrent: no reset during training. else: valid = None # OR torch.ones_like(done) q_prev_o = select_at_indexes(prev_o, q) termination_advantage = q_prev_o - v + self.delib_cost if self.normalize_advantage: if valid is not None: valid_mask = valid > 0 adv_mean = advantage[valid_mask].mean() adv_std = advantage[valid_mask].std() op_adv_mean = advantage[valid_mask].mean() op_adv_std = advantage[valid_mask].std() else: adv_mean = advantage.mean() adv_std = advantage.std() op_adv_mean = op_adv.mean() op_adv_std = op_adv.std() advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6) op_adv[:] = (op_adv - op_adv_mean) / max(op_adv_std, 1e-6) if self.normalize_termination_advantage: valid_mask = valid_o > 0 adv_mean = termination_advantage[valid_mask].mean() adv_std = termination_advantage[valid_mask].std() termination_advantage[:] = (termination_advantage - adv_mean) / max(adv_std, 1e-6) return return_, advantage, valid, termination_advantage, valid_o, op_adv