Пример #1
0
    def process_returns(self, samples):
        """
        Compute bootstrapped returns and advantages from a minibatch of
        samples.  Uses either discounted returns (if ``self.gae_lambda==1``)
        or generalized advantage estimation.  Mask out invalid samples
        according to ``mid_batch_reset`` or for recurrent agent.  Optionally,
        normalize advantages.
        """
        reward, done, value, bv = (samples.env.reward, samples.env.done,
                                   samples.agent.agent_info.value,
                                   samples.agent.bootstrap_value)
        done = done.type(reward.dtype)
        if self.normalize_rewards is not None:  # Normalize and clip rewards before computing advantage
            if self.normalize_rewards == 'return':
                return_ = discount_return(
                    reward,
                    done,
                    0.,
                    self.discount,
                    return_dest=torch.zeros_like(
                        reward))  # NO boostrapping of value
                reward = self.ret_rms(reward, center=False)
            else:
                reward = self.ret_rms(reward)

        if self.gae_lambda == 1:  # GAE reduces to empirical discounted.
            return_ = discount_return(reward,
                                      done,
                                      bv,
                                      self.discount,
                                      return_dest=torch.zeros_like(reward))
            advantage = return_ - value
        else:
            advantage, return_ = generalized_advantage_estimation(
                reward,
                value,
                done,
                bv,
                self.discount,
                self.gae_lambda,
                return_dest=torch.zeros_like(reward),
                advantage_dest=torch.zeros_like(reward))

        if not self.mid_batch_reset or self.agent.recurrent:
            valid = valid_from_done(
                done)  # Recurrent: no reset during training.
        else:
            valid = None  # OR torch.ones_like(done)

        if self.normalize_advantage:
            if valid is not None:
                valid_mask = valid > 0
                adv_mean = advantage[valid_mask].mean()
                adv_std = advantage[valid_mask].std()
            else:
                adv_mean = advantage.mean()
                adv_std = advantage.std()
            advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6)

        return return_, advantage, valid
Пример #2
0
    def process_returns(self, samples):
        reward, done, value, bv = (samples.env.reward, samples.env.done,
                                   samples.agent.agent_info.value,
                                   samples.agent.bootstrap_value)
        done = done.type(reward.dtype)
        if self.gae_lambda == 1:  # GAE reduces to empirical discounted.
            return_ = discount_return(reward, done, bv, self.discount)
            advantage = return_ - value
        else:
            advantage, return_ = generalized_advantage_estimation(
                reward, value, done, bv, self.discount, self.gae_lambda)

        if not self.mid_batch_reset or self.agent.recurrent:
            valid = valid_from_done(
                done)  # Recurrent: no reset during training.
        else:
            valid = None  # OR: torch.ones_like(done)

        if self.normalize_advantage:
            if valid is not None:
                valid_mask = valid > 0
                adv_mean = advantage[valid_mask].mean()
                adv_std = advantage[valid_mask].std()
            else:
                adv_mean = advantage.mean()
                adv_std = advantage.std()
            advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6)

        return return_, advantage, valid
Пример #3
0
    def process_intrinsic_returns(self, int_rew, int_val, int_bootstrap_value):
        """
        Same as ``process_returns`` but discounted reward signal is carried over episodes.
        Note that int_val and int_bootstrap_value should come from separate critic model than that
        used for extrinsic rewards to keep these reward streams distinct.
        For more details, see https://arxiv.org/abs/1810.12894.
        """
        faux_done = torch.zeros_like(
            int_rew)  # Faux done signals, all "not done"

        if self.gae_lambda == 1:  # GAE reduces to empirical discounted.
            return_ = discount_return(int_rew, faux_done, int_bootstrap_value,
                                      self.int_discount)
            advantage = return_ - int_val
        else:
            advantage, return_ = generalized_advantage_estimation(
                int_rew, int_val, faux_done, int_bootstrap_value,
                self.int_discount, self.gae_lambda)

        if self.normalize_advantage:
            adv_mean = advantage.mean()
            adv_std = advantage.std()
            advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6)

        return return_, advantage
Пример #4
0
    def process_extrinsic_returns(self, ext_rew, done, ext_val,
                                  ext_bootstrap_value):
        """
        Identical to ``process_returns`` but expects samples have been extracted
        for parameters, as some buffer names changed (e.g. value to ext_value).
        Also provides greater flexibility, for example in reward clipping before
        entering this function.
        """
        if self.gae_lambda == 1:  # GAE reduces to empirical discounted.
            return_ = discount_return(ext_rew, done, ext_bootstrap_value,
                                      self.discount)
            advantage = return_ - ext_val
        else:
            advantage, return_ = generalized_advantage_estimation(
                ext_rew, ext_val, done, ext_bootstrap_value, self.discount,
                self.gae_lambda)

        if not self.mid_batch_reset or self.agent.recurrent:
            valid = valid_from_done(
                done)  # Recurrent: no reset during training.
        else:
            valid = None  # OR torch.ones_like(done)

        if self.normalize_advantage:
            if valid is not None:
                valid_mask = valid > 0
                adv_mean = advantage[valid_mask].mean()
                adv_std = advantage[valid_mask].std()
            else:
                adv_mean = advantage.mean()
                adv_std = advantage.std()
            advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6)

        return return_, advantage, valid
Пример #5
0
    def process_returns(self, samples):
        """
        Compute bootstrapped returns and advantages from a minibatch of
        samples.  Uses either discounted returns (if ``self.gae_lambda==1``)
        or generalized advantage estimation.  Mask out invalid samples
        according to ``mid_batch_reset`` or for recurrent agent.  Optionally,
        normalize advantages.
        """
        reward, done, value, bv, discounted_return = (
            samples.env.reward, samples.env.done,
            samples.agent.agent_info.value, samples.agent.bootstrap_value,
            samples.env.discounted_return)
        done = done.type(reward.dtype)

        # print()
        # print('discounted return', discounted_return)

        if self.rets is None:
            self.rets = np.zeros(len(reward))

        self.rets = discounted_return.numpy() + reward.numpy()

        self.ret_rms.update(self.rets)
        self.rets[done.numpy().astype(int)] = 0

        pre_reward = reward

        reward = torch.div(reward, np.mean(np.sqrt(self.ret_rms.var + 1e-8)))

        # print('rets', self.rets)
        # print('std', np.mean(np.sqrt(self.ret_rms.var + 1e-8)))

        if self.gae_lambda == 1:  # GAE reduces to empirical discounted.
            return_ = discount_return(reward, done, bv, self.discount)
            advantage = return_ - value
        else:
            advantage, return_ = generalized_advantage_estimation(
                reward, value, done, bv, self.discount, self.gae_lambda)

        # print('value', value)
        # print('bootstrap_value', bv)

        if not self.mid_batch_reset or self.agent.recurrent:
            valid = valid_from_done(
                done)  # Recurrent: no reset during training.
        else:
            valid = None  # OR torch.ones_like(done)

        if self.normalize_advantage:
            if valid is not None:
                valid_mask = valid > 0
                adv_mean = advantage[valid_mask].mean()
                adv_std = advantage[valid_mask].std()
            else:
                adv_mean = advantage.mean()
                adv_std = advantage.std()
            advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6)

        return return_, advantage, valid, value, reward, pre_reward
Пример #6
0
    def process_returns(self, reward, done, value_prediction, action,
                        dist_info, old_dist_info, opt_info):
        done = done.type(reward.dtype)
        if self.pop_art_reward_normalization:
            unnormalized_value = value_prediction
            value_prediction, normalized_value = self.pop_art_normalizer(
                value_prediction)

        bootstrap_value = value_prediction[-1]
        reward, value_prediction, done = reward[:
                                                -1], value_prediction[:
                                                                      -1], done[:
                                                                                -1]

        return_ = discount_return(reward, done, bootstrap_value.detach(),
                                  self.discount)
        if self.pop_art_reward_normalization:
            self.pop_art_normalizer.update_parameters(
                return_.unsqueeze(-1), torch.ones_like(return_.unsqueeze(-1)))
            _, normalized_value = self.pop_art_normalizer(
                unnormalized_value[:-1])
            return_ = self.pop_art_normalizer.normalize(return_)
            advantage = return_ - normalized_value.detach()
            value_prediction = normalized_value
            opt_info.normalized_return.append(return_.numpy())
        else:
            advantage = return_ - value_prediction.detach()

        valid = valid_from_done(done)  # Recurrent: no reset during training.
        opt_info.advantage.append(advantage.numpy())

        loss, opt_info = self.loss(dist_info=dist_info[:-1],
                                   value=value_prediction,
                                   action=action[:-1],
                                   return_=return_,
                                   advantage=advantage.detach(),
                                   valid=valid,
                                   old_dist_info=old_dist_info[:-1],
                                   opt_info=opt_info)
        return loss, opt_info
Пример #7
0
    def process_returns(self, samples):
        """
        Compute bootstrapped returns and advantages from a minibatch of
        samples.  Uses either discounted returns (if ``self.gae_lambda==1``)
        or generalized advantage estimation.  Mask out invalid samples
        according to ``mid_batch_reset`` or for recurrent agent.  Optionally,
        normalize advantages.
        """
        reward, done, value, bv = (samples.env.reward, samples.env.done,
                                   samples.agent.agent_info.value,
                                   samples.agent.bootstrap_value)
        # reward = reward.squeeze(-1) #! the additional dimension at the end was causing issue
        done = done.type(reward.dtype)

        if self.gae_lambda == 1:  # GAE reduces to empirical discounted.
            return_ = discount_return(reward, done, bv, self.discount)
            advantage = return_ - value
        else:
            advantage, return_ = generalized_advantage_estimation(
                reward, value, done, bv, self.discount, self.gae_lambda)

        if not self.mid_batch_reset or self.agent.recurrent:
            valid = valid_from_done(
                done)  # Recurrent: no reset during training.
        else:
            valid = None  # OR torch.ones_like(done)

        if self.normalize_advantage:
            if valid is not None:
                valid_mask = valid > 0
                adv_mean = advantage[valid_mask].mean()
                adv_std = advantage[valid_mask].std()
            else:
                adv_mean = advantage.mean()
                adv_std = advantage.std()
            advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6)

        return return_, advantage, valid
Пример #8
0
    def process_returns(self, samples):
        """
        Compute bootstrapped returns and advantages from a minibatch of
        samples.  Uses either discounted returns (if ``self.gae_lambda==1``)
        or generalized advantage estimation.  Mask out invalid samples
        according to ``mid_batch_reset`` or for recurrent agent.  Optionally,
        normalize advantages.
        """        
        if self.agent.dual_model:
            reward, done, value, bv, int_value, int_bv = (samples.env.reward, samples.env.done, 
                                samples.agent.agent_info.value, samples.agent.bootstrap_value, 
                                samples.agent.agent_info.int_value, samples.agent.int_bootstrap_value)
        else:
            reward, done, value, bv = (samples.env.reward, samples.env.done, samples.agent.agent_info.value, samples.agent.bootstrap_value)        
        done = done.type(reward.dtype)

        if self.curiosity_type in {'icm', 'disagreement', 'micm'}:
            intrinsic_rewards, _ = self.agent.curiosity_step(self.curiosity_type, samples.env.observation.clone(), samples.env.next_observation.clone(), samples.agent.action.clone())
            intrinsic_rewards_logging = intrinsic_rewards.clone().data.numpy()
            self.intrinsic_rewards = intrinsic_rewards_logging
            self.extint_ratio = reward.clone().data.numpy()/(intrinsic_rewards_logging+1e-15)
            if self.agent.dual_model:
                int_reward = intrinsic_rewards
            else:
                reward += intrinsic_rewards
        elif self.curiosity_type == 'ndigo':
            intrinsic_rewards, _ = self.agent.curiosity_step(self.curiosity_type, samples.env.observation.clone(), samples.agent.prev_action.clone(), samples.agent.action.clone()) # no grad
            intrinsic_rewards_logging = intrinsic_rewards.clone().data.numpy()
            self.intrinsic_rewards = intrinsic_rewards_logging
            self.extint_ratio = reward.clone().data.numpy()/(intrinsic_rewards_logging+1e-15)
            if self.agent.dual_model:
                int_reward = intrinsic_rewards
            else:
                reward += intrinsic_rewards
        elif self.curiosity_type == 'rnd':
            intrinsic_rewards, _ = self.agent.curiosity_step(self.curiosity_type, samples.env.next_observation.clone(), done.clone())
            intrinsic_rewards_logging = intrinsic_rewards.clone().data.numpy()
            self.intrinsic_rewards = intrinsic_rewards_logging
            self.extint_ratio = reward.clone().data.numpy()/(intrinsic_rewards_logging+1e-15)
            if self.agent.dual_model:
                int_reward = intrinsic_rewards
            else:
                reward += intrinsic_rewards

        if self.normalize_reward:
            rews = np.array([])
            for rew in reward.clone().detach().data.numpy():
                rews = np.concatenate((rews, self.reward_ff.update(rew)))
            self.reward_rms.update_from_moments(np.mean(rews), np.var(rews), len(rews))
            reward = reward / np.sqrt(self.reward_rms.var)

            if self.agent.dual_model:
                int_rews = np.array([])
                for int_rew in int_reward.clone().detach().data.numpy():
                    int_rews = np.concatenate((int_rews, self.int_reward_ff.update(int_rew)))
                self.int_reward_rms.update_from_moments(np.mean(int_rews), np.var(int_rews), len(int_rews))
                int_reward = int_reward / np.sqrt(self.int_reward_rms.var)

        if self.gae_lambda == 1:  # GAE reduces to empirical discounted.
            return_ = discount_return(reward, done, bv, self.discount)
            advantage = return_ - value
            if self.agent.dual_model:
                int_return_ = discount_return(int_reward, done, bv, self.discount)
                int_advantage = int_return_ - value
        else:
            advantage, return_ = generalized_advantage_estimation(reward, value, done, bv, self.discount, self.gae_lambda)
            if self.agent.dual_model:
                int_advantage, int_return_ = generalized_advantage_estimation(int_reward, value, done, bv, self.discount, self.gae_lambda)
        
        if not self.mid_batch_reset or self.agent.recurrent:
            valid = valid_from_done(done)  # Recurrent: no reset during training.
        else:
            valid = None  # OR torch.ones_like(done)

        if self.normalize_advantage:
            if valid is not None:
                valid_mask = valid > 0
                adv_mean = advantage[valid_mask].mean()
                adv_std = advantage[valid_mask].std()
            else:
                adv_mean = advantage.mean()
                adv_std = advantage.std()
            advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6)

        if self.agent.dual_model:
            return return_, advantage, valid, int_return_, int_advantage
        else:
            return return_, advantage, valid
Пример #9
0
    def process_returns(self, itr, samples):
        reward, cost = samples.env.reward, samples.env.env_info.cost
        cost /= self.cost_scale
        done = samples.env.done
        value, c_value = samples.agent.agent_info.value  # A named 2-tuple.
        bv, c_bv = samples.agent.bootstrap_value  # A named 2-tuple.

        if self.reward_scale != 1:
            reward *= self.reward_scale
            value *= self.reward_scale  # Keep the value learning the same.
            bv *= self.reward_scale

        done = done.type(reward.dtype)  # rlpyt does this in discount_returns?

        if c_value is not None:  # Learning c_value, even if reward penalized.
            if self.cost_gae_lambda == 1:  # GAE reduces to empirical discount.
                c_return = discount_return(cost, done, c_bv,
                                           self.cost_discount)
                c_advantage = c_return - c_value
            else:
                c_advantage, c_return = generalized_advantage_estimation(
                    cost, c_value, done, c_bv, self.cost_discount,
                    self.cost_gae_lambda)
        else:
            c_advantage = c_return = None

        if self.gae_lambda == 1:  # GAE reduces to empirical discounted.
            return_ = discount_return(reward, done, bv, self.discount)
            advantage = return_ - value
        else:
            advantage, return_ = generalized_advantage_estimation(
                reward, value, done, bv, self.discount, self.gae_lambda)

        if not self.mid_batch_reset or self.agent.recurrent:
            # Recurrent: no reset during training.
            valid = valid_from_done(done)
            # "done" might stay True until env resets next batch.
            # Could probably do this formula directly on (1 - done) and use it
            # regardless of mid_batch_reset.
            ep_cost_mask = valid * (1 - torch.cat(
                [valid[1:], torch.ones_like(valid[-1:])])
                                    )  # Find where valid turns OFF.
        else:
            valid = None  # OR: torch.ones_like(done)
            ep_cost_mask = done  # Everywhere a done, is episode final cost.
        ep_costs = samples.env.env_info.cum_cost[ep_cost_mask.type(torch.bool)]

        if self._ddp:
            world_size = torch.distributed.get_world_size(
            )  # already have self.world_size
        if ep_costs.numel() > 0:  # Might not have any completed trajectories.
            ep_cost_avg = ep_costs.mean()
            ep_cost_avg /= self.cost_scale
            if self._ddp:
                eca = ep_cost_avg.to(self.agent.device)
                torch.distributed.all_reduce(eca)
                ep_cost_avg = eca.to("cpu")
                ep_cost_avg /= world_size
            a = self.ep_cost_ema_alpha
            self._ep_cost_ema *= a
            self._ep_cost_ema += (1 - a) * ep_cost_avg

        if self.normalize_advantage:
            if valid is not None:
                valid_mask = valid > 0
                adv_mean = advantage[valid_mask].mean()
                adv_std = advantage[valid_mask].std()
            else:
                adv_mean = advantage.mean()
                adv_std = advantage.std()
            if self._ddp:
                mean_std = torch.stack([adv_mean, adv_std])
                mean_std = mean_std.to(self.agent.device)
                torch.distributed.all_reduce(mean_std)
                mean_std = mean_std.to("cpu")
                mean_std /= world_size
                adv_mean, adv_std = mean_std[0], mean_std[1]
            advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6)

        # Pretty sure not supposed to normalized c_advantage.
        if self.normalize_cost_advantage:
            if valid is not None:
                valid_mask = valid > 0
                cadv_mean = c_advantage[valid_mask].mean()
                cadv_std = c_advantage[valid_mask].std()
            else:
                cadv_mean = c_advantage.mean()
                cadv_std = c_advantage.std()
            if self._ddp:
                mean_std = torch.stack([cadv_mean, cadv_std])
                mean_std = mean_std.to(self.agent.device)
                torch.distributed.all_reduce(mean_std)
                mean_std = mean_std.to("cpu")
                mean_std /= world_size
                cadv_mean, cadv_std = mean_std[0], mean_std[1]
            c_advantage[:] = (c_advantage - cadv_mean) / max(cadv_std, 1e-6)

        return (return_, advantage, valid, c_return, c_advantage,
                self._ep_cost_ema)
Пример #10
0
    def process_returns(self, samples):
        """
        Compute bootstrapped returns and advantages from a minibatch of
        samples.  Uses either discounted returns (if ``self.gae_lambda==1``)
        or generalized advantage estimation.  Mask out invalid samples
        according to ``mid_batch_reset`` or for recurrent agent.  Optionally,
        normalize advantages.
        """
        reward, done, q, v, termination, o, prev_o, pi_omega, bv = (
            samples.env.reward, samples.env.done, samples.agent.agent_info.q,
            samples.agent.agent_info.value,
            samples.agent.agent_info.termination, samples.agent.agent_info.o,
            samples.agent.agent_info.prev_o,
            samples.agent.agent_info.dist_info_omega,
            samples.agent.bootstrap_value)
        done = done.type(reward.dtype)
        q_o = select_at_indexes(o, q)
        if self.normalize_rewards is not None:  # Normalize and clip rewards before computing advantage
            if self.normalize_rewards == 'return':
                return_ = discount_return(
                    reward,
                    done,
                    0.,
                    self.discount,
                    return_dest=torch.zeros_like(
                        reward))  # NO boostrapping of value
                reward = self.ret_rms(reward, center=False)
            else:
                reward = self.ret_rms(reward)

        valid_o = torch.ones_like(
            done
        )  # Options: If reset, no termination gradient, no deliberation cost
        valid_o[prev_o == -1] = 0.
        reward[torch.logical_and(valid_o.bool(),
                                 termination)] -= self.delib_cost

        if self.gae_lambda == 1:  # GAE reduces to empirical discounted.
            return_ = discount_return(reward, done, bv, self.discount)
            advantage = return_ - q_o
            op_adv = return_ - v
        else:
            advantage, return_ = generalized_advantage_estimation(
                reward,
                q_o,
                done,
                bv,
                self.discount,
                self.gae_lambda,
                return_dest=torch.zeros_like(reward),
                advantage_dest=torch.zeros_like(reward))
            op_adv, _ = generalized_advantage_estimation(
                reward,
                v,
                done,
                bv,
                self.discount,
                self.gae_lambda,
                return_dest=torch.zeros_like(reward),
                advantage_dest=torch.zeros_like(reward))

        if not self.mid_batch_reset or self.agent.recurrent:
            valid = valid_from_done(
                done)  # Recurrent: no reset during training.
        else:
            valid = None  # OR torch.ones_like(done)

        q_prev_o = select_at_indexes(prev_o, q)
        termination_advantage = q_prev_o - v + self.delib_cost

        if self.normalize_advantage:
            if valid is not None:
                valid_mask = valid > 0
                adv_mean = advantage[valid_mask].mean()
                adv_std = advantage[valid_mask].std()
                op_adv_mean = advantage[valid_mask].mean()
                op_adv_std = advantage[valid_mask].std()
            else:
                adv_mean = advantage.mean()
                adv_std = advantage.std()
                op_adv_mean = op_adv.mean()
                op_adv_std = op_adv.std()
            advantage[:] = (advantage - adv_mean) / max(adv_std, 1e-6)
            op_adv[:] = (op_adv - op_adv_mean) / max(op_adv_std, 1e-6)

        if self.normalize_termination_advantage:
            valid_mask = valid_o > 0
            adv_mean = termination_advantage[valid_mask].mean()
            adv_std = termination_advantage[valid_mask].std()
            termination_advantage[:] = (termination_advantage -
                                        adv_mean) / max(adv_std, 1e-6)

        return return_, advantage, valid, termination_advantage, valid_o, op_adv