Python ReplayBuffer.sample_recent_data примеры использования

Язык программирования: Python

Пространство имен/Пакет: cs285.infrastructure.replay_buffer

Класс/Тип: ReplayBuffer

Метод/Функция: sample_recent_data

Примеров на hotexamples.com: 20

Python ReplayBuffer.sample_recent_data - 20 примеров найдено. Это лучшие примеры Python кода для cs285.infrastructure.replay_buffer.ReplayBuffer.sample_recent_data, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

add_rollouts(27)

ReplayBuffer(22)

sample_recent_data(20)

sample_random_data(7)

Пример #1

Показать файл

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params['standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
        )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # TODO Implement the following pseudocode: DONE
        for _ in range(self.agent_params['num_critic_updates_per_agent_update']):
            critic_loss = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n)

        advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)

        for _ in range(self.agent_params['num_actor_updates_per_agent_update']):
            actor_loss = self.actor.update(ob_no, ac_na, advantage)

        loss = OrderedDict()
        loss['Critic_Loss'] = critic_loss
        loss['Actor_Loss'] = actor_loss

        return loss

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        # TODO Implement the following pseudocode: DONE
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)

        V_s = self.critic.forward_np(ob_no)
        V_s_prime  = self.critic.forward_np(next_ob_no)
        Q_s_a = re_n + self.gamma*V_s_prime*(1-terminal_n)
        adv_n = Q_s_a - V_s

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

Пример #2

Показать файл

Файл: ac_agent.py Проект: arthur801031/cs285-fall2019

class ACAgent:
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params
        self.num_critic_updates_per_agent_update = agent_params['num_critic_updates_per_agent_update']
        self.num_actor_updates_per_agent_update = agent_params['num_actor_updates_per_agent_update']
        self.device = agent_params['device']

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params['standardize_advantages']

        self.actor = MLPPolicyAC(self.agent_params['ob_dim'],
                               self.agent_params['ac_dim'],
                               self.agent_params['n_layers'],
                               self.agent_params['size'],
                               self.agent_params['device'],
                               discrete=self.agent_params['discrete'],
                               learning_rate=self.agent_params['learning_rate'],
                               )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer(agent_params['replay_size'])

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        ob, next_ob, rew, done = map(lambda x: torch.from_numpy(x).to(self.device), [ob_no, next_ob_no, re_n, terminal_n])

        value = self.critic.value_func(ob).squeeze()
        next_value = self.critic.value_func(next_ob).squeeze() * (1 - done)
        adv_n = rew + (self.gamma * next_value) - value
        adv_n = adv_n.cpu().detach().numpy()

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        loss = OrderedDict()

        for critic_update in range(self.num_critic_updates_per_agent_update):
            loss['Critic_Loss'] = self.critic.update(ob_no, next_ob_no, re_n, terminal_n)

        adv_n = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # put final critic loss here

        for actor_update in range(self.num_actor_updates_per_agent_update):
            loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, adv_n)  # put final actor loss here

        return loss

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

Пример #3

Показать файл

Файл: ac_agent.py Проект: abhi1345/homework_fall2019

class ACAgent(BaseAgent):
    def __init__(self, sess, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env 
        self.sess = sess
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params['standardize_advantages']

        self.actor = MLPPolicyAC(sess, 
                               self.agent_params['ac_dim'],
                               self.agent_params['ob_dim'],
                               self.agent_params['n_layers'],
                               self.agent_params['size'],
                               discrete=self.agent_params['discrete'],
                               learning_rate=self.agent_params['learning_rate'],
                               )
        self.critic = BootstrappedContinuousCritic(sess, self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        
        # TODO Implement the following pseudocode:
            # 1) query the critic with ob_no, to get V(s)
            # 2) query the critic with next_ob_no, to get V(s')
            # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
            # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
            # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)
        
        vs = self.sess.run(self.critic.critic_prediction, feed_dict = {self.critic.sy_ob_no : ob_no})
        vsprime = self.sess.run(self.critic.critic_prediction, feed_dict = {self.critic.sy_ob_no : next_ob_no})*(1-terminal_n)
        q_val = re_n + self.gamma * vsprime
        adv_n = q_val - vs

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        
        # TODO Implement the following pseudocode:
            # for agent_params['num_critic_updates_per_agent_update'] steps,
            #     update the critic

            # advantage = estimate_advantage(...)

            # for agent_params['num_actor_updates_per_agent_update'] steps,
            #     update the actor
        for x in range(self.agent_params['num_critic_updates_per_agent_update']):
            closs = self.critic.update(ob_no, next_ob_no, re_n, terminal_n)
        advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)

        for x in range(self.agent_params['num_actor_updates_per_agent_update']):
            aloss = self.actor.update(ob_no, ac_na, advantage)

        loss = OrderedDict()
        loss['Critic_Loss'] = closs  # put final critic loss here
        loss['Actor_Loss'] = aloss  # put final actor loss here
        return loss

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

Пример #4

Показать файл

Файл: ac_agent.py Проект: xufangda/berkeley-deep-RL-pytorch-starter

class ACAgent:
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params
        self.num_critic_updates_per_agent_update = agent_params['num_critic_updates_per_agent_update']
        self.num_actor_updates_per_agent_update = agent_params['num_actor_updates_per_agent_update']3
        self.device = agent_params['device']

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params['standardize_advantages']

        self.actor = MLPPolicyAC(self.agent_params['ac_dim'],
                               self.agent_params['ob_dim'],
                               self.agent_params['n_layers'],
                               self.agent_params['size'],
                               self.agent_params['device'],
                               discrete=self.agent_params['discrete'],
                               learning_rate=self.agent_params['learning_rate'],
                               )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        ob, next_ob, rew, done = map(lambda x: torch.from_numpy(x).to(self.device), [ob_no, next_ob_no, re_n, terminal_n])

        # TODO Implement the following pseudocode:
            # 1) query the critic with ob_no, to get V(s)
            # 2) query the critic with next_ob_no, to get V(s')
            # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
            # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
            # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)

        adv_n = TODO

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):

        # TODO Implement the following pseudocode:
            # for agent_params['num_critic_updates_per_agent_update'] steps,
            #     update the critic

            # advantage = estimate_advantage(...)

            # for agent_params['num_actor_updates_per_agent_update'] steps,
            #     update the actor

        TODO

        loss = OrderedDict()
        loss['Critic_Loss'] = TODO  # put final critic loss here
        loss['Actor_Loss'] = TODO  # put final actor loss here
        return loss

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

Пример #5

Показать файл

Файл: pg_agent.py Проект: jlin816/cs285_fall2020_hw

class PGAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(PGAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params
        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params['standardize_advantages']
        self.nn_baseline = self.agent_params['nn_baseline']
        self.reward_to_go = self.agent_params['reward_to_go']

        # actor/policy
        self.actor = MLPPolicyPG(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            discrete=self.agent_params['discrete'],
            learning_rate=self.agent_params['learning_rate'],
            nn_baseline=self.agent_params['nn_baseline']
        )

        # replay buffer
        self.replay_buffer = ReplayBuffer(1000000)

    def train(self, observations, actions, rewards_list, next_observations, terminals):

        """
            Training a PG agent refers to updating its actor using the given observations/actions
            and the calculated qvals/advantages that come from the seen rewards.
        """

        # step 1: calculate q values of each (s_t, a_t) point, using rewards (r_0, ..., r_t, ..., r_T)
        q_values = self.calculate_q_vals(rewards_list)

        # step 2: calculate advantages that correspond to each (s_t, a_t) point
        advantages = self.estimate_advantage(observations, q_values)

        # step 3: use all datapoints (s_t, a_t, q_t, adv_t) to update the PG actor/policy
        train_log = self.actor.update(observations, actions, advantages, q_values=q_values)

        return train_log

    def calculate_q_vals(self, rewards_list):

        """
            Monte Carlo estimation of the Q function.
        """

        # Case 1: trajectory-based PG
        # Estimate Q^{pi}(s_t, a_t) by the total discounted reward summed over entire trajectory
        if not self.reward_to_go:

            # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory
            # In other words: value of (s_t, a_t) = sum_{t'=0}^T gamma^t' r_{t'}
            q_values = np.concatenate([self._discounted_return(r) for r in rewards_list])

        # Case 2: reward-to-go PG
        # Estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting from t
        else:

            # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory
            # In other words: value of (s_t, a_t) = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
            q_values = np.concatenate([self._discounted_cumsum(r) for r in rewards_list])

        return q_values

    def estimate_advantage(self, obs, q_values):

        """
            Computes advantages by (possibly) subtracting a baseline from the estimated Q values
        """

        # Estimate the advantage when nn_baseline is True,
        # by querying the neural network that you're using to learn the baseline
        if self.nn_baseline:
            baselines_unnormalized = self.actor.run_baseline_prediction(obs)
            ## ensure that the baseline and q_values have the same dimensionality
            ## to prevent silent broadcasting errors
            assert baselines_unnormalized.ndim == q_values.ndim
            ## baseline was trained with standardized q_values, so ensure that the predictions
            ## have the same mean and standard deviation as the current batch of q_values
            baselines = baselines_unnormalized * np.std(q_values) + np.mean(q_values)
            advantages = q_values - baselines

        # Else, just set the advantage to [Q]
        else:
            advantages = q_values.copy()
            
        # Normalize the resulting advantages
        if self.standardize_advantages:
            advantages = utils.normalize(advantages, np.mean(advantages), np.std(advantages))

        return advantages

    #####################################################
    #####################################################

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size, concat_rew=False)

    #####################################################
    ################## HELPER FUNCTIONS #################
    #####################################################

    def _discounted_return(self, rewards):
        """
            Helper function

            Input: list of rewards {r_0, r_1, ..., r_t', ... r_T} from a single rollout of length T

            Output: list where each index t contains sum_{t'=0}^T gamma^t' r_{t'}
        """
        discounted_return = sum([(self.gamma**t) * r for t, r in enumerate(rewards)])
        return [discounted_return] * len(rewards)

    def _discounted_cumsum(self, rewards):
        """
            Helper function which
            -takes a list of rewards {r_0, r_1, ..., r_t', ... r_T},
            -and returns a list where the entry in each index t is sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        """

        discounted_returns_to_go = []
        for t in range(len(rewards)):
            return_to_go = sum([(self.gamma**tp) * r for tp, r in enumerate(rewards[t:])])
            discounted_returns_to_go.append(return_to_go)
        return discounted_returns_to_go

Пример #6

Показать файл

Файл: ac_agent.py Проект: yzyvl/cs285-homework

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
        )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # for agent_params['num_critic_updates_per_agent_update'] steps,
        #     update the critic

        loss = OrderedDict()

        for _ in range(
                self.agent_params['num_critic_updates_per_agent_update']):
            loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no,
                                                     re_n, terminal_n)

        advantages = self.estimate_advantage(ob_no, next_ob_no, re_n,
                                             terminal_n)

        # for agent_params['num_actor_updates_per_agent_update'] steps,
        #     update the actor
        for _ in range(
                self.agent_params['num_actor_updates_per_agent_update']):
            loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, advantages)

        return loss

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        # Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)

        ob_no = ptu.from_numpy(ob_no)
        next_ob_no = ptu.from_numpy(next_ob_no)
        re_n = ptu.from_numpy(re_n)
        terminal_n = ptu.from_numpy(terminal_n).bool()

        v_s = self.critic(ob_no)
        v_sp1 = self.critic(next_ob_no)
        v_sp1[terminal_n] = 0
        q_sa = re_n + self.gamma * v_sp1
        adv_n = q_sa - v_s
        assert adv_n.size() == re_n.size()
        adv_n = adv_n.detach().cpu().numpy()

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

Пример #7

Показать файл

Файл: ac_agent.py Проект: fokx/cs285_fall2020

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
        )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        ob_no = ptu.from_numpy(ob_no)
        next_ob_no = ptu.from_numpy(next_ob_no)
        terminal_n = ptu.from_numpy(terminal_n)
        re_n = ptu.from_numpy(re_n)

        ac_na = ptu.from_numpy(ac_na)

        loss_critic = 0.
        for i in range(
                self.agent_params['num_critic_updates_per_agent_update']):
            loss_critic += self.critic.update(ob_no, ac_na, next_ob_no, re_n,
                                              terminal_n)

        # advantage = estimate_advantage(...) :
        adv_n = self.estimate_advantage(ob_no, next_ob_no, re_n,
                                        terminal_n)  # a tensor is returned
        loss_actor = 0.
        for i in range(
                self.agent_params['num_actor_updates_per_agent_update']):
            loss_actor += self.actor.update(ob_no, ac_na, adv_n)

        loss = OrderedDict()
        loss['Critic_Loss'] = loss_critic
        loss[
            'Actor_Loss'] = loss_actor  # in TensorBoard, loss_actor actually increases as we actually minimize -loss_actor

        return loss

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        # TODO Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)

        # V_s_prime = self.critic.critic_network(next_ob_no)
        # V_s_prime = V_s_prime.squeeze()
        # mask = (terminal_n == 1.)
        # V_s_prime= V_s_prime.masked_fill(mask, 0.)
        #
        # V_s = self.critic.critic_network(ob_no)
        # V_s = V_s.squeeze()
        # # assert V_s_prime.ndim == V_s.ndim     # TODO-assert enable this assert in debug
        # adv_n2 = re_n + self.gamma * V_s_prime - V_s

        # another way to calculate:
        V_s_prime = re_n + (
            1 - terminal_n) * self.gamma * self.critic.forward(next_ob_no)
        adv_n = V_s_prime - self.critic.forward(ob_no)
        # assert adv_n2 == adv_n

        if self.standardize_advantages:
            adv_n = (adv_n - torch.mean(adv_n)) / (torch.std(adv_n) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

Пример #8

Показать файл

class PGAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(PGAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params
        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']
        self.nn_baseline = self.agent_params['nn_baseline']
        self.reward_to_go = self.agent_params['reward_to_go']

        # actor/policy
        self.actor = MLPPolicyPG(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            discrete=self.agent_params['discrete'],
            learning_rate=self.agent_params['learning_rate'],
            nn_baseline=self.agent_params['nn_baseline'])

        # replay buffer
        self.replay_buffer = ReplayBuffer(1000000)

    def train(self, observations, actions, rewards_list, next_observations,
              terminals):
        """
            Training a PG agent refers to updating its actor using the given observations/actions
            and the calculated qvals/advantages that come from the seen rewards.
        """

        # step 1: calculate q values of each (s_t, a_t) point, using rewards (r_0, ..., r_t, ..., r_T)
        q_values = self.calculate_q_vals(rewards_list)

        # step 2: calculate advantages that correspond to each (s_t, a_t) point
        advantages = self.estimate_advantage(observations, q_values)

        # TODO: step 3: use all datapoints (s_t, a_t, q_t, adv_t) to update the PG actor/policy
        ## HINT: `train_log` should be returned by your actor update method
        train_log = self.actor.update(observations, actions, advantages,
                                      q_values)

        return train_log

    def calculate_q_vals(self, rewards_list):
        """
            Monte Carlo estimation of the Q function.
        """

        # Case 1: trajectory-based PG
        # Estimate Q^{pi}(s_t, a_t) by the total discounted reward summed over entire trajectory
        if not self.reward_to_go:

            # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory
            # In other words: value of (s_t, a_t) = sum_{t'=0}^T gamma^t' r_{t'}
            q_values = np.concatenate(
                [self._discounted_return(r) for r in rewards_list])

        # Case 2: reward-to-go PG
        # Estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting from t
        else:

            # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory
            # In other words: value of (s_t, a_t) = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
            q_values = np.concatenate(
                [self._discounted_cumsum(r) for r in rewards_list])

        return q_values

    def estimate_advantage(self, obs, q_values):
        """
            Computes advantages by (possibly) subtracting a baseline from the estimated Q values
        """

        # Estimate the advantage when nn_baseline is True,
        # by querying the neural network that you're using to learn the baseline
        if self.nn_baseline:
            baselines_unnormalized = self.actor.run_baseline_prediction(obs)
            ## ensure that the baseline and q_values have the same dimensionality
            ## to prevent silent broadcasting errors
            assert baselines_unnormalized.ndim == q_values.ndim
            ## baseline was trained with standardized q_values, so ensure that the predictions
            ## have the same mean and standard deviation as the current batch of q_values
            baselines = baselines_unnormalized * np.std(q_values) + np.mean(
                q_values)
            ## TODO: compute advantage estimates using q_values and baselines
            advantages = q_values - baselines

        # Else, just set the advantage to [Q]
        else:
            advantages = q_values.copy()

        # Normalize the resulting advantages
        if self.standardize_advantages:
            ## TODO: standardize the advantages to have a mean of zero
            ## and a standard deviation of one
            ## HINT: there is a `normalize` function in `infrastructure.utils`
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-8)

        return advantages

    #####################################################
    #####################################################

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size,
                                                     concat_rew=False)

    #####################################################
    ################## HELPER FUNCTIONS #################
    #####################################################

    def _discounted_return(self, rewards):
        """
            Helper function

            Input: list of rewards {r_0, r_1, ..., r_t', ... r_T} from a single rollout of length T

            Output: list where each index t contains sum_{t'=0}^T gamma^t' r_{t'}
        """

        # TODO: create list_of_discounted_returns
        # Hint: note that all entries of this output are equivalent
        # because each sum is from 0 to T (and doesnt involve t)
        out = sum(self.gamma**t * rew for t, rew in enumerate(rewards))
        return [out for _ in range(len(rewards))]

    def _discounted_cumsum(self, rewards):
        """
            Helper function which
            -takes a list of rewards {r_0, r_1, ..., r_t', ... r_T},
            -and returns a list where the entry in each index t' is sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        """

        # TODO: create `list_of_discounted_returns`
        # HINT1: note that each entry of the output should now be unique,
        # because the summation happens over [t, T] instead of [0, T]
        # HINT2: it is possible to write a vectorized solution, but a solution
        # using a for loop is also fine
        ret, q = [], 0
        for rew in reversed(rewards):
            ret.append(q * self.gamma + rew)
            q = ret[-1]
        return ret[::-1]

Пример #9

Показать файл

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']
        self.gae = self.agent_params['gae']
        self.gae_lambda = self.agent_params['gae_lambda']
        self.ppo = self.agent_params['ppo']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
            self.agent_params['clip_eps'],
        )

        if self.ppo:
            self.old_actor = MLPPolicyAC(
                self.agent_params['ac_dim'],
                self.agent_params['ob_dim'],
                self.agent_params['n_layers'],
                self.agent_params['size'],
                self.agent_params['discrete'],
                self.agent_params['learning_rate'],
                self.agent_params['clip_eps'],
            )
            self.old_actor.load_state_dict(self.actor.state_dict())

        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # TODO Implement the following pseudocode:
        # for agent_params['num_critic_updates_per_agent_update'] steps,
        #     update the critic

        rewards = np.concatenate([r for r in re_n]) if self.gae else re_n
        assert rewards.shape == terminal_n.shape
        for i in range(
                self.agent_params['num_critic_updates_per_agent_update']):
            loss_critic = self.critic.update(ob_no, ac_na, next_ob_no, rewards,
                                             terminal_n)

        advantage = self.estimate_advantage(ob_no, next_ob_no, re_n,
                                            terminal_n)
        old_log_prob = self.get_old_prob(self.old_actor, ob_no,
                                         ac_na) if self.ppo else None

        # for agent_params['num_actor_updates_per_agent_update'] steps,
        #     update the actor
        for i in range(
                self.agent_params['num_actor_updates_per_agent_update']):
            loss_actor = self.actor.update(ob_no, ac_na, advantage,
                                           old_log_prob)

        if self.ppo:
            self.old_actor.load_state_dict(self.actor.state_dict())

        loss = OrderedDict()
        loss['Critic_Loss'] = loss_critic
        loss['Actor_Loss'] = loss_actor

        return loss

    def get_old_prob(self, old_policy, ob_no, ac_na):
        observations = ptu.from_numpy(ob_no)
        actions = ptu.from_numpy(ac_na)
        log_prob = old_policy.forward(observations).log_prob(actions)

        return ptu.to_numpy(log_prob)

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        # TODO Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)
        v_s = self.critic.forward_np(ob_no)
        if not self.gae:
            v_s_next = self.critic.forward_np(next_ob_no) * (1 - terminal_n)
            adv_n = re_n + self.gamma * v_s_next - v_s
        else:
            index = 0
            adv_n = np.zeros(len(ob_no))
            for rewards in re_n:
                gae_deltas = []
                for i in range(len(rewards) - 1):
                    delta = rewards[i] + self.gamma * v_s[index + i +
                                                          1] - v_s[index + i]
                    gae_deltas.append(delta)
                i = len(rewards) - 1
                gae_deltas.append(rewards[i] - v_s[index + i])

                assert len(gae_deltas) == len(rewards)

                sum_deltas = 0
                for t in range(len(gae_deltas) - 1, -1, -1):
                    sum_deltas = gae_deltas[
                        t] + sum_deltas * self.gamma * self.gae_lambda
                    adv_n[t + index] = sum_deltas

                index += len(rewards)

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        concat_rew = False if self.gae else True
        return self.replay_buffer.sample_recent_data(batch_size, concat_rew)

Пример #10

Показать файл

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
        )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        loss = OrderedDict()
        ob_no = ptu.from_numpy(ob_no)
        ac_na = ptu.from_numpy(ac_na)
        re_n = ptu.from_numpy(re_n)
        next_ob_no = ptu.from_numpy(next_ob_no)
        terminal_n = ptu.from_numpy(terminal_n)

        for _ in range(
                self.agent_params['num_critic_updates_per_agent_update']):
            loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no,
                                                     re_n, terminal_n)

        advantages = self.estimate_advantage(ob_no, next_ob_no, re_n,
                                             terminal_n)
        advantages = ptu.from_numpy(advantages)

        for _ in range(
                self.agent_params['num_actor_updates_per_agent_update']):
            loss['Actor_Loss'] = self.actor.update(ob_no,
                                                   ac_na,
                                                   adv_n=advantages)

        return loss

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        v_s_n = self.critic(ob_no)
        v_s_prime_n = self.critic(next_ob_no)
        # setting V(s') to zero if the next state is a terminal state
        q_n = re_n + self.gamma * v_s_prime_n * (1 - terminal_n)
        adv_n = q_n - v_s_n
        assert adv_n.size() == re_n.size()
        adv_n = adv_n.detach().cpu().numpy()

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

Пример #11

Показать файл

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
        )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # TODO_ Implement the following pseudocode:
        # for agent_params['num_critic_updates_per_agent_update'] steps,
        #     update the critic

        # advantage = estimate_advantage(...)

        # for agent_params['num_actor_updates_per_agent_update'] steps,
        #     update the actor

        loss = OrderedDict()
        Critic_Loss = []
        Actor_loss = []

        for i in range(agent_params['num_critic_updates_per_agent_update']):
            Critic_Loss.append(
                self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n))

        advantage = estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)

        for i in range(agent_params['num_actor_updates_per_agent_update']):
            Actor_loss.append(self.actor.update(ob_no, ac_na, advantage))

        loss['Critic_Loss'] = mean(Critic_Loss)
        loss['Actor_Loss'] = mean(Actor_loss)

        return loss

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        # TODO_ Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)
        Vs = self.critic(ob_no)
        Vs_prime = self.critic(next_ob_no)

        # ternimal_index = next(i for i, x in enumerate(terminal_n) if x, None)
        ternimal_index = [i for i, x in enumerate(terminal_n) if x]
        if len(ternimal_index):
            Vs_prime[ternimal_index] = 0
        Qs = re_n + self.gamma * Vs_prime
        adv_n = Qs - Vs

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

Пример #12

Показать файл

class PGAgent(BaseAgent):
    def __init__(self, env, agent_params, batch_size=500000, **kwargs):
        super(PGAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params
        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']
        self.nn_baseline = self.agent_params['nn_baseline']
        self.reward_to_go = self.agent_params['reward_to_go']

        # actor/policy
        if self.agent_params['discrete']:
            self.actor = DiscreteMLPPolicy(self.agent_params['ac_dim'],
                                           self.agent_params['ob_dim'],
                                           self.agent_params['n_layers'],
                                           self.agent_params['size'])
        else:
            self.actor = ContinuousMLPPolicy(self.agent_params['ac_dim'],
                                             self.agent_params['ob_dim'],
                                             self.agent_params['n_layers'],
                                             self.agent_params['size'])
        self.policy_optimizer = tf.keras.optimizers.Adam(
            learning_rate=self.agent_params['learning_rate'])

        # replay buffer
        self.replay_buffer = ReplayBuffer(2 * batch_size)

        self.baseline_model = None
        if self.agent_params['nn_baseline']:
            self.baseline_model = build_mlp(
                (self.agent_params['ob_dim'], ),
                output_size=1,
                n_layers=self.agent_params['n_layers'],
                size=self.agent_params['size'],
                name='baseline_model')
            self.baseline_loss = tf.keras.losses.MeanSquaredError()
            self.baseline_optimizer = tf.keras.optimizers.Adam(
                learning_rate=self.agent_params['learning_rate'])
            self.baseline_model.compile(optimizer=self.baseline_optimizer,
                                        loss=self.baseline_loss)

    def train(self, obs, acs, rews_list, next_obs, terminals):
        """
            Training a PG agent refers to updating its actor using the given observations/actions
            and the calculated qvals/advantages that come from the seen rewards.

            ---------------------------------------------------------------------------------- 
            
            Recall that the expression for the policy gradient PG is
            
                PG = E_{tau} [sum_{t=0}^{T-1} grad log pi(a_t|s_t) * (Q_t - b_t )]
            
                where 
                tau=(s_0, a_0, s_1, a_1, s_2, a_2, ...) is a trajectory,
                Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
                b_t is a baseline which may depend on s_t,
                and (Q_t - b_t ) is the advantage.

            Thus, the PG update performed by the actor needs (s_t, a_t, q_t, adv_t),
                and that is exactly what this function provides.

            ----------------------------------------------------------------------------------
        """

        # step 1: calculate q values of each (s_t, a_t) point,
        # using rewards from that full rollout of length T: (r_0, ..., r_t, ..., r_{T-1})
        q_values = self.calculate_q_vals(rews_list)

        # step 2: calculate advantages that correspond to each (s_t, a_t) point
        advantage_values = self.estimate_advantage(obs, q_values)

        # step 3:
        # TODO: pass the calculated values above into the actor/policy's update,
        # which will perform the actual PG update step

        # TODO: define the loss that should be optimized when training a policy with policy gradient
        # HINT1: Recall that the expression that we want to MAXIMIZE
        # is the expectation over collected trajectories of:
        # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
        # HINT2: see define_log_prob (above)
        # to get log pi(a_t|s_t)
        # HINT3: look for a placeholder above that will be populated with advantage values
        # to get [Q_t - b_t]
        # HINT4: don't forget that we need to MINIMIZE this self.loss
        # but the equation above is something that should be maximized

        # define the log probability of seen actions/observations under the current policy
        with tf.GradientTape() as tape:
            log_action_probas = self.actor.get_log_prob(obs, acs)
            advantage_values_no_grad = tf.stop_gradient(advantage_values)
            loss = -tf.reduce_mean(
                advantage_values_no_grad * log_action_probas)

        actor_vars = self.actor.trainable_variables
        grads = tape.gradient(loss, actor_vars)
        self.policy_optimizer.apply_gradients(zip(grads, actor_vars))

        if self.nn_baseline:
            targets_n = (q_values - np.mean(q_values)) / (np.std(q_values) +
                                                          1e-8)
            dataset = tf.data.Dataset.from_tensor_slices(
                (tf.cast(obs, tf.float32), tf.cast(targets_n, tf.float32)))
            dataset = dataset.batch(batch_size=targets_n.shape[0]).repeat()
            # 20 baseline gradient updates with the current data batch.
            self.baseline_model.fit(dataset, epochs=1, steps_per_epoch=20)

        return loss.numpy().item()

    def calculate_q_vals(self, rews_list):
        """
            Monte Carlo estimation of the Q function.

            arguments:
                rews_list: length: number of sampled rollouts
                    Each element corresponds to a particular rollout,
                    and contains an array of the rewards for every step of that particular rollout

            returns:
                q_values: shape: (sum/total number of steps across the rollouts)
                    Each entry corresponds to the estimated q(s_t,a_t) value 
                    of the corresponding obs/ac point at time t.
 
        """

        # Case 1: trajectory-based PG
        if not self.reward_to_go:

            # TODO: Estimate the Q value Q^{pi}(s_t, a_t) using rewards from that entire trajectory
            # HINT1: value of each point (t) = total discounted reward summed over the entire trajectory (from 0 to T-1)
            # In other words, q(s_t, a_t) = sum_{t'=0}^{T-1} gamma^t' r_{t'}
            # Hint3: see the helper functions at the bottom of this file
            q_values = np.concatenate(
                [self._discounted_return(r) for r in rews_list])

        # Case 2: reward-to-go PG
        else:

            # TODO: Estimate the Q value Q^{pi}(s_t, a_t) as the reward-to-go
            # HINT1: value of each point (t) = total discounted reward summed over the remainder of that trajectory
            # (from t to T-1)
            # In other words, q(s_t, a_t) = sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'}
            # Hint3: see the helper functions at the bottom of this file
            q_values = np.concatenate(
                [self._discounted_cumsum(r) for r in rews_list])

        return q_values.astype(np.float32)

    def estimate_advantage(self, obs, q_values):
        """
            Computes advantages by (possibly) subtracting a baseline from the estimated Q values
        """

        # TODO: Estimate the advantage when nn_baseline is True
        # HINT1: pass obs into the neural network that you're using to learn the baseline
        # extra hint if you're stuck: see your actor's run_baseline_prediction
        # HINT2: advantage should be [Q-b]
        if self.nn_baseline:
            b_n_unnormalized = self.baseline_model(obs)
            b_n = b_n_unnormalized * np.std(q_values) + np.mean(q_values)
            adv_n = (q_values - tf.squeeze(b_n)).numpy()
        # Else, just set the advantage to [Q]
        else:
            adv_n = q_values.copy()

        # Normalize the resulting advantages
        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)

        return adv_n.astype(np.float32)

    #####################################################
    #####################################################

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size,
                                                     concat_rew=False)

    #####################################################
    ################## HELPER FUNCTIONS #################
    #####################################################

    def _discounted_return(self, rewards):
        """
            Helper function

            Input: a list of rewards {r_0, r_1, ..., r_t', ... r_{T-1}} from a single rollout of length T

            Output: list where each index t contains sum_{t'=0}^{T-1} gamma^t' r_{t'}
                note that all entries of this output are equivalent
                because each index t is a sum from 0 to T-1 (and doesnt involve t)
        """

        q = sum(reward * (self.gamma**t) for t, reward in enumerate(rewards))
        return [q for _ in rewards]

    def _discounted_cumsum(self, rewards):
        """
            Input:
                a list of length T 
                a list of rewards {r_0, r_1, ..., r_t', ... r_{T-1}} from a single rollout of length T
            Output: 
                a list of length T
                a list where the entry in each index t is sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'}
        """

        all_discounted_cumsums = rewards.copy()
        for t in range(len(all_discounted_cumsums) - 1, 0, -1):
            all_discounted_cumsums[t -
                                   1] += self.gamma * all_discounted_cumsums[t]
        return all_discounted_cumsums

Пример #13

Показать файл

Файл: ac_agent.py Проект: HuangYiduo14/cs285_project_ubereats

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super().__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params['standardize_advantages']
        self.n_drivers = self.agent_params['n_drivers']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size_ac'],
            self.agent_params['shared_exp'],
            self.agent_params['shared_exp_lambda'],
            self.agent_params['is_city'],
            self.agent_params['learning_rate'],
            self.agent_params['n_drivers']
        )

        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # TODO Implement the following pseudocode:
        # for agent_params['num_critic_updates_per_agent_update'] steps,
        #     update the critic
        loss = OrderedDict()
        for i in range(self.agent_params['num_critic_updates_per_agent_update']):
            if not self.agent_params['shared_exp']:
                loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n)
            else:
                action_distributions = self.actor.shared_forward(ptu.from_numpy(ob_no))
                loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n, action_distributions)
        # advantage = estimate_advantage(...)
        if self.agent_params['shared_exp']:
            advantage = self.estimate_shared_advantage(ob_no, next_ob_no, re_n, terminal_n)
        else:
            advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)
        # for agent_params['num_actor_updates_per_agent_update'] steps,
        #     update the actor
        for i in range(self.agent_params['num_actor_updates_per_agent_update']):
            loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, advantage)
        return loss
        
        
    def estimate_shared_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        value_s = self.critic.shared_forward(ptu.from_numpy(ob_no))
        value_next_s = self.critic.shared_forward(ptu.from_numpy(next_ob_no))
        adv_n = dict()
        for i in range(self.n_drivers):
            for k in range(self.n_drivers):
                adv_n[(i,k)] = re_n[:,k] + self.gamma*ptu.to_numpy(value_next_s[(i,k)]) - ptu.to_numpy(value_s[(i,k)])
                if self.standardize_advantages:
                    adv_n[(i,k)] = (adv_n[(i,k)]- np.mean(adv_n[(i,k)]))/(np.std(adv_n[(i,k)])+1e-8)
        return adv_n
        

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        # TODO Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)
        value_s = self.critic.forward_np(ob_no)
        value_next_s = self.critic.forward_np(next_ob_no)
        value_next_s[terminal_n==1] = 0
        adv_n = re_n + self.gamma*value_next_s - value_s
        if self.standardize_advantages:
            for i in range(self.n_drivers):
                adv_n[:,i] = (adv_n[:,i] - np.mean(adv_n[:,i])) / (np.std(adv_n[:,i]) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

Пример #14

Показать файл

class PPOAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(PPOAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params
        self.gamma = self.agent_params['gamma']
        self.use_gae = self.agent_params['use_gae']
        self.lam = self.agent_params['gae_lam']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']
        self.ppo_epochs = self.agent_params['ppo_epochs']
        self.ppo_min_bacth_size = self.agent_params['ppo_min_batch_size']

        # actor/policy
        self.actor = PPOPolicy(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['clip_eps'],
            self.agent_params['ent_coeff'],
            self.agent_params['max_grad_norm'],
            discrete=self.agent_params['discrete'],
            learning_rate=self.agent_params['learning_rate_policyfn'],
        )

        self.critic = PPOCritic(self.agent_params)

        # replay buffer
        self.replay_buffer = ReplayBuffer(1000000)

    def train(self, ob_no, ac_no, re_n, next_ob_no, terminal_n, logprobs):
        """
            Training a PPO agent refers to updating its actor using the given observations/actions
            and the calculated qvals/advantages that come from the seen rewards.
        """

        # calculate advantages and target returs for value_function that correspond to each (s_t, a_t) point
        advantages, targets = self.estimate_advantage(ob_no, next_ob_no, re_n,
                                                      terminal_n)

        # step 1: use all datapoints (s_t, a_t, q_t, adv_t) to update the PPO actor/policy
        ## HINT: `train_log` should be returned by your actor update method
        loss = OrderedDict()

        #print(self.actor.parameters())

        if self.ppo_min_bacth_size:

            n_batches = math.ceil(terminal_n.shape[0] /
                                  self.ppo_min_bacth_size)
            inds = np.arange(terminal_n.shape[0])

            for _ in range(self.ppo_epochs):
                np.random.shuffle(inds)

                for i in range(n_batches):

                    rand_indices = inds[slice(
                        i * self.ppo_min_bacth_size,
                        min(inds.shape[0],
                            ((i + 1) * self.ppo_min_bacth_size)))]
                    mb_ob_no = ob_no[rand_indices]
                    mb_ac_no = ac_no[rand_indices]
                    mb_adv = advantages[rand_indices]
                    mb_targets = targets[rand_indices]
                    mb_logprobs = logprobs[rand_indices]

                    loss['critic_loss'] = self.critic.update(
                        mb_ob_no, mb_targets)
                    loss['agent_loss'] = self.actor.update(
                        mb_ob_no, mb_ac_no, mb_adv, mb_logprobs)

        else:

            for _ in range(self.ppo_epochs):
                loss['critic_loss'] = self.critic.update(ob_no, targets)
                loss['agent_loss'] = self.actor.update(ob_no, ac_no,
                                                       advantages, logprobs)

        return loss

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        """
            Computes advantages (both gae and standard) from the estimated Q values 
        """

        v_t = self.critic.forward_np(ob_no)
        v_tp1 = self.critic.forward_np(next_ob_no)

        if self.use_gae:
            last_gae = 0
            gaes = np.zeros(re_n.shape[0])

            for i in range(re_n.shape[0] - 1, -1, -1):
                next_value = v_tp1[i]
                value = v_t[i]
                delta = re_n[i] + (self.gamma * next_value *
                                   (1 - terminal_n[i])) - value
                last_gae = delta + self.gamma * self.lam * last_gae * (
                    1 - terminal_n[i])
                gaes[i] = last_gae
            valuefn_targets = gaes + v_t
            advantages = gaes
        else:
            q_value = re_n + self.gamma * (v_tp1 * (1 - terminal_n))
            valuefn_targets = q_value
            advantages = q_value - v_t

        # Normalize the resulting advantages
        if self.standardize_advantages:
            advantages = normalize(advantages, np.mean(advantages),
                                   np.std(advantages))

        return advantages, valuefn_targets

    #####################################################
    #####################################################

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

    #####################################################
    ################## HELPER FUNCTIONS #################
    #####################################################

    def save(self, path):
        torch.save(
            {
                "actor": self.actor.state_dict(),
                "critic": self.critic.state_dict(),
                "actor_optimizer": self.actor.optimizer.state_dict(),
                "critic_optimizer": self.critic.optimizer.state_dict()
            }, path)

Пример #15

Показать файл

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
        )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # TODO Implement the following pseudocode:
        # for agent_params['num_critic_updates_per_agent_update'] steps,
        #     update the critic
        # ob_no = ptu.from_numpy(ob_no)
        # ac_na = ptu.from_numpy(ac_na).to(torch.long)
        # next_ob_no = ptu.from_numpy(next_ob_no)
        # re_n = ptu.from_numpy(re_n)
        # terminal_n = ptu.from_numpy(terminal_n)
        for _ in range(
                self.agent_params['num_critic_updates_per_agent_update']):
            critic_loss = self.critic.update(ob_no=ob_no,
                                             ac_na=ac_na,
                                             reward_n=re_n,
                                             next_ob_no=next_ob_no,
                                             terminal_n=terminal_n)

        # targets = re_n + self.gamma * self.critic(next_ob_no) * (1-terminal_n)
        # pred = self.critic(ob_no)
        # #advantage = re_n + self.gamma * self.critic(next_ob_no) * (1-terminal_n)  - self.critic(ob_no)
        # advantage = targets - pred
        advantage = self.estimate_advantage(ob_no, next_ob_no, re_n,
                                            terminal_n)

        # for agent_params['num_actor_updates_per_agent_update'] steps,
        #     update the actor
        for _ in range(
                self.agent_params['num_actor_updates_per_agent_update']):
            actor_loss = self.actor.update(ob_no, ac_na, adv_n=advantage)

        loss = OrderedDict()
        loss['Critic_Loss'] = critic_loss
        loss['Actor_Loss'] = actor_loss

        return loss

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        # TODO Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)
        value_targets = re_n + self.gamma * self.critic(next_ob_no) * (
            1. - terminal_n)
        value_pred = self.critic(ob_no)
        #advantage = re_n + self.gamma * self.critic(next_ob_no) * (1-terminal_n)  - self.critic(ob_no)
        adv_n = value_targets - value_pred

        if self.standardize_advantages:
            adv_n = (adv_n - torch.mean(adv_n)) / (torch.std(adv_n) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

Пример #16

Показать файл

class TRPOAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(TRPOAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params
        self.gamma = self.agent_params['gamma']
        self.use_gae = self.agent_params['use_gae']
        self.lam = self.agent_params['gae_lam']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        # actor/policy
        self.actor = TRPOPolicy(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['cg_steps'],
            self.agent_params['damping'],
            self.agent_params['max_backtracks'],
            self.agent_params['max_kl_increment'],
            discrete=self.agent_params['discrete'],
            learning_rate=self.agent_params['learning_rate'],
        )

        self.critic = TRPOCritic(self.agent_params)

        # replay buffer
        self.replay_buffer = ReplayBuffer(1000000)

    def train(self, ob_no, ac_no, re_n, next_ob_no, terminal_n):
        """
            Training a TRPO agent refers to updating its actor using the given observations/actions
            and the calculated qvals/advantages that come from the seen rewards.
        """

        # calculate advantages and target returs for value_function that correspond to each (s_t, a_t) point
        advantages, targets = self.estimate_advantage(ob_no, next_ob_no, re_n,
                                                      terminal_n)

        loss = OrderedDict()

        #print(self.actor.parameters())
        loss['critic_loss'] = self.critic.update(ob_no, targets)
        loss['agent_loss'] = self.actor.update(ob_no, ac_no, advantages)

        return loss

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        """
            Computes advantages (both gae and standard) from the estimated Q values 
        """

        v_t = self.critic.forward_np(ob_no)
        v_tp1 = self.critic.forward_np(next_ob_no)

        if self.use_gae:
            last_gae = 0
            gaes = np.zeros(re_n.shape[0])

            for i in range(re_n.shape[0] - 1, -1, -1):
                next_value = v_tp1[i]
                value = v_t[i]
                delta = re_n[i] + (self.gamma * next_value *
                                   (1 - terminal_n[i])) - value
                last_gae = delta + self.gamma * self.lam * last_gae * (
                    1 - terminal_n[i])
                gaes[i] = last_gae
            valuefn_targets = gaes + v_t
            advantages = gaes
        else:
            q_value = re_n + self.gamma * (v_tp1 * (1 - terminal_n))
            valuefn_targets = q_value
            advantages = q_value - v_t

        # Normalize the resulting advantages
        if self.standardize_advantages:
            advantages = normalize(advantages, np.mean(advantages),
                                   np.std(advantages))

        return advantages, valuefn_targets

    #####################################################
    #####################################################

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

    #####################################################
    ################## HELPER FUNCTIONS #################
    #####################################################

    def save(self, path):
        torch.save(
            {
                "actor": self.actor.state_dict(),
                "critic": self.critic.state_dict(),
                "actor_optimizer": self.actor.optimizer.state_dict(),
                "critic_optimizer": self.critic.optimizer.state_dict()
            }, path)

Пример #17

Показать файл

Файл: ac_agent.py Проект: arthur801031/cs285-fall2019

class ACAgent:
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params
        self.num_critic_updates_per_agent_update = agent_params[
            'num_critic_updates_per_agent_update']
        self.num_actor_updates_per_agent_update = agent_params[
            'num_actor_updates_per_agent_update']
        self.device = agent_params['device']

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['device'],
            discrete=self.agent_params['discrete'],
            learning_rate=self.agent_params['learning_rate'],
        )
        # introduced in actor-critic to improve advantage function.
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        ob, next_ob, rew, done = map(
            lambda x: torch.from_numpy(x).to(self.device),
            [ob_no, next_ob_no, re_n, terminal_n])

        # DoneTODO Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)

        v_s = self.critic.value_func(ob)
        v_s_prime = self.critic.value_func(next_ob).squeeze()
        v_s_prime[done >= 1] = 0
        estimated_q = rew + self.gamma * v_s_prime
        adv_n = estimated_q - v_s
        adv_n = adv_n.cpu().detach().numpy()

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):

        # DoneTODO Implement the following pseudocode:
        # for agent_params['num_critic_updates_per_agent_update'] steps,
        #     update the critic

        # advantage = estimate_advantage(...)

        # for agent_params['num_actor_updates_per_agent_update'] steps,
        #     update the actor

        loss = OrderedDict()

        for i in range(self.num_critic_updates_per_agent_update):
            loss['Critic_Loss'] = self.critic.update(ob_no, next_ob_no, re_n,
                                                     terminal_n)

        adv = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)

        for i in range(self.num_actor_updates_per_agent_update):
            loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, adv)

        return loss

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

Пример #18

Показать файл

class PGAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(PGAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params
        self.gamma = self.agent_params["gamma"]
        self.standardize_advantages = self.agent_params["standardize_advantages"]
        self.nn_baseline = self.agent_params["nn_baseline"]
        self.reward_to_go = self.agent_params["reward_to_go"]

        # actor/policy
        self.actor = MLPPolicyPG(
            self.agent_params["ac_dim"],
            self.agent_params["ob_dim"],
            self.agent_params["n_layers"],
            self.agent_params["size"],
            discrete=self.agent_params["discrete"],
            learning_rate=self.agent_params["learning_rate"],
            nn_baseline=self.agent_params["nn_baseline"],
        )

        # replay buffer
        self.replay_buffer = ReplayBuffer(1000000)

    def train(self, observations, actions, rewards_list, next_observations, terminals):
        """
        Training a PG agent refers to updating its actor using the given observations/actions
        and the calculated qvals/advantages that come from the seen rewards.
        """

        # step 1: calculate q values of each (s_t, a_t) point, using rewards (r_0, ..., r_t, ..., r_T)
        q_values = self.calculate_q_vals(rewards_list)

        # step 2: calculate advantages that correspond to each (s_t, a_t) point
        advantages = self.estimate_advantage(observations, q_values)

        # step 3: use all datapoints (s_t, a_t, q_t, adv_t) to update the PG actor/policy
        train_log = self.actor.update(observations, actions, advantages, q_values)

        return train_log

    def calculate_q_vals(self, rewards_list):
        """
        Monte Carlo estimation of the Q function.
        """

        # Case 1: trajectory-based PG
        # Estimate Q^{pi}(s_t, a_t) by the total discounted reward summed over entire trajectory
        if not self.reward_to_go:
            # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory
            # In other words: value of (s_t, a_t) = sum_{t'=0}^T gamma^t' r_{t'}
            q_values = np.concatenate(
                [self._discounted_return(r) for r in rewards_list]
            )

        # Case 2: reward-to-go PG
        # Estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting from t
        else:
            # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory
            # In other words: value of (s_t, a_t) = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
            q_values = np.concatenate(
                [self._discounted_cumsum(r) for r in rewards_list]
            )

        return q_values

    def estimate_advantage(self, obs, q_values):
        """
        Computes advantages by (possibly) subtracting a baseline from the estimated Q values
        """

        # Estimate the advantage when nn_baseline is True,
        # by querying the neural network that you're using to learn the baseline
        if self.nn_baseline:
            baselines_unnormalized = self.actor.run_baseline_prediction(obs)
            ## ensure that the baseline and q_values have the same dimensionality
            ## to prevent silent broadcasting errors
            assert baselines_unnormalized.ndim == q_values.ndim
            ## baseline was trained with standardized q_values, so ensure that the predictions
            ## have the same mean and standard deviation as the current batch of q_values
            baselines = baselines_unnormalized * np.std(q_values) + np.mean(q_values)
            ## TODO: compute advantage estimates using q_values and baselines
            advantages = q_values - baselines

        # Else, just set the advantage to [Q]
        else:
            advantages = q_values.copy()

        # Normalize the resulting advantages
        if self.standardize_advantages:
            ## standardize the advantages to have a mean of zero
            ## and a standard deviation of one
            advantages = normalize(advantages)

        return advantages

    #####################################################
    #####################################################

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size, concat_rew=False)

    #####################################################
    ################## HELPER FUNCTIONS #################
    #####################################################

    def _discounted_return(self, rewards):
        """
        Helper function
        Input: list of rewards {r_0, r_1, ..., r_t', ... r_T} from a single rollout of length T
        Output: list where each index t contains sum_{t'=0}^T gamma^t' r_{t'}
        """
        T = rewards.shape[0]
        discount_factors = np.power(self.gamma, np.arange(T))
        discounted_rewards = rewards * discount_factors
        ret = np.sum(discounted_rewards)
        return np.repeat(ret, T)

    def _discounted_cumsum(self, rewards):
        """
        Helper function which
        - takes a list of rewards {r_0, r_1, ..., r_t', ... r_T},
        - and returns a list where the entry in each index t' is sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        """
        # HINT1: note that each entry of the output should now be unique,
        # because the summation happens over [t, T] instead of [0, T]
        # HINT2: it is possible to write a vectorized solution, but a solution
        # using a for loop is also fine
        T = rewards.shape[0]
        discount_factors = np.power(self.gamma, np.arange(T))
        discounted_rewards = rewards * discount_factors

        # We can write RTG(t) = sum_{t'=t}^T gamma^t' r_{t'} / r^t
        # Need cumsum from the right, i.e. flip -> cumsum -> flip
        partial_sums = np.flip(np.cumsum(np.flip(discounted_rewards)))
        rewards_to_go = partial_sums / discount_factors

        return rewards_to_go

Пример #19

Показать файл

class PGAgent(BaseAgent):
    def __init__(self, sess, env, agent_params):
        super(PGAgent, self).__init__()

        # init vars
        self.env = env
        self.sess = sess
        self.agent_params = agent_params
        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']
        self.nn_baseline = self.agent_params['nn_baseline']
        self.reward_to_go = self.agent_params['reward_to_go']

        # actor/policy
        # NOTICE that we are using MLPPolicyPG (hw2), instead of MLPPolicySL (hw1)
        # which indicates similar network structure (layout/inputs/outputs),
        # but differences in training procedure
        # between supervised learning and policy gradients
        self.actor = MLPPolicyPG(
            sess,
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            discrete=self.agent_params['discrete'],
            learning_rate=self.agent_params['learning_rate'],
            nn_baseline=self.agent_params['nn_baseline'])

        # replay buffer
        self.replay_buffer = ReplayBuffer(1000000)

    def train(self, obs, acs, rews_list, next_obs, terminals):
        """
            Training a PG agent refers to updating its actor using the given observations/actions
            and the calculated qvals/advantages that come from the seen rewards.

            ---------------------------------------------------------------------------------- 
            
            Recall that the expression for the policy gradient PG is
            
                PG = E_{tau} [sum_{t=0}^{T-1} grad log pi(a_t|s_t) * (Q_t - b_t )]
            
                where 
                tau=(s_0, a_0, s_1, a_1, s_2, a_2, ...) is a trajectory,
                Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
                b_t is a baseline which may depend on s_t,
                and (Q_t - b_t ) is the advantage.

            Thus, the PG update performed by the actor needs (s_t, a_t, q_t, adv_t),
                and that is exactly what this function provides.

            ----------------------------------------------------------------------------------
        """

        # step 1: calculate q values of each (s_t, a_t) point,
        # using rewards from that full rollout of length T: (r_0, ..., r_t, ..., r_{T-1})
        q_values = self.calculate_q_vals(rews_list)

        # step 2: calculate advantages that correspond to each (s_t, a_t) point
        advantage_values = self.estimate_advantage(obs, q_values)

        # step 3:
        # TODO: pass the calculated values above into the actor/policy's update,
        # which will perform the actual PG update step
        loss = self.actor.update(obs, acs, qvals=TODO, adv_n=TODO)
        return loss

    def calculate_q_vals(self, rews_list):
        """
            Monte Carlo estimation of the Q function.

            arguments:
                rews_list: length: number of sampled rollouts
                    Each element corresponds to a particular rollout,
                    and contains an array of the rewards for every step of that particular rollout

            returns:
                q_values: shape: (sum/total number of steps across the rollouts)
                    Each entry corresponds to the estimated q(s_t,a_t) value 
                    of the corresponding obs/ac point at time t.
 
        """

        # Case 1: trajectory-based PG
        if not self.reward_to_go:

            # TODO: Estimate the Q value Q^{pi}(s_t, a_t) using rewards from that entire trajectory
            # HINT1: value of each point (t) = total discounted reward summed over the entire trajectory (from 0 to T-1)
            # In other words, q(s_t, a_t) = sum_{t'=0}^{T-1} gamma^t' r_{t'}
            # Hint3: see the helper functions at the bottom of this file
            q_values = np.concatenate([TODO for r in rews_list])

        # Case 2: reward-to-go PG
        else:

            # TODO: Estimate the Q value Q^{pi}(s_t, a_t) as the reward-to-go
            # HINT1: value of each point (t) = total discounted reward summed over the remainder of that trajectory (from t to T-1)
            # In other words, q(s_t, a_t) = sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'}
            # Hint3: see the helper functions at the bottom of this file
            q_values = np.concatenate([TODO for r in rews_list])

        return q_values

    def estimate_advantage(self, obs, q_values):
        """
            Computes advantages by (possibly) subtracting a baseline from the estimated Q values
        """

        # TODO: Estimate the advantage when nn_baseline is True
        # HINT1: pass obs into the neural network that you're using to learn the baseline
        # extra hint if you're stuck: see your actor's run_baseline_prediction
        # HINT2: advantage should be [Q-b]
        if self.nn_baseline:
            b_n_unnormalized = TODO
            b_n = b_n_unnormalized * np.std(q_values) + np.mean(q_values)
            adv_n = TODO

        # Else, just set the advantage to [Q]
        else:
            adv_n = q_values.copy()

        # Normalize the resulting advantages
        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)

        return adv_n

    #####################################################
    #####################################################

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size,
                                                     concat_rew=False)

    #####################################################
    ################## HELPER FUNCTIONS #################
    #####################################################

    # TODO: implement this function
    def _discounted_return(self, rewards):
        """
            Helper function

            Input: a list of rewards {r_0, r_1, ..., r_t', ... r_{T-1}} from a single rollout of length T

            Output: list where each index t contains sum_{t'=0}^{T-1} gamma^t' r_{t'}
                note that all entries of this output are equivalent
                because each index t is a sum from 0 to T-1 (and doesnt involve t)
        """

        # 1) create a list of indices (t'): from 0 to T-1
        indices = TODO

        # 2) create a list where the entry at each index (t') is gamma^(t')
        discounts = TODO

        # 3) create a list where the entry at each index (t') is gamma^(t') * r_{t'}
        discounted_rewards = TODO

        # 4) calculate a scalar: sum_{t'=0}^{T-1} gamma^(t') * r_{t'}
        sum_of_discounted_rewards = TODO

        # 5) create a list of length T-1, where each entry t contains that scalar
        list_of_discounted_returns = TODO

        return list_of_discounted_returns

    def _discounted_cumsum(self, rewards):
        """
            Input:
                a list of length T 
                a list of rewards {r_0, r_1, ..., r_t', ... r_{T-1}} from a single rollout of length T
            Output: 
                a list of length T
                a list where the entry in each index t is sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'}
        """

        all_discounted_cumsums = []

        # for loop over steps (t) of the given rollout
        for start_time_index in range(len(rewards)):

            # 1) create a list of indices (t'): goes from t to T-1
            indices = TODO

            # 2) create a list where the entry at each index (t') is gamma^(t'-t)
            discounts = TODO

            # 3) create a list where the entry at each index (t') is gamma^(t'-t) * r_{t'}
            # Hint: remember that t' goes from t to T-1, so you should use the rewards from those indices as well
            discounted_rtg = TODO

            # 4) calculate a scalar: sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'}
            sum_discounted_rtg = TODO

            # appending each of these calculated sums into the list to return
            all_discounted_cumsums.append(sum_discounted_rtg)
        list_of_discounted_cumsums = np.array(all_discounted_cumsums)
        return list_of_discounted_cumsums

Пример #20

Показать файл

Файл: ac_agent.py Проект: waiwnf/berkeley_cs285_homework_fall2019

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params, **kwargs):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params
        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params['standardize_advantages']

        # actor/policy
        if self.agent_params['discrete']:
            self.actor = DiscreteMLPPolicy(self.agent_params['ac_dim'],
                                           self.agent_params['ob_dim'],
                                           self.agent_params['n_layers'],
                                           self.agent_params['size'])
        else:
            self.actor = ContinuousMLPPolicy(self.agent_params['ac_dim'],
                                             self.agent_params['ob_dim'],
                                             self.agent_params['n_layers'],
                                             self.agent_params['size'])
        self.policy_optimizer = tf.keras.optimizers.Adam(learning_rate=self.agent_params['learning_rate'])

        self.critic = BootstrappedContinuousCritic(self.agent_params)
        self.critic_loss = tf.keras.losses.MeanSquaredError()
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=self.agent_params['learning_rate'])
        self.critic.nn_critic.compile(optimizer=self.critic_optimizer, loss=self.critic_loss)

        self.replay_buffer = ReplayBuffer()

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):

        # TODO Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)

        current_state_values = self.critic(ob_no)
        next_state_values = self.gamma * self.critic(next_ob_no) * (1.0 - tf.expand_dims(tf.cast(terminal_n, tf.float32), axis=1))
        adv_n = next_state_values + re_n - current_state_values

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # TODO Implement the following pseudocode:
        # for agent_params['num_critic_updates_per_agent_update'] steps,
        #     update the critic

        # advantage = estimate_advantage(...)

        # for agent_params['num_actor_updates_per_agent_update'] steps,
        #     update the actor

        for _ in range(self.agent_params['num_critic_updates_per_agent_update']):
            for _ in range(self.agent_params['num_target_updates']):
                critic_targets = self.critic.get_training_targets(next_ob_no, re_n, terminal_n, self.gamma)
                critic_dataset = tf.data.Dataset.from_tensor_slices(
                    (tf.cast(ob_no, tf.float32), tf.cast(critic_targets, tf.float32)))
                critic_dataset = critic_dataset.batch(batch_size=critic_targets.shape[0]).repeat()
                self.critic.nn_critic.fit(critic_dataset, epochs=1,
                                          steps_per_epoch=self.agent_params['num_grad_steps_per_target_update'])

        advantage = tf.stop_gradient(self.estimate_advantage(ob_no, next_ob_no, tf.expand_dims(re_n, axis=1), terminal_n))

        for _ in range(self.agent_params['num_actor_updates_per_agent_update']):
            with tf.GradientTape() as tape:
                log_action_probas = self.actor.get_log_prob(ob_no, ac_na)
                loss = -tf.reduce_mean(advantage * tf.expand_dims(log_action_probas, axis=1))

            actor_vars = self.actor.trainable_variables
            grads = tape.gradient(loss, actor_vars)
            self.policy_optimizer.apply_gradients(zip(grads, actor_vars))

        loss_dict = OrderedDict()
        loss_dict['Critic_Loss'] = 0  # put final critic loss here
        loss_dict['Actor_Loss'] = loss.numpy().item()  # put final actor loss here
        return loss_dict

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)