Python BootstrappedContinuousCritic.update示例，cs285.critics.bootstrapped_continuous_critic.BootstrappedContinuousCritic.update Python示例

示例#1

0

显示文件

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params['standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
        )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # TODO Implement the following pseudocode: DONE
        for _ in range(self.agent_params['num_critic_updates_per_agent_update']):
            critic_loss = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n)

        advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)

        for _ in range(self.agent_params['num_actor_updates_per_agent_update']):
            actor_loss = self.actor.update(ob_no, ac_na, advantage)

        loss = OrderedDict()
        loss['Critic_Loss'] = critic_loss
        loss['Actor_Loss'] = actor_loss

        return loss

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        # TODO Implement the following pseudocode: DONE
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)

        V_s = self.critic.forward_np(ob_no)
        V_s_prime  = self.critic.forward_np(next_ob_no)
        Q_s_a = re_n + self.gamma*V_s_prime*(1-terminal_n)
        adv_n = Q_s_a - V_s

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

示例#2

0

显示文件

文件： ac_agent.py 项目： arthur801031/cs285-fall2019

class ACAgent:
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params
        self.num_critic_updates_per_agent_update = agent_params['num_critic_updates_per_agent_update']
        self.num_actor_updates_per_agent_update = agent_params['num_actor_updates_per_agent_update']
        self.device = agent_params['device']

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params['standardize_advantages']

        self.actor = MLPPolicyAC(self.agent_params['ob_dim'],
                               self.agent_params['ac_dim'],
                               self.agent_params['n_layers'],
                               self.agent_params['size'],
                               self.agent_params['device'],
                               discrete=self.agent_params['discrete'],
                               learning_rate=self.agent_params['learning_rate'],
                               )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer(agent_params['replay_size'])

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        ob, next_ob, rew, done = map(lambda x: torch.from_numpy(x).to(self.device), [ob_no, next_ob_no, re_n, terminal_n])

        value = self.critic.value_func(ob).squeeze()
        next_value = self.critic.value_func(next_ob).squeeze() * (1 - done)
        adv_n = rew + (self.gamma * next_value) - value
        adv_n = adv_n.cpu().detach().numpy()

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        loss = OrderedDict()

        for critic_update in range(self.num_critic_updates_per_agent_update):
            loss['Critic_Loss'] = self.critic.update(ob_no, next_ob_no, re_n, terminal_n)

        adv_n = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # put final critic loss here

        for actor_update in range(self.num_actor_updates_per_agent_update):
            loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, adv_n)  # put final actor loss here

        return loss

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

示例#3

0

显示文件

文件： ac_agent.py 项目： arthur801031/cs285-fall2019

class ACAgent:
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params
        self.num_critic_updates_per_agent_update = agent_params[
            'num_critic_updates_per_agent_update']
        self.num_actor_updates_per_agent_update = agent_params[
            'num_actor_updates_per_agent_update']
        self.device = agent_params['device']

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['device'],
            discrete=self.agent_params['discrete'],
            learning_rate=self.agent_params['learning_rate'],
        )
        # introduced in actor-critic to improve advantage function.
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        ob, next_ob, rew, done = map(
            lambda x: torch.from_numpy(x).to(self.device),
            [ob_no, next_ob_no, re_n, terminal_n])

        # DoneTODO Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)

        v_s = self.critic.value_func(ob)
        v_s_prime = self.critic.value_func(next_ob).squeeze()
        v_s_prime[done >= 1] = 0
        estimated_q = rew + self.gamma * v_s_prime
        adv_n = estimated_q - v_s
        adv_n = adv_n.cpu().detach().numpy()

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):

        # DoneTODO Implement the following pseudocode:
        # for agent_params['num_critic_updates_per_agent_update'] steps,
        #     update the critic

        # advantage = estimate_advantage(...)

        # for agent_params['num_actor_updates_per_agent_update'] steps,
        #     update the actor

        loss = OrderedDict()

        for i in range(self.num_critic_updates_per_agent_update):
            loss['Critic_Loss'] = self.critic.update(ob_no, next_ob_no, re_n,
                                                     terminal_n)

        adv = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)

        for i in range(self.num_actor_updates_per_agent_update):
            loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, adv)

        return loss

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

示例#4

0

显示文件

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
        )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # TODO Implement the following pseudocode:
        # for agent_params['num_critic_updates_per_agent_update'] steps,
        #     update the critic
        # ob_no = ptu.from_numpy(ob_no)
        # ac_na = ptu.from_numpy(ac_na).to(torch.long)
        # next_ob_no = ptu.from_numpy(next_ob_no)
        # re_n = ptu.from_numpy(re_n)
        # terminal_n = ptu.from_numpy(terminal_n)
        for _ in range(
                self.agent_params['num_critic_updates_per_agent_update']):
            critic_loss = self.critic.update(ob_no=ob_no,
                                             ac_na=ac_na,
                                             reward_n=re_n,
                                             next_ob_no=next_ob_no,
                                             terminal_n=terminal_n)

        # targets = re_n + self.gamma * self.critic(next_ob_no) * (1-terminal_n)
        # pred = self.critic(ob_no)
        # #advantage = re_n + self.gamma * self.critic(next_ob_no) * (1-terminal_n)  - self.critic(ob_no)
        # advantage = targets - pred
        advantage = self.estimate_advantage(ob_no, next_ob_no, re_n,
                                            terminal_n)

        # for agent_params['num_actor_updates_per_agent_update'] steps,
        #     update the actor
        for _ in range(
                self.agent_params['num_actor_updates_per_agent_update']):
            actor_loss = self.actor.update(ob_no, ac_na, adv_n=advantage)

        loss = OrderedDict()
        loss['Critic_Loss'] = critic_loss
        loss['Actor_Loss'] = actor_loss

        return loss

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        # TODO Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)
        value_targets = re_n + self.gamma * self.critic(next_ob_no) * (
            1. - terminal_n)
        value_pred = self.critic(ob_no)
        #advantage = re_n + self.gamma * self.critic(next_ob_no) * (1-terminal_n)  - self.critic(ob_no)
        adv_n = value_targets - value_pred

        if self.standardize_advantages:
            adv_n = (adv_n - torch.mean(adv_n)) / (torch.std(adv_n) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

示例#5

0

显示文件

文件： ac_agent.py 项目： abhi1345/homework_fall2019

class ACAgent(BaseAgent):
    def __init__(self, sess, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env 
        self.sess = sess
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params['standardize_advantages']

        self.actor = MLPPolicyAC(sess, 
                               self.agent_params['ac_dim'],
                               self.agent_params['ob_dim'],
                               self.agent_params['n_layers'],
                               self.agent_params['size'],
                               discrete=self.agent_params['discrete'],
                               learning_rate=self.agent_params['learning_rate'],
                               )
        self.critic = BootstrappedContinuousCritic(sess, self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        
        # TODO Implement the following pseudocode:
            # 1) query the critic with ob_no, to get V(s)
            # 2) query the critic with next_ob_no, to get V(s')
            # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
            # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
            # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)
        
        vs = self.sess.run(self.critic.critic_prediction, feed_dict = {self.critic.sy_ob_no : ob_no})
        vsprime = self.sess.run(self.critic.critic_prediction, feed_dict = {self.critic.sy_ob_no : next_ob_no})*(1-terminal_n)
        q_val = re_n + self.gamma * vsprime
        adv_n = q_val - vs

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        
        # TODO Implement the following pseudocode:
            # for agent_params['num_critic_updates_per_agent_update'] steps,
            #     update the critic

            # advantage = estimate_advantage(...)

            # for agent_params['num_actor_updates_per_agent_update'] steps,
            #     update the actor
        for x in range(self.agent_params['num_critic_updates_per_agent_update']):
            closs = self.critic.update(ob_no, next_ob_no, re_n, terminal_n)
        advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)

        for x in range(self.agent_params['num_actor_updates_per_agent_update']):
            aloss = self.actor.update(ob_no, ac_na, advantage)

        loss = OrderedDict()
        loss['Critic_Loss'] = closs  # put final critic loss here
        loss['Actor_Loss'] = aloss  # put final actor loss here
        return loss

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

示例#6

0

显示文件

文件： ac_agent.py 项目： yzyvl/cs285-homework

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
        )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # for agent_params['num_critic_updates_per_agent_update'] steps,
        #     update the critic

        loss = OrderedDict()

        for _ in range(
                self.agent_params['num_critic_updates_per_agent_update']):
            loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no,
                                                     re_n, terminal_n)

        advantages = self.estimate_advantage(ob_no, next_ob_no, re_n,
                                             terminal_n)

        # for agent_params['num_actor_updates_per_agent_update'] steps,
        #     update the actor
        for _ in range(
                self.agent_params['num_actor_updates_per_agent_update']):
            loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, advantages)

        return loss

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        # Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)

        ob_no = ptu.from_numpy(ob_no)
        next_ob_no = ptu.from_numpy(next_ob_no)
        re_n = ptu.from_numpy(re_n)
        terminal_n = ptu.from_numpy(terminal_n).bool()

        v_s = self.critic(ob_no)
        v_sp1 = self.critic(next_ob_no)
        v_sp1[terminal_n] = 0
        q_sa = re_n + self.gamma * v_sp1
        adv_n = q_sa - v_s
        assert adv_n.size() == re_n.size()
        adv_n = adv_n.detach().cpu().numpy()

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

示例#7

0

显示文件

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']
        self.gae = self.agent_params['gae']
        self.gae_lambda = self.agent_params['gae_lambda']
        self.ppo = self.agent_params['ppo']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
            self.agent_params['clip_eps'],
        )

        if self.ppo:
            self.old_actor = MLPPolicyAC(
                self.agent_params['ac_dim'],
                self.agent_params['ob_dim'],
                self.agent_params['n_layers'],
                self.agent_params['size'],
                self.agent_params['discrete'],
                self.agent_params['learning_rate'],
                self.agent_params['clip_eps'],
            )
            self.old_actor.load_state_dict(self.actor.state_dict())

        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # TODO Implement the following pseudocode:
        # for agent_params['num_critic_updates_per_agent_update'] steps,
        #     update the critic

        rewards = np.concatenate([r for r in re_n]) if self.gae else re_n
        assert rewards.shape == terminal_n.shape
        for i in range(
                self.agent_params['num_critic_updates_per_agent_update']):
            loss_critic = self.critic.update(ob_no, ac_na, next_ob_no, rewards,
                                             terminal_n)

        advantage = self.estimate_advantage(ob_no, next_ob_no, re_n,
                                            terminal_n)
        old_log_prob = self.get_old_prob(self.old_actor, ob_no,
                                         ac_na) if self.ppo else None

        # for agent_params['num_actor_updates_per_agent_update'] steps,
        #     update the actor
        for i in range(
                self.agent_params['num_actor_updates_per_agent_update']):
            loss_actor = self.actor.update(ob_no, ac_na, advantage,
                                           old_log_prob)

        if self.ppo:
            self.old_actor.load_state_dict(self.actor.state_dict())

        loss = OrderedDict()
        loss['Critic_Loss'] = loss_critic
        loss['Actor_Loss'] = loss_actor

        return loss

    def get_old_prob(self, old_policy, ob_no, ac_na):
        observations = ptu.from_numpy(ob_no)
        actions = ptu.from_numpy(ac_na)
        log_prob = old_policy.forward(observations).log_prob(actions)

        return ptu.to_numpy(log_prob)

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        # TODO Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)
        v_s = self.critic.forward_np(ob_no)
        if not self.gae:
            v_s_next = self.critic.forward_np(next_ob_no) * (1 - terminal_n)
            adv_n = re_n + self.gamma * v_s_next - v_s
        else:
            index = 0
            adv_n = np.zeros(len(ob_no))
            for rewards in re_n:
                gae_deltas = []
                for i in range(len(rewards) - 1):
                    delta = rewards[i] + self.gamma * v_s[index + i +
                                                          1] - v_s[index + i]
                    gae_deltas.append(delta)
                i = len(rewards) - 1
                gae_deltas.append(rewards[i] - v_s[index + i])

                assert len(gae_deltas) == len(rewards)

                sum_deltas = 0
                for t in range(len(gae_deltas) - 1, -1, -1):
                    sum_deltas = gae_deltas[
                        t] + sum_deltas * self.gamma * self.gae_lambda
                    adv_n[t + index] = sum_deltas

                index += len(rewards)

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        concat_rew = False if self.gae else True
        return self.replay_buffer.sample_recent_data(batch_size, concat_rew)

示例#8

0

显示文件

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
        )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        loss = OrderedDict()
        ob_no = ptu.from_numpy(ob_no)
        ac_na = ptu.from_numpy(ac_na)
        re_n = ptu.from_numpy(re_n)
        next_ob_no = ptu.from_numpy(next_ob_no)
        terminal_n = ptu.from_numpy(terminal_n)

        for _ in range(
                self.agent_params['num_critic_updates_per_agent_update']):
            loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no,
                                                     re_n, terminal_n)

        advantages = self.estimate_advantage(ob_no, next_ob_no, re_n,
                                             terminal_n)
        advantages = ptu.from_numpy(advantages)

        for _ in range(
                self.agent_params['num_actor_updates_per_agent_update']):
            loss['Actor_Loss'] = self.actor.update(ob_no,
                                                   ac_na,
                                                   adv_n=advantages)

        return loss

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        v_s_n = self.critic(ob_no)
        v_s_prime_n = self.critic(next_ob_no)
        # setting V(s') to zero if the next state is a terminal state
        q_n = re_n + self.gamma * v_s_prime_n * (1 - terminal_n)
        adv_n = q_n - v_s_n
        assert adv_n.size() == re_n.size()
        adv_n = adv_n.detach().cpu().numpy()

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

示例#9

0

显示文件

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
        )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # TODO_ Implement the following pseudocode:
        # for agent_params['num_critic_updates_per_agent_update'] steps,
        #     update the critic

        # advantage = estimate_advantage(...)

        # for agent_params['num_actor_updates_per_agent_update'] steps,
        #     update the actor

        loss = OrderedDict()
        Critic_Loss = []
        Actor_loss = []

        for i in range(agent_params['num_critic_updates_per_agent_update']):
            Critic_Loss.append(
                self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n))

        advantage = estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)

        for i in range(agent_params['num_actor_updates_per_agent_update']):
            Actor_loss.append(self.actor.update(ob_no, ac_na, advantage))

        loss['Critic_Loss'] = mean(Critic_Loss)
        loss['Actor_Loss'] = mean(Actor_loss)

        return loss

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        # TODO_ Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)
        Vs = self.critic(ob_no)
        Vs_prime = self.critic(next_ob_no)

        # ternimal_index = next(i for i, x in enumerate(terminal_n) if x, None)
        ternimal_index = [i for i, x in enumerate(terminal_n) if x]
        if len(ternimal_index):
            Vs_prime[ternimal_index] = 0
        Qs = re_n + self.gamma * Vs_prime
        adv_n = Qs - Vs

        if self.standardize_advantages:
            adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

示例#10

0

显示文件

文件： ac_agent.py 项目： HuangYiduo14/cs285_project_ubereats

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super().__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params['standardize_advantages']
        self.n_drivers = self.agent_params['n_drivers']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size_ac'],
            self.agent_params['shared_exp'],
            self.agent_params['shared_exp_lambda'],
            self.agent_params['is_city'],
            self.agent_params['learning_rate'],
            self.agent_params['n_drivers']
        )

        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        # TODO Implement the following pseudocode:
        # for agent_params['num_critic_updates_per_agent_update'] steps,
        #     update the critic
        loss = OrderedDict()
        for i in range(self.agent_params['num_critic_updates_per_agent_update']):
            if not self.agent_params['shared_exp']:
                loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n)
            else:
                action_distributions = self.actor.shared_forward(ptu.from_numpy(ob_no))
                loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n, action_distributions)
        # advantage = estimate_advantage(...)
        if self.agent_params['shared_exp']:
            advantage = self.estimate_shared_advantage(ob_no, next_ob_no, re_n, terminal_n)
        else:
            advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)
        # for agent_params['num_actor_updates_per_agent_update'] steps,
        #     update the actor
        for i in range(self.agent_params['num_actor_updates_per_agent_update']):
            loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, advantage)
        return loss
        
        
    def estimate_shared_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        value_s = self.critic.shared_forward(ptu.from_numpy(ob_no))
        value_next_s = self.critic.shared_forward(ptu.from_numpy(next_ob_no))
        adv_n = dict()
        for i in range(self.n_drivers):
            for k in range(self.n_drivers):
                adv_n[(i,k)] = re_n[:,k] + self.gamma*ptu.to_numpy(value_next_s[(i,k)]) - ptu.to_numpy(value_s[(i,k)])
                if self.standardize_advantages:
                    adv_n[(i,k)] = (adv_n[(i,k)]- np.mean(adv_n[(i,k)]))/(np.std(adv_n[(i,k)])+1e-8)
        return adv_n
        

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        # TODO Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)
        value_s = self.critic.forward_np(ob_no)
        value_next_s = self.critic.forward_np(next_ob_no)
        value_next_s[terminal_n==1] = 0
        adv_n = re_n + self.gamma*value_next_s - value_s
        if self.standardize_advantages:
            for i in range(self.n_drivers):
                adv_n[:,i] = (adv_n[:,i] - np.mean(adv_n[:,i])) / (np.std(adv_n[:,i]) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)

示例#11

0

显示文件

文件： ac_agent.py 项目： fokx/cs285_fall2020

class ACAgent(BaseAgent):
    def __init__(self, env, agent_params):
        super(ACAgent, self).__init__()

        self.env = env
        self.agent_params = agent_params

        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']

        self.actor = MLPPolicyAC(
            self.agent_params['ac_dim'],
            self.agent_params['ob_dim'],
            self.agent_params['n_layers'],
            self.agent_params['size'],
            self.agent_params['discrete'],
            self.agent_params['learning_rate'],
        )
        self.critic = BootstrappedContinuousCritic(self.agent_params)

        self.replay_buffer = ReplayBuffer()

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        ob_no = ptu.from_numpy(ob_no)
        next_ob_no = ptu.from_numpy(next_ob_no)
        terminal_n = ptu.from_numpy(terminal_n)
        re_n = ptu.from_numpy(re_n)

        ac_na = ptu.from_numpy(ac_na)

        loss_critic = 0.
        for i in range(
                self.agent_params['num_critic_updates_per_agent_update']):
            loss_critic += self.critic.update(ob_no, ac_na, next_ob_no, re_n,
                                              terminal_n)

        # advantage = estimate_advantage(...) :
        adv_n = self.estimate_advantage(ob_no, next_ob_no, re_n,
                                        terminal_n)  # a tensor is returned
        loss_actor = 0.
        for i in range(
                self.agent_params['num_actor_updates_per_agent_update']):
            loss_actor += self.actor.update(ob_no, ac_na, adv_n)

        loss = OrderedDict()
        loss['Critic_Loss'] = loss_critic
        loss[
            'Actor_Loss'] = loss_actor  # in TensorBoard, loss_actor actually increases as we actually minimize -loss_actor

        return loss

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        # TODO Implement the following pseudocode:
        # 1) query the critic with ob_no, to get V(s)
        # 2) query the critic with next_ob_no, to get V(s')
        # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
        # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)

        # V_s_prime = self.critic.critic_network(next_ob_no)
        # V_s_prime = V_s_prime.squeeze()
        # mask = (terminal_n == 1.)
        # V_s_prime= V_s_prime.masked_fill(mask, 0.)
        #
        # V_s = self.critic.critic_network(ob_no)
        # V_s = V_s.squeeze()
        # # assert V_s_prime.ndim == V_s.ndim     # TODO-assert enable this assert in debug
        # adv_n2 = re_n + self.gamma * V_s_prime - V_s

        # another way to calculate:
        V_s_prime = re_n + (
            1 - terminal_n) * self.gamma * self.critic.forward(next_ob_no)
        adv_n = V_s_prime - self.critic.forward(ob_no)
        # assert adv_n2 == adv_n

        if self.standardize_advantages:
            adv_n = (adv_n - torch.mean(adv_n)) / (torch.std(adv_n) + 1e-8)
        return adv_n

    def add_to_replay_buffer(self, paths):
        self.replay_buffer.add_rollouts(paths)

    def sample(self, batch_size):
        return self.replay_buffer.sample_recent_data(batch_size)