class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: DONE for _ in range(self.agent_params['num_critic_updates_per_agent_update']): critic_loss = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) for _ in range(self.agent_params['num_actor_updates_per_agent_update']): actor_loss = self.actor.update(ob_no, ac_na, advantage) loss = OrderedDict() loss['Critic_Loss'] = critic_loss loss['Actor_Loss'] = actor_loss return loss def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: DONE # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) V_s = self.critic.forward_np(ob_no) V_s_prime = self.critic.forward_np(next_ob_no) Q_s_a = re_n + self.gamma*V_s_prime*(1-terminal_n) adv_n = Q_s_a - V_s if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class ACAgent: def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.num_critic_updates_per_agent_update = agent_params['num_critic_updates_per_agent_update'] self.num_actor_updates_per_agent_update = agent_params['num_actor_updates_per_agent_update'] self.device = agent_params['device'] self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.actor = MLPPolicyAC(self.agent_params['ob_dim'], self.agent_params['ac_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['device'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer(agent_params['replay_size']) def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): ob, next_ob, rew, done = map(lambda x: torch.from_numpy(x).to(self.device), [ob_no, next_ob_no, re_n, terminal_n]) value = self.critic.value_func(ob).squeeze() next_value = self.critic.value_func(next_ob).squeeze() * (1 - done) adv_n = rew + (self.gamma * next_value) - value adv_n = adv_n.cpu().detach().numpy() if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): loss = OrderedDict() for critic_update in range(self.num_critic_updates_per_agent_update): loss['Critic_Loss'] = self.critic.update(ob_no, next_ob_no, re_n, terminal_n) adv_n = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # put final critic loss here for actor_update in range(self.num_actor_updates_per_agent_update): loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, adv_n) # put final actor loss here return loss def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class ACAgent: def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.num_critic_updates_per_agent_update = agent_params[ 'num_critic_updates_per_agent_update'] self.num_actor_updates_per_agent_update = agent_params[ 'num_actor_updates_per_agent_update'] self.device = agent_params['device'] self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['device'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) # introduced in actor-critic to improve advantage function. self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): ob, next_ob, rew, done = map( lambda x: torch.from_numpy(x).to(self.device), [ob_no, next_ob_no, re_n, terminal_n]) # DoneTODO Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) v_s = self.critic.value_func(ob) v_s_prime = self.critic.value_func(next_ob).squeeze() v_s_prime[done >= 1] = 0 estimated_q = rew + self.gamma * v_s_prime adv_n = estimated_q - v_s adv_n = adv_n.cpu().detach().numpy() if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # DoneTODO Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic # advantage = estimate_advantage(...) # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor loss = OrderedDict() for i in range(self.num_critic_updates_per_agent_update): loss['Critic_Loss'] = self.critic.update(ob_no, next_ob_no, re_n, terminal_n) adv = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) for i in range(self.num_actor_updates_per_agent_update): loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, adv) return loss def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic # ob_no = ptu.from_numpy(ob_no) # ac_na = ptu.from_numpy(ac_na).to(torch.long) # next_ob_no = ptu.from_numpy(next_ob_no) # re_n = ptu.from_numpy(re_n) # terminal_n = ptu.from_numpy(terminal_n) for _ in range( self.agent_params['num_critic_updates_per_agent_update']): critic_loss = self.critic.update(ob_no=ob_no, ac_na=ac_na, reward_n=re_n, next_ob_no=next_ob_no, terminal_n=terminal_n) # targets = re_n + self.gamma * self.critic(next_ob_no) * (1-terminal_n) # pred = self.critic(ob_no) # #advantage = re_n + self.gamma * self.critic(next_ob_no) * (1-terminal_n) - self.critic(ob_no) # advantage = targets - pred advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor for _ in range( self.agent_params['num_actor_updates_per_agent_update']): actor_loss = self.actor.update(ob_no, ac_na, adv_n=advantage) loss = OrderedDict() loss['Critic_Loss'] = critic_loss loss['Actor_Loss'] = actor_loss return loss def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) value_targets = re_n + self.gamma * self.critic(next_ob_no) * ( 1. - terminal_n) value_pred = self.critic(ob_no) #advantage = re_n + self.gamma * self.critic(next_ob_no) * (1-terminal_n) - self.critic(ob_no) adv_n = value_targets - value_pred if self.standardize_advantages: adv_n = (adv_n - torch.mean(adv_n)) / (torch.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class ACAgent(BaseAgent): def __init__(self, sess, env, agent_params): super(ACAgent, self).__init__() self.env = env self.sess = sess self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.actor = MLPPolicyAC(sess, self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], discrete=self.agent_params['discrete'], learning_rate=self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(sess, self.agent_params) self.replay_buffer = ReplayBuffer() def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) vs = self.sess.run(self.critic.critic_prediction, feed_dict = {self.critic.sy_ob_no : ob_no}) vsprime = self.sess.run(self.critic.critic_prediction, feed_dict = {self.critic.sy_ob_no : next_ob_no})*(1-terminal_n) q_val = re_n + self.gamma * vsprime adv_n = q_val - vs if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic # advantage = estimate_advantage(...) # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor for x in range(self.agent_params['num_critic_updates_per_agent_update']): closs = self.critic.update(ob_no, next_ob_no, re_n, terminal_n) advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) for x in range(self.agent_params['num_actor_updates_per_agent_update']): aloss = self.actor.update(ob_no, ac_na, advantage) loss = OrderedDict() loss['Critic_Loss'] = closs # put final critic loss here loss['Actor_Loss'] = aloss # put final actor loss here return loss def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic loss = OrderedDict() for _ in range( self.agent_params['num_critic_updates_per_agent_update']): loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) advantages = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor for _ in range( self.agent_params['num_actor_updates_per_agent_update']): loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, advantages) return loss def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) ob_no = ptu.from_numpy(ob_no) next_ob_no = ptu.from_numpy(next_ob_no) re_n = ptu.from_numpy(re_n) terminal_n = ptu.from_numpy(terminal_n).bool() v_s = self.critic(ob_no) v_sp1 = self.critic(next_ob_no) v_sp1[terminal_n] = 0 q_sa = re_n + self.gamma * v_sp1 adv_n = q_sa - v_s assert adv_n.size() == re_n.size() adv_n = adv_n.detach().cpu().numpy() if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.gae = self.agent_params['gae'] self.gae_lambda = self.agent_params['gae_lambda'] self.ppo = self.agent_params['ppo'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], self.agent_params['clip_eps'], ) if self.ppo: self.old_actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], self.agent_params['clip_eps'], ) self.old_actor.load_state_dict(self.actor.state_dict()) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic rewards = np.concatenate([r for r in re_n]) if self.gae else re_n assert rewards.shape == terminal_n.shape for i in range( self.agent_params['num_critic_updates_per_agent_update']): loss_critic = self.critic.update(ob_no, ac_na, next_ob_no, rewards, terminal_n) advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) old_log_prob = self.get_old_prob(self.old_actor, ob_no, ac_na) if self.ppo else None # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor for i in range( self.agent_params['num_actor_updates_per_agent_update']): loss_actor = self.actor.update(ob_no, ac_na, advantage, old_log_prob) if self.ppo: self.old_actor.load_state_dict(self.actor.state_dict()) loss = OrderedDict() loss['Critic_Loss'] = loss_critic loss['Actor_Loss'] = loss_actor return loss def get_old_prob(self, old_policy, ob_no, ac_na): observations = ptu.from_numpy(ob_no) actions = ptu.from_numpy(ac_na) log_prob = old_policy.forward(observations).log_prob(actions) return ptu.to_numpy(log_prob) def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) v_s = self.critic.forward_np(ob_no) if not self.gae: v_s_next = self.critic.forward_np(next_ob_no) * (1 - terminal_n) adv_n = re_n + self.gamma * v_s_next - v_s else: index = 0 adv_n = np.zeros(len(ob_no)) for rewards in re_n: gae_deltas = [] for i in range(len(rewards) - 1): delta = rewards[i] + self.gamma * v_s[index + i + 1] - v_s[index + i] gae_deltas.append(delta) i = len(rewards) - 1 gae_deltas.append(rewards[i] - v_s[index + i]) assert len(gae_deltas) == len(rewards) sum_deltas = 0 for t in range(len(gae_deltas) - 1, -1, -1): sum_deltas = gae_deltas[ t] + sum_deltas * self.gamma * self.gae_lambda adv_n[t + index] = sum_deltas index += len(rewards) if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): concat_rew = False if self.gae else True return self.replay_buffer.sample_recent_data(batch_size, concat_rew)
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): loss = OrderedDict() ob_no = ptu.from_numpy(ob_no) ac_na = ptu.from_numpy(ac_na) re_n = ptu.from_numpy(re_n) next_ob_no = ptu.from_numpy(next_ob_no) terminal_n = ptu.from_numpy(terminal_n) for _ in range( self.agent_params['num_critic_updates_per_agent_update']): loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) advantages = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) advantages = ptu.from_numpy(advantages) for _ in range( self.agent_params['num_actor_updates_per_agent_update']): loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, adv_n=advantages) return loss def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): v_s_n = self.critic(ob_no) v_s_prime_n = self.critic(next_ob_no) # setting V(s') to zero if the next state is a terminal state q_n = re_n + self.gamma * v_s_prime_n * (1 - terminal_n) adv_n = q_n - v_s_n assert adv_n.size() == re_n.size() adv_n = adv_n.detach().cpu().numpy() if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO_ Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic # advantage = estimate_advantage(...) # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor loss = OrderedDict() Critic_Loss = [] Actor_loss = [] for i in range(agent_params['num_critic_updates_per_agent_update']): Critic_Loss.append( self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n)) advantage = estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) for i in range(agent_params['num_actor_updates_per_agent_update']): Actor_loss.append(self.actor.update(ob_no, ac_na, advantage)) loss['Critic_Loss'] = mean(Critic_Loss) loss['Actor_Loss'] = mean(Actor_loss) return loss def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO_ Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) Vs = self.critic(ob_no) Vs_prime = self.critic(next_ob_no) # ternimal_index = next(i for i, x in enumerate(terminal_n) if x, None) ternimal_index = [i for i, x in enumerate(terminal_n) if x] if len(ternimal_index): Vs_prime[ternimal_index] = 0 Qs = re_n + self.gamma * Vs_prime adv_n = Qs - Vs if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super().__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params['standardize_advantages'] self.n_drivers = self.agent_params['n_drivers'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size_ac'], self.agent_params['shared_exp'], self.agent_params['shared_exp_lambda'], self.agent_params['is_city'], self.agent_params['learning_rate'], self.agent_params['n_drivers'] ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # TODO Implement the following pseudocode: # for agent_params['num_critic_updates_per_agent_update'] steps, # update the critic loss = OrderedDict() for i in range(self.agent_params['num_critic_updates_per_agent_update']): if not self.agent_params['shared_exp']: loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) else: action_distributions = self.actor.shared_forward(ptu.from_numpy(ob_no)) loss['Critic_Loss'] = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n, action_distributions) # advantage = estimate_advantage(...) if self.agent_params['shared_exp']: advantage = self.estimate_shared_advantage(ob_no, next_ob_no, re_n, terminal_n) else: advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor for i in range(self.agent_params['num_actor_updates_per_agent_update']): loss['Actor_Loss'] = self.actor.update(ob_no, ac_na, advantage) return loss def estimate_shared_advantage(self, ob_no, next_ob_no, re_n, terminal_n): value_s = self.critic.shared_forward(ptu.from_numpy(ob_no)) value_next_s = self.critic.shared_forward(ptu.from_numpy(next_ob_no)) adv_n = dict() for i in range(self.n_drivers): for k in range(self.n_drivers): adv_n[(i,k)] = re_n[:,k] + self.gamma*ptu.to_numpy(value_next_s[(i,k)]) - ptu.to_numpy(value_s[(i,k)]) if self.standardize_advantages: adv_n[(i,k)] = (adv_n[(i,k)]- np.mean(adv_n[(i,k)]))/(np.std(adv_n[(i,k)])+1e-8) return adv_n def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) value_s = self.critic.forward_np(ob_no) value_next_s = self.critic.forward_np(next_ob_no) value_next_s[terminal_n==1] = 0 adv_n = re_n + self.gamma*value_next_s - value_s if self.standardize_advantages: for i in range(self.n_drivers): adv_n[:,i] = (adv_n[:,i] - np.mean(adv_n[:,i])) / (np.std(adv_n[:,i]) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)
class ACAgent(BaseAgent): def __init__(self, env, agent_params): super(ACAgent, self).__init__() self.env = env self.agent_params = agent_params self.gamma = self.agent_params['gamma'] self.standardize_advantages = self.agent_params[ 'standardize_advantages'] self.actor = MLPPolicyAC( self.agent_params['ac_dim'], self.agent_params['ob_dim'], self.agent_params['n_layers'], self.agent_params['size'], self.agent_params['discrete'], self.agent_params['learning_rate'], ) self.critic = BootstrappedContinuousCritic(self.agent_params) self.replay_buffer = ReplayBuffer() def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): ob_no = ptu.from_numpy(ob_no) next_ob_no = ptu.from_numpy(next_ob_no) terminal_n = ptu.from_numpy(terminal_n) re_n = ptu.from_numpy(re_n) ac_na = ptu.from_numpy(ac_na) loss_critic = 0. for i in range( self.agent_params['num_critic_updates_per_agent_update']): loss_critic += self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) # advantage = estimate_advantage(...) : adv_n = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) # a tensor is returned loss_actor = 0. for i in range( self.agent_params['num_actor_updates_per_agent_update']): loss_actor += self.actor.update(ob_no, ac_na, adv_n) loss = OrderedDict() loss['Critic_Loss'] = loss_critic loss[ 'Actor_Loss'] = loss_actor # in TensorBoard, loss_actor actually increases as we actually minimize -loss_actor return loss def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): # TODO Implement the following pseudocode: # 1) query the critic with ob_no, to get V(s) # 2) query the critic with next_ob_no, to get V(s') # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) # V_s_prime = self.critic.critic_network(next_ob_no) # V_s_prime = V_s_prime.squeeze() # mask = (terminal_n == 1.) # V_s_prime= V_s_prime.masked_fill(mask, 0.) # # V_s = self.critic.critic_network(ob_no) # V_s = V_s.squeeze() # # assert V_s_prime.ndim == V_s.ndim # TODO-assert enable this assert in debug # adv_n2 = re_n + self.gamma * V_s_prime - V_s # another way to calculate: V_s_prime = re_n + ( 1 - terminal_n) * self.gamma * self.critic.forward(next_ob_no) adv_n = V_s_prime - self.critic.forward(ob_no) # assert adv_n2 == adv_n if self.standardize_advantages: adv_n = (adv_n - torch.mean(adv_n)) / (torch.std(adv_n) + 1e-8) return adv_n def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) def sample(self, batch_size): return self.replay_buffer.sample_recent_data(batch_size)